Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Pekka Enberg @ 2010-04-07 16:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Zhang, Yanmin, Eric Dumazet, netdev, Tejun Heo, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <alpine.DEB.2.00.1004071130260.13261@router.home>

Christoph Lameter wrote:
> I wonder if this is not related to the kmem_cache_cpu structure straggling
> cache line boundaries under some conditions. On 2.6.33 the kmem_cache_cpu
> structure was larger and therefore tight packing resulted in different
> alignment.
> 
> Could you see how the following patch affects the results. It attempts to
> increase the size of kmem_cache_cpu to a power of 2 bytes. There is also
> the potential that other per cpu fetches to neighboring objects affect the
> situation. We could cacheline align the whole thing.
> 
> ---
>  include/linux/slub_def.h |    5 +++++
>  1 file changed, 5 insertions(+)
> 
> Index: linux-2.6/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slub_def.h	2010-04-07 11:33:50.000000000 -0500
> +++ linux-2.6/include/linux/slub_def.h	2010-04-07 11:35:18.000000000 -0500
> @@ -38,6 +38,11 @@ struct kmem_cache_cpu {
>  	void **freelist;	/* Pointer to first free per cpu object */
>  	struct page *page;	/* The slab from which we are allocating */
>  	int node;		/* The node of the page (or -1 for debug) */
> +#ifndef CONFIG_64BIT
> +	int dummy1;
> +#endif
> +	unsigned long dummy2;
> +
>  #ifdef CONFIG_SLUB_STATS
>  	unsigned stat[NR_SLUB_STAT_ITEMS];
>  #endif

Would __cacheline_aligned_in_smp do the trick here?

^ permalink raw reply

* [RFC] change dst_entry padding from using an array to using __attribute__
From: Laurent Chavey @ 2010-04-07 16:46 UTC (permalink / raw)
  To: netdev; +Cc: laurent chavey

what are the benefit(s)  of using an array to force a struct
element to be aligned on 64 bytes / 32 bytes boundaries
versus using gcc __attribute__.


------------------------------------

struct dst_entry {

       /*
        * Align __refcnt to a 64 bytes alignment
        * (L1_CACHE_SIZE would be too much)
        */
- #ifdef CONFIG_64BIT
-       long                    __pad_to_align_refcnt[1];
- #else
-       long                    __pad_to_align_refcnt[0];
- #endif
+       atomic_t                __refcnt __attribute__
((aligned(64))); /* client references    */
#undef __padding__
       /*
        * __refcnt wants to be on a different cache line from
        * input/output/ops or performance tanks badly
        */
       atomic_t                __refcnt;       /* client references    */
};

^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Christoph Lameter @ 2010-04-07 16:43 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: Eric Dumazet, netdev, Tejun Heo, Pekka Enberg, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <1270607668.2078.259.camel@ymzhang.sh.intel.com>

On Wed, 7 Apr 2010, Zhang, Yanmin wrote:

> I collected retired instruction, dtlb miss and LLC miss.
> Below is data of LLC miss.
>
> Kernel 2.6.33:
>     20.94%        hackbench  [kernel.kallsyms]                                       [k] copy_user_generic_string
>     14.56%        hackbench  [kernel.kallsyms]                                       [k] unix_stream_recvmsg
>     12.88%        hackbench  [kernel.kallsyms]                                       [k] kfree
>      7.37%        hackbench  [kernel.kallsyms]                                       [k] kmem_cache_free
>      7.18%        hackbench  [kernel.kallsyms]                                       [k] kmem_cache_alloc_node
>      6.78%        hackbench  [kernel.kallsyms]                                       [k] kfree_skb
>      6.27%        hackbench  [kernel.kallsyms]                                       [k] __kmalloc_node_track_caller
>      2.73%        hackbench  [kernel.kallsyms]                                       [k] __slab_free
>      2.21%        hackbench  [kernel.kallsyms]                                       [k] get_partial_node
>      2.01%        hackbench  [kernel.kallsyms]                                       [k] _raw_spin_lock
>      1.59%        hackbench  [kernel.kallsyms]                                       [k] schedule
>      1.27%        hackbench  hackbench                                               [.] receiver
>      0.99%        hackbench  libpthread-2.9.so                                       [.] __read
>      0.87%        hackbench  [kernel.kallsyms]                                       [k] unix_stream_sendmsg
>
> Kernel 2.6.34-rc3:
>     18.55%        hackbench  [kernel.kallsyms]                                                     [k] copy_user_generic_str
> ing
>     13.19%        hackbench  [kernel.kallsyms]                                                     [k] unix_stream_recvmsg
>     11.62%        hackbench  [kernel.kallsyms]                                                     [k] kfree
>      8.54%        hackbench  [kernel.kallsyms]                                                     [k] kmem_cache_free
>      7.88%        hackbench  [kernel.kallsyms]                                                     [k] __kmalloc_node_track_
> caller

Seems that the overhead of __kmalloc_node_track_caller was increased. The
function inlines slab_alloc().

>      6.54%        hackbench  [kernel.kallsyms]                                                     [k] kmem_cache_alloc_node
>      5.94%        hackbench  [kernel.kallsyms]                                                     [k] kfree_skb
>      3.48%        hackbench  [kernel.kallsyms]                                                     [k] __slab_free
>      2.15%        hackbench  [kernel.kallsyms]                                                     [k] _raw_spin_lock
>      1.83%        hackbench  [kernel.kallsyms]                                                     [k] schedule
>      1.82%        hackbench  [kernel.kallsyms]                                                     [k] get_partial_node
>      1.59%        hackbench  hackbench                                                             [.] receiver
>      1.37%        hackbench  libpthread-2.9.so                                                     [.] __read

I wonder if this is not related to the kmem_cache_cpu structure straggling
cache line boundaries under some conditions. On 2.6.33 the kmem_cache_cpu
structure was larger and therefore tight packing resulted in different
alignment.

Could you see how the following patch affects the results. It attempts to
increase the size of kmem_cache_cpu to a power of 2 bytes. There is also
the potential that other per cpu fetches to neighboring objects affect the
situation. We could cacheline align the whole thing.

---
 include/linux/slub_def.h |    5 +++++
 1 file changed, 5 insertions(+)

Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2010-04-07 11:33:50.000000000 -0500
+++ linux-2.6/include/linux/slub_def.h	2010-04-07 11:35:18.000000000 -0500
@@ -38,6 +38,11 @@ struct kmem_cache_cpu {
 	void **freelist;	/* Pointer to first free per cpu object */
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
+#ifndef CONFIG_64BIT
+	int dummy1;
+#endif
+	unsigned long dummy2;
+
 #ifdef CONFIG_SLUB_STATS
 	unsigned stat[NR_SLUB_STAT_ITEMS];
 #endif

^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Christoph Lameter @ 2010-04-07 16:30 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: Eric Dumazet, netdev, Tejun Heo, Pekka Enberg, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <1270607668.2078.259.camel@ymzhang.sh.intel.com>

On Wed, 7 Apr 2010, Zhang, Yanmin wrote:

> > booting with slub_min_order=3 do change hackbench results for example ;)
> By default, slub_min_order=3 on my Nehalem machines. I also tried different
> larger slub_min_order and didn't find help.

Lets stop fiddling with kernel command line parameters for these test.
Leave as default. That is how I tested.

^ permalink raw reply

* Re: [Bug 15703] New: Getting "MD5 Hash failed for" for fragmented IP packets.
From: Stephen Hemminger @ 2010-04-07 16:28 UTC (permalink / raw)
  To: bugzilla-daemon, vishal.swarnkar; +Cc: netdev
In-Reply-To: <bug-15703-100@https.bugzilla.kernel.org/>

On Tue, 6 Apr 2010 09:52:46 GMT
bugzilla-daemon@bugzilla.kernel.org wrote:

> https://bugzilla.kernel.org/show_bug.cgi?id=15703
> 
>            Summary: Getting "MD5 Hash failed for" for fragmented IP
>                     packets.
>            Product: Networking
>            Version: 2.5
>     Kernel Version: 2.6.31-14-generic
>           Platform: All
>         OS/Version: Linux
>               Tree: Mainline
>             Status: NEW
>           Severity: normal
>           Priority: P1
>          Component: IPV4
>         AssignedTo: shemminger@linux-foundation.org
>         ReportedBy: vishal.swarnkar@gmail.com
>                 CC: vishal.swarnkar@gmail.com
>         Regression: No
> 
> 
> I am using Ubuntu 9.10 with a kernel 2.6.31-14-generic.
> 
> At my linux box I am receiving IP packets with TCP as payload with TCP-MD5
> option as its a BGP update.
> 
> If I am getting fragmented IP packets on my Linux box then I am receiving only
> one fragment of the IP packet and I don't receive the second fragments because
> of some settings at my firewall ( I discard small packets, and the second
> fragment has only 20 bytes of data, so I consider it as small packet and always
> discard it).
> 
> I have ensured that I am not receiving the second fragment of the packet all
> the time ( in all further retransmission ) using sniffers.
> 
> Now for a IP fragmented packet I keep on getting messages 
> "MD5 hash failed for IP(src)--> IP(Dst)". I receive this message for all
> fragmented packets. The message is in tcp_ipv4.c - >tcp_v4_inbound_md5_hash
> method.
> 
> I hope I should not be getting these message because the fragmented packets
> should not be pushed to the upper layer( TCP) for further sanity checks(
> MD5-check sum verification), until all fragments are assembled together.
> 
> I tried to look at the ip_rcv code and found that ip_local_deliver(struct
> sk_buff* skb) is calling ip_defrag() function to check if the fragmentation
> task is still in progress or not.
> 
> I tried to dig down more into ip_defrag function and the return values from
> ip_defrag function and I hope that the check in the ip_local_deliver function
> is not correct.
> 
> int ip_local_deliver(struct sk_buff *skb)
> {
>     /*
>      *    Reassemble IP fragments.
>      */
> 
>     if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
>         if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
>             return 0;
>     }
> 
>     return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
>                ip_local_deliver_finish);
> }
> 
> 
> I think the check for ip_defrag should be like this 
> ------>>>>if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER) < 0) 
> 
> we should check the negative return to avoid ip_local_deliver_finish, instead
> of checking the null return.

Maybe path MTU discovery can help with this.

^ permalink raw reply

* [PATCH] xtables: make XT_ALIGN() usable in exported headers by exporting __ALIGN_KERNEL()
From: Alexey Dobriyan @ 2010-04-07 16:22 UTC (permalink / raw)
  To: kaber; +Cc: linux-kernel, netdev, shemminger, bhutchings, andreas, hadi,
	hideaki

XT_ALIGN() was rewritten through ALIGN() by commit 42107f5009da223daa800d6da6904d77297ae829
"netfilter: xtables: symmetric COMPAT_XT_ALIGN definition".
ALIGN() is not exported in userspace headers, which created compile problem for tc(8)
and will create problem for iptables(8).

We can't export generic looking name ALIGN() but we can export less generic
__ALIGN_KERNEL() (suggested by Ben Hutchings).
Google knows nothing about __ALIGN_KERNEL().

COMPAT_XT_ALIGN() changed for symmetry.

Reported-by: Andreas Henriksson <andreas@fatal.se>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/linux/kernel.h             |    5 +++--
 include/linux/netfilter/x_tables.h |    6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -4,6 +4,8 @@
 /*
  * 'kernel.h' contains some often-used function prototypes etc
  */
+#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 
 #ifdef __KERNEL__
 
@@ -37,8 +39,7 @@ extern const char linux_proc_banner[];
 
 #define STACK_MAGIC	0xdeadbeef
 
-#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
-#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+#define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
 #define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
 #define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
 
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -1,6 +1,6 @@
 #ifndef _X_TABLES_H
 #define _X_TABLES_H
-
+#include <linux/kernel.h>
 #include <linux/types.h>
 
 #define XT_FUNCTION_MAXNAMELEN 30
@@ -93,7 +93,7 @@ struct _xt_align {
 	__u64 u64;
 };
 
-#define XT_ALIGN(s) ALIGN((s), __alignof__(struct _xt_align))
+#define XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _xt_align))
 
 /* Standard return verdict, or do jump. */
 #define XT_STANDARD_TARGET ""
@@ -598,7 +598,7 @@ struct _compat_xt_align {
 	compat_u64 u64;
 };
 
-#define COMPAT_XT_ALIGN(s) ALIGN((s), __alignof__(struct _compat_xt_align))
+#define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align))
 
 extern void xt_compat_lock(u_int8_t af);
 extern void xt_compat_unlock(u_int8_t af);

^ permalink raw reply

* Re: [RFC PATCH net-next 2/4] gianfar: Added timer feature for eTSEC
From: Scott Wood @ 2010-04-07 16:21 UTC (permalink / raw)
  To: Manfred Rudigier
  Cc: 'sandeep.kumar@freescale.com',
	'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'
In-Reply-To: <95DC1AA8EC908B48939B72CF375AA5E311A9F4E0@alice.at.omicron.at>

On Wed, Apr 07, 2010 at 11:46:24AM +0200, Manfred Rudigier wrote:
>  	switch (config.tx_type) {
>  	case HWTSTAMP_TX_OFF:
> +		priv->hwts_tx_en = 0;
>  		break;
>  	case HWTSTAMP_TX_ON:
> +		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
> +			return -ERANGE;
> +		priv->hwts_tx_en = 1;
>  		return -ERANGE;
>  	default:
>  		return -ERANGE;
> @@ -796,8 +802,12 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
>  
>  	switch (config.rx_filter) {
>  	case HWTSTAMP_FILTER_NONE:
> +		priv->hwts_rx_en = 0;
>  		break;
>  	default:
> +		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
> +			return -ERANGE;
> +		priv->hwts_rx_en = 1;
>  		return -ERANGE;
>  	}

You need to acquire bflock (or avoid using bitfields for this), or you could
race with the setting of wol_en or rx_csum_enable.

-Scott

^ permalink raw reply

* Re: [PATCH] IPVS: replace sprintf to snprintf to avoid stack buffer overflow
From: Patrick McHardy @ 2010-04-07 16:09 UTC (permalink / raw)
  To: Simon Horman
  Cc: wzt.wzt, linux-kernel, Wensong Zhang, Julian Anastasov, netdev,
	lvs-devel
In-Reply-To: <20100406032248.GF30359@verge.net.au>

[-- Attachment #1: Type: text/plain, Size: 968 bytes --]

Simon Horman wrote:
> On Tue, Apr 06, 2010 at 10:50:20AM +0800, wzt.wzt@gmail.com wrote:
>> IPVS not check the length of pp->name, use sprintf will cause stack buffer overflow.
>> struct ip_vs_protocol{} declare name as char *, if register a protocol as:
>> struct ip_vs_protocol ip_vs_test = {
>>         .name =			"aaaaaaaa....128...aaa",
>> 	.debug_packet =         ip_vs_tcpudp_debug_packet,
>> };
>>
>> when called ip_vs_tcpudp_debug_packet(), sprintf(buf, "%s TRUNCATED", pp->name); 
>> will cause stack buffer overflow.
>>
>> Signed-off-by: Zhitong Wang <zhitong.wangzt@alibaba-inc.com>
> 
> I think that the simple answer is, don't do that.

Indeed.

> But your patch seems entirely reasonable to me.
> 
> Acked-by: Simon Horman <horms@verge.net.au>
> 
> Patrick, please consider merging this.

I think this fix is a bit silly, we can simply print the name in
the pr_debug() statement and avoid both the potential overflow
and truncation.

How does this look?

[-- Attachment #2: x --]
[-- Type: text/plain, Size: 3385 bytes --]

diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 0e58455..27add97 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -166,26 +166,24 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
 
 	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
+		sprintf(buf, "TRUNCATED");
 	else if (ih->frag_off & htons(IP_OFFSET))
-		sprintf(buf, "%s %pI4->%pI4 frag",
-			pp->name, &ih->saddr, &ih->daddr);
+		sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
 	else {
 		__be16 _ports[2], *pptr
 ;
 		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
-			sprintf(buf, "%s TRUNCATED %pI4->%pI4",
-				pp->name, &ih->saddr, &ih->daddr);
+			sprintf(buf, "TRUNCATED %pI4->%pI4",
+				&ih->saddr, &ih->daddr);
 		else
-			sprintf(buf, "%s %pI4:%u->%pI4:%u",
-				pp->name,
+			sprintf(buf, "%pI4:%u->%pI4:%u",
 				&ih->saddr, ntohs(pptr[0]),
 				&ih->daddr, ntohs(pptr[1]));
 	}
 
-	pr_debug("%s: %s\n", msg, buf);
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -200,26 +198,24 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 
 	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
+		sprintf(buf, "TRUNCATED");
 	else if (ih->nexthdr == IPPROTO_FRAGMENT)
-		sprintf(buf, "%s %pI6->%pI6 frag",
-			pp->name, &ih->saddr, &ih->daddr);
+		sprintf(buf, "%pI6->%pI6 frag",	&ih->saddr, &ih->daddr);
 	else {
 		__be16 _ports[2], *pptr;
 
 		pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
-			sprintf(buf, "%s TRUNCATED %pI6->%pI6",
-				pp->name, &ih->saddr, &ih->daddr);
+			sprintf(buf, "TRUNCATED %pI6->%pI6",
+				&ih->saddr, &ih->daddr);
 		else
-			sprintf(buf, "%s %pI6:%u->%pI6:%u",
-				pp->name,
+			sprintf(buf, "%pI6:%u->%pI6:%u",
 				&ih->saddr, ntohs(pptr[0]),
 				&ih->daddr, ntohs(pptr[1]));
 	}
 
-	pr_debug("%s: %s\n", msg, buf);
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
 }
 #endif
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index c30b43c..1892dfc 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -136,12 +136,11 @@ ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 
 	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
+		sprintf(buf, "TRUNCATED");
 	else
-		sprintf(buf, "%s %pI4->%pI4",
-			pp->name, &ih->saddr, &ih->daddr);
+		sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
 
-	pr_debug("%s: %s\n", msg, buf);
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -154,12 +153,11 @@ ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 
 	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
+		sprintf(buf, "TRUNCATED");
 	else
-		sprintf(buf, "%s %pI6->%pI6",
-			pp->name, &ih->saddr, &ih->daddr);
+		sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
 
-	pr_debug("%s: %s\n", msg, buf);
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
 }
 #endif
 

^ permalink raw reply related

* [PATCH] macb: allow reception of large (>1518 bytes) frames
From: Peter Korsgaard @ 2010-04-07 14:59 UTC (permalink / raw)
  To: davem, haavard.skinnemoen, netdev, linux-kernel; +Cc: Peter Korsgaard

Enable BIG bit in the network configuration register, so the MAC doesn't
reject big frames (E.G. when vlans are used).

Signed-off-by: Peter Korsgaard <peter.korsgaard@barco.com>
---
 drivers/net/macb.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/macb.c b/drivers/net/macb.c
index c8a18a6..4dde9db 100644
--- a/drivers/net/macb.c
+++ b/drivers/net/macb.c
@@ -793,6 +793,7 @@ static void macb_init_hw(struct macb *bp)
 	config = macb_readl(bp, NCFGR) & MACB_BF(CLK, -1L);
 	config |= MACB_BIT(PAE);		/* PAuse Enable */
 	config |= MACB_BIT(DRFCS);		/* Discard Rx FCS */
+	config |= MACB_BIT(BIG);		/* Receive oversized frames */
 	if (bp->dev->flags & IFF_PROMISC)
 		config |= MACB_BIT(CAF);	/* Copy All Frames */
 	if (!(bp->dev->flags & IFF_BROADCAST))
-- 
1.7.0

^ permalink raw reply related

* Re: [Bugme-new] [Bug 15682] New: XFRM is not updating RTAX_ADVMSS metric
From: Herbert Xu @ 2010-04-07 13:54 UTC (permalink / raw)
  To: jamal
  Cc: Andrew Morton, netdev, bugzilla-daemon, bugme-daemon,
	eduardo.panisset, David S. Miller
In-Reply-To: <1270561240.7198.3.camel@bigi>

On Tue, Apr 06, 2010 at 09:40:40AM -0400, jamal wrote:
> 
> Herbert would give better answers. I dont think what Eduardo is 
> doing is correct. You cant just start factoring in tcp headers
> at the xfrm level - and besides, the mtu calculation
> already takes care tunnel headers - so tcp should be able to 
> compute correct MSS. 

I think Eduardo is basically right.

Unfortunately, we've kind of stuffed up in how we handle ADVMSS.
In particular, we don't update it at all when we get a normal
non-IPsec PTMU message.

As a result, if we now start updating it it may actually break
existing users.

We do provide a locked bit for the user to use, however, because
the fact that we have never updated ADVMSS on the fly before,
people out there may not be aware of the locked bit.

Dave, what do you think about us starting to update ADVMSS on
the fly, just like the MTU?

The only risk is that existing users who are forcing ADVMSS to
a value higher than what it would otherwise be would now get a
lower value, if they're not using the "lock" keyword.

This shouldn't be fatal as it would only result in smaller packets
which should still work, and they can always start using the
"lock" keyword to get back the old behaviour.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v3 0/4] xfrm: add x86 CONFIG_COMPAT support
From: Patrick McHardy @ 2010-04-07 13:45 UTC (permalink / raw)
  To: Florian Westphal; +Cc: David Miller, netdev, johannes
In-Reply-To: <20100407133528.GD22518@Chamillionaire.breakpoint.cc>

Florian Westphal wrote:
> David Miller <davem@davemloft.net> wrote:
>> From: Florian Westphal <fw@strlen.de>
>> Date: Tue,  6 Apr 2010 00:27:07 +0200
> 
> [..]
> 
>>> I sent a patch that solved this by adding a sys_compat_write syscall
>>> and a ->compat_aio_write() to struct file_operations to the
>>> vfs mailing list, but that patch was ignored by the vfs people,
>>> and the x86 folks did not exactly like the idea either.
>>>
>>> So this leaves three alternatives:
>>> 1 - drop the whole idea and keep the current status.
>>> 2 - Add new structure definitions (with new numbering) that would work
>>>     everywhere, keep the old ones for backwards compatibility (This
>>>     was suggested by Arnd Bergmann).

Given that there is only a quite small number of users of this
interface, that would in my opinion be the best way.

>>> 3 - apply this patch set and tell userspace to move the sendmsg() when
>>>     they want to work with xfrm on x86_64 with 32 bit userland.
>> So do we know of any xfrm netlink apps that do not use sendmsg()?

^ permalink raw reply

* Re: tulip_stop_rxtx() failed (CSR5 0xf0260000 CSR6 0xb3862002) on DEC Alpha Personal Workstation 433au
From: Adrian Glaubitz @ 2010-04-07 13:42 UTC (permalink / raw)
  To: Joe Perches
  Cc: Grant Grundler, Kyle McMartin, David S. Miller, netdev,
	linux-kernel
In-Reply-To: <1270488982.31062.28.camel@Joe-Laptop.home>

Hi,

On Mon, Apr 05, 2010 at 10:36:22AM -0700, Joe Perches wrote:
> On Mon, 2010-04-05 at 19:13 +0200, Adrian Glaubitz wrote:
> > Hi guys,
> > 
> > I installed Debian unstable on an old digital workstation "DEC Digital
> > Personal Workstation 433au" (Miata) which has an on-board tulip
> > network controller. I'm not really using that network controller but
> > an off-board intel e1000 controller. However, I found that the tulip
> > driver produces a lot of noise in the message log, the following
> > message is repated periodically and spams the whole message log:
> > 
> > 0000:00:03.0: tulip_stop_rxtx() failed (CSR5 0xf0260000 CSR6 0xb3862002)
> > 
> > Do you think this is related to the fact that no cable is connected to
> > the network controller?
> 
> Probably something is trying periodically to open the device.
> Maybe this helps reduce the message log noise:
> 
> Signed-off-by: Joe Perches <joe@perches.com>

Acked-by: Adrian Glaubitz <glaubitz@physik.fu-berlin.de>

Patch works perfectly. Now the message log looks pretty normal.

Thanks,

Adrian

^ permalink raw reply

* Re: [PATCH v3 0/4] xfrm: add x86 CONFIG_COMPAT support
From: Florian Westphal @ 2010-04-07 13:35 UTC (permalink / raw)
  To: David Miller; +Cc: fw, netdev, johannes
In-Reply-To: <20100407.030850.04450543.davem@davemloft.net>

David Miller <davem@davemloft.net> wrote:
> From: Florian Westphal <fw@strlen.de>
> Date: Tue,  6 Apr 2010 00:27:07 +0200

[..]

> > I sent a patch that solved this by adding a sys_compat_write syscall
> > and a ->compat_aio_write() to struct file_operations to the
> > vfs mailing list, but that patch was ignored by the vfs people,
> > and the x86 folks did not exactly like the idea either.
> >
> > So this leaves three alternatives:
> > 1 - drop the whole idea and keep the current status.
> > 2 - Add new structure definitions (with new numbering) that would work
> >     everywhere, keep the old ones for backwards compatibility (This
> >     was suggested by Arnd Bergmann).
> > 3 - apply this patch set and tell userspace to move the sendmsg() when
> >     they want to work with xfrm on x86_64 with 32 bit userland.
>
> So do we know of any xfrm netlink apps that do not use sendmsg()?
>
> I doubt there are any, and if that's true for the most part, then
> option #3 seems the best.

Open- and strongswan do (in pluto/kernel_netlink.c), strongswan also uses
sendto() in charon (their ikev2 daemon); sendto does not work at the
moment either.

That being said, this is only in a single spot.
The openswan people indicated they'd accept a patch that converts the
write to a sendmsg call.

openikev2 also uses write on a netlink socket (looks rather similar
to the one in open/strongswan).

AFAICS, all the receive functions use recvfrom (which sets CMSG_COMPAT).

I did not find other ike software that uses xfrm.

> If we find a non-trivial number of apps using plain write() then
> we might have to consider championing your vfs patch to the
> lkml and vfs folks again.

There is probably no "nontrivial" number, simply because there are not
that many user apps out there to begin with.

But whats really bothering me is the number of sys_compat_* functions
needed to make all possibilities work;. e.g. to make (unmodified)
strongwan work, sys_compat_write and sys_compat_sendto are needed.

What about applications that use any of the other countless
write functions?  Right now these do not set CMSG_COMPAT either.

And lets not get started with the abdunance of read() like functions...

[ I do not know about any such applications, but still ...]

> I'll help if this is needed.

Thank you.

Personally I hope that we can move the affected userspace tools to
sendmsg() and avoid introducing new compat syscalls, simply because
there is no telling what kind of mess we'd be walking into :-)

^ permalink raw reply

* RE: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver
From: Eric Dumazet @ 2010-04-07 13:31 UTC (permalink / raw)
  To: John Linn
  Cc: David Miller, netdev, linuxppc-dev, grant.likely, jwboyer,
	john.williams, michal.simek, jtyner
In-Reply-To: <b8be47d5-5db7-4e72-a45f-94fc54b7f370@VA3EHSMHS036.ehs.local>

Le mercredi 07 avril 2010 à 07:25 -0600, John Linn a écrit :
> > -----Original Message-----
> > From: David Miller [mailto:davem@davemloft.net]
> > Sent: Tuesday, April 06, 2010 8:52 PM
> > To: eric.dumazet@gmail.com
> > Cc: John Linn; netdev@vger.kernel.org; linuxppc-dev@ozlabs.org;
> grant.likely@secretlab.ca;
> > jwboyer@linux.vnet.ibm.com; john.williams@petalogix.com;
> michal.simek@petalogix.com; jtyner@cs.ucr.edu
> > Subject: Re: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver
> > 
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Mon, 05 Apr 2010 23:29:53 +0200
> > 
> > > So, I ask, cant you use netdev_alloc_skb_ip_align() in this driver ?
> > 
> > Thanks to everyone for getting this patch into shape.
> > 
> > I applied version 4 of the patch to net-next-2.6, thanks!
> 
> Thanks David, appreciate everyone's help and patience.
> 

You're welcome ;)

Thanks



^ permalink raw reply

* RE: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver
From: John Linn @ 2010-04-07 13:25 UTC (permalink / raw)
  To: David Miller, eric.dumazet
  Cc: netdev, linuxppc-dev, grant.likely, jwboyer, john.williams,
	michal.simek, jtyner
In-Reply-To: <20100406.195204.165441511.davem@davemloft.net>

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Tuesday, April 06, 2010 8:52 PM
> To: eric.dumazet@gmail.com
> Cc: John Linn; netdev@vger.kernel.org; linuxppc-dev@ozlabs.org;
grant.likely@secretlab.ca;
> jwboyer@linux.vnet.ibm.com; john.williams@petalogix.com;
michal.simek@petalogix.com; jtyner@cs.ucr.edu
> Subject: Re: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver
> 
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 05 Apr 2010 23:29:53 +0200
> 
> > So, I ask, cant you use netdev_alloc_skb_ip_align() in this driver ?
> 
> Thanks to everyone for getting this patch into shape.
> 
> I applied version 4 of the patch to net-next-2.6, thanks!

Thanks David, appreciate everyone's help and patience.

This email and any attachments are intended for the sole use of the named recipient(s) and contain(s) confidential information that may be proprietary, privileged or copyrighted under applicable law. If you are not the intended recipient, do not read, copy, or forward this email message or any attachments. Delete this email message and any attachments immediately.



^ permalink raw reply

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Jan Engelhardt @ 2010-04-07 13:26 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, netdev
In-Reply-To: <4BBB634D.6050109@trash.net>


On Tuesday 2010-04-06 18:37, Patrick McHardy wrote:
>Jan Engelhardt wrote:
>> On Thursday 2010-04-01 15:22, Patrick McHardy wrote:
>>>> Or should we be using skb_alloc and copying the data portion over, like 
>>>> ipt_REJECT does since v2.6.24-2931-g9ba99b0?
>>> I guess pskb_copy() would be most optimal since we can modify
>>> the header, but the non-linear area could be shared
>> 
>> Trying to improve my understanding: when doing skb_pull,
>> does the skb->head that is relevant for pskb_copy move?
>
>skb_pull() only changes skb->data.

But how does it interact, with, say, xt_TCPMSS which modifies not
only the L3 header, but also the L4 header?

^ permalink raw reply

* [PATCH] CAIF: write check
From: Alan Cox @ 2010-04-07 13:17 UTC (permalink / raw)
  To: Sjur BRENDELAND; +Cc: netdev@vger.kernel.org
In-Reply-To: <81C3A93C17462B4BBD7E272753C105791696B5DCF0@EXDCVYMBSTM005.EQ1STM.local>

caif: check write operations

From: Alan Cox <alan@linux.intel.com>

write is optional for a tty device. Check that we have a write op rather
than calling NULL.

Signed-off-by: Alan Cox <alan@linux.intel.com>
---

 drivers/net/caif/caif_serial.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)


diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index b271aa0..38c0186 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -312,6 +312,10 @@ static int ldisc_open(struct tty_struct *tty)
 	char name[64];
 	int result;
 
+	/* No write no play */
+	if (tty->ops->write == NULL)
+		return -EOPNOTSUPP;
+
 	sprintf(name, "cf%s", tty->name);
 	dev = alloc_netdev(sizeof(*ser), name, caifdev_setup);
 	ser = netdev_priv(dev);


^ permalink raw reply related

* [PATCH] Caif: Ref counting
From: Alan Cox @ 2010-04-07 13:13 UTC (permalink / raw)
  To: Sjur BRENDELAND; +Cc: netdev@vger.kernel.org
In-Reply-To: <81C3A93C17462B4BBD7E272753C105791696B5DCF0@EXDCVYMBSTM005.EQ1STM.local>

caif: tty's are kref objects so take a reference

From: Alan Cox <alan@linux.intel.com>

I don't think this can be abused in this case but do things properly.

Signed-off-by: Alan Cox <alan@linux.intel.com>
---

 drivers/net/caif/caif_serial.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)


diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index 3502f60..b271aa0 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -315,7 +315,7 @@ static int ldisc_open(struct tty_struct *tty)
 	sprintf(name, "cf%s", tty->name);
 	dev = alloc_netdev(sizeof(*ser), name, caifdev_setup);
 	ser = netdev_priv(dev);
-	ser->tty = tty;
+	ser->tty = tty_kref_get(tty);
 	ser->dev = dev;
 	debugfs_init(ser, tty);
 	tty->receive_room = N_TTY_BUF_SIZE;
@@ -348,6 +348,7 @@ static void ldisc_close(struct tty_struct *tty)
 	unregister_netdevice(ser->dev);
 	list_del(&ser->node);
 	debugfs_deinit(ser);
+	tty_kref_put(ser->tty);
 	if (!islocked)
 		rtnl_unlock();
 }

^ permalink raw reply related

* Re: [Bugme-new] [Bug 15682] New: XFRM is not updating RTAX_ADVMSS metric
From: jamal @ 2010-04-07 12:19 UTC (permalink / raw)
  To: Eduardo Panisset
  Cc: Andrew Morton, Herbert Xu, hideaki, netdev, bugzilla-daemon,
	bugme-daemon, David S. Miller
In-Reply-To: <y2jb7b22e81004061202uac6ef8eeodc70d6deb65f7699@mail.gmail.com>

Hi Eduardo,

As a first step, I dont know what the real cause is but 
what you showed as a solution didnt look right. I am also not familiar
with DSMIPv6. I know a lot of other xfrm subsystems work fine with
the current code. So thats why i pointed to Herbert who is more
knowledgeable. I am adding Yoshfuji to the list. 
If you tell me what kernel options to turn on and a very
basic test setup to reproduce it (I dont have much hardware, so make it
very minimal maybe requiring one or two PCs max) then I could make time
and try to reproduce it.

cheers,
jamal

On Tue, 2010-04-06 at 16:02 -0300, Eduardo Panisset wrote:
> Hi,
> 
> My intention is only to report a problem that I have faced. The
> solution proposed isn't (I know that) probably the best one to adopt
> as I'm not a kernel specialist, it is more illustrative to allow you
> guys understanding what I'm meaning and solve that on the better way
> (hence I haven't submited a patch).
> 
> Regards,
> Eduardo Panisset.

^ permalink raw reply

* [PATCH] corrected documentation for hardware time stamping
From: Patrick Loschmidt @ 2010-04-07 11:15 UTC (permalink / raw)
  To: netdev; +Cc: Patrick Ohly

From: Patrick Loschmidt <Patrick.Loschmidt@oeaw.ac.at>

The current documentation for hardware time stamping does not correctly specify the available kernel
functions since the implementation was changed later on.

Signed-off-by: Patrick Loschmidt <Patrick.Loschmidt@oeaw.ac.at>
---
--- Documentation/networking/timestamping.txt.orig	2010-04-07 12:52:47.000000000 +0200
+++ Documentation/networking/timestamping.txt	2010-04-07 11:43:57.000000000 +0200
@@ -41,11 +41,12 @@ SOF_TIMESTAMPING_SOFTWARE:     return sy
 SOF_TIMESTAMPING_TX/RX determine how time stamps are generated.
 SOF_TIMESTAMPING_RAW/SYS determine how they are reported in the
 following control message:
-    struct scm_timestamping {
-           struct timespec systime;
-           struct timespec hwtimetrans;
-           struct timespec hwtimeraw;
-    };
+
+struct scm_timestamping {
+	struct timespec systime;
+	struct timespec hwtimetrans;
+	struct timespec hwtimeraw;
+};

 recvmsg() can be used to get this control message for regular incoming
 packets. For send time stamps the outgoing packet is looped back to
@@ -87,12 +88,13 @@ by the network device and will be empty
 SIOCSHWTSTAMP:

 Hardware time stamping must also be initialized for each device driver
-that is expected to do hardware time stamping. The parameter is:
+that is expected to do hardware time stamping. The parameter is defined in
+/include/linux/net_tstamp.h as:

 struct hwtstamp_config {
-    int flags;           /* no flags defined right now, must be zero */
-    int tx_type;         /* HWTSTAMP_TX_* */
-    int rx_filter;       /* HWTSTAMP_FILTER_* */
+	int flags;	/* no flags defined right now, must be zero */
+	int tx_type;	/* HWTSTAMP_TX_* */
+	int rx_filter;	/* HWTSTAMP_FILTER_* */
 };

 Desired behavior is passed into the kernel and to a specific device by
@@ -139,42 +141,56 @@ enum {
 	/* time stamp any incoming packet */
 	HWTSTAMP_FILTER_ALL,

-        /* return value: time stamp all packets requested plus some others */
-        HWTSTAMP_FILTER_SOME,
+	/* return value: time stamp all packets requested plus some others */
+	HWTSTAMP_FILTER_SOME,

 	/* PTP v1, UDP, any kind of event packet */
 	HWTSTAMP_FILTER_PTP_V1_L4_EVENT,
-
-        ...
+	
+	/* for the complete list of values, please check
+	 * the include file /include/linux/net_tstamp.h
+	 */
 };


 DEVICE IMPLEMENTATION

 A driver which supports hardware time stamping must support the
-SIOCSHWTSTAMP ioctl. Time stamps for received packets must be stored
-in the skb with skb_hwtstamp_set().
+SIOCSHWTSTAMP ioctl and update the supplied struct hwtstamp_config with
+the actual values as described in the section on SIOCSHWTSTAMP.
+
+Time stamps for received packets must be stored in the skb. To get a pointer
+to the shared time stamp structure of the skb call skb_hwtstamps(). Then
+set the time stamps in the structure:
+
+struct skb_shared_hwtstamps {
+	/* hardware time stamp transformed into duration
+	 * since arbitrary point in time
+	 */
+	ktime_t	hwtstamp;
+	ktime_t	syststamp; /* hwtstamp transformed to system time base */
+};

 Time stamps for outgoing packets are to be generated as follows:
-- In hard_start_xmit(), check if skb_hwtstamp_check_tx_hardware()
-  returns non-zero. If yes, then the driver is expected
-  to do hardware time stamping.
+- In hard_start_xmit(), check if skb_tx(skb)->hardware is set no-zero.
+  If yes, then the driver is expected to do hardware time stamping.
 - If this is possible for the skb and requested, then declare
-  that the driver is doing the time stamping by calling
-  skb_hwtstamp_tx_in_progress(). A driver not supporting
-  hardware time stamping doesn't do that. A driver must never
-  touch sk_buff::tstamp! It is used to store how time stamping
-  for an outgoing packets is to be done.
+  that the driver is doing the time stamping by setting the field
+  skb_tx(skb)->in_progress non-zero. You might want to keep a pointer
+  to the associated skb for the next step and not free the skb. A driver
+  not supporting hardware time stamping doesn't do that. A driver must
+  never touch sk_buff::tstamp! It is used to store software generated
+  time stamps by the network subsystem.
 - As soon as the driver has sent the packet and/or obtained a
   hardware time stamp for it, it passes the time stamp back by
   calling skb_hwtstamp_tx() with the original skb, the raw
-  hardware time stamp and a handle to the device (necessary
-  to convert the hardware time stamp to system time). If obtaining
-  the hardware time stamp somehow fails, then the driver should
-  not fall back to software time stamping. The rationale is that
-  this would occur at a later time in the processing pipeline
-  than other software time stamping and therefore could lead
-  to unexpected deltas between time stamps.
-- If the driver did not call skb_hwtstamp_tx_in_progress(), then
+  hardware time stamp. skb_hwtstamp_tx() clones the original skb and
+  adds the timestamps, therefore the original skb has to be freed now.
+  If obtaining the hardware time stamp somehow fails, then the driver
+  should not fall back to software time stamping. The rationale is that
+  this would occur at a later time in the processing pipeline than other
+  software time stamping and therefore could lead to unexpected deltas
+  between time stamps.
+- If the driver did not call set skb_tx(skb)->in_progress, then
   dev_hard_start_xmit() checks whether software time stamping
   is wanted as fallback and potentially generates the time stamp.

^ permalink raw reply

* Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net
From: Michael S. Tsirkin @ 2010-04-07 11:35 UTC (permalink / raw)
  To: David L Stevens; +Cc: kvm, netdev, rusty, virtualization
In-Reply-To: <20100407105910.GD9550@redhat.com>

Some corrections:

On Wed, Apr 07, 2010 at 01:59:10PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 06, 2010 at 01:32:53PM -0700, David L Stevens wrote:
> > 
> > This patch adds support for the Mergeable Receive Buffers feature to
> > vhost_net.
> > 
> > 						+-DLS
> > 
> > Changes from previous revision:
> > 1) renamed:
> > 	vhost_discard_vq_desc -> vhost_discard_desc
> > 	vhost_get_heads -> vhost_get_desc_n
> > 	vhost_get_vq_desc -> vhost_get_desc
> > 2) added heads as argument to ghost_get_desc_n
> > 3) changed "vq->heads" from iovec to vring_used_elem, removed casts
> > 4) changed vhost_add_used to do multiple elements in a single
> > copy_to_user,
> > 	or two when we wrap the ring.
> > 5) removed rxmaxheadcount and available buffer checks in favor of
> > running until
> > 	an allocation failure, but making sure we break the loop if we get
> > 	two in a row, indicating we have at least 1 buffer, but not enough
> > 	for the current receive packet
> > 6) restore non-vnet header handling
> > 
> > Signed-Off-By: David L Stevens <dlstevens@us.ibm.com>
> 
> Thanks!
> There's some whitespace damage, are you sending with your new
> sendmail setup? It seems to have worked for qemu patches ...
> 
> > diff -ruNp net-next-p0/drivers/vhost/net.c
> > net-next-v3/drivers/vhost/net.c
> > --- net-next-p0/drivers/vhost/net.c	2010-03-22 12:04:38.000000000 -0700
> > +++ net-next-v3/drivers/vhost/net.c	2010-04-06 12:54:56.000000000 -0700
> > @@ -130,9 +130,8 @@ static void handle_tx(struct vhost_net *
> >  	hdr_size = vq->hdr_size;
> >  
> >  	for (;;) {
> > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -					 ARRAY_SIZE(vq->iov),
> > -					 &out, &in,
> > +		head = vhost_get_desc(&net->dev, vq, vq->iov,
> > +					 ARRAY_SIZE(vq->iov), &out, &in,
> >  					 NULL, NULL);
> >  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
> >  		if (head == vq->num) {
> > @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
> >  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> >  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
> >  		if (unlikely(err < 0)) {
> > -			vhost_discard_vq_desc(vq);
> > -			tx_poll_start(net, sock);
> > +			if (err == -EAGAIN) {
> > +				vhost_discard_desc(vq, 1);
> > +				tx_poll_start(net, sock);
> > +			} else {
> > +				vq_err(vq, "sendmsg: errno %d\n", -err);
> > +				/* drop packet; do not discard/resend */
> > +				vhost_add_used_and_signal(&net->dev, vq, head,
> > +							  0);
> 
> vhost does not currently has a consistent error handling strategy:
> if we drop packets, need to think which other errors should cause
> packet drops.  I prefer to just call vq_err for now,
> and have us look at handling segfaults etc in a consistent way
> separately.
> 
> > +			}
> >  			break;
> >  		}
> >  		if (err != len)
> > @@ -186,12 +192,25 @@ static void handle_tx(struct vhost_net *
> >  	unuse_mm(net->dev.mm);
> >  }
> >  
> > +static int vhost_head_len(struct sock *sk)
> > +{
> > +	struct sk_buff *head;
> > +	int len = 0;
> > +
> > +	lock_sock(sk);
> > +	head = skb_peek(&sk->sk_receive_queue);
> > +	if (head)
> > +		len = head->len;
> > +	release_sock(sk);
> > +	return len;
> > +}
> > +
> 
> I wonder whether it makes sense to check
> skb_queue_empty(&sk->sk_receive_queue)
> outside the lock, to reduce the cost of this call
> on an empty queue (we know that it happens at least once
> each time we exit the loop on rx)?
> 
> >  /* Expects to be always run from workqueue - which acts as
> >   * read-size critical section for our kind of RCU. */
> >  static void handle_rx(struct vhost_net *net)
> >  {
> >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > -	unsigned head, out, in, log, s;
> > +	unsigned in, log, s;
> >  	struct vhost_log *vq_log;
> >  	struct msghdr msg = {
> >  		.msg_name = NULL,
> > @@ -202,13 +221,14 @@ static void handle_rx(struct vhost_net *
> >  		.msg_flags = MSG_DONTWAIT,
> >  	};
> >  
> > -	struct virtio_net_hdr hdr = {
> > -		.flags = 0,
> > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> > +	struct virtio_net_hdr_mrg_rxbuf hdr = {
> > +		.hdr.flags = 0,
> > +		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
> >  	};
> >  
> > +	int retries = 0;
> >  	size_t len, total_len = 0;
> > -	int err;
> > +	int err, headcount, datalen;
> >  	size_t hdr_size;
> >  	struct socket *sock = rcu_dereference(vq->private_data);
> >  	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
> >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> >  		vq->log : NULL;
> >  
> > -	for (;;) {
> > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -					 ARRAY_SIZE(vq->iov),
> > -					 &out, &in,
> > -					 vq_log, &log);
> > +	while ((datalen = vhost_head_len(sock->sk))) {
> > +		headcount = vhost_get_desc_n(vq, vq->heads, datalen, &in,
> > +					     vq_log, &log);
> 
> This looks like a bug, I think we need to pass
> datalen + header size to vhost_get_desc_n.
> Not sure how we know the header size that backend will use though.
> Maybe just look at our features.
> 
> >  		/* OK, now we need to know about added descriptors. */
> > -		if (head == vq->num) {
> > -			if (unlikely(vhost_enable_notify(vq))) {
> > +		if (!headcount) {
> > +			if (retries == 0 && unlikely(vhost_enable_notify(vq))) {
> >  				/* They have slipped one in as we were
> >  				 * doing that: check again. */
> >  				vhost_disable_notify(vq);
> > +				retries++;
> >  				continue;
> >  			}
> 
> Hmm. The reason we have the code at all, as the comment says, is because
> guest could have added more buffers between the time we read last index
> and the time we enabled notification. So if we just break like this
> the race still exists. We could remember the
> last head value we observed, and have vhost_enable_notify check
> against this value?
> 
> Need to think about it.
> 
> Another concern here is that on retries vhost_get_desc_n
> is doing extra work, rescanning the same descriptor
> again and again. Not sure how common this is, might be
> worthwhile to add a TODO to consider this at least.
> 
> > +			retries = 0;
> >  			/* Nothing new?  Wait for eventfd to tell us
> >  			 * they refilled. */
> >  			break;
> >  		}
> >  		/* We don't need to be notified again. */
> > -		if (out) {
> > -			vq_err(vq, "Unexpected descriptor format for RX: "
> > -			       "out %d, int %d\n",
> > -			       out, in);
> > -			break;
> > -		}
> > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> > +		/* Skip header. TODO: support TSO. */
> >  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> >  		msg.msg_iovlen = in;
> >  		len = iov_length(vq->iov, in);
> > @@ -261,14 +275,33 @@ static void handle_rx(struct vhost_net *
> >  					 len, MSG_DONTWAIT | MSG_TRUNC);
> >  		/* TODO: Check specific error and bomb out unless EAGAIN? */
> >  		if (err < 0) {
> 
> I think we need to compare err and datalen and drop packet on mismatch as well.
> The check err > len won't be needed then.
> 
> > -			vhost_discard_vq_desc(vq);
> > +			vhost_discard_desc(vq, headcount);
> >  			break;
> >  		}
> >  		/* TODO: Should check and handle checksum. */
> > +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
> > +			struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > +				(struct virtio_net_hdr_mrg_rxbuf *)
> > +				vq->iov[0].iov_base;
> > +			/* add num_buffers */
> > +			if (vhost_has_feature(&net->dev,
> > +					      VHOST_NET_F_VIRTIO_NET_HDR))
> > +				hdr.num_buffers = headcount;
> 
> Why is the above necessary?
> 
> > +			else if (vq->iov[0].iov_len < sizeof(*vhdr)) {
> 
> I think length check is already done when we copy the header. No?
> 
> > +				vq_err(vq, "tiny buffers < %d unsupported",
> > +					vq->iov[0].iov_len);
> > +				vhost_discard_desc(vq, headcount);
> > +				break;
> 
> Problem here is that recvmsg might modify iov.
> This is why I think we need to use vq->hdr here.
> 
> > +			} else if (put_user(headcount, &vhdr->num_buffers)) {
> 
> The above put_user writes out a 32 bit value, right?
> This seems wrong.

Sorry, put_user looks at the pointer type, so that's ok.
I still suggest memcpy_toiovecend to remove layout assumptions.

> How about using
> 	memcpy_toiovecend(vq->hdr, &headcount,
> 			  offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers),
> 			  sizeof headcount);
> 
> this way we also do not make any assumptions about layout.
> 
> > +				vq_err(vq, "Failed num_buffers write");
> > +				vhost_discard_desc(vq, headcount);
> > +				break;
> > +			}
> > +		}
> >  		if (err > len) {
> >  			pr_err("Discarded truncated rx packet: "
> >  			       " len %d > %zd\n", err, len);
> > -			vhost_discard_vq_desc(vq);
> > +			vhost_discard_desc(vq, headcount);
> >  			continue;
> >  		}
> >  		len = err;
> > @@ -279,7 +312,7 @@ static void handle_rx(struct vhost_net *
> >  			break;
> >  		}
> >  		len += hdr_size;
> > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> > +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, headcount);
> >  		if (unlikely(vq_log))
> >  			vhost_log_write(vq, vq_log, log, len);
> >  		total_len += len;
> > @@ -560,9 +593,14 @@ done:
> >  
> >  static int vhost_net_set_features(struct vhost_net *n, u64 features)
> >  {
> > -	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> > -		sizeof(struct virtio_net_hdr) : 0;
> > +	size_t hdr_size = 0;
> >  	int i;
> > +
> > +	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> > +		hdr_size = sizeof(struct virtio_net_hdr);
> > +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> > +			hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> My personal style for this would be:
>   	if (!(features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)))
> 		hdr_size = 0
> 	else if (!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
> 		hdr_size = sizeof(virtio_net_hdr);
> 	else
> 		hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> which results in more symmetry and less nesting.
> 
> >  	mutex_lock(&n->dev.mutex);
> >  	if ((features & (1 << VHOST_F_LOG_ALL)) &&
> >  	    !vhost_log_access_ok(&n->dev)) {
> > diff -ruNp net-next-p0/drivers/vhost/vhost.c
> > net-next-v3/drivers/vhost/vhost.c
> > --- net-next-p0/drivers/vhost/vhost.c	2010-03-22 12:04:38.000000000
> > -0700
> > +++ net-next-v3/drivers/vhost/vhost.c	2010-04-06 12:57:51.000000000
> > -0700
> > @@ -856,6 +856,47 @@ static unsigned get_indirect(struct vhos
> >  	return 0;
> >  }
> >  
> > +/* This is a multi-buffer version of vhost_get_vq_desc
> > + * @vq		- the relevant virtqueue
> > + * datalen	- data length we'll be reading
> > + * @iovcount	- returned count of io vectors we fill
> > + * @log		- vhost log
> > + * @log_num	- log offset
> > + *	returns number of buffer heads allocated, 0 on error
> 
> This is unusual. Let's return a negative error code on error.
> 
> > + */
> > +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem
> > *heads,
> > +		     int datalen, int *iovcount, struct vhost_log *log,
> > +		     unsigned int *log_num)
> > +{
> > +	int out, in;
> > +	int seg = 0;		/* iov index */
> > +	int hc = 0;		/* head count */
> > +
> > +	while (datalen > 0) {
> > +		if (hc >= VHOST_NET_MAX_SG)
> > +			goto err;
> > +		heads[hc].id = vhost_get_desc(vq->dev, vq, vq->iov+seg,
> > +					      ARRAY_SIZE(vq->iov)-seg, &out,
> > +					      &in, log, log_num);
> > +		if (heads[hc].id == vq->num)
> > +			goto err;
> > +		if (out || in <= 0) {
> > +			vq_err(vq, "unexpected descriptor format for RX: "
> > +				"out %d, in %d\n", out, in);
> > +			goto err;
> > +		}
> > +		heads[hc].len = iov_length(vq->iov+seg, in);
> > +		datalen -= heads[hc].len;
> 
> This signed/unsigned mix makes me nervuous.
> Let's make datalen unsigned, add unsigned total_len, and
> while (datalen < total_len).
> 
> > +		hc++;
> > +		seg += in;
> > +	}
> > +	*iovcount = seg;
> > +	return hc;
> > +err:
> > +	vhost_discard_desc(vq, hc);
> > +	return 0;
> > +}
> > +
> >  /* This looks in the virtqueue and for the first available buffer, and
> > converts
> >   * it to an iovec for convenient access.  Since descriptors consist of
> > some
> >   * number of output then some number of input descriptors, it's
> > actually two
> > @@ -863,7 +904,7 @@ static unsigned get_indirect(struct vhos
> >   *
> >   * This function returns the descriptor number found, or vq->num (which
> >   * is never a valid descriptor number) if none was found. */
> > -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct
> > vhost_virtqueue *vq,
> > +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue
> > *vq,
> >  			   struct iovec iov[], unsigned int iov_size,
> >  			   unsigned int *out_num, unsigned int *in_num,
> >  			   struct vhost_log *log, unsigned int *log_num)
> > @@ -981,31 +1022,42 @@ unsigned vhost_get_vq_desc(struct vhost_
> >  }
> >  
> >  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling.
> > */
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> > +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
> >  {
> > -	vq->last_avail_idx--;
> > +	vq->last_avail_idx -= n;
> >  }
> >  
> >  /* After we've used one of their buffers, we tell them about it.  We'll
> > then
> >   * want to notify the guest, using eventfd. */
> > -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int
> > len)
> > +int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem
> > *heads,
> > +		   int count)
> 
> I think we are better off with vhost_add_used and vhost_add_used_n:
> the version with _n has a lot of extra complexity, and tx always
> adds them 1 by one.
> 
> >  {
> >  	struct vring_used_elem *used;
> > +	int start, n;
> > +
> > +	if (count <= 0)
> > +		return -EINVAL;
> >  
> > -	/* The virtqueue contains a ring of used buffers.  Get a pointer to
> > the
> > -	 * next entry in that used ring. */
> > -	used = &vq->used->ring[vq->last_used_idx % vq->num];
> > -	if (put_user(head, &used->id)) {
> > -		vq_err(vq, "Failed to write used id");
> > +	start = vq->last_used_idx % vq->num;
> > +	if (vq->num - start < count)
> > +		n = vq->num - start;
> > +	else
> > +		n = count;
> 
> use min?
> 
> > +	used = vq->used->ring + start;
> > +	if (copy_to_user(used, heads, sizeof(heads[0])*n)) {
> > +		vq_err(vq, "Failed to write used");
> >  		return -EFAULT;
> >  	}
> > -	if (put_user(len, &used->len)) {
> > -		vq_err(vq, "Failed to write used len");
> > -		return -EFAULT;
> > +	if (n < count) {	/* wrapped the ring */
> > +		used = vq->used->ring;
> > +		if (copy_to_user(used, heads+n, sizeof(heads[0])*(count-n))) {
> > +			vq_err(vq, "Failed to write used");
> > +			return -EFAULT;
> > +		}
> >  	}
> >  	/* Make sure buffer is written before we update index. */
> >  	smp_wmb();
> > -	if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> > +	if (put_user(vq->last_used_idx+count, &vq->used->idx)) {
> 
> I am a bit confused ... will this write a 32 or 16 bit value?
> count is 32 bit ... Maybe we are better off with
>   u16 idx = vq->last_used_idx + count
>   put_user(idx, &vq->used->idx)
>   vq->last_used_idx = idx

The above's not necessary, put_user gets type from the pointer.

> >  		vq_err(vq, "Failed to increment used idx");
> >  		return -EFAULT;
> >  	}
> > @@ -1023,7 +1075,7 @@ int vhost_add_used(struct vhost_virtqueu
> >  		if (vq->log_ctx)
> >  			eventfd_signal(vq->log_ctx, 1);
> >  	}
> > -	vq->last_used_idx++;
> > +	vq->last_used_idx += count;
> >  	return 0;
> >  }
> >  
> > @@ -1049,10 +1101,23 @@ void vhost_signal(struct vhost_dev *dev,
> >  
> >  /* And here's the combo meal deal.  Supersize me! */
> >  void vhost_add_used_and_signal(struct vhost_dev *dev,
> > -			       struct vhost_virtqueue *vq,
> > -			       unsigned int head, int len)
> > +			       struct vhost_virtqueue *vq, unsigned int id,
> > +			       int len)
> > +{
> > +	struct vring_used_elem head;
> > +
> > +	head.id = id;
> > +	head.len = len;
> > +	vhost_add_used(vq, &head, 1);
> > +	vhost_signal(dev, vq);
> > +}
> > +
> > +/* multi-buffer version of vhost_add_used_and_signal */
> > +void vhost_add_used_and_signal_n(struct vhost_dev *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct vring_used_elem *heads, int count)
> >  {
> > -	vhost_add_used(vq, head, len);
> > +	vhost_add_used(vq, heads, count);
> >  	vhost_signal(dev, vq);
> >  }
> >  
> > diff -ruNp net-next-p0/drivers/vhost/vhost.h
> > net-next-v3/drivers/vhost/vhost.h
> > --- net-next-p0/drivers/vhost/vhost.h	2010-03-22 12:04:38.000000000
> > -0700
> > +++ net-next-v3/drivers/vhost/vhost.h	2010-04-05 20:33:57.000000000
> > -0700
> > @@ -85,6 +85,7 @@ struct vhost_virtqueue {
> >  	struct iovec iov[VHOST_NET_MAX_SG];
> >  	struct iovec hdr[VHOST_NET_MAX_SG];
> >  	size_t hdr_size;
> > +	struct vring_used_elem heads[VHOST_NET_MAX_SG];
> >  	/* We use a kind of RCU to access private pointer.
> >  	 * All readers access it from workqueue, which makes it possible to
> >  	 * flush the workqueue instead of synchronize_rcu. Therefore readers
> > do
> > @@ -120,16 +121,22 @@ long vhost_dev_ioctl(struct vhost_dev *,
> >  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
> >  int vhost_log_access_ok(struct vhost_dev *);
> >  
> > -unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue
> > *,
> > +int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem
> > *heads,
> > +		     int datalen, int *iovcount, struct vhost_log *log,
> > +		     unsigned int *log_num);
> > +unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
> >  			   struct iovec iov[], unsigned int iov_count,
> >  			   unsigned int *out_num, unsigned int *in_num,
> >  			   struct vhost_log *log, unsigned int *log_num);
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *);
> > +void vhost_discard_desc(struct vhost_virtqueue *, int);
> >  
> > -int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int
> > len);
> > -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> > +int vhost_add_used(struct vhost_virtqueue *, struct vring_used_elem
> > *heads,
> > +		    int count);
> >  void vhost_add_used_and_signal(struct vhost_dev *, struct
> > vhost_virtqueue *,
> > -			       unsigned int head, int len);
> > +			       unsigned int id, int len);
> > +void vhost_add_used_and_signal_n(struct vhost_dev *, struct
> > vhost_virtqueue *,
> > +			       struct vring_used_elem *heads, int count);
> > +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> >  void vhost_disable_notify(struct vhost_virtqueue *);
> >  bool vhost_enable_notify(struct vhost_virtqueue *);
> >  
> > @@ -149,7 +156,8 @@ enum {
> >  	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
> >  			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
> >  			 (1 << VHOST_F_LOG_ALL) |
> > -			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> > +			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> > +			 (1 << VIRTIO_NET_F_MRG_RXBUF),
> >  };
> >  
> >  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> > 

^ permalink raw reply

* [PATCH] r6040: fix r6040_multicast_list
From: Florian Fainelli @ 2010-04-07 11:18 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, darkadept

As reported in <https://bugzilla.kernel.org/show_bug.cgi?id=15355>, r6040_
multicast_list currently crashes. This is due a wrong maximum of multicast
entries. This patch fixes the following issues with multicast:

- number of maximum entries if off-by-one (4 instead of 3)

- the writing of the hash table index is not necessary and leads to invalid
values being written into the MCR1 register, so the MAC is simply put in a non
coherent state

- when we exceed the maximum number of mutlticast address, writing the
broadcast address should be done in registers MID_1{L,M,H} instead of
MID_O{L,M,H}, otherwise we would loose the adapter's MAC address

Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index f5a0e96..83ebc75 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -135,7 +135,7 @@
 #define RX_DESC_SIZE	(RX_DCNT * sizeof(struct r6040_descriptor))
 #define TX_DESC_SIZE	(TX_DCNT * sizeof(struct r6040_descriptor))
 #define MBCR_DEFAULT	0x012A	/* MAC Bus Control Register */
-#define MCAST_MAX	4	/* Max number multicast addresses to filter */
+#define MCAST_MAX	3	/* Max number multicast addresses to filter */
 
 /* Descriptor status */
 #define DSC_OWNER_MAC	0x8000	/* MAC is the owner of this descriptor */
@@ -983,9 +983,6 @@ static void r6040_multicast_list(struct net_device *dev)
 			crc >>= 26;
 			hash_table[crc >> 4] |= 1 << (15 - (crc & 0xf));
 		}
-		/* Write the index of the hash table */
-		for (i = 0; i < 4; i++)
-			iowrite16(hash_table[i] << 14, ioaddr + MCR1);
 		/* Fill the MAC hash tables with their values */
 		iowrite16(hash_table[0], ioaddr + MAR0);
 		iowrite16(hash_table[1], ioaddr + MAR1);
@@ -1001,9 +998,9 @@ static void r6040_multicast_list(struct net_device *dev)
 			iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
 			iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
 		} else {
-			iowrite16(0xffff, ioaddr + MID_0L + 8 * i);
-			iowrite16(0xffff, ioaddr + MID_0M + 8 * i);
-			iowrite16(0xffff, ioaddr + MID_0H + 8 * i);
+			iowrite16(0xffff, ioaddr + MID_1L + 8 * i);
+			iowrite16(0xffff, ioaddr + MID_1M + 8 * i);
+			iowrite16(0xffff, ioaddr + MID_1H + 8 * i);
 		}
 		i++;
 	}

^ permalink raw reply related

* Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-07 11:17 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mingo, jdike
In-Reply-To: <1270630839-19876-1-git-send-email-xiaohui.xin@intel.com>

On Wed, Apr 07, 2010 at 05:00:39PM +0800, xiaohui.xin@intel.com wrote:
> From: Xin Xiaohui <xiaohui.xin@intel.com>
> 
> ---
> 
> Michael,
> Thanks a lot for the explanation. I have drafted a patch for the qemu write
> after I looked into tun driver. Does it do in right way?
> 
> Thanks
> Xiaohui
> 
>  drivers/vhost/mpassthru.c |   45 +++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 45 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
> index e9449ac..1cde097 100644
> --- a/drivers/vhost/mpassthru.c
> +++ b/drivers/vhost/mpassthru.c
> @@ -1065,6 +1065,49 @@ static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
>  	return mask;
>  }
>  
> +static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
> +				unsigned long count, loff_t pos)
> +{
> +	struct file *file = iocb->ki_filp;
> +	struct mp_struct *mp = mp_get(file->private_data);
> +	struct sock *sk = mp->socket.sk;
> +	struct sk_buff *skb;
> +	int len, err;
> +	ssize_t result;
> +
> +	if (!mp)
> +		return -EBADFD;
> +

Can this happen? When?

> +	/* currently, async is not supported */
> +	if (!is_sync_kiocb(iocb))
> +		return -EFAULT;

Really necessary? I think do_sync_write handles all this.

> +
> +	len = iov_length(iov, count);
> +	skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
> +				  file->f_flags & O_NONBLOCK, &err);
> +
> +	if (!skb)
> +		return -EFAULT;

Surely not EFAULT. -EAGAIN?

> +
> +	skb_reserve(skb, NET_IP_ALIGN);
> +	skb_put(skb, len);
> +
> +	if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
> +		kfree_skb(skb);
> +		return -EFAULT;
> +	}
> +	skb_set_network_header(skb, ETH_HLEN);

Is this really right or necessary? Also,
probably need to check that length is at least ETH_ALEN before
doing this.

> +	skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);

eth_type_trans?

> +	skb->dev = mp->dev;
> +
> +	dev_queue_xmit(skb);
> +	mp->dev->stats.tx_packets++;
> +	mp->dev->stats.tx_bytes += len;

Doesn't the hard start xmit function for the device
increment the counters?

> +
> +	mp_put(mp);
> +	return result;
> +}
> +
>  static int mp_chr_close(struct inode *inode, struct file *file)
>  {
>  	struct mp_file *mfile = file->private_data;
> @@ -1084,6 +1127,8 @@ static int mp_chr_close(struct inode *inode, struct file *file)
>  static const struct file_operations mp_fops = {
>  	.owner  = THIS_MODULE,
>  	.llseek = no_llseek,
> +	.write  = do_sync_write,
> +	.aio_write = mp_chr_aio_write,
>  	.poll   = mp_chr_poll,
>  	.unlocked_ioctl = mp_chr_ioctl,
>  	.open   = mp_chr_open,
> -- 
> 1.5.4.4

^ permalink raw reply

* [PATCH v2] can: Add esd board support to plx_pci CAN driver
From: Matthias Fuchs @ 2010-04-07 11:09 UTC (permalink / raw)
  To: netdev; +Cc: Socketcan-core

This patch adds support for SJA1000 based PCI CAN interface cards
from electronic system design gmbh.

Some changes have been done on the common code:
 - esd boards must not have the 2nd local interupt enabled (PLX9030/9050)
 - a new path for PLX9056/PEX8311 chips has been added
 - new plx9056 reset function has been implemented
 - struct plx_card_info got a reset function entry

In detail the following additional boards are now supported:

        CAN-PCI/200 (PCI)
        CAN-PCI/266 (PCI)
        CAN-PMC266 (PMC module)
        CAN-PCIe/2000 (PCI Express)
        CAN-CPCI/200 (Compact PCI, 3U)
        CAN-PCI104 (PCI104)

Signed-off-by: Matthias Fuchs <matthias.fuchs@esd.eu>
---
v2:
 - update Kconfig
 - add proper plx9056 reset function
 - add reset function pointer to plx_card_info structure
 - use card's reset function in plx_pci_del_card()

 drivers/net/can/sja1000/Kconfig   |    4 +-
 drivers/net/can/sja1000/plx_pci.c |  153 ++++++++++++++++++++++++++++++++++---
 2 files changed, 145 insertions(+), 12 deletions(-)

diff --git a/drivers/net/can/sja1000/Kconfig b/drivers/net/can/sja1000/Kconfig
index c01eb4e..b49f609 100644
--- a/drivers/net/can/sja1000/Kconfig
+++ b/drivers/net/can/sja1000/Kconfig
@@ -62,7 +62,9 @@ config CAN_PLX_PCI
 	  Driver supports now:
 	   - Adlink PCI-7841/cPCI-7841 card (http://www.adlinktech.com/)
 	   - Adlink PCI-7841/cPCI-7841 SE card
+	   - esd CAN-PCI/CPCI/PCI104/200 (http://www.esd.eu/)
+	   - esd CAN-PCI/PMC/266
+	   - esd CAN-PCIe/2000
 	   - Marathon CAN-bus-PCI card (http://www.marathon.ru/)
 	   - TEWS TECHNOLOGIES TPMC810 card (http://www.tews.com/)
-
 endif
diff --git a/drivers/net/can/sja1000/plx_pci.c b/drivers/net/can/sja1000/plx_pci.c
index 6b46a63..8c8ed15 100644
--- a/drivers/net/can/sja1000/plx_pci.c
+++ b/drivers/net/can/sja1000/plx_pci.c
@@ -40,7 +40,10 @@ MODULE_DESCRIPTION("Socket-CAN driver for PLX90xx PCI-bridge cards with "
 MODULE_SUPPORTED_DEVICE("Adlink PCI-7841/cPCI-7841, "
 			"Adlink PCI-7841/cPCI-7841 SE, "
 			"Marathon CAN-bus-PCI, "
-			"TEWS TECHNOLOGIES TPMC810");
+			"TEWS TECHNOLOGIES TPMC810, "
+			"esd CAN-PCI/CPCI/PCI104/200, "
+			"esd CAN-PCI/PMC/266, "
+			"esd CAN-PCIe/2000")
 MODULE_LICENSE("GPL v2");
 
 #define PLX_PCI_MAX_CHAN 2
@@ -49,11 +52,14 @@ struct plx_pci_card {
 	int channels;			/* detected channels count */
 	struct net_device *net_dev[PLX_PCI_MAX_CHAN];
 	void __iomem *conf_addr;
+
+	/* Pointer to device-dependent reset function */
+	void (*reset_func)(struct pci_dev *pdev);
 };
 
 #define PLX_PCI_CAN_CLOCK (16000000 / 2)
 
-/* PLX90xx registers */
+/* PLX9030/9050/9052 registers */
 #define PLX_INTCSR	0x4c		/* Interrupt Control/Status */
 #define PLX_CNTRL	0x50		/* User I/O, Direct Slave Response,
 					 * Serial EEPROM, and Initialization
@@ -65,6 +71,14 @@ struct plx_pci_card {
 #define PLX_PCI_INT_EN	(1 << 6)	/* PCI Interrupt Enable */
 #define PLX_PCI_RESET	(1 << 30)	/* PCI Adapter Software Reset */
 
+/* PLX9056 registers */
+#define PLX9056_INTCSR	0x68		/* Interrupt Control/Status */
+#define PLX9056_CNTRL	0x6c		/* Control / Software Reset */
+
+#define PLX9056_LINTI	(1 << 11)
+#define PLX9056_PCI_INT_EN (1 << 8)
+#define PLX9056_PCI_RCR	(1 << 29)	/* Read Configuration Registers */
+
 /*
  * The board configuration is probably following:
  * RX1 is connected to ground.
@@ -100,6 +114,13 @@ struct plx_pci_card {
 #define ADLINK_PCI_VENDOR_ID		0x144A
 #define ADLINK_PCI_DEVICE_ID		0x7841
 
+#define ESD_PCI_SUB_SYS_ID_PCI200	0x0004
+#define ESD_PCI_SUB_SYS_ID_PCI266	0x0009
+#define ESD_PCI_SUB_SYS_ID_PMC266	0x000e
+#define ESD_PCI_SUB_SYS_ID_CPCI200	0x010b
+#define ESD_PCI_SUB_SYS_ID_PCIE2000	0x0200
+#define ESD_PCI_SUB_SYS_ID_PCI104200	0x0501
+
 #define MARATHON_PCI_DEVICE_ID		0x2715
 
 #define TEWS_PCI_VENDOR_ID		0x1498
@@ -107,6 +128,7 @@ struct plx_pci_card {
 
 static void plx_pci_reset_common(struct pci_dev *pdev);
 static void plx_pci_reset_marathon(struct pci_dev *pdev);
+static void plx9056_pci_reset_common(struct pci_dev *pdev);
 
 struct plx_pci_channel_map {
 	u32 bar;
@@ -147,6 +169,30 @@ static struct plx_pci_card_info plx_pci_card_info_adlink_se __devinitdata = {
 	/* based on PLX9052 */
 };
 
+static struct plx_pci_card_info plx_pci_card_info_esd200 __devinitdata = {
+	"esd CAN-PCI/CPCI/PCI104/200", 2,
+	PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR,
+	{0, 0x00, 0x00}, { {2, 0x00, 0x80}, {2, 0x100, 0x80} },
+	&plx_pci_reset_common
+	/* based on PLX9030/9050 */
+};
+
+static struct plx_pci_card_info plx_pci_card_info_esd266 __devinitdata = {
+	"esd CAN-PCI/PMC/266", 2,
+	PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR,
+	{0, 0x00, 0x00}, { {2, 0x00, 0x80}, {2, 0x100, 0x80} },
+	&plx9056_pci_reset_common
+	/* based on PLX9056 */
+};
+
+static struct plx_pci_card_info plx_pci_card_info_esd2000 __devinitdata = {
+	"esd CAN-PCIe/2000", 2,
+	PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR,
+	{0, 0x00, 0x00}, { {2, 0x00, 0x80}, {2, 0x100, 0x80} },
+	&plx9056_pci_reset_common
+	/* based on PEX8311 */
+};
+
 static struct plx_pci_card_info plx_pci_card_info_marathon __devinitdata = {
 	"Marathon CAN-bus-PCI", 2,
 	PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR,
@@ -179,6 +225,48 @@ static DEFINE_PCI_DEVICE_TABLE(plx_pci_tbl) = {
 		(kernel_ulong_t)&plx_pci_card_info_adlink_se
 	},
 	{
+		/* esd CAN-PCI/200 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PCI200,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd200
+	},
+	{
+		/* esd CAN-CPCI/200 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9030,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_CPCI200,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd200
+	},
+	{
+		/* esd CAN-PCI104/200 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9030,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PCI104200,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd200
+	},
+	{
+		/* esd CAN-PCI/266 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9056,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PCI266,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd266
+	},
+	{
+		/* esd CAN-PMC/266 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9056,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PMC266,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd266
+	},
+	{
+		/* esd CAN-PCIE/2000 */
+		PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9056,
+		PCI_VENDOR_ID_ESDGMBH, ESD_PCI_SUB_SYS_ID_PCIE2000,
+		0, 0,
+		(kernel_ulong_t)&plx_pci_card_info_esd2000
+	},
+	{
 		/* Marathon CAN-bus-PCI card */
 		PCI_VENDOR_ID_PLX, MARATHON_PCI_DEVICE_ID,
 		PCI_ANY_ID, PCI_ANY_ID,
@@ -241,7 +329,7 @@ static inline int plx_pci_check_sja1000(const struct sja1000_priv *priv)
 }
 
 /*
- * PLX90xx software reset
+ * PLX9030/50/52 software reset
  * Also LRESET# asserts and brings to reset device on the Local Bus (if wired).
  * For most cards it's enough for reset the SJA1000 chips.
  */
@@ -258,6 +346,38 @@ static void plx_pci_reset_common(struct pci_dev *pdev)
 	iowrite32(cntrl, card->conf_addr + PLX_CNTRL);
 };
 
+/*
+ * PLX9056 software reset
+ * Assert LRESET# and reset device(s) on the Local Bus (if wired).
+ */
+static void plx9056_pci_reset_common(struct pci_dev *pdev)
+{
+	struct plx_pci_card *card = pci_get_drvdata(pdev);
+	u32 cntrl;
+
+	/* issue a local bus reset */
+	cntrl = ioread32(card->conf_addr + PLX9056_CNTRL);
+	cntrl |= PLX_PCI_RESET;
+	iowrite32(cntrl, card->conf_addr + PLX9056_CNTRL);
+	udelay(100);
+	cntrl ^= PLX_PCI_RESET;
+	iowrite32(cntrl, card->conf_addr + PLX9056_CNTRL);
+
+	/* reload local configuration from EEPROM */
+	cntrl |= PLX9056_PCI_RCR;
+	iowrite32(cntrl, card->conf_addr + PLX9056_CNTRL);
+
+	/*
+	 * There is no safe way to poll for the end
+	 * of reconfiguration process. Waiting for 10ms
+	 * is safe.
+	 */
+	mdelay(10);
+
+	cntrl ^= PLX9056_PCI_RCR;
+	iowrite32(cntrl, card->conf_addr + PLX9056_CNTRL);
+};
+
 /* Special reset function for Marathon card */
 static void plx_pci_reset_marathon(struct pci_dev *pdev)
 {
@@ -301,13 +421,16 @@ static void plx_pci_del_card(struct pci_dev *pdev)
 		free_sja1000dev(dev);
 	}
 
-	plx_pci_reset_common(pdev);
+	card->reset_func(pdev);
 
 	/*
-	 * Disable interrupts from PCI-card (PLX90xx) and disable Local_1,
-	 * Local_2 interrupts
+	 * Disable interrupts from PCI-card and disable local
+	 * interrupts
 	 */
-	iowrite32(0x0, card->conf_addr + PLX_INTCSR);
+	if (pdev->device != PCI_DEVICE_ID_PLX_9056)
+		iowrite32(0x0, card->conf_addr + PLX_INTCSR);
+	else
+		iowrite32(0x0, card->conf_addr + PLX9056_INTCSR);
 
 	if (card->conf_addr)
 		pci_iounmap(pdev, card->conf_addr);
@@ -366,6 +489,7 @@ static int __devinit plx_pci_add_card(struct pci_dev *pdev,
 	card->conf_addr = addr + ci->conf_map.offset;
 
 	ci->reset_func(pdev);
+	card->reset_func = ci->reset_func;
 
 	/* Detect available channels */
 	for (i = 0; i < ci->channel_count; i++) {
@@ -437,10 +561,17 @@ static int __devinit plx_pci_add_card(struct pci_dev *pdev,
 	 * Enable interrupts from PCI-card (PLX90xx) and enable Local_1,
 	 * Local_2 interrupts from the SJA1000 chips
 	 */
-	val = ioread32(card->conf_addr + PLX_INTCSR);
-	val |= PLX_LINT1_EN | PLX_LINT2_EN | PLX_PCI_INT_EN;
-	iowrite32(val, card->conf_addr + PLX_INTCSR);
-
+	if (pdev->device != PCI_DEVICE_ID_PLX_9056) {
+		val = ioread32(card->conf_addr + PLX_INTCSR);
+		if (pdev->subsystem_vendor == PCI_VENDOR_ID_ESDGMBH)
+			val |= PLX_LINT1_EN | PLX_PCI_INT_EN;
+		else
+			val |= PLX_LINT1_EN | PLX_LINT2_EN | PLX_PCI_INT_EN;
+		iowrite32(val, card->conf_addr + PLX_INTCSR);
+	} else {
+		iowrite32(PLX9056_LINTI | PLX9056_PCI_INT_EN,
+			  card->conf_addr + PLX9056_INTCSR);
+	}
 	return 0;
 
 failure_cleanup:
-- 
1.6.1


^ permalink raw reply related

* Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net
From: Michael S. Tsirkin @ 2010-04-07 10:59 UTC (permalink / raw)
  To: David L Stevens; +Cc: kvm, netdev, rusty, virtualization
In-Reply-To: <1270585973.28407.3.camel@lab1.dls>

On Tue, Apr 06, 2010 at 01:32:53PM -0700, David L Stevens wrote:
> 
> This patch adds support for the Mergeable Receive Buffers feature to
> vhost_net.
> 
> 						+-DLS
> 
> Changes from previous revision:
> 1) renamed:
> 	vhost_discard_vq_desc -> vhost_discard_desc
> 	vhost_get_heads -> vhost_get_desc_n
> 	vhost_get_vq_desc -> vhost_get_desc
> 2) added heads as argument to ghost_get_desc_n
> 3) changed "vq->heads" from iovec to vring_used_elem, removed casts
> 4) changed vhost_add_used to do multiple elements in a single
> copy_to_user,
> 	or two when we wrap the ring.
> 5) removed rxmaxheadcount and available buffer checks in favor of
> running until
> 	an allocation failure, but making sure we break the loop if we get
> 	two in a row, indicating we have at least 1 buffer, but not enough
> 	for the current receive packet
> 6) restore non-vnet header handling
> 
> Signed-Off-By: David L Stevens <dlstevens@us.ibm.com>

Thanks!
There's some whitespace damage, are you sending with your new
sendmail setup? It seems to have worked for qemu patches ...

> diff -ruNp net-next-p0/drivers/vhost/net.c
> net-next-v3/drivers/vhost/net.c
> --- net-next-p0/drivers/vhost/net.c	2010-03-22 12:04:38.000000000 -0700
> +++ net-next-v3/drivers/vhost/net.c	2010-04-06 12:54:56.000000000 -0700
> @@ -130,9 +130,8 @@ static void handle_tx(struct vhost_net *
>  	hdr_size = vq->hdr_size;
>  
>  	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> +		head = vhost_get_desc(&net->dev, vq, vq->iov,
> +					 ARRAY_SIZE(vq->iov), &out, &in,
>  					 NULL, NULL);
>  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
>  		if (head == vq->num) {
> @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
>  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
>  		if (unlikely(err < 0)) {
> -			vhost_discard_vq_desc(vq);
> -			tx_poll_start(net, sock);
> +			if (err == -EAGAIN) {
> +				vhost_discard_desc(vq, 1);
> +				tx_poll_start(net, sock);
> +			} else {
> +				vq_err(vq, "sendmsg: errno %d\n", -err);
> +				/* drop packet; do not discard/resend */
> +				vhost_add_used_and_signal(&net->dev, vq, head,
> +							  0);

vhost does not currently has a consistent error handling strategy:
if we drop packets, need to think which other errors should cause
packet drops.  I prefer to just call vq_err for now,
and have us look at handling segfaults etc in a consistent way
separately.

> +			}
>  			break;
>  		}
>  		if (err != len)
> @@ -186,12 +192,25 @@ static void handle_tx(struct vhost_net *
>  	unuse_mm(net->dev.mm);
>  }
>  
> +static int vhost_head_len(struct sock *sk)
> +{
> +	struct sk_buff *head;
> +	int len = 0;
> +
> +	lock_sock(sk);
> +	head = skb_peek(&sk->sk_receive_queue);
> +	if (head)
> +		len = head->len;
> +	release_sock(sk);
> +	return len;
> +}
> +

I wonder whether it makes sense to check
skb_queue_empty(&sk->sk_receive_queue)
outside the lock, to reduce the cost of this call
on an empty queue (we know that it happens at least once
each time we exit the loop on rx)?

>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_rx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> -	unsigned head, out, in, log, s;
> +	unsigned in, log, s;
>  	struct vhost_log *vq_log;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
> @@ -202,13 +221,14 @@ static void handle_rx(struct vhost_net *
>  		.msg_flags = MSG_DONTWAIT,
>  	};
>  
> -	struct virtio_net_hdr hdr = {
> -		.flags = 0,
> -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> +	struct virtio_net_hdr_mrg_rxbuf hdr = {
> +		.hdr.flags = 0,
> +		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
>  	};
>  
> +	int retries = 0;
>  	size_t len, total_len = 0;
> -	int err;
> +	int err, headcount, datalen;
>  	size_t hdr_size;
>  	struct socket *sock = rcu_dereference(vq->private_data);
>  	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
>  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  		vq->log : NULL;
>  
> -	for (;;) {
> -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> -					 ARRAY_SIZE(vq->iov),
> -					 &out, &in,
> -					 vq_log, &log);
> +	while ((datalen = vhost_head_len(sock->sk))) {
> +		headcount = vhost_get_desc_n(vq, vq->heads, datalen, &in,
> +					     vq_log, &log);

This looks like a bug, I think we need to pass
datalen + header size to vhost_get_desc_n.
Not sure how we know the header size that backend will use though.
Maybe just look at our features.

>  		/* OK, now we need to know about added descriptors. */
> -		if (head == vq->num) {
> -			if (unlikely(vhost_enable_notify(vq))) {
> +		if (!headcount) {
> +			if (retries == 0 && unlikely(vhost_enable_notify(vq))) {
>  				/* They have slipped one in as we were
>  				 * doing that: check again. */
>  				vhost_disable_notify(vq);
> +				retries++;
>  				continue;
>  			}

Hmm. The reason we have the code at all, as the comment says, is because
guest could have added more buffers between the time we read last index
and the time we enabled notification. So if we just break like this
the race still exists. We could remember the
last head value we observed, and have vhost_enable_notify check
against this value?

Need to think about it.

Another concern here is that on retries vhost_get_desc_n
is doing extra work, rescanning the same descriptor
again and again. Not sure how common this is, might be
worthwhile to add a TODO to consider this at least.

> +			retries = 0;
>  			/* Nothing new?  Wait for eventfd to tell us
>  			 * they refilled. */
>  			break;
>  		}
>  		/* We don't need to be notified again. */
> -		if (out) {
> -			vq_err(vq, "Unexpected descriptor format for RX: "
> -			       "out %d, int %d\n",
> -			       out, in);
> -			break;
> -		}
> -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> +		/* Skip header. TODO: support TSO. */
>  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
>  		msg.msg_iovlen = in;
>  		len = iov_length(vq->iov, in);
> @@ -261,14 +275,33 @@ static void handle_rx(struct vhost_net *
>  					 len, MSG_DONTWAIT | MSG_TRUNC);
>  		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  		if (err < 0) {

I think we need to compare err and datalen and drop packet on mismatch as well.
The check err > len won't be needed then.

> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, headcount);
>  			break;
>  		}
>  		/* TODO: Should check and handle checksum. */
> +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
> +			struct virtio_net_hdr_mrg_rxbuf *vhdr =
> +				(struct virtio_net_hdr_mrg_rxbuf *)
> +				vq->iov[0].iov_base;
> +			/* add num_buffers */
> +			if (vhost_has_feature(&net->dev,
> +					      VHOST_NET_F_VIRTIO_NET_HDR))
> +				hdr.num_buffers = headcount;

Why is the above necessary?

> +			else if (vq->iov[0].iov_len < sizeof(*vhdr)) {

I think length check is already done when we copy the header. No?

> +				vq_err(vq, "tiny buffers < %d unsupported",
> +					vq->iov[0].iov_len);
> +				vhost_discard_desc(vq, headcount);
> +				break;

Problem here is that recvmsg might modify iov.
This is why I think we need to use vq->hdr here.

> +			} else if (put_user(headcount, &vhdr->num_buffers)) {

The above put_user writes out a 32 bit value, right?
This seems wrong.

How about using
	memcpy_toiovecend(vq->hdr, &headcount,
			  offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers),
			  sizeof headcount);

this way we also do not make any assumptions about layout.

> +				vq_err(vq, "Failed num_buffers write");
> +				vhost_discard_desc(vq, headcount);
> +				break;
> +			}
> +		}
>  		if (err > len) {
>  			pr_err("Discarded truncated rx packet: "
>  			       " len %d > %zd\n", err, len);
> -			vhost_discard_vq_desc(vq);
> +			vhost_discard_desc(vq, headcount);
>  			continue;
>  		}
>  		len = err;
> @@ -279,7 +312,7 @@ static void handle_rx(struct vhost_net *
>  			break;
>  		}
>  		len += hdr_size;
> -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, headcount);
>  		if (unlikely(vq_log))
>  			vhost_log_write(vq, vq_log, log, len);
>  		total_len += len;
> @@ -560,9 +593,14 @@ done:
>  
>  static int vhost_net_set_features(struct vhost_net *n, u64 features)
>  {
> -	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> -		sizeof(struct virtio_net_hdr) : 0;
> +	size_t hdr_size = 0;
>  	int i;
> +
> +	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> +		hdr_size = sizeof(struct virtio_net_hdr);
> +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> +			hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);

My personal style for this would be:
  	if (!(features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)))
		hdr_size = 0
	else if (!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
		hdr_size = sizeof(virtio_net_hdr);
	else
		hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);

which results in more symmetry and less nesting.

>  	mutex_lock(&n->dev.mutex);
>  	if ((features & (1 << VHOST_F_LOG_ALL)) &&
>  	    !vhost_log_access_ok(&n->dev)) {
> diff -ruNp net-next-p0/drivers/vhost/vhost.c
> net-next-v3/drivers/vhost/vhost.c
> --- net-next-p0/drivers/vhost/vhost.c	2010-03-22 12:04:38.000000000
> -0700
> +++ net-next-v3/drivers/vhost/vhost.c	2010-04-06 12:57:51.000000000
> -0700
> @@ -856,6 +856,47 @@ static unsigned get_indirect(struct vhos
>  	return 0;
>  }
>  
> +/* This is a multi-buffer version of vhost_get_vq_desc
> + * @vq		- the relevant virtqueue
> + * datalen	- data length we'll be reading
> + * @iovcount	- returned count of io vectors we fill
> + * @log		- vhost log
> + * @log_num	- log offset
> + *	returns number of buffer heads allocated, 0 on error

This is unusual. Let's return a negative error code on error.

> + */
> +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem
> *heads,
> +		     int datalen, int *iovcount, struct vhost_log *log,
> +		     unsigned int *log_num)
> +{
> +	int out, in;
> +	int seg = 0;		/* iov index */
> +	int hc = 0;		/* head count */
> +
> +	while (datalen > 0) {
> +		if (hc >= VHOST_NET_MAX_SG)
> +			goto err;
> +		heads[hc].id = vhost_get_desc(vq->dev, vq, vq->iov+seg,
> +					      ARRAY_SIZE(vq->iov)-seg, &out,
> +					      &in, log, log_num);
> +		if (heads[hc].id == vq->num)
> +			goto err;
> +		if (out || in <= 0) {
> +			vq_err(vq, "unexpected descriptor format for RX: "
> +				"out %d, in %d\n", out, in);
> +			goto err;
> +		}
> +		heads[hc].len = iov_length(vq->iov+seg, in);
> +		datalen -= heads[hc].len;

This signed/unsigned mix makes me nervuous.
Let's make datalen unsigned, add unsigned total_len, and
while (datalen < total_len).

> +		hc++;
> +		seg += in;
> +	}
> +	*iovcount = seg;
> +	return hc;
> +err:
> +	vhost_discard_desc(vq, hc);
> +	return 0;
> +}
> +
>  /* This looks in the virtqueue and for the first available buffer, and
> converts
>   * it to an iovec for convenient access.  Since descriptors consist of
> some
>   * number of output then some number of input descriptors, it's
> actually two
> @@ -863,7 +904,7 @@ static unsigned get_indirect(struct vhos
>   *
>   * This function returns the descriptor number found, or vq->num (which
>   * is never a valid descriptor number) if none was found. */
> -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct
> vhost_virtqueue *vq,
> +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue
> *vq,
>  			   struct iovec iov[], unsigned int iov_size,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num)
> @@ -981,31 +1022,42 @@ unsigned vhost_get_vq_desc(struct vhost_
>  }
>  
>  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling.
> */
> -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
>  {
> -	vq->last_avail_idx--;
> +	vq->last_avail_idx -= n;
>  }
>  
>  /* After we've used one of their buffers, we tell them about it.  We'll
> then
>   * want to notify the guest, using eventfd. */
> -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int
> len)
> +int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem
> *heads,
> +		   int count)

I think we are better off with vhost_add_used and vhost_add_used_n:
the version with _n has a lot of extra complexity, and tx always
adds them 1 by one.

>  {
>  	struct vring_used_elem *used;
> +	int start, n;
> +
> +	if (count <= 0)
> +		return -EINVAL;
>  
> -	/* The virtqueue contains a ring of used buffers.  Get a pointer to
> the
> -	 * next entry in that used ring. */
> -	used = &vq->used->ring[vq->last_used_idx % vq->num];
> -	if (put_user(head, &used->id)) {
> -		vq_err(vq, "Failed to write used id");
> +	start = vq->last_used_idx % vq->num;
> +	if (vq->num - start < count)
> +		n = vq->num - start;
> +	else
> +		n = count;

use min?

> +	used = vq->used->ring + start;
> +	if (copy_to_user(used, heads, sizeof(heads[0])*n)) {
> +		vq_err(vq, "Failed to write used");
>  		return -EFAULT;
>  	}
> -	if (put_user(len, &used->len)) {
> -		vq_err(vq, "Failed to write used len");
> -		return -EFAULT;
> +	if (n < count) {	/* wrapped the ring */
> +		used = vq->used->ring;
> +		if (copy_to_user(used, heads+n, sizeof(heads[0])*(count-n))) {
> +			vq_err(vq, "Failed to write used");
> +			return -EFAULT;
> +		}
>  	}
>  	/* Make sure buffer is written before we update index. */
>  	smp_wmb();
> -	if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> +	if (put_user(vq->last_used_idx+count, &vq->used->idx)) {

I am a bit confused ... will this write a 32 or 16 bit value?
count is 32 bit ... Maybe we are better off with
  u16 idx = vq->last_used_idx + count
  put_user(idx, &vq->used->idx)
  vq->last_used_idx = idx

>  		vq_err(vq, "Failed to increment used idx");
>  		return -EFAULT;
>  	}
> @@ -1023,7 +1075,7 @@ int vhost_add_used(struct vhost_virtqueu
>  		if (vq->log_ctx)
>  			eventfd_signal(vq->log_ctx, 1);
>  	}
> -	vq->last_used_idx++;
> +	vq->last_used_idx += count;
>  	return 0;
>  }
>  
> @@ -1049,10 +1101,23 @@ void vhost_signal(struct vhost_dev *dev,
>  
>  /* And here's the combo meal deal.  Supersize me! */
>  void vhost_add_used_and_signal(struct vhost_dev *dev,
> -			       struct vhost_virtqueue *vq,
> -			       unsigned int head, int len)
> +			       struct vhost_virtqueue *vq, unsigned int id,
> +			       int len)
> +{
> +	struct vring_used_elem head;
> +
> +	head.id = id;
> +	head.len = len;
> +	vhost_add_used(vq, &head, 1);
> +	vhost_signal(dev, vq);
> +}
> +
> +/* multi-buffer version of vhost_add_used_and_signal */
> +void vhost_add_used_and_signal_n(struct vhost_dev *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct vring_used_elem *heads, int count)
>  {
> -	vhost_add_used(vq, head, len);
> +	vhost_add_used(vq, heads, count);
>  	vhost_signal(dev, vq);
>  }
>  
> diff -ruNp net-next-p0/drivers/vhost/vhost.h
> net-next-v3/drivers/vhost/vhost.h
> --- net-next-p0/drivers/vhost/vhost.h	2010-03-22 12:04:38.000000000
> -0700
> +++ net-next-v3/drivers/vhost/vhost.h	2010-04-05 20:33:57.000000000
> -0700
> @@ -85,6 +85,7 @@ struct vhost_virtqueue {
>  	struct iovec iov[VHOST_NET_MAX_SG];
>  	struct iovec hdr[VHOST_NET_MAX_SG];
>  	size_t hdr_size;
> +	struct vring_used_elem heads[VHOST_NET_MAX_SG];
>  	/* We use a kind of RCU to access private pointer.
>  	 * All readers access it from workqueue, which makes it possible to
>  	 * flush the workqueue instead of synchronize_rcu. Therefore readers
> do
> @@ -120,16 +121,22 @@ long vhost_dev_ioctl(struct vhost_dev *,
>  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
>  int vhost_log_access_ok(struct vhost_dev *);
>  
> -unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue
> *,
> +int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem
> *heads,
> +		     int datalen, int *iovcount, struct vhost_log *log,
> +		     unsigned int *log_num);
> +unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
>  			   struct iovec iov[], unsigned int iov_count,
>  			   unsigned int *out_num, unsigned int *in_num,
>  			   struct vhost_log *log, unsigned int *log_num);
> -void vhost_discard_vq_desc(struct vhost_virtqueue *);
> +void vhost_discard_desc(struct vhost_virtqueue *, int);
>  
> -int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int
> len);
> -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> +int vhost_add_used(struct vhost_virtqueue *, struct vring_used_elem
> *heads,
> +		    int count);
>  void vhost_add_used_and_signal(struct vhost_dev *, struct
> vhost_virtqueue *,
> -			       unsigned int head, int len);
> +			       unsigned int id, int len);
> +void vhost_add_used_and_signal_n(struct vhost_dev *, struct
> vhost_virtqueue *,
> +			       struct vring_used_elem *heads, int count);
> +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
>  void vhost_disable_notify(struct vhost_virtqueue *);
>  bool vhost_enable_notify(struct vhost_virtqueue *);
>  
> @@ -149,7 +156,8 @@ enum {
>  	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
>  			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
>  			 (1 << VHOST_F_LOG_ALL) |
> -			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> +			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> +			 (1 << VIRTIO_NET_F_MRG_RXBUF),
>  };
>  
>  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox