Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
From: Eric Dumazet @ 2010-04-01 12:10 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Jorrit Kronjee, netfilter-devel, netdev
In-Reply-To: <4BB47D81.3060400@trash.net>

Le jeudi 01 avril 2010 à 13:03 +0200, Patrick McHardy a écrit :
> Eric Dumazet wrote:
> > xt_hashlimit uses a central lock per hash table and suffers from
> > contention on some workloads. (Multiqueue NIC or if RPS is enabled)
> > 
> > After RCU conversion, central lock is only used when a writer wants to
> > add or delete an entry.
> > 
> > For 'readers', updating an existing entry, they use an individual lock
> > per entry.
> 
> Looks good to me, thanks Eric.
> 
> > -/* allocate dsthash_ent, initialize dst, put in htable and lock it */
> > -static struct dsthash_ent *
> > -dsthash_alloc_init(struct xt_hashlimit_htable *ht,
> > -		   const struct dsthash_dst *dst)
> 
> Is there a reason for moving this function downwards in the file?
> That unnecessarily increases the diff and makes the patch harder to
> review. For review purposes I moved it back up, resulting in 42
> lines less diff.
> --

Well, this is because I had to move inside this function various
initializations and these inits use user2credits() which was defined
after dsthash_alloc_init().

But we can avoid this since we hold the entry spinlock, and before hash
insertion.

Only the lookup fields and the spinlock MUST be committed to memory
before the insert. Other fields can be initialized later by caller.

Here is V2 of patch, I added locking as well in dl_seq_real_show()
because we call rateinfo_recalc(). htable main lock doesnt anymore
protects each entry rateinfo.

Thanks

[PATCH nf-next-2.6] xt_hashlimit: RCU conversion

xt_hashlimit uses a central lock per hash table and suffers from
contention on some workloads. (Multiqueue NIC or if RPS is enabled)

After RCU conversion, central lock is only used when a writer wants to
add or delete an entry.

For 'readers', updating an existing entry, they use an individual lock
per entry.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 5470bb0..453178d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -81,12 +81,14 @@ struct dsthash_ent {
 	struct dsthash_dst dst;
 
 	/* modified structure members in the end */
+	spinlock_t lock;
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
 		u_int32_t credit;
 		u_int32_t credit_cap, cost;
 	} rateinfo;
+	struct rcu_head rcu;
 };
 
 struct xt_hashlimit_htable {
@@ -143,9 +145,11 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
 	u_int32_t hash = hash_dst(ht, dst);
 
 	if (!hlist_empty(&ht->hash[hash])) {
-		hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
-			if (dst_cmp(ent, dst))
+		hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+			if (dst_cmp(ent, dst)) {
+				spin_lock(&ent->lock);
 				return ent;
+			}
 	}
 	return NULL;
 }
@@ -157,9 +161,10 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
 {
 	struct dsthash_ent *ent;
 
+	spin_lock(&ht->lock);
 	/* initialize hash with random val at the time we allocate
 	 * the first hashtable entry */
-	if (!ht->rnd_initialized) {
+	if (unlikely(!ht->rnd_initialized)) {
 		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
 		ht->rnd_initialized = true;
 	}
@@ -168,27 +173,36 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
 		/* FIXME: do something. question is what.. */
 		if (net_ratelimit())
 			pr_err("max count of %u reached\n", ht->cfg.max);
-		return NULL;
-	}
-
-	ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+		ent = NULL;
+	} else
+		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
 	if (!ent) {
 		if (net_ratelimit())
 			pr_err("cannot allocate dsthash_ent\n");
-		return NULL;
-	}
-	memcpy(&ent->dst, dst, sizeof(ent->dst));
+	} else {
+		memcpy(&ent->dst, dst, sizeof(ent->dst));
+		spin_lock_init(&ent->lock);
 
-	hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
-	ht->count++;
+		spin_lock(&ent->lock);
+		hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+		ht->count++;
+	}
+	spin_unlock(&ht->lock);
 	return ent;
 }
 
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+	struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
+
+	kmem_cache_free(hashlimit_cachep, ent);
+}
+
 static inline void
 dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 {
-	hlist_del(&ent->node);
-	kmem_cache_free(hashlimit_cachep, ent);
+	hlist_del_rcu(&ent->node);
+	call_rcu_bh(&ent->rcu, dsthash_free_rcu);
 	ht->count--;
 }
 static void htable_gc(unsigned long htlong);
@@ -512,15 +526,14 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
+	rcu_read_lock_bh();
 	dh = dsthash_find(hinfo, &dst);
 	if (dh == NULL) {
 		dh = dsthash_alloc_init(hinfo, &dst);
 		if (dh == NULL) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
-
 		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
 		dh->rateinfo.prev = jiffies;
 		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
@@ -537,11 +550,13 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* below the limit */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
 	}
 
-	spin_unlock_bh(&hinfo->lock);
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
 	return info->cfg.mode & XT_HASHLIMIT_INVERT;
 
@@ -666,12 +681,15 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				   struct seq_file *s)
 {
+	int res;
+
+	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
 	rateinfo_recalc(ent, jiffies);
 
 	switch (family) {
 	case NFPROTO_IPV4:
-		return seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip.src,
 				 ntohs(ent->dst.src_port),
@@ -679,9 +697,10 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ntohs(ent->dst.dst_port),
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	case NFPROTO_IPV6:
-		return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip6.src,
 				 ntohs(ent->dst.src_port),
@@ -689,11 +708,14 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ntohs(ent->dst.dst_port),
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #endif
 	default:
 		BUG();
-		return 0;
+		res = 0;
 	}
+	spin_unlock(&ent->lock);
+	return res;
 }
 
 static int dl_seq_show(struct seq_file *s, void *v)
@@ -817,9 +839,11 @@ err1:
 
 static void __exit hashlimit_mt_exit(void)
 {
-	kmem_cache_destroy(hashlimit_cachep);
 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
 	unregister_pernet_subsys(&hashlimit_net_ops);
+
+	rcu_barrier_bh();
+	kmem_cache_destroy(hashlimit_cachep);
 }
 
 module_init(hashlimit_mt_init);



^ permalink raw reply related

* Re: [RFC] SPD basic actions per netdev
From: Timo Teräs @ 2010-04-01 12:10 UTC (permalink / raw)
  To: hadi; +Cc: Herbert Xu, David S. Miller, Patrick McHardy, netdev
In-Reply-To: <1270123246.26743.177.camel@bigi>

jamal wrote:
> On Thu, 2010-04-01 at 14:47 +0300, Timo Teräs wrote:
> 
>> The thing is that currently FWD 'dev blah' matches the interface
>> to which the packet is being forwarded to. Someone might be using
>> this feature already.
> 
> So this is the part i am missing i think. If i look at:
> 
> int ip_forward(struct sk_buff *skb)
> {
> .....
>         if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
>                 goto drop;
> ....
> ........later forwarding happens here ...
>         if (!xfrm4_route_forward(skb))
>                 goto drop;
> ...
> }
> 
> On entry we have a legit skb->skb_iif.
> The validity check is before forwarding decision (where the interface
> the packet is being forwarded to is recognized).

On entry to ip_forward the routing decision has already been made.
Both oif and iif are valid on entry.

Currently policy_check() uses oif for SPD matching.

Do note that xfrm4_route_forward() is a no-op if there's no matching
policy. It has nothing to do with routing decision, it's purpose
is to wrap the dst_entry with xfrm_dst if the flow matches a valid
SPD.

>> Your patch changes semantics on how FWD policies are matched.
> 
> I agree if what you say earlier is true.

^ permalink raw reply

* Re: [PATCH] xtables: make XT_ALIGN() usable in exported headers
From: Patrick McHardy @ 2010-04-01 12:06 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Stephen Hemminger, Ben Hutchings, Andreas Henriksson, jamal,
	netdev
In-Reply-To: <n2ib6fcc0a1004010502z8c25a68cg11c5aa2f90ad4826@mail.gmail.com>

Alexey Dobriyan wrote:
> On Thu, Apr 1, 2010 at 1:50 PM, Patrick McHardy <kaber@trash.net> wrote:
>> I can't think of anything but to restore the XT_ALIGN macro.
>> We could add a XT_ALIGN definition to xtables.h, but that might
>> still leave problems for other users.
>>
>> Alexey, do you have any better suggestions?
> 
> I like __KERNEL_ALIGN trick.
> 
> Sorry for attachment, my patch sending facility is broke.
> Tested on iptables compilation.
> 

Seems fine to me, thanks. I'll wait a bit for others to comment.

^ permalink raw reply

* [PATCH] xtables: make XT_ALIGN() usable in exported headers
From: Alexey Dobriyan @ 2010-04-01 12:02 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, Ben Hutchings, Andreas Henriksson, jamal,
	netdev

[-- Attachment #1: Type: text/plain, Size: 411 bytes --]

On Thu, Apr 1, 2010 at 1:50 PM, Patrick McHardy <kaber@trash.net> wrote:
> I can't think of anything but to restore the XT_ALIGN macro.
> We could add a XT_ALIGN definition to xtables.h, but that might
> still leave problems for other users.
>
> Alexey, do you have any better suggestions?

I like __KERNEL_ALIGN trick.

Sorry for attachment, my patch sending facility is broke.
Tested on iptables compilation.

[-- Attachment #2: xt-align.patch --]
[-- Type: text/x-diff, Size: 2528 bytes --]

[PATCH] xtables: make XT_ALIGN() usable in exported headers

XT_ALIGN() was rewritten through ALIGN() by commit 42107f5009da223daa800d6da6904d77297ae829
"netfilter: xtables: symmetric COMPAT_XT_ALIGN definition".
ALIGN() is not exported in userspace headers, which created compile problem for tc(8)
and will create problem for iptables(8).

We can't export generic looking name ALIGN() but we can export less generic
__ALIGN_KERNEL() (suggested by Ben Hutchings).
Google knows nothing about __ALIGN_KERNEL().

COMPAT_XT_ALIGN() changed for symmetry.

Reported-by: Andreas Henriksson <andreas@fatal.se>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/linux/kernel.h             |    5 +++--
 include/linux/netfilter/x_tables.h |    6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7f07074..284ea99 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -4,6 +4,8 @@
 /*
  * 'kernel.h' contains some often-used function prototypes etc
  */
+#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 
 #ifdef __KERNEL__
 
@@ -37,8 +39,7 @@ extern const char linux_proc_banner[];
 
 #define STACK_MAGIC	0xdeadbeef
 
-#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
-#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+#define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
 #define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
 #define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
 
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 84c7c92..f01ddbe 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -1,6 +1,6 @@
 #ifndef _X_TABLES_H
 #define _X_TABLES_H
-
+#include <linux/kernel.h>
 #include <linux/types.h>
 
 #define XT_FUNCTION_MAXNAMELEN 30
@@ -93,7 +93,7 @@ struct _xt_align {
 	__u64 u64;
 };
 
-#define XT_ALIGN(s) ALIGN((s), __alignof__(struct _xt_align))
+#define XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _xt_align))
 
 /* Standard return verdict, or do jump. */
 #define XT_STANDARD_TARGET ""
@@ -598,7 +598,7 @@ struct _compat_xt_align {
 	compat_u64 u64;
 };
 
-#define COMPAT_XT_ALIGN(s) ALIGN((s), __alignof__(struct _compat_xt_align))
+#define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align))
 
 extern void xt_compat_lock(u_int8_t af);
 extern void xt_compat_unlock(u_int8_t af);

^ permalink raw reply related

* Re: [RFC] SPD basic actions per netdev
From: jamal @ 2010-04-01 12:00 UTC (permalink / raw)
  To: Timo Teräs; +Cc: Herbert Xu, David S. Miller, Patrick McHardy, netdev
In-Reply-To: <4BB487CA.3020603@iki.fi>

On Thu, 2010-04-01 at 14:47 +0300, Timo Teräs wrote:

> 
> The thing is that currently FWD 'dev blah' matches the interface
> to which the packet is being forwarded to. Someone might be using
> this feature already.

So this is the part i am missing i think. If i look at:

int ip_forward(struct sk_buff *skb)
{
.....
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
                goto drop;
....
........later forwarding happens here ...
        if (!xfrm4_route_forward(skb))
                goto drop;
...
}

On entry we have a legit skb->skb_iif.
The validity check is before forwarding decision (where the interface
the packet is being forwarded to is recognized).

> Your patch changes semantics on how FWD policies are matched.

I agree if what you say earlier is true.

cheers,
jamal

^ permalink raw reply

* Re: [PATCH 3/5] netfilter: xtables: inclusion of xt_TEE
From: Patrick McHardy @ 2010-04-01 11:54 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: netfilter-devel, netdev
In-Reply-To: <alpine.LSU.2.01.1004011304350.17429@obet.zrqbmnf.qr>

Jan Engelhardt wrote:
> On Thursday 2010-04-01 12:34, Patrick McHardy wrote:
>>> +static bool
>>> +tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
>>> +{
>>> +	const struct iphdr *iph = ip_hdr(skb);
>>> +	struct rtable *rt;
>>> +	struct flowi fl;
>>> +	int err;
>>> +
>>> +	memset(&fl, 0, sizeof(fl));
>>> +	fl.iif  = skb->skb_iif;
>> I'm not sure you really should set iif here. We usually (tunnels, REJECT
>> etc) packets generated locally as new packets.
>>> +	fl.mark = skb->mark;
>> The same applies to mark.
> 
> If you use TEE in PREROUTING or INPUT, teeing acts more like FORWARD than
> OUTPUT, though. All TEE does is lookup a route to a new fl.dst, but it keeps
> the original src address in fl.src, so if somebody has some source-based policy
> routing, it could suddenly behave different. What do you think?

That might make it unnessarily complicated to use src-based routing
when using TEE. I guess you'd usually have a host for logging or IDS
somewhere on a private network and TEE packets there. So specifying
oif and gateway seems most useful to me.

>>> +/*
>>> + * To detect and deter routed packet loopback when using the --tee option, we
>>> + * take a page out of the raw.patch book: on the copied skb, we set up a fake
>>> + * ->nfct entry, pointing to the local &route_tee_track. We skip routing
>>> + * packets when we see they already have that ->nfct.
>> So without conntrack, people may create loops? If that's the case,
>> I'd suggest to simply forbid TEE'ing packets to loopback. That
>> doesn't seem to be very useful anyways.
> 
>>> +#ifdef WITH_CONNTRACK
>>> +	if (skb->nfct == &tee_track.ct_general)
>>> +		/*
>>> +		 * Loopback - a packet we already routed, is to be
>>> +		 * routed another time. Avoid that, now.
>>> +		 */
> 	printk("loopback - dropped\n");
>>> +		return NF_DROP;
>>> +#endif
> 
> We are looking at a historic piece of code - and comments, which
> traces back to when xt_NOTRACK was still in POM.
> 
> {
>     →   /* Previously seen (loopback)? Ignore. */
>     →   if ((*pskb)->nfct != NULL)
>     →       →   return IPT_CONTINUE;
> 
>     →   /* Attach fake conntrack entry.·
>     →      If there is a real ct entry correspondig to this packet,·
>     →      it'll hang aroun till timing out. We don't deal with it
>     →      for performance reasons. JK */
>     →   (*pskb)->nfct = &ip_conntrack_untracked.infos[IP_CT_NEW];
>     →   nf_conntrack_get((*pskb)->nfct);
> 
>     →   return IPT_CONTINUE;
> }
> 
> Let's look at the condition "skb->nfct == &tee_track.ct_general" in detail. An
> skb can only already have tee_track when it has been teed.
> 
> The teed packet however never traversed Xtables at all. Of course that changes
> once the nesting patch is applied. But was someone really thinking of this, 6
> years ago?
> 
> That actually made me wonder and dig in history, and it turns out that
> ipt_ROUTE allowed the packet to be fed back into netif_rx (commit
> bee4e80167e3d024bdb80f400f4ecc8de47cfb03 in pom-ng.git), which would
> explain all the loopback stuff. Since modern xt_TEE does not do
> that evil thing, the comment is a walnut-hard remainder of past times.
> 
> I shall remove it now that it has been spotted.

Yeah, but currently it does allow packets to be looped back. These
packets will also go through the netfilter hooks again.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net-next] bnx2x: Added GRO support
From: Dmitry Kravkov @ 2010-04-01 11:49 UTC (permalink / raw)
  To: davem; +Cc: netdev, eilong, vladz

Resend, since original mail has typo in Dave's address

---
Adding GRO support on top of the HW LRO (TPA) support –
there is no measurable performance drawback of adding GRO
on top of it, and it allows better performance when LRO (TPA)
is turned off for virtualization or bridging.


Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2x_main.c |   20 +++++++++++---------
 1 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 6c042a7..f4ea99d 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -57,8 +57,8 @@
 #include "bnx2x_init_ops.h"
 #include "bnx2x_dump.h"
 
-#define DRV_MODULE_VERSION	"1.52.1-7"
-#define DRV_MODULE_RELDATE	"2010/02/28"
+#define DRV_MODULE_VERSION	"1.52.1-8"
+#define DRV_MODULE_RELDATE	"2010/04/01"
 #define BNX2X_BC_VER		0x040200
 
 #include <linux/firmware.h>
@@ -1441,12 +1441,12 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 #ifdef BCM_VLAN
 			if ((bp->vlgrp != NULL) && is_vlan_cqe &&
 			    (!is_not_hwaccel_vlan_cqe))
-				vlan_hwaccel_receive_skb(skb, bp->vlgrp,
-						le16_to_cpu(cqe->fast_path_cqe.
-							    vlan_tag));
+				vlan_gro_receive(&fp->napi, bp->vlgrp,
+						 le16_to_cpu(cqe->fast_path_cqe.
+							     vlan_tag), skb);
 			else
 #endif
-				netif_receive_skb(skb);
+				napi_gro_receive(&fp->napi, skb);
 		} else {
 			DP(NETIF_MSG_RX_STATUS, "Failed to allocate new pages"
 			   " - dropping packet!\n");
@@ -1699,11 +1699,11 @@ reuse_rx:
 		if ((bp->vlgrp != NULL) && (bp->flags & HW_VLAN_RX_FLAG) &&
 		    (le16_to_cpu(cqe->fast_path_cqe.pars_flags.flags) &
 		     PARSING_FLAGS_VLAN))
-			vlan_hwaccel_receive_skb(skb, bp->vlgrp,
-				le16_to_cpu(cqe->fast_path_cqe.vlan_tag));
+			vlan_gro_receive(&fp->napi, bp->vlgrp,
+				le16_to_cpu(cqe->fast_path_cqe.vlan_tag), skb);
 		else
 #endif
-			netif_receive_skb(skb);
+			napi_gro_receive(&fp->napi, skb);
 
 
 next_rx:
@@ -8935,6 +8935,8 @@ static int __devinit bnx2x_init_bp(struct bnx2x *bp)
 	bp->multi_mode = multi_mode;
 
 
+	bp->dev->features |= NETIF_F_GRO;
+
 	/* Set TPA flags */
 	if (disable_tpa) {
 		bp->flags &= ~TPA_ENABLE_FLAG;
-- 
1.6.3.3





^ permalink raw reply related

* Re: [RFC] SPD basic actions per netdev
From: Timo Teräs @ 2010-04-01 11:47 UTC (permalink / raw)
  To: hadi; +Cc: Herbert Xu, David S. Miller, Patrick McHardy, netdev
In-Reply-To: <1270121385.26743.169.camel@bigi>

jamal wrote:
> Q: So if all i want to achieve for now is to make sure that i can
> specify a "dev blah" in the forward or in direction and have it work to
> identify the incoming device, wouldnt this patch suffice?
> 
> I am attaching this patch with a fix to check for FWD as well if you
> have a chance i would appreciate if you re-look at it again.

The thing is that currently FWD 'dev blah' matches the interface
to which the packet is being forwarded to. Someone might be using
this feature already.

Your patch changes semantics on how FWD policies are matched.

^ permalink raw reply

* Re: [PATCH 3/5] netfilter: xtables: inclusion of xt_TEE
From: Jan Engelhardt @ 2010-04-01 11:39 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, netdev
In-Reply-To: <4BB47698.6070102@trash.net>


On Thursday 2010-04-01 12:34, Patrick McHardy wrote:
>> +static bool
>> +tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
>> +{
>> +	const struct iphdr *iph = ip_hdr(skb);
>> +	struct rtable *rt;
>> +	struct flowi fl;
>> +	int err;
>> +
>> +	memset(&fl, 0, sizeof(fl));
>> +	fl.iif  = skb->skb_iif;
>
>I'm not sure you really should set iif here. We usually (tunnels, REJECT
>etc) packets generated locally as new packets.
>> +	fl.mark = skb->mark;
>
>The same applies to mark.

If you use TEE in PREROUTING or INPUT, teeing acts more like FORWARD than
OUTPUT, though. All TEE does is lookup a route to a new fl.dst, but it keeps
the original src address in fl.src, so if somebody has some source-based policy
routing, it could suddenly behave different. What do you think?

>> +/*
>> + * To detect and deter routed packet loopback when using the --tee option, we
>> + * take a page out of the raw.patch book: on the copied skb, we set up a fake
>> + * ->nfct entry, pointing to the local &route_tee_track. We skip routing
>> + * packets when we see they already have that ->nfct.
>
>So without conntrack, people may create loops? If that's the case,
>I'd suggest to simply forbid TEE'ing packets to loopback. That
>doesn't seem to be very useful anyways.

>> +#ifdef WITH_CONNTRACK
>> +	if (skb->nfct == &tee_track.ct_general)
>> +		/*
>> +		 * Loopback - a packet we already routed, is to be
>> +		 * routed another time. Avoid that, now.
>> +		 */
	printk("loopback - dropped\n");
>> +		return NF_DROP;
>> +#endif

We are looking at a historic piece of code - and comments, which
traces back to when xt_NOTRACK was still in POM.

{
    →   /* Previously seen (loopback)? Ignore. */
    →   if ((*pskb)->nfct != NULL)
    →       →   return IPT_CONTINUE;

    →   /* Attach fake conntrack entry.·
    →      If there is a real ct entry correspondig to this packet,·
    →      it'll hang aroun till timing out. We don't deal with it
    →      for performance reasons. JK */
    →   (*pskb)->nfct = &ip_conntrack_untracked.infos[IP_CT_NEW];
    →   nf_conntrack_get((*pskb)->nfct);

    →   return IPT_CONTINUE;
}

Let's look at the condition "skb->nfct == &tee_track.ct_general" in detail. An
skb can only already have tee_track when it has been teed.

The teed packet however never traversed Xtables at all. Of course that changes
once the nesting patch is applied. But was someone really thinking of this, 6
years ago?

That actually made me wonder and dig in history, and it turns out that
ipt_ROUTE allowed the packet to be fed back into netif_rx (commit
bee4e80167e3d024bdb80f400f4ecc8de47cfb03 in pom-ng.git), which would
explain all the loopback stuff. Since modern xt_TEE does not do
that evil thing, the comment is a walnut-hard remainder of past times.

I shall remove it now that it has been spotted.

>> +	/*
>> +	 * If we are in PREROUTING/INPUT, the checksum must be recalculated
>> +	 * since the length could have changed as a result of defragmentation.
>> +	 *
>> +	 * We also decrease the TTL to mitigate potential TEE loops
>> +	 * between two hosts.
>> +	 *
>> +	 * Set %IP_DF so that the original source is notified of a potentially
>> +	 * decreased MTU on the clone route. IPv6 does this too.
>> +	 */
>> +	iph = ip_hdr(skb);
>> +	iph->frag_off |= htons(IP_DF);
>> +	if (par->hooknum == NF_INET_PRE_ROUTING ||
>> +	    par->hooknum == NF_INET_LOCAL_IN)
>> +		--iph->ttl;
>> +	ip_send_check(iph);
>
>Shouldn't this only be done in PRE_ROUTING/INPUT as stated above?

The csum needs to be recomputed due to the addition of the DF flag.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC] SPD basic actions per netdev
From: jamal @ 2010-04-01 11:29 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Timo Teräs, David S. Miller, Patrick McHardy, netdev
In-Reply-To: <20100401063956.GA21422@gondor.apana.org.au>

[-- Attachment #1: Type: text/plain, Size: 1858 bytes --]

On Thu, 2010-04-01 at 14:39 +0800, Herbert Xu wrote:
> On Thu, Apr 01, 2010 at 09:32:06AM +0300, Timo Teräs wrote:
> >
> > On inbound it's always loopback interface. Does the same hold
> > true on forward? I was under the impression that it would
> > reflect the actual destination interface.
> 
> That's a good point.  I suppose there might just be some crazy
> people out there using it this way.
> 
> So Jamal, I think this is a good reason why we do need a new
> field instead of just overloading the current one.  Otherwise
> inbound selectors and forward selectors will have different
> semantics which is needlessly confusing.


So I followed the discussion up to about this point then confusion sets
in for me - in particular about loopback being used for policy_check()
which you guys seem to agree on.
Nod on:  IN+FWD should be treated the same way. Locally generated/OUT
works and I dont muck with that.
The current code is sufficiently clean such that all i need is to worry
about is __xfrm_policy_check() (which is invoked only for IN and FWD).
And thats the only thing i touch - the rest "works as it did before".

[Note: the flow struct used in __xfrm_policy_check() is local to it, so
my touching it affects only the scope of validation of IN/FWD. I dont
see loopback being used for policy check.
Note2: In the FWD policy check, the output dev hasnt been decided
yet at that point. So it sounds fair to define "dev blah" in FWD
direction to mean incoming device (as it is for IN/local destined).]

Q: So if all i want to achieve for now is to make sure that i can
specify a "dev blah" in the forward or in direction and have it work to
identify the incoming device, wouldnt this patch suffice?

I am attaching this patch with a fix to check for FWD as well if you
have a chance i would appreciate if you re-look at it again.

cheers,
jamal


[-- Attachment #2: spd-fw-in --]
[-- Type: text/x-patch, Size: 5827 bytes --]

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..ce18464 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -838,7 +838,7 @@ __be16 xfrm_flowi_dport(struct flowi *fl)
 }
 
 extern int xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
-			       unsigned short family);
+			       unsigned short family, int use_if);
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 /*	If neither has a context --> match
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 843e066..cad67b3 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -55,35 +55,37 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
 
 static inline int
-__xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl)
+__xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl, int uif)
 {
+	int use_if = uif ? uif : fl->oif;
 	return  addr_match(&fl->fl4_dst, &sel->daddr, sel->prefixlen_d) &&
 		addr_match(&fl->fl4_src, &sel->saddr, sel->prefixlen_s) &&
 		!((xfrm_flowi_dport(fl) ^ sel->dport) & sel->dport_mask) &&
 		!((xfrm_flowi_sport(fl) ^ sel->sport) & sel->sport_mask) &&
 		(fl->proto == sel->proto || !sel->proto) &&
-		(fl->oif == sel->ifindex || !sel->ifindex);
+		(use_if == sel->ifindex || !sel->ifindex);
 }
 
 static inline int
-__xfrm6_selector_match(struct xfrm_selector *sel, struct flowi *fl)
+__xfrm6_selector_match(struct xfrm_selector *sel, struct flowi *fl, int uif)
 {
+	int use_if = uif ? uif : fl->oif;
 	return  addr_match(&fl->fl6_dst, &sel->daddr, sel->prefixlen_d) &&
 		addr_match(&fl->fl6_src, &sel->saddr, sel->prefixlen_s) &&
 		!((xfrm_flowi_dport(fl) ^ sel->dport) & sel->dport_mask) &&
 		!((xfrm_flowi_sport(fl) ^ sel->sport) & sel->sport_mask) &&
 		(fl->proto == sel->proto || !sel->proto) &&
-		(fl->oif == sel->ifindex || !sel->ifindex);
+		(use_if == sel->ifindex || !sel->ifindex);
 }
 
 int xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
-		    unsigned short family)
+		    unsigned short family, int ifindex)
 {
 	switch (family) {
 	case AF_INET:
-		return __xfrm4_selector_match(sel, fl);
+		return __xfrm4_selector_match(sel, fl, ifindex);
 	case AF_INET6:
-		return __xfrm6_selector_match(sel, fl);
+		return __xfrm6_selector_match(sel, fl, ifindex);
 	}
 	return 0;
 }
@@ -917,14 +919,17 @@ static int xfrm_policy_match(struct xfrm_policy *pol, struct flowi *fl,
 			     u8 type, u16 family, int dir)
 {
 	struct xfrm_selector *sel = &pol->selector;
-	int match, ret = -ESRCH;
+	int use_if = 0, match, ret = -ESRCH;
 
 	if (pol->family != family ||
 	    (fl->mark & pol->mark.m) != pol->mark.v ||
 	    pol->type != type)
 		return ret;
 
-	match = xfrm_selector_match(sel, fl, family);
+	if (dir == FLOW_DIR_IN || dir == FLOW_DIR_FWD)
+		use_if = fl->iif;
+
+	match = xfrm_selector_match(sel, fl, family, use_if);
 	if (match)
 		ret = security_xfrm_policy_lookup(pol->security, fl->secid,
 						  dir);
@@ -1041,7 +1046,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struc
 	read_lock_bh(&xfrm_policy_lock);
 	if ((pol = sk->sk_policy[dir]) != NULL) {
 		int match = xfrm_selector_match(&pol->selector, fl,
-						sk->sk_family);
+						sk->sk_family, 0);
 		int err = 0;
 
 		if (match) {
@@ -1918,6 +1923,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	struct flowi fl;
 	u8 fl_dir;
 	int xerr_idx = -1;
+	int use_if = 0;
 
 	reverse = dir & ~XFRM_POLICY_MASK;
 	dir &= XFRM_POLICY_MASK;
@@ -1928,6 +1934,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		return 0;
 	}
 
+	if (fl_dir ==  FLOW_DIR_IN || fl_dir ==  FLOW_DIR_FWD)
+		fl.iif = use_if = skb->skb_iif;
+
 	nf_nat_decode_session(skb, &fl, family);
 
 	/* First, check used SA against their selectors. */
@@ -1936,7 +1945,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 
 		for (i=skb->sp->len-1; i>=0; i--) {
 			struct xfrm_state *x = skb->sp->xvec[i];
-			if (!xfrm_selector_match(&x->sel, &fl, family)) {
+			if (!xfrm_selector_match(&x->sel, &fl, family, use_if)) {
 				XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
 				return 0;
 			}
@@ -2243,7 +2252,7 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 		if (first->origin && !flow_cache_uli_match(first->origin, fl))
 			return 0;
 		if (first->partner &&
-		    !xfrm_selector_match(first->partner, fl, family))
+		    !xfrm_selector_match(first->partner, fl, family, 0))
 			return 0;
 	}
 #endif
@@ -2253,7 +2262,7 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 	do {
 		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 
-		if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
+		if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family, 0))
 			return 0;
 		if (fl && pol &&
 		    !security_xfrm_state_pol_flow_match(dst->xfrm, pol, fl))
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 17d5b96..f47c90b 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -756,7 +756,7 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
 	 */
 	if (x->km.state == XFRM_STATE_VALID) {
 		if ((x->sel.family &&
-		     !xfrm_selector_match(&x->sel, fl, x->sel.family)) ||
+		     !xfrm_selector_match(&x->sel, fl, x->sel.family, 0)) ||
 		    !security_xfrm_state_pol_flow_match(x, pol, fl))
 			return;
 
@@ -769,7 +769,7 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
 		*acq_in_progress = 1;
 	} else if (x->km.state == XFRM_STATE_ERROR ||
 		   x->km.state == XFRM_STATE_EXPIRED) {
-		if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
+		if (xfrm_selector_match(&x->sel, fl, x->sel.family, 0) &&
 		    security_xfrm_state_pol_flow_match(x, pol, fl))
 			*error = -ESRCH;
 	}

^ permalink raw reply related

* Re: [PATCH 3/4] xfrm: remove policy lock when accessing policy->walk.dead
From: Patrick McHardy @ 2010-04-01 11:24 UTC (permalink / raw)
  To: hadi; +Cc: Herbert Xu, Timo Teras, netdev, David S. Miller
In-Reply-To: <1270057677.26743.116.camel@bigi>

jamal wrote:
> On Wed, 2010-03-31 at 18:41 +0200, Patrick McHardy wrote:
> 
>> I agree with Herbert, the flush notification indicates that
>> the table is now empty, independant of its previous state.
> 
> What purpose does it serve?
> 
> As an example, if i delete an entry, the fact i deleted an entry is 
> of interest to some manager in user space for the purpose of syncing.
> If i brought a link down, same thing. If i brought a link down
> that was already down - why would that be of interest to generate
> as an event? etc.

That's true. Since both pfkey and xfrm process messages synchronously,
there shouldn't be any need for this. In fact I couldn't even find
a single keying daemon that cares about this message.

^ permalink raw reply

* [PATCH net-next] bnx2x: Added GRO support
From: Dmitry Kravkov @ 2010-04-01 11:10 UTC (permalink / raw)
  To: "David Miller [davem"; +Cc: netdev, eilong, vladz

Adding GRO support on top of the HW LRO (TPA) support –
there is no measurable performance drawback of adding GRO
on top of it, and it allows better performance when LRO (TPA)
is turned off for virtualization or bridging.

Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2x_main.c |   20 +++++++++++---------
 1 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 6c042a7..f4ea99d 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -57,8 +57,8 @@
 #include "bnx2x_init_ops.h"
 #include "bnx2x_dump.h"
 
-#define DRV_MODULE_VERSION	"1.52.1-7"
-#define DRV_MODULE_RELDATE	"2010/02/28"
+#define DRV_MODULE_VERSION	"1.52.1-8"
+#define DRV_MODULE_RELDATE	"2010/04/01"
 #define BNX2X_BC_VER		0x040200
 
 #include <linux/firmware.h>
@@ -1441,12 +1441,12 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 #ifdef BCM_VLAN
 			if ((bp->vlgrp != NULL) && is_vlan_cqe &&
 			    (!is_not_hwaccel_vlan_cqe))
-				vlan_hwaccel_receive_skb(skb, bp->vlgrp,
-						le16_to_cpu(cqe->fast_path_cqe.
-							    vlan_tag));
+				vlan_gro_receive(&fp->napi, bp->vlgrp,
+						 le16_to_cpu(cqe->fast_path_cqe.
+							     vlan_tag), skb);
 			else
 #endif
-				netif_receive_skb(skb);
+				napi_gro_receive(&fp->napi, skb);
 		} else {
 			DP(NETIF_MSG_RX_STATUS, "Failed to allocate new pages"
 			   " - dropping packet!\n");
@@ -1699,11 +1699,11 @@ reuse_rx:
 		if ((bp->vlgrp != NULL) && (bp->flags & HW_VLAN_RX_FLAG) &&
 		    (le16_to_cpu(cqe->fast_path_cqe.pars_flags.flags) &
 		     PARSING_FLAGS_VLAN))
-			vlan_hwaccel_receive_skb(skb, bp->vlgrp,
-				le16_to_cpu(cqe->fast_path_cqe.vlan_tag));
+			vlan_gro_receive(&fp->napi, bp->vlgrp,
+				le16_to_cpu(cqe->fast_path_cqe.vlan_tag), skb);
 		else
 #endif
-			netif_receive_skb(skb);
+			napi_gro_receive(&fp->napi, skb);
 
 
 next_rx:
@@ -8935,6 +8935,8 @@ static int __devinit bnx2x_init_bp(struct bnx2x *bp)
 	bp->multi_mode = multi_mode;
 
 
+	bp->dev->features |= NETIF_F_GRO;
+
 	/* Set TPA flags */
 	if (disable_tpa) {
 		bp->flags &= ~TPA_ENABLE_FLAG;
-- 
1.6.3.3





^ permalink raw reply related

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Patrick McHardy @ 2010-04-01 11:09 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: netfilter-devel, netdev
In-Reply-To: <alpine.LSU.2.01.1004011258060.17429@obet.zrqbmnf.qr>

Jan Engelhardt wrote:
> On Thursday 2010-04-01 12:37, Patrick McHardy wrote:
> 
>> Jan Engelhardt wrote:
>>> Since Xtables is now reentrant/nestable, the cloned packet can also go
>>> through Xtables and be subject to rules itself.
>> That sounds dangerous if conntrack isn't used to prevent loops.
> 
> Conntrack loops are prevented by using a dummy conntrack, just as 
> NOTRACK does.

My question was about the case without conntrack.

>> Is that really useful? For filtering, you can simply apply the
>> rules before deciding to TEE the packet.
> 
> I can think of a handful of applications:
>  - CLASSIFY

Good point, you should probably reset a couple of skb members
after the skb_copy().

>  - When the cloned packets gets XFRMed or tunneled, its status switches 
>    from "special" to "plain". Doing policy routing on them does not seem 
>    so far-fetched.

Fair enough, provided we can also handle loops when conntrack
isn't used.

^ permalink raw reply

* Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-01 11:08 UTC (permalink / raw)
  To: Xin Xiaohui; +Cc: netdev, kvm, linux-kernel, mingo, jdike, yzhao81
In-Reply-To: <1270114038-5048-1-git-send-email-xiaohui.xin@intel.com>

On Thu, Apr 01, 2010 at 05:27:18PM +0800, Xin Xiaohui wrote:
> Add a device to utilize the vhost-net backend driver for
> copy-less data transfer between guest FE and host NIC.
> It pins the guest user space to the host memory and
> provides proto_ops as sendmsg/recvmsg to vhost-net.
> 
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> Signed-off-by: Zhao Yu <yzhao81@gmail.com>
> Sigend-off-by: Jeff Dike <jdike@c2.user-mode-linux.org>
> ---
> 
> Micheal,
> Sorry, I did not resolve all your comments this time.
> I did not move the device out of vhost directory because I
> did not implement real asynchronous read/write operations
> to mp device for now, We wish we can do this after the network
> code checked in. 

Well, placement of code is not such a major issue.
It's just that code under drivers/net gets more and better
review than drivers/vhost. I'll try to get Dave's opinion.

> 
> For the DOS issue, I'm not sure how much the limit get_user_pages()
> can pin is reasonable, should we compute the bindwidth to make it?

There's a ulimit for locked memory. Can we use this, decreasing
the value for rlimit array? We can do this when backend is
enabled and re-increment when backend is disabled.

> We use get_user_pages_fast() and use set_page_dirty_lock().
> Remove read_rcu_lock()/unlock(), since the ctor pointer is
> only changed by BIND/UNBIND ioctl, and during that time,
> the NIC is always stoped, all outstanding requests are done,
> so the ctor pointer cannot be raced into wrong condition.
> 
> Qemu needs a userspace write, is that a synchronous one or
> asynchronous one?

It's a synchronous non-blocking write.

> Thanks
> Xiaohui
> 
>  drivers/vhost/Kconfig     |    5 +
>  drivers/vhost/Makefile    |    2 +
>  drivers/vhost/mpassthru.c | 1162 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/mpassthru.h |   29 ++
>  4 files changed, 1198 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/vhost/mpassthru.c
>  create mode 100644 include/linux/mpassthru.h
> 
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index 9f409f4..ee32a3b 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -9,3 +9,8 @@ config VHOST_NET
>  	  To compile this driver as a module, choose M here: the module will
>  	  be called vhost_net.
>  
> +config VHOST_PASSTHRU
> +	tristate "Zerocopy network driver (EXPERIMENTAL)"
> +	depends on VHOST_NET
> +	---help---
> +	  zerocopy network I/O support
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 72dd020..3f79c79 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -1,2 +1,4 @@
>  obj-$(CONFIG_VHOST_NET) += vhost_net.o
>  vhost_net-y := vhost.o net.o
> +
> +obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
> diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
> new file mode 100644
> index 0000000..6e8fc4d
> --- /dev/null
> +++ b/drivers/vhost/mpassthru.c
> @@ -0,0 +1,1162 @@
> +/*
> + *  MPASSTHRU - Mediate passthrough device.
> + *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + *  GNU General Public License for more details.
> + *
> + */
> +
> +#define DRV_NAME        "mpassthru"
> +#define DRV_DESCRIPTION "Mediate passthru device driver"
> +#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
> +
> +#include <linux/module.h>
> +#include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/major.h>
> +#include <linux/slab.h>
> +#include <linux/smp_lock.h>
> +#include <linux/poll.h>
> +#include <linux/fcntl.h>
> +#include <linux/init.h>
> +#include <linux/aio.h>
> +
> +#include <linux/skbuff.h>
> +#include <linux/netdevice.h>
> +#include <linux/etherdevice.h>
> +#include <linux/miscdevice.h>
> +#include <linux/ethtool.h>
> +#include <linux/rtnetlink.h>
> +#include <linux/if.h>
> +#include <linux/if_arp.h>
> +#include <linux/if_ether.h>
> +#include <linux/crc32.h>
> +#include <linux/nsproxy.h>
> +#include <linux/uaccess.h>
> +#include <linux/virtio_net.h>
> +#include <linux/mpassthru.h>
> +#include <net/net_namespace.h>
> +#include <net/netns/generic.h>
> +#include <net/rtnetlink.h>
> +#include <net/sock.h>
> +
> +#include <asm/system.h>
> +
> +#include "vhost.h"
> +
> +/* Uncomment to enable debugging */
> +/* #define MPASSTHRU_DEBUG 1 */
> +
> +#ifdef MPASSTHRU_DEBUG
> +static int debug;
> +
> +#define DBG  if (mp->debug) printk
> +#define DBG1 if (debug == 2) printk
> +#else
> +#define DBG(a...)
> +#define DBG1(a...)
> +#endif
> +
> +#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
> +#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
> +
> +struct frag {
> +	u16     offset;
> +	u16     size;
> +};
> +
> +struct page_ctor {
> +	struct list_head        readq;
> +	int 			w_len;
> +	int 			r_len;
> +	spinlock_t      	read_lock;
> +	struct kmem_cache   	*cache;
> +	struct net_device   	*dev;
> +	struct mpassthru_port	port;
> +};
> +
> +struct page_info {
> +	void 			*ctrl;
> +	struct list_head    	list;
> +	int         		header;
> +	/* indicate the actual length of bytes
> +	 * send/recv in the user space buffers
> +	 */
> +	int         		total;
> +	int         		offset;
> +	struct page     	*pages[MAX_SKB_FRAGS+1];
> +	struct skb_frag_struct 	frag[MAX_SKB_FRAGS+1];
> +	struct sk_buff      	*skb;
> +	struct page_ctor   	*ctor;
> +
> +	/* The pointer relayed to skb, to indicate
> +	 * it's a user space allocated skb or kernel
> +	 */
> +	struct skb_user_page    user;
> +	struct skb_shared_info	ushinfo;
> +
> +#define INFO_READ      		0
> +#define INFO_WRITE     		1
> +	unsigned        	flags;
> +	unsigned        	pnum;
> +
> +	/* It's meaningful for receive, means
> +	 * the max length allowed
> +	 */
> +	size_t          	len;
> +
> +	/* The fields after that is for backend
> +	 * driver, now for vhost-net.
> +	 */
> +
> +	struct kiocb		*iocb;
> +	unsigned int    	desc_pos;
> +	unsigned int 		log;
> +	struct iovec 		hdr[VHOST_NET_MAX_SG];
> +	struct iovec 		iov[VHOST_NET_MAX_SG];
> +};
> +
> +struct mp_struct {
> +	struct mp_file   	*mfile;
> +	struct net_device       *dev;
> +	struct page_ctor	*ctor;
> +	struct socket           socket;
> +
> +#ifdef MPASSTHRU_DEBUG
> +	int debug;
> +#endif
> +};
> +
> +struct mp_file {
> +	atomic_t count;
> +	struct mp_struct *mp;
> +	struct net *net;
> +};
> +
> +struct mp_sock {
> +	struct sock            	sk;
> +	struct mp_struct       	*mp;
> +};
> +
> +static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
> +{
> +	int ret = 0;
> +
> +	rtnl_lock();
> +	ret = dev_change_flags(dev, flags);
> +	rtnl_unlock();
> +
> +	if (ret < 0)
> +		printk(KERN_ERR "failed to change dev state of %s", dev->name);
> +
> +	return ret;
> +}
> +
> +/* The main function to allocate user space buffers */
> +static struct skb_user_page *page_ctor(struct mpassthru_port *port,
> +					struct sk_buff *skb, int npages)
> +{
> +	int i;
> +	unsigned long flags;
> +	struct page_ctor *ctor;
> +	struct page_info *info = NULL;
> +
> +	ctor = container_of(port, struct page_ctor, port);
> +
> +	spin_lock_irqsave(&ctor->read_lock, flags);
> +	if (!list_empty(&ctor->readq)) {
> +		info = list_first_entry(&ctor->readq, struct page_info, list);
> +		list_del(&info->list);
> +	}
> +	spin_unlock_irqrestore(&ctor->read_lock, flags);
> +	if (!info)
> +		return NULL;
> +
> +	for (i = 0; i < info->pnum; i++) {
> +		get_page(info->pages[i]);
> +		info->frag[i].page = info->pages[i];
> +		info->frag[i].page_offset = i ? 0 : info->offset;
> +		info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
> +			port->data_len;
> +	}
> +	info->skb = skb;
> +	info->user.frags = info->frag;
> +	info->user.ushinfo = &info->ushinfo;
> +	return &info->user;
> +}
> +
> +static void mp_ki_dtor(struct kiocb *iocb)
> +{
> +	struct page_info *info = (struct page_info *)(iocb->private);
> +	int i;
> +
> +	for (i = 0; i < info->pnum; i++) {
> +		if (info->pages[i])
> +			put_page(info->pages[i]);
> +	}
> +
> +	if (info->flags == INFO_READ) {
> +		skb_shinfo(info->skb)->destructor_arg = &info->user;
> +		info->skb->destructor = NULL;
> +		kfree_skb(info->skb);
> +	}
> +
> +	kmem_cache_free(info->ctor->cache, info);
> +
> +	return;
> +}
> +
> +static struct kiocb *create_iocb(struct page_info *info, int size)
> +{
> +	struct kiocb *iocb = NULL;
> +
> +	iocb = info->iocb;
> +	if (!iocb)
> +		return iocb;
> +	iocb->ki_flags = 0;
> +	iocb->ki_users = 1;
> +	iocb->ki_key = 0;
> +	iocb->ki_ctx = NULL;
> +	iocb->ki_cancel = NULL;
> +	iocb->ki_retry = NULL;
> +	iocb->ki_iovec = NULL;
> +	iocb->ki_eventfd = NULL;
> +	iocb->private = (void *)info;
> +	iocb->ki_pos = info->desc_pos;
> +	iocb->ki_nbytes = size;
> +	iocb->ki_user_data = info->log;
> +	iocb->ki_dtor = mp_ki_dtor;
> +	return iocb;
> +}
> +
> +/* A helper to clean the skb before the kfree_skb() */
> +
> +static void page_dtor_prepare(struct page_info *info)
> +{
> +	if (info->flags == INFO_READ)
> +		if (info->skb)
> +			info->skb->head = NULL;
> +}
> +
> +/* The callback to destruct the user space buffers or skb */
> +static void page_dtor(struct skb_user_page *user)
> +{
> +	struct page_info *info;
> +	struct page_ctor *ctor;
> +	struct sock *sk;
> +	struct sk_buff *skb;
> +	struct kiocb *iocb = NULL;
> +	struct vhost_virtqueue *vq = NULL;
> +	unsigned long flags;
> +	int i;
> +
> +	if (!user)
> +		return;
> +	info = container_of(user, struct page_info, user);
> +	if (!info)
> +		return;
> +	ctor = info->ctor;
> +	skb = info->skb;
> +
> +	page_dtor_prepare(info);
> +
> +	/* If the info->total is 0, make it to be reused */
> +	if (!info->total) {
> +		spin_lock_irqsave(&ctor->read_lock, flags);
> +		list_add(&info->list, &ctor->readq);
> +		spin_unlock_irqrestore(&ctor->read_lock, flags);
> +		return;
> +	}
> +
> +	if (info->flags == INFO_READ)
> +		return;
> +
> +	/* For transmit, we should wait for the DMA finish by hardware.
> +	 * Queue the notifier to wake up the backend driver
> +	 */
> +	vq = (struct vhost_virtqueue *)info->ctrl;
> +	iocb = create_iocb(info, info->total);
> +
> +	spin_lock_irqsave(&vq->notify_lock, flags);
> +	list_add_tail(&iocb->ki_list, &vq->notifier);
> +	spin_unlock_irqrestore(&vq->notify_lock, flags);
> +
> +	sk = ctor->port.sock->sk;
> +	sk->sk_write_space(sk);
> +
> +	return;
> +}
> +
> +static int page_ctor_attach(struct mp_struct *mp)
> +{
> +	int rc;
> +	struct page_ctor *ctor;
> +	struct net_device *dev = mp->dev;
> +
> +	/* locked by mp_mutex */
> +	if (rcu_dereference(mp->ctor))
> +		return -EBUSY;
> +
> +	ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
> +	if (!ctor)
> +		return -ENOMEM;
> +	rc = netdev_mp_port_prep(dev, &ctor->port);
> +	if (rc)
> +		goto fail;
> +
> +	ctor->cache = kmem_cache_create("skb_page_info",
> +			sizeof(struct page_info), 0,
> +			SLAB_HWCACHE_ALIGN, NULL);
> +
> +	if (!ctor->cache)
> +		goto cache_fail;
> +
> +	INIT_LIST_HEAD(&ctor->readq);
> +	spin_lock_init(&ctor->read_lock);
> +
> +	ctor->w_len = 0;
> +	ctor->r_len = 0;
> +
> +	dev_hold(dev);
> +	ctor->dev = dev;
> +	ctor->port.ctor = page_ctor;
> +	ctor->port.sock = &mp->socket;
> +
> +	rc = netdev_mp_port_attach(dev, &ctor->port);
> +	if (rc)
> +		goto fail;
> +
> +	/* locked by mp_mutex */
> +	rcu_assign_pointer(mp->ctor, ctor);
> +
> +	/* XXX:Need we do set_offload here ? */
> +
> +	return 0;
> +
> +fail:
> +	kmem_cache_destroy(ctor->cache);
> +cache_fail:
> +	kfree(ctor);
> +	dev_put(dev);
> +
> +	return rc;
> +}
> +
> +struct page_info *info_dequeue(struct page_ctor *ctor)
> +{
> +	unsigned long flags;
> +	struct page_info *info = NULL;
> +	spin_lock_irqsave(&ctor->read_lock, flags);
> +	if (!list_empty(&ctor->readq)) {
> +		info = list_first_entry(&ctor->readq,
> +				struct page_info, list);
> +		list_del(&info->list);
> +	}
> +	spin_unlock_irqrestore(&ctor->read_lock, flags);
> +	return info;
> +}
> +
> +static int page_ctor_detach(struct mp_struct *mp)
> +{
> +	struct page_ctor *ctor;
> +	struct page_info *info;
> +	struct vhost_virtqueue *vq = NULL;
> +	struct kiocb *iocb = NULL;
> +	int i;
> +	unsigned long flags;
> +
> +	/* locked by mp_mutex */
> +	ctor = rcu_dereference(mp->ctor);
> +	if (!ctor)
> +		return -ENODEV;
> +
> +	while ((info = info_dequeue(ctor))) {
> +		for (i = 0; i < info->pnum; i++)
> +			if (info->pages[i])
> +				put_page(info->pages[i]);
> +		vq = (struct vhost_virtqueue *)(info->ctrl);
> +		iocb = create_iocb(info, 0);
> +
> +		spin_lock_irqsave(&vq->notify_lock, flags);
> +		list_add_tail(&iocb->ki_list, &vq->notifier);
> +		spin_unlock_irqrestore(&vq->notify_lock, flags);
> +
> +		kmem_cache_free(ctor->cache, info);
> +	}
> +	kmem_cache_destroy(ctor->cache);
> +	netdev_mp_port_detach(ctor->dev);
> +	dev_put(ctor->dev);
> +
> +	/* locked by mp_mutex */
> +	rcu_assign_pointer(mp->ctor, NULL);
> +	synchronize_rcu();
> +
> +	kfree(ctor);
> +	return 0;
> +}
> +
> +/* For small user space buffers transmit, we don't need to call
> + * get_user_pages().
> + */
> +static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
> +						struct kiocb *iocb, int total)
> +{
> +	struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
> +
> +	if (!info)
> +		return NULL;
> +	info->total = total;
> +	info->user.dtor = page_dtor;
> +	info->ctor = ctor;
> +	info->flags = INFO_WRITE;
> +	info->iocb = iocb;
> +	return info;
> +}
> +
> +/* The main function to transform the guest user space address
> + * to host kernel address via get_user_pages(). Thus the hardware
> + * can do DMA directly to the user space address.
> + */
> +static struct page_info *alloc_page_info(struct page_ctor *ctor,
> +					struct kiocb *iocb, struct iovec *iov,
> +					int count, struct frag *frags,
> +					int npages, int total)
> +{
> +	int rc;
> +	int i, j, n = 0;
> +	int len;
> +	unsigned long base;
> +	struct page_info *info = kmem_cache_zalloc(ctor->cache, GFP_KERNEL);
> +
> +	if (!info)
> +		return NULL;
> +
> +	for (i = j = 0; i < count; i++) {
> +		base = (unsigned long)iov[i].iov_base;
> +		len = iov[i].iov_len;
> +
> +		if (!len)
> +			continue;
> +		n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
> +
> +		rc = get_user_pages_fast(base, n, npages ? 1 : 0,
> +						&info->pages[j]);
> +		if (rc != n)
> +			goto failed;
> +
> +		while (n--) {
> +			frags[j].offset = base & ~PAGE_MASK;
> +			frags[j].size = min_t(int, len,
> +					PAGE_SIZE - frags[j].offset);
> +			len -= frags[j].size;
> +			base += frags[j].size;
> +			j++;
> +		}
> +	}
> +
> +#ifdef CONFIG_HIGHMEM
> +	if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
> +		for (i = 0; i < j; i++) {
> +			if (PageHighMem(info->pages[i]))
> +				goto failed;
> +		}
> +	}
> +#endif
> +
> +	info->total = total;
> +	info->user.dtor = page_dtor;
> +	info->ctor = ctor;
> +	info->pnum = j;
> +	info->iocb = iocb;
> +	if (!npages)
> +		info->flags = INFO_WRITE;
> +	if (info->flags == INFO_READ) {
> +		info->user.start = (u8 *)(((unsigned long)
> +				(pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
> +				frags[0].offset) - NET_IP_ALIGN - NET_SKB_PAD);
> +		info->user.size = iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD;
> +		for (i = 0; i < j; i++)
> +			set_page_dirty_lock(info->pages[i]);
> +	}
> +	return info;
> +
> +failed:
> +	for (i = 0; i < j; i++)
> +		put_page(info->pages[i]);
> +
> +	kmem_cache_free(ctor->cache, info);
> +
> +	return NULL;
> +}
> +
> +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
> +			struct msghdr *m, size_t total_len)
> +{
> +	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> +	struct page_ctor *ctor;
> +	struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
> +	struct iovec *iov = m->msg_iov;
> +	struct page_info *info = NULL;
> +	struct frag frags[MAX_SKB_FRAGS];
> +	struct sk_buff *skb;
> +	int count = m->msg_iovlen;
> +	int total = 0, header, n, i, len, rc;
> +	unsigned long base;
> +
> +	ctor = rcu_dereference(mp->ctor);
> +	if (!ctor)
> +		return -ENODEV;
> +
> +	total = iov_length(iov, count);
> +
> +	if (total < ETH_HLEN)
> +		return -EINVAL;
> +
> +	if (total <= COPY_THRESHOLD)
> +		goto copy;
> +
> +	n = 0;
> +	for (i = 0; i < count; i++) {
> +		base = (unsigned long)iov[i].iov_base;
> +		len = iov[i].iov_len;
> +		if (!len)
> +			continue;
> +		n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
> +		if (n > MAX_SKB_FRAGS)
> +			return -EINVAL;
> +	}
> +
> +copy:
> +	header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
> +
> +	skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC);
> +	if (!skb)
> +		goto drop;
> +
> +	skb_reserve(skb, NET_IP_ALIGN);
> +
> +	skb_set_network_header(skb, ETH_HLEN);
> +
> +	memcpy_fromiovec(skb->data, iov, header);
> +	skb_put(skb, header);
> +	skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
> +
> +	if (header == total) {
> +		rc = total;
> +		info = alloc_small_page_info(ctor, iocb, total);
> +	} else {
> +		info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total);
> +		if (info)
> +			for (i = 0; info->pages[i]; i++) {
> +				skb_add_rx_frag(skb, i, info->pages[i],
> +						frags[i].offset, frags[i].size);
> +				info->pages[i] = NULL;
> +			}
> +	}
> +	if (info != NULL) {
> +		info->desc_pos = iocb->ki_pos;
> +		info->ctrl = vq;
> +		info->total = total;
> +		info->skb = skb;
> +		skb_shinfo(skb)->destructor_arg = &info->user;
> +		skb->dev = mp->dev;
> +		dev_queue_xmit(skb);
> +		mp->dev->stats.tx_packets++;
> +		mp->dev->stats.tx_bytes += total;
> +		return 0;
> +	}
> +drop:
> +	kfree_skb(skb);
> +	if (info) {
> +		for (i = 0; info->pages[i]; i++)
> +			put_page(info->pages[i]);
> +		kmem_cache_free(info->ctor->cache, info);
> +	}
> +	mp->dev->stats.tx_dropped++;
> +	return -ENOMEM;
> +}
> +
> +
> +static void mp_recvmsg_notify(struct vhost_virtqueue *vq)
> +{
> +	struct socket *sock = vq->private_data;
> +	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> +	struct page_ctor *ctor = NULL;
> +	struct sk_buff *skb = NULL;
> +	struct page_info *info = NULL;
> +	struct ethhdr *eth;
> +	struct kiocb *iocb = NULL;
> +	int len, i;
> +	unsigned long flags;
> +
> +	struct virtio_net_hdr hdr = {
> +		.flags = 0,
> +		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> +	};
> +
> +	ctor = rcu_dereference(mp->ctor);
> +	if (!ctor)
> +		return;
> +
> +	while ((skb = skb_dequeue(&sock->sk->sk_receive_queue)) != NULL) {
> +		if (skb_shinfo(skb)->destructor_arg) {
> +			info = container_of(skb_shinfo(skb)->destructor_arg,
> +					struct page_info, user);
> +			info->skb = skb;
> +			if (skb->len > info->len) {
> +				mp->dev->stats.rx_dropped++;
> +				DBG(KERN_INFO "Discarded truncated rx packet: "
> +					" len %d > %zd\n", skb->len, info->len);
> +				info->total = skb->len;
> +				goto clean;
> +			} else {
> +				int i;
> +				struct skb_shared_info *gshinfo =
> +				(struct skb_shared_info *)(&info->ushinfo);
> +				struct skb_shared_info *hshinfo =
> +						skb_shinfo(skb);
> +
> +				if (gshinfo->nr_frags < hshinfo->nr_frags)
> +					goto clean;
> +				eth = eth_hdr(skb);
> +				skb_push(skb, ETH_HLEN);
> +
> +				hdr.hdr_len = skb_headlen(skb);
> +				info->total = skb->len;
> +
> +				for (i = 0; i < gshinfo->nr_frags; i++)
> +					gshinfo->frags[i].size = 0;
> +				for (i = 0; i < hshinfo->nr_frags; i++)
> +					gshinfo->frags[i].size =
> +						hshinfo->frags[i].size;
> +				memcpy(skb_shinfo(skb), &info->ushinfo,
> +						sizeof(struct skb_shared_info));
> +			}
> +		} else {
> +			/* The skb composed with kernel buffers
> +			 * in case user space buffers are not sufficent.
> +			 * The case should be rare.
> +			 */
> +			unsigned long flags;
> +			int i;
> +			struct skb_shared_info *gshinfo = NULL;
> +
> +			info = NULL;
> +
> +			spin_lock_irqsave(&ctor->read_lock, flags);
> +			if (!list_empty(&ctor->readq)) {
> +				info = list_first_entry(&ctor->readq,
> +						struct page_info, list);
> +				list_del(&info->list);
> +			}
> +			spin_unlock_irqrestore(&ctor->read_lock, flags);
> +			if (!info) {
> +				DBG(KERN_INFO "No user buffer avaliable %p\n",
> +									skb);
> +				skb_queue_head(&sock->sk->sk_receive_queue,
> +									skb);
> +				break;
> +			}
> +			info->skb = skb;
> +			/* compute the guest skb frags info */
> +			gshinfo = (struct skb_shared_info *)(info->user.start +
> +					SKB_DATA_ALIGN(info->user.size));
> +
> +			if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
> +				goto clean;
> +
> +			eth = eth_hdr(skb);
> +			skb_push(skb, ETH_HLEN);
> +			info->total = skb->len;
> +
> +			for (i = 0; i < gshinfo->nr_frags; i++)
> +				gshinfo->frags[i].size = 0;
> +			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> +				gshinfo->frags[i].size =
> +					skb_shinfo(skb)->frags[i].size;
> +			hdr.hdr_len = min_t(int, skb->len,
> +						info->iov[1].iov_len);
> +			skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
> +		}
> +
> +		len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
> +								 sizeof hdr);
> +		if (len) {
> +			DBG(KERN_INFO
> +				"Unable to write vnet_hdr at addr %p: %d\n",
> +				info->hdr->iov_base, len);
> +			goto clean;
> +		}
> +		iocb = create_iocb(info, skb->len + sizeof(hdr));
> +
> +		spin_lock_irqsave(&vq->notify_lock, flags);
> +		list_add_tail(&iocb->ki_list, &vq->notifier);
> +		spin_unlock_irqrestore(&vq->notify_lock, flags);
> +		continue;
> +
> +clean:
> +		kfree_skb(skb);
> +		for (i = 0; info->pages[i]; i++)
> +			put_page(info->pages[i]);
> +		kmem_cache_free(ctor->cache, info);
> +	}
> +	return;
> +}
> +
> +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
> +			struct msghdr *m, size_t total_len,
> +			int flags)
> +{
> +	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> +	struct page_ctor *ctor;
> +	struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
> +	struct iovec *iov = m->msg_iov;
> +	int count = m->msg_iovlen;
> +	int npages, payload;
> +	struct page_info *info;
> +	struct frag frags[MAX_SKB_FRAGS];
> +	unsigned long base;
> +	int i, len;
> +	unsigned long flag;
> +
> +	if (!(flags & MSG_DONTWAIT))
> +		return -EINVAL;
> +
> +	ctor = rcu_dereference(mp->ctor);
> +	if (!ctor)
> +		return -EINVAL;
> +
> +	/* Error detections in case invalid user space buffer */
> +	if (count > 2 && iov[1].iov_len < ctor->port.hdr_len &&
> +			mp->dev->features & NETIF_F_SG) {
> +		return -EINVAL;
> +	}
> +
> +	npages = ctor->port.npages;
> +	payload = ctor->port.data_len;
> +
> +	/* If KVM guest virtio-net FE driver use SG feature */
> +	if (count > 2) {
> +		for (i = 2; i < count; i++) {
> +			base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
> +			len = iov[i].iov_len;
> +			if (npages == 1)
> +				len = min_t(int, len, PAGE_SIZE - base);
> +			else if (base)
> +				break;
> +			payload -= len;
> +			if (payload <= 0)
> +				goto proceed;
> +			if (npages == 1 || (len & ~PAGE_MASK))
> +				break;
> +		}
> +	}
> +
> +	if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
> +				- NET_SKB_PAD - NET_IP_ALIGN) >= 0)
> +		goto proceed;
> +
> +	return -EINVAL;
> +
> +proceed:
> +	/* skip the virtnet head */
> +	iov++;
> +	count--;
> +
> +	/* Translate address to kernel */
> +	info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
> +	if (!info)
> +		return -ENOMEM;
> +	info->len = total_len;
> +	info->hdr[0].iov_base = vq->hdr[0].iov_base;
> +	info->hdr[0].iov_len = vq->hdr[0].iov_len;
> +	info->offset = frags[0].offset;
> +	info->desc_pos = iocb->ki_pos;
> +	info->log = iocb->ki_user_data;
> +	info->ctrl = vq;
> +
> +	iov--;
> +	count++;
> +
> +	memcpy(info->iov, vq->iov, sizeof(struct iovec) * count);
> +
> +	spin_lock_irqsave(&ctor->read_lock, flag);
> +	list_add_tail(&info->list, &ctor->readq);
> +	spin_unlock_irqrestore(&ctor->read_lock, flag);
> +
> +	if (!vq->receiver)
> +		vq->receiver = mp_recvmsg_notify;
> +
> +	return 0;
> +}
> +
> +static void __mp_detach(struct mp_struct *mp)
> +{
> +	mp->mfile = NULL;
> +
> +	mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
> +	page_ctor_detach(mp);
> +	mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
> +
> +	/* Drop the extra count on the net device */
> +	dev_put(mp->dev);
> +}
> +
> +static DEFINE_MUTEX(mp_mutex);
> +
> +static void mp_detach(struct mp_struct *mp)
> +{
> +	mutex_lock(&mp_mutex);
> +	__mp_detach(mp);
> +	mutex_unlock(&mp_mutex);
> +}
> +
> +static void mp_put(struct mp_file *mfile)
> +{
> +	if (atomic_dec_and_test(&mfile->count))
> +		mp_detach(mfile->mp);
> +}
> +
> +static int mp_release(struct socket *sock)
> +{
> +	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> +	struct mp_file *mfile = mp->mfile;
> +
> +	mp_put(mfile);
> +	sock_put(mp->socket.sk);
> +	put_net(mfile->net);
> +
> +	return 0;
> +}
> +
> +/* Ops structure to mimic raw sockets with mp device */
> +static const struct proto_ops mp_socket_ops = {
> +	.sendmsg = mp_sendmsg,
> +	.recvmsg = mp_recvmsg,
> +	.release = mp_release,
> +};
> +
> +static struct proto mp_proto = {
> +	.name           = "mp",
> +	.owner          = THIS_MODULE,
> +	.obj_size       = sizeof(struct mp_sock),
> +};
> +
> +static int mp_chr_open(struct inode *inode, struct file * file)
> +{
> +	struct mp_file *mfile;
> +	cycle_kernel_lock();
> +	DBG1(KERN_INFO "mp: mp_chr_open\n");
> +
> +	mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
> +	if (!mfile)
> +		return -ENOMEM;
> +	atomic_set(&mfile->count, 0);
> +	mfile->mp = NULL;
> +	mfile->net = get_net(current->nsproxy->net_ns);
> +	file->private_data = mfile;
> +	return 0;
> +}
> +
> +
> +static struct mp_struct *mp_get(struct mp_file *mfile)
> +{
> +	struct mp_struct *mp = NULL;
> +	if (atomic_inc_not_zero(&mfile->count))
> +		mp = mfile->mp;
> +
> +	return mp;
> +}
> +
> +
> +static int mp_attach(struct mp_struct *mp, struct file *file)
> +{
> +	struct mp_file *mfile = file->private_data;
> +	int err;
> +
> +	netif_tx_lock_bh(mp->dev);
> +
> +	err = -EINVAL;
> +
> +	if (mfile->mp)
> +		goto out;
> +
> +	err = -EBUSY;
> +	if (mp->mfile)
> +		goto out;
> +
> +	err = 0;
> +	mfile->mp = mp;
> +	mp->mfile = mfile;
> +	mp->socket.file = file;
> +	dev_hold(mp->dev);
> +	sock_hold(mp->socket.sk);
> +	atomic_inc(&mfile->count);
> +
> +out:
> +	netif_tx_unlock_bh(mp->dev);
> +	return err;
> +}
> +
> +static void mp_sock_destruct(struct sock *sk)
> +{
> +	struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
> +	kfree(mp);
> +}
> +
> +static int do_unbind(struct mp_file *mfile)
> +{
> +	struct mp_struct *mp = mp_get(mfile);
> +
> +	if (!mp)
> +		return -EINVAL;
> +
> +	mp_detach(mp);
> +	sock_put(mp->socket.sk);
> +	mp_put(mfile);
> +	return 0;
> +}
> +
> +static void mp_sock_data_ready(struct sock *sk, int len)
> +{
> +	if (sk_has_sleeper(sk))
> +		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
> +}
> +
> +static void mp_sock_write_space(struct sock *sk)
> +{
> +	if (sk_has_sleeper(sk))
> +		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
> +}
> +
> +static long mp_chr_ioctl(struct file *file, unsigned int cmd,
> +		unsigned long arg)
> +{
> +	struct mp_file *mfile = file->private_data;
> +	struct mp_struct *mp;
> +	struct net_device *dev;
> +	void __user* argp = (void __user *)arg;
> +	struct ifreq ifr;
> +	struct sock *sk;
> +	int ret;
> +
> +	ret = -EINVAL;
> +
> +	switch (cmd) {
> +	case MPASSTHRU_BINDDEV:
> +		ret = -EFAULT;
> +		if (copy_from_user(&ifr, argp, sizeof ifr))
> +			break;
> +
> +		ifr.ifr_name[IFNAMSIZ-1] = '\0';
> +
> +		ret = -EBUSY;
> +
> +		if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
> +			break;
> +
> +		ret = -ENODEV;
> +		dev = dev_get_by_name(mfile->net, ifr.ifr_name);
> +		if (!dev)
> +			break;
> +
> +		mutex_lock(&mp_mutex);
> +
> +		ret = -EBUSY;
> +		mp = mfile->mp;
> +		if (mp)
> +			goto err_dev_put;
> +
> +		mp = kzalloc(sizeof(*mp), GFP_KERNEL);
> +		if (!mp) {
> +			ret = -ENOMEM;
> +			goto err_dev_put;
> +		}
> +		mp->dev = dev;
> +		ret = -ENOMEM;
> +
> +		sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
> +		if (!sk)
> +			goto err_free_mp;
> +
> +		init_waitqueue_head(&mp->socket.wait);
> +		mp->socket.ops = &mp_socket_ops;
> +		sock_init_data(&mp->socket, sk);
> +		sk->sk_sndbuf = INT_MAX;
> +		container_of(sk, struct mp_sock, sk)->mp = mp;
> +
> +		sk->sk_destruct = mp_sock_destruct;
> +		sk->sk_data_ready = mp_sock_data_ready;
> +		sk->sk_write_space = mp_sock_write_space;
> +
> +		ret = mp_attach(mp, file);
> +		if (ret < 0)
> +			goto err_free_sk;
> +
> +		ret = page_ctor_attach(mp);
> +		if (ret < 0)
> +			goto err_free_sk;
> +
> +		ifr.ifr_flags |= IFF_MPASSTHRU_EXCL;
> +		mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
> +out:
> +		mutex_unlock(&mp_mutex);
> +		break;
> +err_free_sk:
> +		sk_free(sk);
> +err_free_mp:
> +		kfree(mp);
> +err_dev_put:
> +		dev_put(dev);
> +		goto out;
> +
> +	case MPASSTHRU_UNBINDDEV:
> +		ret = do_unbind(mfile);
> +		break;
> +
> +	default:
> +		break;
> +	}
> +	return ret;
> +}
> +
> +static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
> +{
> +	struct mp_file *mfile = file->private_data;
> +	struct mp_struct *mp = mp_get(mfile);
> +	struct sock *sk;
> +	unsigned int mask = 0;
> +
> +	if (!mp)
> +		return POLLERR;
> +
> +	sk = mp->socket.sk;
> +
> +	poll_wait(file, &mp->socket.wait, wait);
> +
> +	if (!skb_queue_empty(&sk->sk_receive_queue))
> +		mask |= POLLIN | POLLRDNORM;
> +
> +	if (sock_writeable(sk) ||
> +		(!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
> +			 sock_writeable(sk)))
> +		mask |= POLLOUT | POLLWRNORM;
> +
> +	if (mp->dev->reg_state != NETREG_REGISTERED)
> +		mask = POLLERR;
> +
> +	mp_put(mfile);
> +	return mask;
> +}
> +
> +static int mp_chr_close(struct inode *inode, struct file *file)
> +{
> +	struct mp_file *mfile = file->private_data;
> +
> +	/*
> +	 * Ignore return value since an error only means there was nothing to
> +	 * do
> +	 */
> +	do_unbind(mfile);
> +
> +	put_net(mfile->net);
> +	kfree(mfile);
> +
> +	return 0;
> +}
> +
> +static const struct file_operations mp_fops = {
> +	.owner  = THIS_MODULE,
> +	.llseek = no_llseek,
> +	.poll   = mp_chr_poll,
> +	.unlocked_ioctl = mp_chr_ioctl,
> +	.open   = mp_chr_open,
> +	.release = mp_chr_close,
> +};
> +
> +static struct miscdevice mp_miscdev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "mp",
> +	.nodename = "net/mp",
> +	.fops = &mp_fops,
> +};
> +
> +static int mp_device_event(struct notifier_block *unused,
> +		unsigned long event, void *ptr)
> +{
> +	struct net_device *dev = ptr;
> +	struct mpassthru_port *port;
> +	struct mp_struct *mp = NULL;
> +	struct socket *sock = NULL;
> +
> +	port = dev->mp_port;
> +	if (port == NULL)
> +		return NOTIFY_DONE;
> +
> +	switch (event) {
> +	case NETDEV_UNREGISTER:
> +			sock = dev->mp_port->sock;
> +			mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> +			do_unbind(mp->mfile);
> +			break;
> +	}
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block mp_notifier_block __read_mostly = {
> +	.notifier_call  = mp_device_event,
> +};
> +
> +static int mp_init(void)
> +{
> +	int ret = 0;
> +
> +	ret = misc_register(&mp_miscdev);
> +	if (ret)
> +		printk(KERN_ERR "mp: Can't register misc device\n");
> +	else {
> +		printk(KERN_INFO "Registering mp misc device - minor = %d\n",
> +			mp_miscdev.minor);
> +		register_netdevice_notifier(&mp_notifier_block);
> +	}
> +	return ret;
> +}
> +
> +void mp_cleanup(void)
> +{
> +	unregister_netdevice_notifier(&mp_notifier_block);
> +	misc_deregister(&mp_miscdev);
> +}
> +
> +/* Get an underlying socket object from mp file.  Returns error unless file is
> + * attached to a device.  The returned object works like a packet socket, it
> + * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
> + * holding a reference to the file for as long as the socket is in use. */
> +struct socket *mp_get_socket(struct file *file)
> +{
> +	struct mp_file *mfile = file->private_data;
> +	struct mp_struct *mp;
> +
> +	if (file->f_op != &mp_fops)
> +		return ERR_PTR(-EINVAL);
> +	mp = mp_get(mfile);
> +	if (!mp)
> +		return ERR_PTR(-EBADFD);
> +	mp_put(mfile);
> +	return &mp->socket;
> +}
> +EXPORT_SYMBOL_GPL(mp_get_socket);
> +
> +module_init(mp_init);
> +module_exit(mp_cleanup);
> +MODULE_AUTHOR(DRV_COPYRIGHT);
> +MODULE_DESCRIPTION(DRV_DESCRIPTION);
> +MODULE_LICENSE("GPL v2");
> diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
> new file mode 100644
> index 0000000..2be21c5
> --- /dev/null
> +++ b/include/linux/mpassthru.h
> @@ -0,0 +1,29 @@
> +#ifndef __MPASSTHRU_H
> +#define __MPASSTHRU_H
> +
> +#include <linux/types.h>
> +#include <linux/if_ether.h>
> +
> +/* ioctl defines */
> +#define MPASSTHRU_BINDDEV      _IOW('M', 213, int)
> +#define MPASSTHRU_UNBINDDEV    _IOW('M', 214, int)
> +
> +/* MPASSTHRU ifc flags */
> +#define IFF_MPASSTHRU		0x0001
> +#define IFF_MPASSTHRU_EXCL	0x0002
> +
> +#ifdef __KERNEL__
> +#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
> +struct socket *mp_get_socket(struct file *);
> +#else
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +struct file;
> +struct socket;
> +static inline struct socket *mp_get_socket(struct file *f)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
> +#endif /* CONFIG_VHOST_PASSTHRU */
> +#endif /* __KERNEL__ */
> +#endif /* __MPASSTHRU_H */
> -- 
> 1.5.4.4

^ permalink raw reply

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Jan Engelhardt @ 2010-04-01 11:03 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netfilter-devel, netdev
In-Reply-To: <4BB47768.1050405@trash.net>

On Thursday 2010-04-01 12:37, Patrick McHardy wrote:

>Jan Engelhardt wrote:
>> Since Xtables is now reentrant/nestable, the cloned packet can also go
>> through Xtables and be subject to rules itself.
>
>That sounds dangerous if conntrack isn't used to prevent loops.

Conntrack loops are prevented by using a dummy conntrack, just as 
NOTRACK does.

>Is that really useful? For filtering, you can simply apply the
>rules before deciding to TEE the packet.

I can think of a handful of applications:
 - CLASSIFY

 - When the cloned packets gets XFRMed or tunneled, its status switches 
   from "special" to "plain". Doing policy routing on them does not seem 
   so far-fetched.

^ permalink raw reply

* Re: [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
From: Patrick McHardy @ 2010-04-01 11:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jorrit Kronjee, netfilter-devel, netdev
In-Reply-To: <1270038210.2103.23.camel@edumazet-laptop>

Eric Dumazet wrote:
> xt_hashlimit uses a central lock per hash table and suffers from
> contention on some workloads. (Multiqueue NIC or if RPS is enabled)
> 
> After RCU conversion, central lock is only used when a writer wants to
> add or delete an entry.
> 
> For 'readers', updating an existing entry, they use an individual lock
> per entry.

Looks good to me, thanks Eric.

> -/* allocate dsthash_ent, initialize dst, put in htable and lock it */
> -static struct dsthash_ent *
> -dsthash_alloc_init(struct xt_hashlimit_htable *ht,
> -		   const struct dsthash_dst *dst)

Is there a reason for moving this function downwards in the file?
That unnecessarily increases the diff and makes the patch harder to
review. For review purposes I moved it back up, resulting in 42
lines less diff.

^ permalink raw reply

* Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.
From: Michael S. Tsirkin @ 2010-04-01 11:02 UTC (permalink / raw)
  To: Xin Xiaohui; +Cc: netdev, kvm, linux-kernel, mingo, jdike
In-Reply-To: <1270113296-4999-1-git-send-email-xiaohui.xin@intel.com>

On Thu, Apr 01, 2010 at 05:14:56PM +0800, Xin Xiaohui wrote:
> The vhost-net backend now only supports synchronous send/recv
> operations. The patch provides multiple submits and asynchronous
> notifications. This is needed for zero-copy case.
> 
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> ---
> 
> Michael,
> Now, I made vhost to alloc/destroy the kiocb, and transfer it from 
> sendmsg/recvmsg. I did not remove vq->receiver, since what the
> callback does is related to the structures owned by mp device,
> and I think isolation them to vhost is a good thing to us all.
> And it will not prevent mp device to be independent of vhost 
> in future. Later, when mp device can be a real device which
> provides asynchronous read/write operations and not just report
> proto_ops, it will use another callback function which is not
> related to vhost at all.

Thanks, I'll look at the code!

> For the write logging, do you have a function in hand that we can
> recompute the log? If that, I think I can use it to recompute the
> log info when the logging is suddenly enabled.
> For the outstanding requests, do you mean all the user buffers have
> submitted before the logging ioctl changed? That may be a lot, and
> some of them are still in NIC ring descriptors. Waiting them to be
> finished may be need some time. I think when logging ioctl changed,
> then the logging is changed just after that is also reasonable.

The key point is that after loggin ioctl returns, any
subsequent change to memory must be logged. It does not
matter when was the request submitted, otherwise we will
get memory corruption on migration.

> Thanks
> Xiaohui
> 
>  drivers/vhost/net.c   |  189 +++++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/vhost/vhost.h |   10 +++
>  2 files changed, 192 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 22d5fef..2aafd90 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -17,11 +17,13 @@
>  #include <linux/workqueue.h>
>  #include <linux/rcupdate.h>
>  #include <linux/file.h>
> +#include <linux/aio.h>
>  
>  #include <linux/net.h>
>  #include <linux/if_packet.h>
>  #include <linux/if_arp.h>
>  #include <linux/if_tun.h>
> +#include <linux/mpassthru.h>
>  
>  #include <net/sock.h>
>  
> @@ -47,6 +49,7 @@ struct vhost_net {
>  	struct vhost_dev dev;
>  	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
>  	struct vhost_poll poll[VHOST_NET_VQ_MAX];
> +	struct kmem_cache       *cache;
>  	/* Tells us whether we are polling a socket for TX.
>  	 * We only do this when socket buffer fills up.
>  	 * Protected by tx vq lock. */
> @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
>  	net->tx_poll_state = VHOST_NET_POLL_STARTED;
>  }
>  
> +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> +{
> +	struct kiocb *iocb = NULL;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&vq->notify_lock, flags);
> +	if (!list_empty(&vq->notifier)) {
> +		iocb = list_first_entry(&vq->notifier,
> +				struct kiocb, ki_list);
> +		list_del(&iocb->ki_list);
> +	}
> +	spin_unlock_irqrestore(&vq->notify_lock, flags);
> +	return iocb;
> +}
> +
> +static void handle_async_rx_events_notify(struct vhost_net *net,
> +					struct vhost_virtqueue *vq)
> +{
> +	struct kiocb *iocb = NULL;
> +	struct vhost_log *vq_log = NULL;
> +	int rx_total_len = 0;
> +	int log, size;
> +
> +	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> +		return;
> +
> +	if (vq->receiver)
> +		vq->receiver(vq);
> +
> +	vq_log = unlikely(vhost_has_feature(
> +				&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> +	while ((iocb = notify_dequeue(vq)) != NULL) {
> +		vhost_add_used_and_signal(&net->dev, vq,
> +				iocb->ki_pos, iocb->ki_nbytes);
> +		log = (int)iocb->ki_user_data;
> +		size = iocb->ki_nbytes;
> +		rx_total_len += iocb->ki_nbytes;
> +
> +		if (iocb->ki_dtor)
> +			iocb->ki_dtor(iocb);
> +		kmem_cache_free(net->cache, iocb);
> +
> +		if (unlikely(vq_log))
> +			vhost_log_write(vq, vq_log, log, size);
> +		if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> +			vhost_poll_queue(&vq->poll);
> +			break;
> +		}
> +	}
> +}
> +
> +static void handle_async_tx_events_notify(struct vhost_net *net,
> +					struct vhost_virtqueue *vq)
> +{
> +	struct kiocb *iocb = NULL;
> +	int tx_total_len = 0;
> +
> +	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> +		return;
> +
> +	while ((iocb = notify_dequeue(vq)) != NULL) {
> +		vhost_add_used_and_signal(&net->dev, vq,
> +				iocb->ki_pos, 0);
> +		tx_total_len += iocb->ki_nbytes;
> +
> +		if (iocb->ki_dtor)
> +			iocb->ki_dtor(iocb);
> +
> +		kmem_cache_free(net->cache, iocb);
> +		if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> +			vhost_poll_queue(&vq->poll);
> +			break;
> +		}
> +	}
> +}
> +
>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_tx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> +	struct kiocb *iocb = NULL;
>  	unsigned head, out, in, s;
>  	struct msghdr msg = {
>  		.msg_name = NULL,
> @@ -124,6 +204,8 @@ static void handle_tx(struct vhost_net *net)
>  		tx_poll_stop(net);
>  	hdr_size = vq->hdr_size;
>  
> +	handle_async_tx_events_notify(net, vq);
> +
>  	for (;;) {
>  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
> @@ -151,6 +233,15 @@ static void handle_tx(struct vhost_net *net)
>  		/* Skip header. TODO: support TSO. */
>  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
>  		msg.msg_iovlen = out;
> +
> +		if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> +			iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> +			if (!iocb)
> +				break;
> +			iocb->ki_pos = head;
> +			iocb->private = (void *)vq;
> +		}
> +
>  		len = iov_length(vq->iov, out);
>  		/* Sanity check */
>  		if (!len) {
> @@ -160,12 +251,16 @@ static void handle_tx(struct vhost_net *net)
>  			break;
>  		}
>  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> -		err = sock->ops->sendmsg(NULL, sock, &msg, len);
> +		err = sock->ops->sendmsg(iocb, sock, &msg, len);
>  		if (unlikely(err < 0)) {
>  			vhost_discard_vq_desc(vq);
>  			tx_poll_start(net, sock);
>  			break;
>  		}
> +
> +		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> +			continue;
> +
>  		if (err != len)
>  			pr_err("Truncated TX packet: "
>  			       " len %d != %zd\n", err, len);
> @@ -177,6 +272,8 @@ static void handle_tx(struct vhost_net *net)
>  		}
>  	}
>  
> +	handle_async_tx_events_notify(net, vq);
> +
>  	mutex_unlock(&vq->mutex);
>  	unuse_mm(net->dev.mm);
>  }
> @@ -186,6 +283,7 @@ static void handle_tx(struct vhost_net *net)
>  static void handle_rx(struct vhost_net *net)
>  {
>  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> +	struct kiocb *iocb = NULL;
>  	unsigned head, out, in, log, s;
>  	struct vhost_log *vq_log;
>  	struct msghdr msg = {
> @@ -206,7 +304,8 @@ static void handle_rx(struct vhost_net *net)
>  	int err;
>  	size_t hdr_size;
>  	struct socket *sock = rcu_dereference(vq->private_data);
> -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> +	if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> +			vq->link_state == VHOST_VQ_LINK_SYNC))
>  		return;
>  
>  	use_mm(net->dev.mm);
> @@ -214,9 +313,18 @@ static void handle_rx(struct vhost_net *net)
>  	vhost_disable_notify(vq);
>  	hdr_size = vq->hdr_size;
>  
> -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> +	/* In async cases, for write logging, the simple way is to get
> +	 * the log info always, and really logging is decided later.
> +	 * Thus, when logging enabled, we can get log, and when logging
> +	 * disabled, we can get log disabled accordingly.
> +	 */
> +
> +	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
> +		(vq->link_state == VHOST_VQ_LINK_ASYNC) ?
>  		vq->log : NULL;
>  
> +	handle_async_rx_events_notify(net, vq);
> +
>  	for (;;) {
>  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  					 ARRAY_SIZE(vq->iov),
> @@ -245,6 +353,14 @@ static void handle_rx(struct vhost_net *net)
>  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
>  		msg.msg_iovlen = in;
>  		len = iov_length(vq->iov, in);
> +		if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> +			iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> +			if (!iocb)
> +				break;
> +			iocb->private = vq;
> +			iocb->ki_pos = head;
> +			iocb->ki_user_data = log;
> +		}
>  		/* Sanity check */
>  		if (!len) {
>  			vq_err(vq, "Unexpected header len for RX: "
> @@ -252,13 +368,18 @@ static void handle_rx(struct vhost_net *net)
>  			       iov_length(vq->hdr, s), hdr_size);
>  			break;
>  		}
> -		err = sock->ops->recvmsg(NULL, sock, &msg,
> +
> +		err = sock->ops->recvmsg(iocb, sock, &msg,
>  					 len, MSG_DONTWAIT | MSG_TRUNC);
>  		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  		if (err < 0) {
>  			vhost_discard_vq_desc(vq);
>  			break;
>  		}
> +
> +		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> +			continue;
> +
>  		/* TODO: Should check and handle checksum. */
>  		if (err > len) {
>  			pr_err("Discarded truncated rx packet: "
> @@ -284,10 +405,13 @@ static void handle_rx(struct vhost_net *net)
>  		}
>  	}
>  
> +	handle_async_rx_events_notify(net, vq);
> +
>  	mutex_unlock(&vq->mutex);
>  	unuse_mm(net->dev.mm);
>  }
>  
> +
>  static void handle_tx_kick(struct work_struct *work)
>  {
>  	struct vhost_virtqueue *vq;
> @@ -338,6 +462,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>  	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
>  	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
>  	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
> +	n->cache = NULL;
>  	return 0;
>  }
>  
> @@ -398,6 +523,17 @@ static void vhost_net_flush(struct vhost_net *n)
>  	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
>  }
>  
> +static void vhost_notifier_cleanup(struct vhost_net *n)
> +{
> +	struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
> +	struct kiocb *iocb = NULL;
> +	if (n->cache) {
> +		while ((iocb = notify_dequeue(vq)) != NULL)
> +			kmem_cache_free(n->cache, iocb);
> +		kmem_cache_destroy(n->cache);
> +	}
> +}
> +
>  static int vhost_net_release(struct inode *inode, struct file *f)
>  {
>  	struct vhost_net *n = f->private_data;
> @@ -414,6 +550,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
>  	/* We do an extra flush before freeing memory,
>  	 * since jobs can re-queue themselves. */
>  	vhost_net_flush(n);
> +	vhost_notifier_cleanup(n);
>  	kfree(n);
>  	return 0;
>  }
> @@ -462,7 +599,19 @@ static struct socket *get_tun_socket(int fd)
>  	return sock;
>  }
>  
> -static struct socket *get_socket(int fd)
> +static struct socket *get_mp_socket(int fd)
> +{
> +	struct file *file = fget(fd);
> +	struct socket *sock;
> +	if (!file)
> +		return ERR_PTR(-EBADF);
> +	sock = mp_get_socket(file);
> +	if (IS_ERR(sock))
> +		fput(file);
> +	return sock;
> +}
> +
> +static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
>  {
>  	struct socket *sock;
>  	if (fd == -1)
> @@ -473,9 +622,31 @@ static struct socket *get_socket(int fd)
>  	sock = get_tun_socket(fd);
>  	if (!IS_ERR(sock))
>  		return sock;
> +	sock = get_mp_socket(fd);
> +	if (!IS_ERR(sock)) {
> +		vq->link_state = VHOST_VQ_LINK_ASYNC;
> +		return sock;
> +	}
>  	return ERR_PTR(-ENOTSOCK);
>  }
>  
> +static void vhost_init_link_state(struct vhost_net *n, int index)
> +{
> +	struct vhost_virtqueue *vq = n->vqs + index;
> +
> +	WARN_ON(!mutex_is_locked(&vq->mutex));
> +	if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> +		vq->receiver = NULL;
> +		INIT_LIST_HEAD(&vq->notifier);
> +		spin_lock_init(&vq->notify_lock);
> +		if (!n->cache) {
> +			n->cache = kmem_cache_create("vhost_kiocb",
> +					sizeof(struct kiocb), 0,
> +					SLAB_HWCACHE_ALIGN, NULL);
> +		}
> +	}
> +}
> +
>  static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>  {
>  	struct socket *sock, *oldsock;
> @@ -493,12 +664,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>  	}
>  	vq = n->vqs + index;
>  	mutex_lock(&vq->mutex);
> -	sock = get_socket(fd);
> +	vq->link_state = VHOST_VQ_LINK_SYNC;
> +	sock = get_socket(vq, fd);
>  	if (IS_ERR(sock)) {
>  		r = PTR_ERR(sock);
>  		goto err;
>  	}
>  
> +	vhost_init_link_state(n, index);
> +
>  	/* start polling new socket */
>  	oldsock = vq->private_data;
>  	if (sock == oldsock)
> @@ -507,8 +681,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>  	vhost_net_disable_vq(n, vq);
>  	rcu_assign_pointer(vq->private_data, sock);
>  	vhost_net_enable_vq(n, vq);
> -	mutex_unlock(&vq->mutex);
>  done:
> +	mutex_unlock(&vq->mutex);
>  	mutex_unlock(&n->dev.mutex);
>  	if (oldsock) {
>  		vhost_net_flush_vq(n, index);
> @@ -516,6 +690,7 @@ done:
>  	}
>  	return r;
>  err:
> +	mutex_unlock(&vq->mutex);
>  	mutex_unlock(&n->dev.mutex);
>  	return r;
>  }
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index d1f0453..cffe39a 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -43,6 +43,11 @@ struct vhost_log {
>  	u64 len;
>  };
>  
> +enum vhost_vq_link_state {
> +	VHOST_VQ_LINK_SYNC = 	0,
> +	VHOST_VQ_LINK_ASYNC = 	1,
> +};
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>  	struct vhost_dev *dev;
> @@ -96,6 +101,11 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log log[VHOST_NET_MAX_SG];
> +	/*Differiate async socket for 0-copy from normal*/
> +	enum vhost_vq_link_state link_state;
> +	struct list_head notifier;
> +	spinlock_t notify_lock;
> +	void (*receiver)(struct vhost_virtqueue *);
>  };
>  
>  struct vhost_dev {
> -- 
> 1.5.4.4

^ permalink raw reply

* Re: [PATCH 3/4] xfrm: remove policy lock when accessing policy->walk.dead
From: jamal @ 2010-04-01 10:53 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, timo.teras, netdev
In-Reply-To: <1270088351.26743.121.camel@bigi>


Ive tried it with net-next which is derived of 2.6.34-rc1
and it seems to work as expected
i.e. only the flush on a non-empty SAD is event-ed.
Probably older kernel you are running?

cheers,
jamal

On Wed, 2010-03-31 at 22:19 -0400, jamal wrote:
> On Thu, 2010-04-01 at 08:22 +0800, Herbert Xu wrote:
> 
> > OK, in that case please change xfrm_state_flush too.  Right now
> > it also notifies on empty.
> 
> It shouldnt. I will double check it tomorrow..
> 
> cheers,
> jamal


^ permalink raw reply

* Re: [PATCH] netfilter: CLUSTERIP: clusterip_seq_stop() fix
From: Patrick McHardy @ 2010-04-01 10:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netfilter-devel, netdev, David Miller
In-Reply-To: <1269546442.2894.5.camel@edumazet-laptop>

Eric Dumazet wrote:
> If clusterip_seq_start() memory allocation fails, we crash later in
> clusterip_seq_start(), trying to kfree(ERR_PTR(-ENOMEM))

Applied, thanks Eric.

^ permalink raw reply

* Re: [PATCH v2] Add Mergeable RX buffer feature to vhost_net
From: Michael S. Tsirkin @ 2010-04-01 10:54 UTC (permalink / raw)
  To: David Stevens; +Cc: netdev, kvm-owner, kvm, virtualization
In-Reply-To: <OF5F520CEF.01718090-ON882576F7.006C8152-882576F7.0079481F@us.ibm.com>

On Wed, Mar 31, 2010 at 03:04:43PM -0700, David Stevens wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 03/31/2010 05:02:28 AM:
> > 
> > attached patch seems to be whiespace damaged as well.
> > Does the origin pass checkpatch.pl for you?
> 
>         Yes, but I probably broke it in the transfer -- will be more
> careful with the next revision.
> 
> 
> 
> > > +               head.iov_base = (void *)vhost_get_vq_desc(&net->dev, 
> vq,
> > > +                       vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, 
> 
> > > NULL);
> > 
> > I this casting confusing.
> > Is it really expensive to add an array of heads so that
> > we do not need to cast?
> 
>         It needs the heads and the lengths, which looks a lot
> like an iovec. I was trying to resist adding a new
> struct XXX { unsigned head; unsigned len; } just for this,
> but I could make these parallel arrays, one with head index and
> the other with length.
> 
> > > +               if (vq->rxmaxheadcount < headcount)
> > > +                       vq->rxmaxheadcount = headcount;
> > 
> > This seems the only place where we set the rxmaxheadcount
> > value. Maybe it can be moved out of vhost.c to net.c?
> > If vhost library needs this it can get it as function
> > parameter.
> 
>         I can move it to vhost_get_heads(), sure.
> > 
> > > +               /* Skip header. TODO: support TSO. */
> > 
> > You seem to have removed the code that skips the header.
> > Won't this break non-GSO backends such as raw?
> 
>         From our prior discussion, what I had in mind was that
> we'd remove all special-header processing by using the ioctl
> you added for TUN and later, an equivalent ioctl for raw (when
> supported in qemu-kvm). Qemu would arrange headers with tap
> (and later raw) to get what the guest expects, and vhost then
> just passes all data as-is between the socket and the ring.
>         That not only removes the different-header-len code
> for mergeable buffers, but also eliminates making a copy of the
> header for every packet as before.

Yes but please note that in practice if this is what we do,
then vhost header size is 0 and then there is no actual copy.

> Vhost has no need to do
> anything with the header, or even know its length. It also
> doesn't have to care what the type of the backend is-- raw or
> tap.
>         I think that simplifies everything, but it does mean that
> raw socket support requires a header ioctl for the different
> vnethdr sizes if the guest wants a vnethdr with and without
> mergeable buffers. Actually, I thought this was your idea and
> the point of the TUNSETVNETHDRSZ ioctl, but I guess you had
> something else in mind?

Yes. However, we also have an ioctl that sets vhost header size, and it
allows supporting simple backends which do not support an (equivalent
of) TUNSETVNETHDRSZ. We have two of these now: packet and macvtap.
So I thought that unless there are gains in code simplicity and/or
performance we can keep supporting these just as well.

> > > -               /* TODO: Check specific error and bomb out unless 
> EAGAIN? 
> > > */
> > 
> > Do you think it's not a good idea?
> 
>         EAGAIN is not possible after the change, because we don't
> even enter the loop unless we have an skb on the read queue; the
> other cases bomb out, so I figured the comment for future work is
> now done. :-)

Guest could be buggy so we'll get EFAULT.
If skb is taken off the rx queue (as below), we might get EAGAIN.

> > 
> > >                 if (err < 0) {
> > > -                       vhost_discard_vq_desc(vq);
> > > +                       vhost_discard_vq_desc(vq, headcount);
> > >                         break;
> > >                 }
> > 
> > I think we should detect and discard truncated messages,
> > since len might not be reliable if userspace pulls
> > a packet from under us. Also, if new packet is
> > shorter than the old one, there's no truncation but
> > headcount is wrong.
> > 
> > So simplest fix IMO would be to compare err with expected len.
> > If there's a difference, we hit the race and so
> > we would discard the packet.
> 
> Sure.
> 
> > 
> > 
> > >                 /* TODO: Should check and handle checksum. */
> > > +               if (vhost_has_feature(&net->dev, 
> VIRTIO_NET_F_MRG_RXBUF)) 
> > > {
> > > +                       struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > > +                               (struct virtio_net_hdr_mrg_rxbuf *)
> > > +                               vq->iov[0].iov_base;
> > > +                       /* add num_bufs */
> > > +                       if (put_user(headcount, &vhdr->num_buffers)) {
> > > +                               vq_err(vq, "Failed to write 
> num_buffers");
> > > +                               vhost_discard_vq_desc(vq, headcount);
> > 
> > Let's do memcpy_to_iovecend etc so that we do not assume layout.
> > This is also why we need move_iovec: sendmsg might modify the iovec.
> > It would also be nice not to corrupt memory and
> > get a reasonable error if buffer size
> > that we get is smaller than expected header size.
> 
>         I wanted to avoid the header copy here, with the reasonable 
> restriction
> that the guest gives you a buffer at least big enough for the vnet_hdr.
> A
> length check alone (on iov[0].iov_len) could enforce that without copying
> headers back and forth to support silly cases like 8-byte buffers or
> num_buffers spanning multiple iovecs, and I think paying the price of the
> copy on every packet to support guests that broken isn't worth it.

In practice, when the first iovec includes the header what
we will copy is iov[0].iov_len + iov[0].iov_base.

We also still have the problem that sendmsg might modify the iovec,
(for tap it doesn't) so using iov_len after sendmsg isn't legal,
you need to copy it anyway.

>         So, my preference here would be to add:
> 
>         if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
>                 struct virtio_net_mrg_rxbuf *vhdr...
> 
>                 if (vq->iov[0].iov_len < sizeof(*vhdr)) {
>                         vq_err(vq, "tiny buffers (len %d < %d) are not 
> supported",
>                                 vq->iov[0].iov_len, sizeof(*vhdr));
>                         break;
>                 }
>                 /* add num_bufs */
>                 ...
> 
> Does that work for you?

Seems more code, doesn't it? Do you really believe this is a performance gain?
It looks a bit like prepature optimization ...

> > > -               if (err) {
> > > -                       vq_err(vq, "Unable to write vnet_hdr at addr 
> %p: 
> > > %d\n",
> > > -                              vq->iov->iov_base, err);
> > > -                       break;
> > > -               }
> > > -               len += hdr_size;
> > 
> > This seems to break non-GSO backends as well.
> 
> I don't have any to test with w/ current qemu-kvm,

I can dig up the demo, but there's also the macvtap backend. Try using that.

> but again, I thought
> your plan was to remove special header processing from vhost and let
> the socket end do it via ioctl. Wouldn't a similar ioctl for raw
> sockets when supported by qemu do that?

I just don't want to require that backends support GSO/vnet header,
so vhost needs an ability to skip the header.

> > >         }
> > >         n->dev.acked_features = features;
> > >         smp_wmb();
> > > -       for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
> > > -               mutex_lock(&n->vqs[i].mutex);
> > > -               n->vqs[i].hdr_size = hdr_size;
> > > -               mutex_unlock(&n->vqs[i].mutex);
> > 
> > I expect the above is a leftover from the previous version
> > which calculated header size in kernel?
> 
>         Right. With the ioctl, and assuming raw gets a similar
> one, we don't need to know anything about the header size in
> vhost, except that qemu needs to make sure mergeable rxbufs is
> only set when the socket has the larger vnet_hdr that includes
> the num_bufs field.
> 

Note that vhost doesn't really *know* about header, per se,
it just skips the first X bytes in iovec.

> > > @@ -410,6 +410,7 @@ static long vhost_set_vring(struct vhost
> > >                 vq->last_avail_idx = s.num;
> > >                 /* Forget the cached index value. */
> > >                 vq->avail_idx = vq->last_avail_idx;
> > > +               vq->rxmaxheadcount = 0;
> > >                 break;
> > >         case VHOST_GET_VRING_BASE:
> > >                 s.index = idx;
> > > @@ -856,6 +857,48 @@ static unsigned get_indirect(struct vhos
> > >         return 0;
> > >  }
> > > 
> > > +/* This is a multi-head version of vhost_get_vq_desc
> > 
> > How about:
> > version of vhost_get_vq_desc that returns multiple
> > descriptors
> 
> OK.
> 
> > 
> > > + * @vq         - the relevant virtqueue
> > > + * datalen     - data length we'll be reading
> > > + * @iovcount   - returned count of io vectors we fill
> > > + * @log                - vhost log
> > > + * @log_num    - log offset
> > 
> > return value?
> > Also - why unsigned?
> 
>         Returned value is the number of heads, which is
> 0 or greater.

The advantage of using int is that you'll be able to return
an error code. Good for debugging.

> > 
> > > + */
> > > +unsigned vhost_get_heads(struct vhost_virtqueue *vq, int datalen, int 
> 
> > 
> > Would vhost_get_vq_desc_multiple be a better name?
> 
>         26 chars for the function name with indentation and
> argument list might not be pretty on the calls. What about
> changing vhost_get_desc() and vhost_get_desc_n() [make them
> both shorter]? ("_n" indicating >= 1 instead of just one).

OK.

> > > +       struct vhost_log *log, unsigned int *log_num)
> > > +{
> > > +       struct iovec *heads = vq->heads;
> > 
> > I think it's better to pass in heads array than take it from vq->heads.
> 
>         I'd actually prefer to go the other way, and remove the
> log stuff, for example, since they are accessible from vq which
> we have already.

The main reason is that long term, we might want to
detach the arrays from vq - they are there simply to
avoid sticking too much data on stack - and the
fixed size becomes limiting for vhost block.
The advantage of passing them would be that
vhost won't have to change.

> Adding heads make it look less similar to the
> arg list for vhost_get_vq_desc(), but I'm more interested in
> getting the feature than particular arg lists, so your call.

Well, vhost_get_desc_n is just like vhost_get_desc but returns
multiple heads. In C you can return a single int by value
but many ints must be returned in an array, so I don't
think it is that dissimilar.

> > 
> > > +       int out, in = 0;
> > 
> > Why is in initialized here?
> 
> Needed in the previous version, but not here.
> 
> > 
> > > +       int seg = 0;            /* iov index */
> > > +       int hc = 0;             /* head count */
> > > +
> > > +       while (datalen > 0) {
> > 
> > Can't this simply call vhost_get_vq_desc in a loop somehow,
> > or use a common function that both this and vhost_get_vq_desc call?
> 
> It is calling vhost_get_vq_desc() in this loop; that's how it
> gets the indiviual heads and iov entries. The rest of it is just
> massaging the offsets so vhost_get_vq_desc puts them all in the
> right places and tracks the heads and lengths correctly for add_used.

Sorry, will have to re-read.

> > 
> > > +               if (hc >= VHOST_NET_MAX_SG) {
> > > +                       vhost_discard_vq_desc(vq, hc);
> > > +                       return 0;
> > > +               }
> > > +               heads[hc].iov_base = (void 
> *)vhost_get_vq_desc(vq->dev, 
> > > vq,
> > > +                       vq->iov+seg, ARRAY_SIZE(vq->iov)-seg, &out, 
> &in, 
> > > log,
> > > +                       log_num);
> > > +               if (heads[hc].iov_base == (void *)vq->num) {
> > > +                       vhost_discard_vq_desc(vq, hc);
> > > +                       return 0;
> > > +               }
> > > +               if (out || in <= 0) {
> > > +                       vq_err(vq, "unexpected descriptor format for 
> RX: "
> > > +                               "out %d, in %d\n", out, in);
> > > +                       vhost_discard_vq_desc(vq, hc);
> > 
> > goto err above might help simplify cleanup.
> 
> I generally would rather see the error cleanup explicitly where the
> error detection and message are, for code readability, but I can change
> it to a goto here if you think that's clearer.

This is the idiom I use all over this code.

> > > -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int 
> 
> > > len)
> > > +int vhost_add_used(struct vhost_virtqueue *vq, struct iovec *heads, 
> int 
> > > count)
> > 
> > count is always 0 for send, right?
> > I think it is better to have two APIs here as well:
> > vhost_add_used
> > vhost_add_used_multiple
> > we can use functions to avoid code duplication.
> > 
> > >  {
> > > -       struct vring_used_elem *used;
> > > +       struct vring_used_elem *used = 0;
> > 
> > why is used initialized here?
> 
>         This is to remove a compiler warning that "used" may
> be used unitialized, for a case that can't happen (count <= 0).
> 

I think there's some macro in kernel to shut the compiler up.
Or just ignore the warning. Let's not add extra initializers
just to shut up compiler, I think this might hide real bugs.

> > > +               vq->last_used_idx++;
> > 
> > I think we should update last_used_idx on success only,
> > at the end. Simply use last_used_idx + count
> > instead of last_used_idx + 1.
> 
>         OK.
> 
> > > @@ -1023,22 +1071,35 @@ int vhost_add_used(struct vhost_virtqueu
> > >                 if (vq->log_ctx)
> > >                         eventfd_signal(vq->log_ctx, 1);
> > >         }
> > > -       vq->last_used_idx++;
> > >         return 0;
> > >  }
> > > 
> > > +int vhost_available(struct vhost_virtqueue *vq)
> > 
> > since this function is non-static, please document what it does.
> 
>         OK.
> 
> > > +{
> > > +       int avail;
> > > +
> > > +       if (!vq->rxmaxheadcount)        /* haven't got any yet */
> > > +               return 1;
> > 
> > since seems to make net - specific asumptions.
> > How about moving this check out to net.c?
> 
>         I can, but its only user is vhost_signal() in this file.

So why isn't it static?

> > > +       avail = vq->avail_idx - vq->last_avail_idx;
> > > +       if (avail < 0)
> > > +               avail += 0x10000; /* wrapped */
> > 
> > A branch that is likely non-taken. Also, rxmaxheadcount
> > is really unlikely to get as large as half the ring.
> > So this just wastes cycles?
> 
>         The indexes are free-running counters, so if
> avail_idx has overflowed but last_avail_idx has not
> then the result will be (incorrectly) negative. It
> has nothing to do with how many buffers are in use--
> consider avail_idx = 2 and last_avail_idx = 65534,
> then avail = will be -65532, but we want it to be 4.

Right. Sorry about the noise.

> > 
> > > +       return avail;
> > > +}
> > > +
> > >  /* This actually signals the guest, using eventfd. */
> > > -void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > > +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq, 
> bool 
> > > recvr)
> > 
> > This one is not elegant. receive is net.c concept, let's not
> > put it in vhost.c. Pass in headcount if you must.
> 
>         Mainly what I was after here is to not affect signal's
> for the transmit side. I thought about splitting it into a
> separate function entirely for the transmit, but came up with
> this, instead. I can't tell if it's a receiver from the vq,
> but I don't want to override NOTIFY_ON_EMPTY or the headcount
> check (which has nothing to do with transmitters) for the
> transmitter at all.
>         The vq doesn't tell you if it is a receiver or not,
> so I either need a flag as argument to tell, or I need to
> split into a transmit vhost_signal and a receive vhost_signal.

You see, receive is really a net concept. virtio has in and out,
and a single descriptor can do both.

>         I can do the available check before calling vhost_signal,
> but then I need to remove the NOTIFY check entirely so it'll
> still notify for the receiver case when available is not
> enough but still not 0.

My question is, guests can not rely on the 'available is not
enough but still not 0' to work because it relies on a heuristic
to figure out how much is 'not enough'. So why do we need it at all.

>         So, I think the options are:
> 1) a flag
> 2) separate transmit or receive signal functions
> 3) move NOTIFY_ON_EMPTY out of vhost_signal and
>         check that or available (for recvr) before
>         calling vhost_signal().

There are ordering rules that make 3 impossible.
Let's try to rethink the concept of redefining NOTIFY_ON_EMPTY.
Is it really necessary?

> > 
> > >  {
> > >         __u16 flags = 0;
> > > +
> > >         if (get_user(flags, &vq->avail->flags)) {
> > >                 vq_err(vq, "Failed to get flags");
> > >                 return;
> > >         }
> > > 
> > > -       /* If they don't want an interrupt, don't signal, unless 
> empty. */
> > > +       /* If they don't want an interrupt, don't signal, unless
> > > +        * empty or receiver can't get a max-sized packet. */
> > >         if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
> > > -           (vq->avail_idx != vq->last_avail_idx ||
> > > +           (!recvr || vhost_available(vq) >= vq->rxmaxheadcount ||
> > 
> > Is the above really worth the complexity?
> > Guests can't rely on this kind of fuzzy logic, can they?
> > Do you see this helping performance at all?
> 
>         It definitely hurts performance to allocate max-sized buffers
> and then free up the excess (the translate_desc is expensive).

Well, we do not need to rerun translate_desc, we can keep
the results around, however ...

> It is
> a heuristic, but it's one that keeps the notifies flowing without
> having to fail on a packet to reenable them.

I don't understand completely. We looked into the skb, so
we know the required length?

>         The notify code has to change because there is no recovery
> if we need 5 buffers and have only 3, while notification is
> disabled unless we go all the way to 0. We can't honor NOTIFY_ON_EMPTY
> in that case because it will never proceed. We can remove the
> NOTIFY_ON_EMPTY check in vhost_signal and do it in the caller; then allow
> receiver to fail a buffer allocation and signal unconditionally,
> but we cannot wait for it to be 0 even if NOTIFY_ON_EMPTY is set--
> that's a deadlock.
> 

Which guest kernel or spec are you looking at? As far as I know
F_NOTIFY_ON_EMPTY was only used for TX at some point.
I am not aware of this bit having the meaning you assign it.
I suspect that what you outline above simply means guest
can not combine F_NOTIFY_ON_EMPTY with mergeable buffers for
virtio net RX VQ.

> > >  /* OK, now we need to know about added descriptors. */
> > > -bool vhost_enable_notify(struct vhost_virtqueue *vq)
> > > +bool vhost_enable_notify(struct vhost_virtqueue *vq, bool recvr)
> > >  {
> > >         u16 avail_idx;
> > >         int r;
> > > @@ -1080,6 +1141,8 @@ bool vhost_enable_notify(struct vhost_vi
> > >                 return false;
> > >         }
> > > 
> > > +       if (recvr && vq->rxmaxheadcount)
> > > +               return (avail_idx - vq->last_avail_idx) >= 
> > > vq->rxmaxheadcount;
> > 
> > The fuzzy logic behind rxmaxheadcount might be a good
> > optimization, but I am not comfortable using it
> > for correctness. Maybe vhost_enable_notify should get
> > the last head so we can redo poll when another one
> > is added.
> 
>         I tried requiring at least 64K here always, but as
> I said before, the cost of doing the translate_desc and
> then throwing those away for small packets killed performance.
> I also considered using a highwater mark on the ring, but
> potentially differing buffer sizes makes that less useful,
> and a small ring may only have enough space for 1 max-sized
> packet. 
>         Maybe we should just remove NOTIFY_ON_EMPTY if we
> fail a packet allocation; I can try that out.
>         Another alternative I considered was splitting out
> the translate_desc stuff in the hopes that we could found
> out how many buffers we need in a less-expensive way; then
> we could go for max-sized and free the excess without too
> much overhead, maybe.
>         Anyway, I'll look harder at working around this.

I am kind of confused. We did a peek and have the skb length,
why do we need the maxheads?

Hmm, the iovec consumed is not just skb length, it is skb length + vnet
header. Is this it? Maybe we can just export this from socket somehow?

> > >         struct iovec indirect[VHOST_NET_MAX_SG];
> > > -       struct iovec iov[VHOST_NET_MAX_SG];
> > > -       struct iovec hdr[VHOST_NET_MAX_SG];
> > > -       size_t hdr_size;
> > > +       struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr 
> */
> > 
> > VHOST_NET_MAX_SG should already include iovec vnet hdr.
> 
>         That's actually an artifact from the previous patch. I
> was arranging the iovec to have just the header needed by the
> guest and this simplified the bounds checking. It can be
> removed if we leave header size management to the socket side,
> but if vhost is going to be doing vnet_hdr manipulation for
> raw sockets, then it'll depend on what that new code needs
> to do to manage the header, I suppose.
> 
>                                                 +-DLS

Note it doesn't manipluate the header, just skips it.

-- 
MST

^ permalink raw reply

* Re: XT_ALIGN changed to use ALIGN breaks iproute2
From: Patrick McHardy @ 2010-04-01 10:50 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ben Hutchings, Alexey Dobriyan, Andreas Henriksson, jamal, netdev
In-Reply-To: <20100330082902.6d64d21d@nehalam>

Stephen Hemminger wrote:
> On Tue, 30 Mar 2010 16:01:18 +0100
> Ben Hutchings <bhutchings@solarflare.com> wrote:
> 
>> On Tue, 2010-03-30 at 16:15 +0300, Alexey Dobriyan wrote:
>>> On Tue, Mar 30, 2010 at 12:28 PM, Andreas Henriksson <andreas@fatal.se> wrote:
>>>> You updated the kernel header include/linux/netfilter/x_tables.h
>>>> in torvalds/linux-2.6.git commit 42107f5009da223daa800d6da6904d77297ae829
>>>> with the comment "Use ALIGN() macro while I'm at it for same types.".
>>>>
>>>> When this header was synced into iproute2 the build broke because the
>>>> ALIGN macro apparently only is defined in kernel headers.
>>>>
>>>> (For iproute2 the problem was introduced in
>>>> 8ecdcce08319d0e39b0d32c1d17db3f69d85a35c and found by Stephen
>>>> and worked around in 609ceb807deba8e23 and edaaa11e5a3cf2c9c1a39)
>>>>
>>>> I'm guessing the problem in the iproute2 header sync is just a heads
>>>> up for what's going to happen when distributions updates their
>>>> system headers to match linux 2.6.33.
>>>>
>>>>
>>>> Could someone who knows how the userspace version of the kernel
>>>> headers are generated please find a suitable solution?
>>> We can export ALIGN to userspace, but the name is so generic,
>>> so it's not clear what breakage more risky.
> 
> I put a hack in m_xt.c to keep iproute2 building.
> But this is a temporary workaround until you guys figure out the
> right answer.

I can't think of anything but to restore the XT_ALIGN macro.
We could add a XT_ALIGN definition to xtables.h, but that might
still leave problems for other users.

Alexey, do you have any better suggestions?


^ permalink raw reply

* Re: [net-next-2.6 PATCH] netfilter: ctnetlink: compute message size properly
From: Patrick McHardy @ 2010-04-01 10:50 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Eric Dumazet, netdev, netfilter-devel, netfilter, davem,
	Krzysztof Oledzki
In-Reply-To: <20100401104356.GD2879@psychotron.redhat.com>

Jiri Pirko wrote:
> Thu, Apr 01, 2010 at 12:40:23PM CEST, kaber@trash.net wrote:
>> Eric Dumazet wrote:
>>> Le mercredi 31 mars 2010 à 20:21 +0200, Jiri Pirko a écrit :
>>>> Okay, I see your point. How about this:
>>>>
>>>> Subject: [net-next-2.6 PATCH] netfilter: ctnetlink: compute message size properly V2
>>>>
>>>> Message size should be dependent on net->ct.sysctl_acct, not on
>>>> CONFIG_NF_CT_ACCT definition.
>>>>
>>> Then Changelog is not updated with the actual test :)
>> I've fixed up the changelog and applied the patch, thanks.
> 
> Cand I ask on which tree?

Since this doesn't confirm to Dave's definition of crucial fixes,
I've applied it to nf-next-2.6.git :)

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [net-next-2.6 PATCH] netfilter: ctnetlink: compute message size properly
From: Patrick McHardy @ 2010-04-01 10:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jiri Pirko, netdev, netfilter-devel, netfilter, davem,
	Krzysztof Oledzki
In-Reply-To: <1270060543.1931.20.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le mercredi 31 mars 2010 à 20:21 +0200, Jiri Pirko a écrit :
>> Okay, I see your point. How about this:
>>
>> Subject: [net-next-2.6 PATCH] netfilter: ctnetlink: compute message size properly V2
>>
>> Message size should be dependent on net->ct.sysctl_acct, not on
>> CONFIG_NF_CT_ACCT definition.
>>
> 
> Then Changelog is not updated with the actual test :)

I've fixed up the changelog and applied the patch, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 5/5] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: Patrick McHardy @ 2010-04-01 10:37 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: netfilter-devel, netdev
In-Reply-To: <1270031934-15940-6-git-send-email-jengelh@medozas.de>

Jan Engelhardt wrote:
> Since Xtables is now reentrant/nestable, the cloned packet can also go
> through Xtables and be subject to rules itself.

That sounds dangerous if conntrack isn't used to prevent loops.
Is that really useful? For filtering, you can simply apply the
rules before deciding to TEE the packet.

^ permalink raw reply

* Re: [PATCH 3/5] netfilter: xtables: inclusion of xt_TEE
From: Patrick McHardy @ 2010-04-01 10:34 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: netfilter-devel, netdev
In-Reply-To: <1270031934-15940-4-git-send-email-jengelh@medozas.de>

Jan Engelhardt wrote:
> +static bool
> +tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
> +{
> +	const struct iphdr *iph = ip_hdr(skb);
> +	struct rtable *rt;
> +	struct flowi fl;
> +	int err;
> +
> +	memset(&fl, 0, sizeof(fl));
> +	fl.iif  = skb->skb_iif;

I'm not sure you really should set iif here. We usually (tunnels, REJECT
etc) packets generated locally as new packets.

> +	fl.mark = skb->mark;

The same applies to mark.

> +	fl.nl_u.ip4_u.daddr = info->gw.ip;
> +	fl.nl_u.ip4_u.tos   = RT_TOS(iph->tos);
> +	fl.nl_u.ip4_u.scope = RT_SCOPE_UNIVERSE;
> +
> +	/* Trying to route the packet using the standard routing table. */
> +	err = ip_route_output_key(dev_net(skb->dev), &rt, &fl);
> +	if (err != 0)
> +		return false;
> +
> +	dst_release(skb_dst(skb));
> +	skb_dst_set(skb, &rt->u.dst);
> +	skb->dev      = rt->u.dst.dev;
> +	skb->protocol = htons(ETH_P_IP);
> +	IPCB(skb)->flags |= IPSKB_REROUTED;
> +	return true;
> +}
> +
> +/*
> + * To detect and deter routed packet loopback when using the --tee option, we
> + * take a page out of the raw.patch book: on the copied skb, we set up a fake
> + * ->nfct entry, pointing to the local &route_tee_track. We skip routing
> + * packets when we see they already have that ->nfct.

So without conntrack, people may create loops? If that's the case,
I'd suggest to simply forbid TEE'ing packets to loopback. That
doesn't seem to be very useful anyways.

> + */
> +static unsigned int
> +tee_tg4(struct sk_buff *skb, const struct xt_target_param *par)
> +{
> +	const struct xt_tee_tginfo *info = par->targinfo;
> +	struct iphdr *iph;
> +
> +#ifdef WITH_CONNTRACK
> +	if (skb->nfct == &tee_track.ct_general)
> +		/*
> +		 * Loopback - a packet we already routed, is to be
> +		 * routed another time. Avoid that, now.
> +		 */
> +		return NF_DROP;
> +#endif
> +	/*
> +	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
> +	 * the original skb, which should continue on its way as if nothing has
> +	 * happened. The copy should be independently delivered to the TEE
> +	 * --gateway.
> +	 */
> +	skb = skb_copy(skb, GFP_ATOMIC);
> +	if (skb == NULL)
> +		return XT_CONTINUE;
> +	/*
> +	 * If we are in PREROUTING/INPUT, the checksum must be recalculated
> +	 * since the length could have changed as a result of defragmentation.
> +	 *
> +	 * We also decrease the TTL to mitigate potential TEE loops
> +	 * between two hosts.
> +	 *
> +	 * Set %IP_DF so that the original source is notified of a potentially
> +	 * decreased MTU on the clone route. IPv6 does this too.
> +	 */
> +	iph = ip_hdr(skb);
> +	iph->frag_off |= htons(IP_DF);
> +	if (par->hooknum == NF_INET_PRE_ROUTING ||
> +	    par->hooknum == NF_INET_LOCAL_IN)
> +		--iph->ttl;
> +	ip_send_check(iph);

Shouldn't this only be done in PRE_ROUTING/INPUT as stated above?

> +
> +#ifdef WITH_CONNTRACK
> +	nf_conntrack_put(skb->nfct);
> +	skb->nfct     = &tee_track.ct_general;
> +	skb->nfctinfo = IP_CT_NEW;
> +	nf_conntrack_get(skb->nfct);
> +#endif
> +	/*
> +	 * Xtables is not reentrant currently, so a choice has to be made:
> +	 * 1. return absolute verdict for the original and let the cloned
> +	 *    packet travel through the chains
> +	 * 2. let the original continue travelling and not pass the clone
> +	 *    to Xtables.
> +	 * #2 is chosen. Normally, we would use ip_local_out for the clone.
> +	 * Because iph->check is already correct and we don't pass it to
> +	 * Xtables anyway, a shortcut to dst_output [forwards to ip_output] can
> +	 * be taken. %IPSKB_REROUTED needs to be set so that ip_output does not
> +	 * invoke POSTROUTING on the cloned packet.
> +	 */
> +	IPCB(skb)->flags |= IPSKB_REROUTED;
> +	if (tee_tg_route4(skb, info))
> +		ip_output(skb);
> +
> +	return XT_CONTINUE;
> +}
> +

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox