Netdev List

Netdev List
 help / color / mirror / Atom feed

* bug fix patch lost: git problem or just incorrect merge?
From: James Bottomley @ 2010-05-21 15:41 UTC (permalink / raw)
  To: Linus Torvalds, David Miller; +Cc: linux-kernel, netdev, linux-scsi

The patch in question is this one (upstream for a while):

commit d7d05548a62c87ee55b0c81933669177f885aa8d
Author: Mike Christie <michaelc@cs.wisc.edu>
Date:   Wed Mar 31 14:41:35 2010 -0500

    [SCSI] iscsi_tcp: fix relogin/shutdown hang

It's a simple one line change in iscsi_tcp.c (diff clipped):

--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -599,7 +599,7 @@ static void iscsi_sw_tcp_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
-       if (sock->sk->sk_sleep && waitqueue_active(sock->sk->sk_sleep)) {
+       if (sock->sk->sk_sleep) {

It was killed by this merge commit in the net-next tree:

commit 278554bd6579206921f5d8a523649a7a57f8850d
Merge: 5a147e8 cea0d76
Author: David S. Miller <davem@davemloft.net>
Date:   Wed May 12 00:05:35 2010 -0700

However, the curious thing is that git seems to have lost trace of the
missing patch entirely:  if I try to find it in linus' tree with a git
log -- drivers/scsi/iscsi_tcp.c, it doesn't show up.  The merge commit
which killed it does list iscsi_tcp.c as a file conflict, but git show
on that commit doesn't list that file in the resolution diff ... even
though this is where it actually got killed.

Is this a git problem ... or is it just a mismerge in the net tree?

Either way, of course, we need the patch back ...

James

^ permalink raw reply

* Re: [PATCH] [RFC] Add ip route {save,restore}
From: Stephen Hemminger @ 2010-05-21 15:26 UTC (permalink / raw)
  To: Dan Smith; +Cc: netdev
In-Reply-To: <1274454863-5146-1-git-send-email-danms@us.ibm.com>

On Fri, 21 May 2010 08:14:23 -0700
Dan Smith <danms@us.ibm.com> wrote:

> This patch adds save and restore commands to "ip route". Save dumps
> the RTNL stream to stdout which can then be passed to restore later.
> This may be helpful in some normal situations, and will allow C/R to
> migrate the routing information in userspace.  Tweaking of the stream
> can be done by userspace helpers to convert between versions and adjust
> things like device indexes when restoring routes in a different
> environment.
> 
> By factoring out some of the common bits of print_route() into
> filter_nlmsg(), the "save" command can use the same selection logic
> as "list," allowing the caller to save only specific routes as
> necessary.
> 
> Signed-off-by: Dan Smith <danms@us.ibm.com>

Looks good, please add man page and doc entry for this in final
version.

^ permalink raw reply

* [PATCH] [RFC] Add ip route {save,restore}
From: Dan Smith @ 2010-05-21 15:14 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

This patch adds save and restore commands to "ip route". Save dumps
the RTNL stream to stdout which can then be passed to restore later.
This may be helpful in some normal situations, and will allow C/R to
migrate the routing information in userspace.  Tweaking of the stream
can be done by userspace helpers to convert between versions and adjust
things like device indexes when restoring routes in a different
environment.

By factoring out some of the common bits of print_route() into
filter_nlmsg(), the "save" command can use the same selection logic
as "list," allowing the caller to save only specific routes as
necessary.

Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 ip/iproute.c |  169 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 127 insertions(+), 42 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 8252e18..3049975 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -23,6 +23,7 @@
 #include <netinet/ip.h>
 #include <arpa/inet.h>
 #include <linux/in_route.h>
+#include <errno.h>
 
 #include "rt_names.h"
 #include "utils.h"
@@ -32,7 +33,11 @@
 #define RTAX_RTTVAR RTAX_HOPS
 #endif
 
-
+enum list_action {
+	IPROUTE_LIST,
+	IPROUTE_FLUSH,
+	IPROUTE_SAVE,
+};
 static const char *mx_names[RTAX_MAX+1] = {
 	[RTAX_MTU]	= "mtu",
 	[RTAX_WINDOW]	= "window",
@@ -53,6 +58,8 @@ static void usage(void) __attribute__((noreturn));
 static void usage(void)
 {
 	fprintf(stderr, "Usage: ip route { list | flush } SELECTOR\n");
+	fprintf(stderr, "       ip route save SELECTOR\n");
+	fprintf(stderr, "       ip route restore\n");
 	fprintf(stderr, "       ip route get ADDRESS [ from ADDRESS iif STRING ]\n");
 	fprintf(stderr, "                            [ oif STRING ]  [ tos TOS ]\n");
 	fprintf(stderr, "       ip route { add | del | change | append | replace | monitor } ROUTE\n");
@@ -115,46 +122,16 @@ static int flush_update(void)
 	return 0;
 }
 
-int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+int filter_nlmsg(struct nlmsghdr *n, struct rtattr **tb, int host_len)
 {
-	FILE *fp = (FILE*)arg;
 	struct rtmsg *r = NLMSG_DATA(n);
-	int len = n->nlmsg_len;
-	struct rtattr * tb[RTA_MAX+1];
-	char abuf[256];
 	inet_prefix dst;
 	inet_prefix src;
-	inet_prefix prefsrc;
 	inet_prefix via;
-	int host_len = -1;
-	static int ip6_multiple_tables;
+	inet_prefix prefsrc;
 	__u32 table;
-	SPRINT_BUF(b1);
-	static int hz;
-
-	if (n->nlmsg_type != RTM_NEWROUTE && n->nlmsg_type != RTM_DELROUTE) {
-		fprintf(stderr, "Not a route: %08x %08x %08x\n",
-			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
-		return 0;
-	}
-	if (filter.flushb && n->nlmsg_type != RTM_NEWROUTE)
-		return 0;
-	len -= NLMSG_LENGTH(sizeof(*r));
-	if (len < 0) {
-		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
-		return -1;
-	}
-
-	if (r->rtm_family == AF_INET6)
-		host_len = 128;
-	else if (r->rtm_family == AF_INET)
-		host_len = 32;
-	else if (r->rtm_family == AF_DECnet)
-		host_len = 16;
-	else if (r->rtm_family == AF_IPX)
-		host_len = 80;
+	static int ip6_multiple_tables;
 
-	parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len);
 	table = rtm_get_table(r, tb);
 
 	if (r->rtm_family == AF_INET6 && table != RT_TABLE_MAIN)
@@ -281,6 +258,56 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 	    *(int*)RTA_DATA(tb[RTA_PRIORITY]) == -1)
 		return 0;
 
+	return 1;
+}
+
+int calc_host_len(struct rtmsg *r)
+{
+	if (r->rtm_family == AF_INET6)
+		return 128;
+	else if (r->rtm_family == AF_INET)
+		return 32;
+	else if (r->rtm_family == AF_DECnet)
+		return 16;
+	else if (r->rtm_family == AF_IPX)
+		return 80;
+	else
+		return -1;
+}
+
+int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+	FILE *fp = (FILE*)arg;
+	struct rtmsg *r = NLMSG_DATA(n);
+	int len = n->nlmsg_len;
+	struct rtattr * tb[RTA_MAX+1];
+	char abuf[256];
+	int host_len = -1;
+	__u32 table;
+	SPRINT_BUF(b1);
+	static int hz;
+
+	if (n->nlmsg_type != RTM_NEWROUTE && n->nlmsg_type != RTM_DELROUTE) {
+		fprintf(stderr, "Not a route: %08x %08x %08x\n",
+			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+		return 0;
+	}
+	if (filter.flushb && n->nlmsg_type != RTM_NEWROUTE)
+		return 0;
+	len -= NLMSG_LENGTH(sizeof(*r));
+	if (len < 0) {
+		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+		return -1;
+	}
+
+	host_len = calc_host_len(r);
+
+ 	parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len);
+	table = rtm_get_table(r, tb);
+
+	if (!filter_nlmsg(n, tb, host_len))
+		return 0;
+
 	if (filter.flushb) {
 		struct nlmsghdr *fn;
 		if (NLMSG_ALIGN(filter.flushp) + n->nlmsg_len > filter.flushe) {
@@ -1041,17 +1068,51 @@ static int iproute_flush_cache(void)
 	return 0;
 }
 
+int save_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+	int ret;
+	int len = n->nlmsg_len;
+	struct rtmsg *r = NLMSG_DATA(n);
+	struct rtattr *tb[RTA_MAX+1];
+	int host_len = -1;
+
+	if (isatty(STDOUT_FILENO)) {
+		fprintf(stderr, "Not sending binary stream to stdout\n");
+		return -1;
+	}
+
+	host_len = calc_host_len(r);
+	len -= NLMSG_LENGTH(sizeof(*r));
+ 	parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len);
+
+	if (!filter_nlmsg(n, tb, host_len))
+		return 0;
+
+	ret = write(STDOUT_FILENO, n, n->nlmsg_len);
+	if ((ret > 0) && (ret != n->nlmsg_len)) {
+		fprintf(stderr, "Short write while saving nlmsg\n");
+		ret = -EIO;
+	}
+
+	return ret == n->nlmsg_len ? 0 : ret;
+}
 
-static int iproute_list_or_flush(int argc, char **argv, int flush)
+static int iproute_list_flush_or_save(int argc, char **argv, int action)
 {
 	int do_ipv6 = preferred_family;
 	char *id = NULL;
 	char *od = NULL;
+	rtnl_filter_t filter_fn;
+
+	if (action == IPROUTE_SAVE)
+		filter_fn = save_route;
+	else
+		filter_fn = print_route;
 
 	iproute_reset_filter();
 	filter.tb = RT_TABLE_MAIN;
 
-	if (flush && argc <= 0) {
+	if ((action == IPROUTE_FLUSH) && argc <= 0) {
 		fprintf(stderr, "\"ip route flush\" requires arguments.\n");
 		return -1;
 	}
@@ -1201,7 +1262,7 @@ static int iproute_list_or_flush(int argc, char **argv, int flush)
 		}
 	}
 
-	if (flush) {
+	if (action == IPROUTE_FLUSH) {
 		int round = 0;
 		char flushb[4096-512];
 		time_t start = time(0);
@@ -1226,7 +1287,7 @@ static int iproute_list_or_flush(int argc, char **argv, int flush)
 				exit(1);
 			}
 			filter.flushed = 0;
-			if (rtnl_dump_filter(&rth, print_route, stdout, NULL, NULL) < 0) {
+			if (rtnl_dump_filter(&rth, filter_fn, stdout, NULL, NULL) < 0) {
 				fprintf(stderr, "Flush terminated\n");
 				exit(1);
 			}
@@ -1269,7 +1330,7 @@ static int iproute_list_or_flush(int argc, char **argv, int flush)
 		}
 	}
 
-	if (rtnl_dump_filter(&rth, print_route, stdout, NULL, NULL) < 0) {
+	if (rtnl_dump_filter(&rth, filter_fn, stdout, NULL, NULL) < 0) {
 		fprintf(stderr, "Dump terminated\n");
 		exit(1);
 	}
@@ -1436,6 +1497,26 @@ int iproute_get(int argc, char **argv)
 	exit(0);
 }
 
+int restore_handler(struct sockaddr_nl *nl, struct nlmsghdr *n, void *arg)
+{
+	int ret;
+
+	n->nlmsg_flags |= NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+
+	ll_init_map(&rth);
+
+	ret = rtnl_talk(&rth, n, 0, 0, n, NULL, NULL);
+	if ((ret < 0) && (errno == EEXIST))
+		ret = 0;
+
+	return ret;
+}
+
+int iproute_restore(void)
+{
+	exit(rtnl_from_file(stdin, &restore_handler, NULL));
+}
+
 void iproute_reset_filter()
 {
 	memset(&filter, 0, sizeof(filter));
@@ -1446,7 +1527,7 @@ void iproute_reset_filter()
 int do_iproute(int argc, char **argv)
 {
 	if (argc < 1)
-		return iproute_list_or_flush(0, NULL, 0);
+		return iproute_list_flush_or_save(0, NULL, IPROUTE_LIST);
 
 	if (matches(*argv, "add") == 0)
 		return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE|NLM_F_EXCL,
@@ -1471,11 +1552,15 @@ int do_iproute(int argc, char **argv)
 				      argc-1, argv+1);
 	if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
 	    || matches(*argv, "lst") == 0)
-		return iproute_list_or_flush(argc-1, argv+1, 0);
+		return iproute_list_flush_or_save(argc-1, argv+1, IPROUTE_LIST);
 	if (matches(*argv, "get") == 0)
 		return iproute_get(argc-1, argv+1);
 	if (matches(*argv, "flush") == 0)
-		return iproute_list_or_flush(argc-1, argv+1, 1);
+		return iproute_list_flush_or_save(argc-1, argv+1, IPROUTE_FLUSH);
+	if (matches(*argv, "save") == 0)
+		return iproute_list_flush_or_save(argc-1, argv+1, IPROUTE_SAVE);
+	if (matches(*argv, "restore") == 0)
+		return iproute_restore();
 	if (matches(*argv, "help") == 0)
 		usage();
 	fprintf(stderr, "Command \"%s\" is unknown, try \"ip route help\".\n", *argv);
-- 
1.7.0.4


^ permalink raw reply related

* Re: linux-next: manual merge of the driver-core tree with the net tree
From: Greg KH @ 2010-05-21 15:13 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: David Miller, sfr, linux-next, linux-kernel, therbert, netdev
In-Reply-To: <m1pr0pwvgk.fsf@fess.ebiederm.org>

On Fri, May 21, 2010 at 12:43:55AM -0700, Eric W. Biederman wrote:
> David Miller <davem@davemloft.net> writes:
> 
> > From: ebiederm@xmission.com (Eric W. Biederman)
> > Date: Thu, 20 May 2010 23:46:22 -0700
> >
> >> It looks right, except perhaps the RPS code looks like it will cause a
> >> build failure with sysfs disabled, but that has nothing to do with your
> >> changes.
> >
> > CONFIG_RPS depends upon CONFIG_SMP && CONFIG_SYSFS, so no that build
> > failure is not possible.
> 
> That's the bit I'm missing.  My apologies if I caused any unnecessary worry.

I've now fixed this all up in my tree and have pushed it out.
I'll also go push the patches themselves to Linus later today.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH 1/3] cgroups: Add an API to attach a task to current task's cgroup
From: Sridhar Samudrala @ 2010-05-21 15:09 UTC (permalink / raw)
  To: Paul Menage
  Cc: Sridhar Samudrala, Michael S. Tsirkin, netdev,
	kvm@vger.kernel.org, lkml
In-Reply-To: <AANLkTinsrFoLVKDFM5pcKcL_6MvAzhR6IzbNmWKh3BDh@mail.gmail.com>

On 5/20/2010 3:22 PM, Paul Menage wrote:
> On Tue, May 18, 2010 at 5:04 PM, Sridhar Samudrala
> <samudrala.sridhar@gmail.com>  wrote:
>    
>> Add a new kernel API to attach a task to current task's cgroup
>> in all the active hierarchies.
>>
>> Signed-off-by: Sridhar Samudrala<sri@us.ibm.com>
>>      
> Reviewed-by: Paul Menage<menage@google.com>
>
> It would be more efficient to just attach directly to current->cgroups
> rather than potentially creating/destroying one css_set for each
> hierarchy until we've completely converged on current->cgroups - but
> that would require a bunch of refactoring of the guts of
> cgroup_attach_task() to ensure that the right can_attach()/attach()
> callbacks are made. That doesn't really seem worthwhile right now for
> the initial use, that I imagine isn't going to be
> performance-sensitive.
>    
Yes. In our use-case, this will be called only once per guest interface 
when the guest comes up.
Hope you or someone more familiar with cgroups subsystem can optimize 
this function later.

Thanks
Sridhar


^ permalink raw reply

* [PATCH] net: add additional lock to qdisc to increase throughput
From: Eric Dumazet @ 2010-05-21 15:08 UTC (permalink / raw)
  To: Alexander Duyck, David Miller; +Cc: netdev
In-Reply-To: <20100323202553.21598.10754.stgit@gitlad.jf.intel.com>

Le mardi 23 mars 2010 à 13:25 -0700, Alexander Duyck a écrit :
> The qdisc layer shows a significant issue when you start transmitting from
> multiple CPUs.  The issue is that the transmit rate drops significantly, and I
> believe it is due to the fact that the spinlock is shared between the 1
> dequeue, and n-1 enqueue cpu threads.  In order to improve this situation I am
> adding one additional lock which will need to be obtained during the enqueue
> portion of the path.  This essentially allows sch_direct_xmit to jump to
> near the head of the line when attempting to obtain the lock after
> completing a transmit.
> 
> Running the script below I saw an increase from 200K packets per second to
> 1.07M packets per second as a result of this patch.
> 
> for j in `seq 0 15`; do
> 	for i in `seq 0 7`; do
> 		netperf -H <ip> -t UDP_STREAM -l 600 -N -T $i -- -m 6 &
> 	done
> done
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> ---
> 
>  include/net/sch_generic.h |    3 ++-
>  net/core/dev.c            |    7 ++++++-
>  net/sched/sch_generic.c   |    1 +
>  3 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 67dc08e..f5088a9 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -51,7 +51,6 @@ struct Qdisc {
>  	struct list_head	list;
>  	u32			handle;
>  	u32			parent;
> -	atomic_t		refcnt;
>  	struct gnet_stats_rate_est	rate_est;
>  	int			(*reshape_fail)(struct sk_buff *skb,
>  					struct Qdisc *q);
> @@ -65,6 +64,8 @@ struct Qdisc {
>  	struct netdev_queue	*dev_queue;
>  	struct Qdisc		*next_sched;
>  
> +	atomic_t		refcnt;
> +	spinlock_t		enqueue_lock;
>  	struct sk_buff		*gso_skb;
>  	/*
>  	 * For performance sake on SMP, we put highly modified fields at the end
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a03aab4..8a2d508 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2006,8 +2006,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>  	spinlock_t *root_lock = qdisc_lock(q);
>  	int rc;
>  
> +	spin_lock(&q->enqueue_lock);
>  	spin_lock(root_lock);
>  	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
> +		spin_unlock(&q->enqueue_lock);
>  		kfree_skb(skb);
>  		rc = NET_XMIT_DROP;
>  	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
> @@ -2018,7 +2020,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>  		 * xmit the skb directly.
>  		 */
>  		__qdisc_update_bstats(q, skb->len);
> -		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
> +		spin_unlock(&q->enqueue_lock);
> +		rc = sch_direct_xmit(skb, q, dev, txq, root_lock);
> +		if (rc)
>  			__qdisc_run(q);
>  		else
>  			clear_bit(__QDISC_STATE_RUNNING, &q->state);
> @@ -2026,6 +2030,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>  		rc = NET_XMIT_SUCCESS;
>  	} else {
>  		rc = qdisc_enqueue_root(skb, q);
> +		spin_unlock(&q->enqueue_lock);
>  		qdisc_run(q);
>  	}
>  	spin_unlock(root_lock);
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 5173c1e..4b5e125 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -538,6 +538,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
>  	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
>  	sch->padded = (char *) sch - (char *) p;
>  
> +	spin_lock_init(&sch->enqueue_lock);
>  	INIT_LIST_HEAD(&sch->list);
>  	skb_queue_head_init(&sch->q);
>  	sch->ops = ops;
> 

Hi Alexander

What about following patch instead, adding an heuristic to avoid an
extra atomic operation in fast path ?

I also try to release the 'busylock' after the main lock to favor the
worker as much as possible. It must be released before main lock only
before a call to __qdisc_run()

Another refinement would be to make the spinning done outside of BH
disabled context, because we currently delay TX completion (if NAPI is
used by driver) that can make room in device TX ring buffer. This would
allow cpus to process incoming traffic too...

Thanks

[PATCH] net: add additional lock to qdisc to increase throughput

When many cpus compete for sending frames on a given qdisc, the qdisc
spinlock suffers from very high contention.

The cpu owning __QDISC_STATE_RUNNING bit has same priority to acquire
the lock, and cannot dequeue packets fast enough, since it must wait for
this lock for each dequeued packet.

One solution to this problem is to force all cpus spinning on a second
lock before trying to get the main lock, when/if they see
__QDISC_STATE_RUNNING already set.

The owning cpu then compete with at most one other cpu for the main
lock, allowing for higher dequeueing rate.

Based on a previous patch from Alexander Duyck. I added the heuristic to
avoid the atomic in fast path, and put the new lock far away from the
cache line used by the dequeue worker. Also try to release the busylock
lock as late as possible.

Tests with following script gave a boost from ~50.000 pps to ~600.000
pps on a dual quad core machine (E5450 @3.00GHz), tg3 driver.
(A single netperf flow can reach ~800.000 pps on this platform)

for j in `seq 0 3`; do
  for i in `seq 0 7`; do
    netperf -H 192.168.0.1 -t UDP_STREAM -l 60 -N -T $i -- -m 6 &
  done
done


Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/sch_generic.h |    3 ++-
 net/core/dev.c            |   29 +++++++++++++++++++++++++----
 net/sched/sch_generic.c   |    1 +
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 03ca5d8..2d55649 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -73,7 +73,8 @@ struct Qdisc {
 	struct sk_buff_head	q;
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue	qstats;
-	struct rcu_head     rcu_head;
+	struct rcu_head		rcu_head;
+	spinlock_t		busylock;
 };
 
 struct Qdisc_class_ops {
diff --git a/net/core/dev.c b/net/core/dev.c
index 6c82065..1b313eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2040,6 +2040,16 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 {
 	spinlock_t *root_lock = qdisc_lock(q);
 	int rc;
+	bool contended = test_bit(__QDISC_STATE_RUNNING, &q->state);
+
+	/*
+	 * Heuristic to force contended enqueues to serialize on a
+	 * separate lock before trying to get qdisc main lock.
+	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
+	 * and dequeue packets faster.
+	 */
+	if (unlikely(contended))
+		spin_lock(&q->busylock);
 
 	spin_lock(root_lock);
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
@@ -2055,19 +2065,30 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
 			skb_dst_force(skb);
 		__qdisc_update_bstats(q, skb->len);
-		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+			if (unlikely(contended)) {
+				spin_unlock(&q->busylock);
+				contended = false;
+			}
 			__qdisc_run(q);
-		else
+		} else
 			clear_bit(__QDISC_STATE_RUNNING, &q->state);
 
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		skb_dst_force(skb);
 		rc = qdisc_enqueue_root(skb, q);
-		qdisc_run(q);
+		if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
+			if (unlikely(contended)) {
+				spin_unlock(&q->busylock);
+				contended = false;
+			}
+			__qdisc_run(q);
+		}
 	}
 	spin_unlock(root_lock);
-
+	if (unlikely(contended))
+		spin_unlock(&q->busylock);
 	return rc;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a63029e..0a44290 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -543,6 +543,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 
 	INIT_LIST_HEAD(&sch->list);
 	skb_queue_head_init(&sch->q);
+	spin_lock_init(&sch->busylock);
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;




^ permalink raw reply related

* [PATCH] iproute2: Add dsfield as alias for tos for ip rules
From: Arnd Hannemann @ 2010-05-21 14:10 UTC (permalink / raw)
  To: shemminger; +Cc: netdev, Arnd Hannemann

From: Arnd Hannemann <arnd@rhea.(none)>

Get ip rule parsing of "dsfield" in sync with ip route parsing and manual page.

Signed-off-by: Arnd Hannemann <hannemann@nets.rwth-aachen.de>
---
 ip/iprule.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/ip/iprule.c b/ip/iprule.c
index 7140375..cbf8226 100644
--- a/ip/iprule.c
+++ b/ip/iprule.c
@@ -270,7 +270,8 @@ static int iprule_modify(int cmd, int argc, char **argv)
 			if (get_u32(&pref, *argv, 0))
 				invarg("preference value is invalid\n", *argv);
 			addattr32(&req.n, sizeof(req), FRA_PRIORITY, pref);
-		} else if (strcmp(*argv, "tos") == 0) {
+		} else if (strcmp(*argv, "tos") == 0 ||
+			   matches(*argv, "dsfield") == 0) {
 			__u32 tos;
 			NEXT_ARG();
 			if (rtnl_dsfield_a2n(&tos, *argv))
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCH] net-2.6 : V2 - fix dev_get_valid_name
From: Octavian Purdila @ 2010-05-21 13:25 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: davem, netdev
In-Reply-To: <4BF68635.4070202@free.fr>

On Friday 21 May 2010 16:10:13 you wrote:
> On 05/19/2010 10:12 PM, Daniel Lezcano wrote:
> > the commit:
> >
> > commit d90310243fd750240755e217c5faa13e24f41536
> > Author: Octavian Purdila<opurdila@ixiacom.com>
> > Date:   Wed Nov 18 02:36:59 2009 +0000
> >
> >      net: device name allocation cleanups
> >
> > introduced a bug when there is a hash collision making impossible
> > to rename a device with eth%d. This bug is very hard to reproduce
> > and appears rarely.
> >
> > The problem is coming from we don't pass a temporary buffer to
> > __dev_alloc_name but 'dev->name' which is modified by the function.
> >
> > A detailed explanation is here:
> >
> > http://marc.info/?l=linux-netdev&m=127417784011987&w=2
> >
> > Changelog:
> >   V2 : replaced strings comparison by pointers comparison
> >
> > Signed-off-by: Daniel Lezcano<daniel.lezcano@free.fr>
> > ---
> 
> Octavian, are you ok with this patch ?
> 

Yes, all looks good to me, thanks for the fix.

Reviewed-by: Octavian Purdila <opurdila@ixiacom.com>


^ permalink raw reply

* Re: bnx2x + SFP+ DA/2.6.33.3: Got bad status 0x0 when reading from SFP+ EEPROM -> SFP+ module is not initialized
From: Krzysztof Olędzki @ 2010-05-21 13:19 UTC (permalink / raw)
  To: eilong; +Cc: Rick Jones, Michael Chan, netdev@vger.kernel.org
In-Reply-To: <1274446713.11082.3.camel@lb-tlvb-eilong.il.broadcom.com>

On 2010-05-21 14:58, Eilon Greenstein wrote:
> On Thu, 2010-05-20 at 13:54 -0700, Krzysztof Olędzki wrote:
>> On 2010-05-20 22:25, Rick Jones wrote:
>>> Some simple/simplistic thoughts/questions...
>>>
>>> Has the DAC been used successfully prior to this?
>>
>> Yes. It was successfully used to connect two HP switches, before I
>> received SFP+ SR modules, that allowed me to put the switches into
>> distanced rooms.
>
> Krzysztof,
>
> I would like to check in what speed the I2C is running at, and while we
> are there, to make sure it is at low speed (100KHz and not 400KHz). Can
> you please add this patch and let me know what you see:

Sure.

>
> diff --git a/drivers/net/bnx2x_link.c b/drivers/net/bnx2x_link.c

<CUT>

[bnx2x_8727_read_sfp_module_eeprom:2774(eth4)]I2C 2W speed 0x0, bit 8 0
[bnx2x_8727_read_sfp_module_eeprom:2774(eth5)]I2C 2W speed 0x0, bit 8 0

> Since we are already off for the weekend, I couldn't find a machine with
> this kind of board to test it for myself. On top of that, I don't have
> this type of DAC cable, so I need your help in this debug process.

No problem. There is no need to ask for a help a person who depends on 
your support. ;) I would like to solve it as soon as possible so I'm 
willing to test everything you send me. ;)

best regards,

			Krzysztof Olędzki


^ permalink raw reply

* Re: [PATCH] net-2.6 : V2 - fix dev_get_valid_name
From: Daniel Lezcano @ 2010-05-21 13:10 UTC (permalink / raw)
  To: opurdila; +Cc: davem, netdev
In-Reply-To: <1274299939-28132-1-git-send-email-daniel.lezcano@free.fr>

On 05/19/2010 10:12 PM, Daniel Lezcano wrote:
> the commit:
>
> commit d90310243fd750240755e217c5faa13e24f41536
> Author: Octavian Purdila<opurdila@ixiacom.com>
> Date:   Wed Nov 18 02:36:59 2009 +0000
>
>      net: device name allocation cleanups
>
> introduced a bug when there is a hash collision making impossible
> to rename a device with eth%d. This bug is very hard to reproduce
> and appears rarely.
>
> The problem is coming from we don't pass a temporary buffer to
> __dev_alloc_name but 'dev->name' which is modified by the function.
>
> A detailed explanation is here:
>
> http://marc.info/?l=linux-netdev&m=127417784011987&w=2
>
> Changelog:
>   V2 : replaced strings comparison by pointers comparison
>
> Signed-off-by: Daniel Lezcano<daniel.lezcano@free.fr>
> ---
>    

Octavian, are you ok with this patch ?

^ permalink raw reply

* Re: tun: Use netif_receive_skb instead of netif_rx
From: Neil Horman @ 2010-05-21 12:59 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Neil Horman, David Miller, eric.dumazet, bmb, tgraf, netdev
In-Reply-To: <20100521110847.GA29878@gondor.apana.org.au>

On Fri, May 21, 2010 at 09:08:47PM +1000, Herbert Xu wrote:
> On Fri, May 21, 2010 at 06:51:28AM -0400, Neil Horman wrote:
> >
> > I read it, we just described the issue in diferent terms.
> 
> Yeah I wasn't clear enough with my email without an actual patch :)
> 
No worries, its not an easy issue to describe :)

> Thanks for testing Neil!
Thanks for the patch, I'll let you know how it works out in just a bit here!
Neil

> -- 
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: bnx2x + SFP+ DA/2.6.33.3: Got bad status 0x0 when reading from SFP+ EEPROM -> SFP+ module is not initialized
From: Eilon Greenstein @ 2010-05-21 12:58 UTC (permalink / raw)
  To: Krzysztof Olędzki; +Cc: Rick Jones, Michael Chan, netdev@vger.kernel.org
In-Reply-To: <4BF5A19E.5010003@ans.pl>

On Thu, 2010-05-20 at 13:54 -0700, Krzysztof Olędzki wrote:
> On 2010-05-20 22:25, Rick Jones wrote:
> > Some simple/simplistic thoughts/questions...
> >
> > Has the DAC been used successfully prior to this?
> 
> Yes. It was successfully used to connect two HP switches, before I 
> received SFP+ SR modules, that allowed me to put the switches into 
> distanced rooms.

Krzysztof,

I would like to check in what speed the I2C is running at, and while we
are there, to make sure it is at low speed (100KHz and not 400KHz). Can
you please add this patch and let me know what you see:

diff --git a/drivers/net/bnx2x_link.c b/drivers/net/bnx2x_link.c
index ff70be8..6e05f99 100644
--- a/drivers/net/bnx2x_link.c
+++ b/drivers/net/bnx2x_link.c
@@ -2766,6 +2766,21 @@ static u8
bnx2x_8727_read_sfp_module_eeprom(struct link_params *params,
                      MDIO_PMA_REG_SFP_TWO_WIRE_CTRL,
                      &val);
 
+       /* Make sure we are working in 100KHz */
+       bnx2x_cl45_read(bp, port,
+                     PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM8727,
+                     ext_phy_addr,
+                     MDIO_PMA_DEVAD,
+                     8005,
+                     &val);
+       BNX2X_ERR("I2C 2W speed 0x%x, bit 8 %d\n", val, val & 8);
+       bnx2x_cl45_write(bp, port,
+                      ext_phy_type,
+                      ext_phy_addr,
+                      MDIO_PMA_DEVAD,
+                      8005,
+                      val & ~8);
+
        /* Set the read command byte count */
        bnx2x_cl45_write(bp, port,
                       ext_phy_type,

Since we are already off for the weekend, I couldn't find a machine with
this kind of board to test it for myself. On top of that, I don't have
this type of DAC cable, so I need your help in this debug process.

Thanks,
Eilon




^ permalink raw reply related

* [PATCH] rtnetlink: Fix error handling in do_setlink()
From: David Howells @ 2010-05-21 12:25 UTC (permalink / raw)
  To: netdev; +Cc: David Howells, Chris Wright, David S. Miller

Commit c02db8c6290bb992442fec1407643c94cc414375:

	Author:  Chris Wright <chrisw@sous-sol.org>
	Date:    Sun May 16 01:05:45 2010 -0700
	Subject: rtnetlink: make SR-IOV VF interface symmetric

adds broken error handling to do_setlink() in net/core/rtnetlink.c.  The
problem is the following chunk of code:

	if (tb[IFLA_VFINFO_LIST]) {
		struct nlattr *attr;
		int rem;
		nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
			if (nla_type(attr) != IFLA_VF_INFO)
  ---->				goto errout;
			err = do_setvfinfo(dev, attr);
			if (err < 0)
				goto errout;
			modified = 1;
		}
	}

which can get to errout without setting err, resulting in the following error:

net/core/rtnetlink.c: In function 'do_setlink':
net/core/rtnetlink.c:904: warning: 'err' may be used uninitialized in this function

Change the code to return -EINVAL in this case.  Note that this might not be
the appropriate error though.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Chris Wright <chrisw@sous-sol.org>
cc: David S. Miller <davem@davemloft.net>
---

 net/core/rtnetlink.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)


diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 31e85d3..56b4157 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1030,8 +1030,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		struct nlattr *attr;
 		int rem;
 		nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
-			if (nla_type(attr) != IFLA_VF_INFO)
+			if (nla_type(attr) != IFLA_VF_INFO) {
+				err = -EINVAL;
 				goto errout;
+			}
 			err = do_setvfinfo(dev, attr);
 			if (err < 0)
 				goto errout;


^ permalink raw reply related

* [PATCH net-2.6 6/6] caif: Bugfix - use MSG_TRUNC in receive
From: sjur.brandeland @ 2010-05-21 12:16 UTC (permalink / raw)
  To: netdev, davem; +Cc: sjurbr, linus.walleij, marcel, Sjur Braendeland
In-Reply-To: <1274444172-31969-5-git-send-email-sjur.brandeland@stericsson.com>

From: Sjur Braendeland <sjur.brandeland@stericsson.com>

Fixed handling when skb don't fit in user buffer,
instead of returning -EMSGSIZE, the buffer is truncated (just
as unix seqpakcet does).

Signed-off-by: Sjur Braendeland <sjur.brandeland@stericsson.com>
---
 net/caif/caif_socket.c |   47 ++++++++++++++++++-----------------------------
 1 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 691a571..3d0e095 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -292,53 +292,42 @@ static void caif_check_flow_release(struct sock *sk)
 			caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ);
 	}
 }
+
 /*
- * Copied from sock.c:sock_queue_rcv_skb(), and added check that user buffer
- * has sufficient size.
+ * Copied from unix_dgram_recvmsg, but removed credit checks,
+ * changed locking, address handling and added MSG_TRUNC.
  */
-
 static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
-				struct msghdr *m, size_t buf_len, int flags)
+				struct msghdr *m, size_t len, int flags)
 
 {
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb;
-	int ret = 0;
-	int len;
+	int ret;
+	int copylen;
 
-	if (unlikely(!buf_len))
-		return -EINVAL;
+	ret = -EOPNOTSUPP;
+	if (m->msg_flags&MSG_OOB)
+		goto read_error;
 
 	skb = skb_recv_datagram(sk, flags, 0 , &ret);
 	if (!skb)
 		goto read_error;
-
-	len = skb->len;
-
-	if (skb && skb->len > buf_len && !(flags & MSG_PEEK)) {
-		len = buf_len;
-		/*
-		 * Push skb back on receive queue if buffer too small.
-		 * This has a built-in race where multi-threaded receive
-		 * may get packet in wrong order, but multiple read does
-		 * not really guarantee ordered delivery anyway.
-		 * Let's optimize for speed without taking locks.
-		 */
-
-		skb_queue_head(&sk->sk_receive_queue, skb);
-		ret = -EMSGSIZE;
-		goto read_error;
+	copylen = skb->len;
+	if (len < copylen) {
+		m->msg_flags |= MSG_TRUNC;
+		copylen = len;
 	}
 
-	ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, len);
+	ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen);
 	if (ret)
-		goto read_error;
+		goto out_free;
 
+	ret = (flags & MSG_TRUNC) ? skb->len : copylen;
+out_free:
 	skb_free_datagram(sk, skb);
-
 	caif_check_flow_release(sk);
-
-	return len;
+	return ret;
 
 read_error:
 	return ret;
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH net-2.6 5/6] caif: Bugfix - missing spin_unlock
From: sjur.brandeland @ 2010-05-21 12:16 UTC (permalink / raw)
  To: netdev, davem; +Cc: sjurbr, linus.walleij, marcel, Sjur Braendeland
In-Reply-To: <1274444172-31969-4-git-send-email-sjur.brandeland@stericsson.com>

From: Sjur Braendeland <sjur.brandeland@stericsson.com>

Splint found missing spin_unlock.
Corrected this an some other trivial split warnings.

Signed-off-by: Sjur Braendeland <sjur.brandeland@stericsson.com>
---
 net/caif/caif_socket.c |   10 +++++-----
 net/caif/cfmuxl.c      |    3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 732897d..691a571 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -60,7 +60,7 @@ struct debug_fs_counter {
 	atomic_t num_rx_flow_off;
 	atomic_t num_rx_flow_on;
 };
-struct debug_fs_counter cnt;
+static struct debug_fs_counter cnt;
 #define	dbfs_atomic_inc(v) atomic_inc(v)
 #define	dbfs_atomic_dec(v) atomic_dec(v)
 #else
@@ -128,13 +128,13 @@ static void caif_read_unlock(struct sock *sk)
 	mutex_unlock(&cf_sk->readlock);
 }
 
-int sk_rcvbuf_lowwater(struct caifsock *cf_sk)
+static int sk_rcvbuf_lowwater(struct caifsock *cf_sk)
 {
 	/* A quarter of full buffer is used a low water mark */
 	return cf_sk->sk.sk_rcvbuf / 4;
 }
 
-void caif_flow_ctrl(struct sock *sk, int mode)
+static void caif_flow_ctrl(struct sock *sk, int mode)
 {
 	struct caifsock *cf_sk;
 	cf_sk = container_of(sk, struct caifsock, sk);
@@ -146,7 +146,7 @@ void caif_flow_ctrl(struct sock *sk, int mode)
  * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are
  * not dropped, but CAIF is sending flow off instead.
  */
-int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int err;
 	int skb_len;
@@ -1184,7 +1184,7 @@ static struct net_proto_family caif_family_ops = {
 	.owner = THIS_MODULE,
 };
 
-int af_caif_init(void)
+static int af_caif_init(void)
 {
 	int err = sock_register(&caif_family_ops);
 	if (!err)
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
index 7372f27..80c8d33 100644
--- a/net/caif/cfmuxl.c
+++ b/net/caif/cfmuxl.c
@@ -174,10 +174,11 @@ struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 id)
 	spin_lock(&muxl->receive_lock);
 	up = get_up(muxl, id);
 	if (up == NULL)
-		return NULL;
+		goto out;
 	memset(muxl->up_cache, 0, sizeof(muxl->up_cache));
 	list_del(&up->node);
 	cfsrvl_put(up);
+out:
 	spin_unlock(&muxl->receive_lock);
 	return up;
 }
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH net-2.6 4/6] caif: Bugfix - Poll can't return POLLHUP while connecting.
From: sjur.brandeland @ 2010-05-21 12:16 UTC (permalink / raw)
  To: netdev, davem; +Cc: sjurbr, linus.walleij, marcel, Sjur Braendeland
In-Reply-To: <1274444172-31969-3-git-send-email-sjur.brandeland@stericsson.com>

From: Sjur Braendeland <sjur.brandeland@stericsson.com>

Discovered bug when testing async connect.
While connecting poll should not return POLLHUP,
but POLLOUT when connected. 
Also fixed the sysfs flow-control-counters.

Signed-off-by: Sjur Braendeland <sjur.brandeland@stericsson.com>
---
 net/caif/caif_socket.c |   21 ++++++---------------
 1 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 77e9956..732897d 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -138,7 +138,7 @@ void caif_flow_ctrl(struct sock *sk, int mode)
 {
 	struct caifsock *cf_sk;
 	cf_sk = container_of(sk, struct caifsock, sk);
-	if (cf_sk->layer.dn)
+	if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd)
 		cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode);
 }
 
@@ -162,9 +162,8 @@ int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			atomic_read(&cf_sk->sk.sk_rmem_alloc),
 			sk_rcvbuf_lowwater(cf_sk));
 		set_rx_flow_off(cf_sk);
-		if (cf_sk->layer.dn)
-			cf_sk->layer.dn->modemcmd(cf_sk->layer.dn,
-						CAIF_MODEMCMD_FLOW_OFF_REQ);
+		dbfs_atomic_inc(&cnt.num_rx_flow_off);
+		caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
 	}
 
 	err = sk_filter(sk, skb);
@@ -175,9 +174,8 @@ int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		trace_printk("CAIF: %s():"
 			" sending flow OFF due to rmem_schedule\n",
 			__func__);
-		if (cf_sk->layer.dn)
-			cf_sk->layer.dn->modemcmd(cf_sk->layer.dn,
-						CAIF_MODEMCMD_FLOW_OFF_REQ);
+		dbfs_atomic_inc(&cnt.num_rx_flow_off);
+		caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
 	}
 	skb->dev = NULL;
 	skb_set_owner_r(skb, sk);
@@ -285,16 +283,13 @@ static void caif_check_flow_release(struct sock *sk)
 {
 	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
 
-	if (cf_sk->layer.dn == NULL || cf_sk->layer.dn->modemcmd == NULL)
-		return;
 	if (rx_flow_is_on(cf_sk))
 		return;
 
 	if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) {
 			dbfs_atomic_inc(&cnt.num_rx_flow_on);
 			set_rx_flow_on(cf_sk);
-			cf_sk->layer.dn->modemcmd(cf_sk->layer.dn,
-						CAIF_MODEMCMD_FLOW_ON_REQ);
+			caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ);
 	}
 }
 /*
@@ -1018,10 +1013,6 @@ static unsigned int caif_poll(struct file *file,
 		(sk->sk_shutdown & RCV_SHUTDOWN))
 		mask |= POLLIN | POLLRDNORM;
 
-	/* Connection-based need to check for termination and startup */
-	if (sk->sk_state == CAIF_DISCONNECTED)
-		mask |= POLLHUP;
-
 	/*
 	 * we set writable also when the other side has shut down the
 	 * connection. This prevents stuck sockets.
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH net-2.6 3/6] caif: Bugfix - handle mem-allocation failures
From: sjur.brandeland @ 2010-05-21 12:16 UTC (permalink / raw)
  To: netdev, davem; +Cc: sjurbr, linus.walleij, marcel, Sjur Braendeland
In-Reply-To: <1274444172-31969-2-git-send-email-sjur.brandeland@stericsson.com>

From: Sjur Braendeland <sjur.brandeland@stericsson.com>

Discovered bugs when injecting slab allocation failures.
Add checks on all memory allocation.

Signed-off-by: Sjur Braendeland <sjur.brandeland@stericsson.com>
---
 net/caif/cfpkt_skbuff.c |   25 +++++++++++++++++--------
 net/caif/cfserl.c       |    3 ++-
 net/caif/cfsrvl.c       |    6 ++++++
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index 83fff2f..a6fdf89 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -238,6 +238,7 @@ int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len)
 	struct sk_buff *lastskb;
 	u8 *to;
 	const u8 *data = data2;
+	int ret;
 	if (unlikely(is_erronous(pkt)))
 		return -EPROTO;
 	if (unlikely(skb_headroom(skb) < len)) {
@@ -246,9 +247,10 @@ int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len)
 	}
 
 	/* Make sure data is writable */
-	if (unlikely(skb_cow_data(skb, 0, &lastskb) < 0)) {
+	ret = skb_cow_data(skb, 0, &lastskb);
+	if (unlikely(ret < 0)) {
 		PKT_ERROR(pkt, "cfpkt_add_head: cow failed\n");
-		return -EPROTO;
+		return ret;
 	}
 
 	to = skb_push(skb, len);
@@ -316,6 +318,8 @@ EXPORT_SYMBOL(cfpkt_setlen);
 struct cfpkt *cfpkt_create_uplink(const unsigned char *data, unsigned int len)
 {
 	struct cfpkt *pkt = cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX);
+	if (!pkt)
+		return NULL;
 	if (unlikely(data != NULL))
 		cfpkt_add_body(pkt, data, len);
 	return pkt;
@@ -344,12 +348,13 @@ struct cfpkt *cfpkt_append(struct cfpkt *dstpkt,
 
 	if (dst->tail + neededtailspace > dst->end) {
 		/* Create a dumplicate of 'dst' with more tail space */
+		struct cfpkt *tmppkt;
 		dstlen = skb_headlen(dst);
 		createlen = dstlen + neededtailspace;
-		tmp = pkt_to_skb(
-			cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX));
-		if (!tmp)
+		tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX);
+		if (tmppkt == NULL)
 			return NULL;
+		tmp = pkt_to_skb(tmppkt);
 		skb_set_tail_pointer(tmp, dstlen);
 		tmp->len = dstlen;
 		memcpy(tmp->data, dst->data, dstlen);
@@ -368,6 +373,7 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
 {
 	struct sk_buff *skb2;
 	struct sk_buff *skb = pkt_to_skb(pkt);
+	struct cfpkt *tmppkt;
 	u8 *split = skb->data + pos;
 	u16 len2nd = skb_tail_pointer(skb) - split;
 
@@ -381,9 +387,12 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
 	}
 
 	/* Create a new packet for the second part of the data */
-	skb2 = pkt_to_skb(
-		cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX,
-				 PKT_PREFIX));
+	tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX,
+				  PKT_PREFIX);
+	if (tmppkt == NULL)
+		return NULL;
+	skb2 = pkt_to_skb(tmppkt);
+
 
 	if (skb2 == NULL)
 		return NULL;
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index 06029ea..cb4325a 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -67,6 +67,8 @@ static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt)
 		layr->incomplete_frm =
 		    cfpkt_append(layr->incomplete_frm, newpkt, expectlen);
 		pkt = layr->incomplete_frm;
+		if (pkt == NULL)
+			return -ENOMEM;
 	} else {
 		pkt = newpkt;
 	}
@@ -154,7 +156,6 @@ static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt)
 			if (layr->usestx) {
 				if (tail_pkt != NULL)
 					pkt = cfpkt_append(pkt, tail_pkt, 0);
-
 				/* Start search for next STX if frame failed */
 				continue;
 			} else {
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index aff31f3..6e5b707 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -123,6 +123,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
 			struct caif_payload_info *info;
 			u8 flow_off = SRVL_FLOW_OFF;
 			pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
+			if (!pkt) {
+				pr_warning("CAIF: %s(): Out of memory\n",
+					__func__);
+				return -ENOMEM;
+			}
+
 			if (cfpkt_add_head(pkt, &flow_off, 1) < 0) {
 				pr_err("CAIF: %s(): Packet is erroneous!\n",
 					__func__);
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH net-2.6 2/6] caif: Bugfix - use standard Linux lists
From: sjur.brandeland @ 2010-05-21 12:16 UTC (permalink / raw)
  To: netdev, davem; +Cc: sjurbr, linus.walleij, marcel, Sjur Braendeland
In-Reply-To: <1274444172-31969-1-git-send-email-sjur.brandeland@stericsson.com>

From: Sjur Braendeland <sjur.brandeland@stericsson.com>

Discovered bug when running high number of parallel connect requests.
Replace buggy home brewed list with linux/list.h.

Signed-off-by: Sjur Braendeland <sjur.brandeland@stericsson.com>
---
 include/net/caif/cfctrl.h |    4 +-
 net/caif/cfctrl.c         |   92 +++++++++++++--------------------------------
 2 files changed, 28 insertions(+), 68 deletions(-)

diff --git a/include/net/caif/cfctrl.h b/include/net/caif/cfctrl.h
index 997603f..9402543 100644
--- a/include/net/caif/cfctrl.h
+++ b/include/net/caif/cfctrl.h
@@ -94,8 +94,8 @@ struct cfctrl_request_info {
 	enum cfctrl_cmd cmd;
 	u8 channel_id;
 	struct cfctrl_link_param param;
-	struct cfctrl_request_info *next;
 	struct cflayer *client_layer;
+	struct list_head list;
 };
 
 struct cfctrl {
@@ -103,7 +103,7 @@ struct cfctrl {
 	struct cfctrl_rsp res;
 	atomic_t req_seq_no;
 	atomic_t rsp_seq_no;
-	struct cfctrl_request_info *first_req;
+	struct list_head list;
 	/* Protects from simultaneous access to first_req list */
 	spinlock_t info_list_lock;
 #ifndef CAIF_NO_LOOP
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index 0ffe1e1..fcfda98 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -44,13 +44,14 @@ struct cflayer *cfctrl_create(void)
 	dev_info.id = 0xff;
 	memset(this, 0, sizeof(*this));
 	cfsrvl_init(&this->serv, 0, &dev_info);
-	spin_lock_init(&this->info_list_lock);
 	atomic_set(&this->req_seq_no, 1);
 	atomic_set(&this->rsp_seq_no, 1);
 	this->serv.layer.receive = cfctrl_recv;
 	sprintf(this->serv.layer.name, "ctrl");
 	this->serv.layer.ctrlcmd = cfctrl_ctrlcmd;
 	spin_lock_init(&this->loop_linkid_lock);
+	spin_lock_init(&this->info_list_lock);
+	INIT_LIST_HEAD(&this->list);
 	this->loop_linkid = 1;
 	return &this->serv.layer;
 }
@@ -112,20 +113,10 @@ bool cfctrl_req_eq(struct cfctrl_request_info *r1,
 void cfctrl_insert_req(struct cfctrl *ctrl,
 			      struct cfctrl_request_info *req)
 {
-	struct cfctrl_request_info *p;
 	spin_lock(&ctrl->info_list_lock);
-	req->next = NULL;
 	atomic_inc(&ctrl->req_seq_no);
 	req->sequence_no = atomic_read(&ctrl->req_seq_no);
-	if (ctrl->first_req == NULL) {
-		ctrl->first_req = req;
-		spin_unlock(&ctrl->info_list_lock);
-		return;
-	}
-	p = ctrl->first_req;
-	while (p->next != NULL)
-		p = p->next;
-	p->next = req;
+	list_add_tail(&req->list, &ctrl->list);
 	spin_unlock(&ctrl->info_list_lock);
 }
 
@@ -133,46 +124,28 @@ void cfctrl_insert_req(struct cfctrl *ctrl,
 struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl,
 					      struct cfctrl_request_info *req)
 {
-	struct cfctrl_request_info *p;
-	struct cfctrl_request_info *ret;
+	struct cfctrl_request_info *p, *tmp, *first;
 
 	spin_lock(&ctrl->info_list_lock);
-	if (ctrl->first_req == NULL) {
-		spin_unlock(&ctrl->info_list_lock);
-		return NULL;
-	}
-
-	if (cfctrl_req_eq(req, ctrl->first_req)) {
-		ret = ctrl->first_req;
-		caif_assert(ctrl->first_req);
-		atomic_set(&ctrl->rsp_seq_no,
-				 ctrl->first_req->sequence_no);
-		ctrl->first_req = ctrl->first_req->next;
-		spin_unlock(&ctrl->info_list_lock);
-		return ret;
-	}
+	first = list_first_entry(&ctrl->list, struct cfctrl_request_info, list);
 
-	p = ctrl->first_req;
-
-	while (p->next != NULL) {
-		if (cfctrl_req_eq(req, p->next)) {
-			pr_warning("CAIF: %s(): Requests are not "
+	list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+		if (cfctrl_req_eq(req, p)) {
+			if (p != first)
+				pr_warning("CAIF: %s(): Requests are not "
 					"received in order\n",
 					__func__);
-			ret = p->next;
+
 			atomic_set(&ctrl->rsp_seq_no,
-					p->next->sequence_no);
-			p->next = p->next->next;
-			spin_unlock(&ctrl->info_list_lock);
-			return ret;
+					 p->sequence_no);
+			list_del(&p->list);
+			goto out;
 		}
-		p = p->next;
 	}
+	p = NULL;
+out:
 	spin_unlock(&ctrl->info_list_lock);
-
-	pr_warning("CAIF: %s(): Request does not match\n",
-		   __func__);
-	return NULL;
+	return p;
 }
 
 struct cfctrl_rsp *cfctrl_get_respfuncs(struct cflayer *layer)
@@ -388,31 +361,18 @@ void cfctrl_getstartreason_req(struct cflayer *layer)
 
 void cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer)
 {
-	struct cfctrl_request_info *p, *req;
+	struct cfctrl_request_info *p, *tmp;
 	struct cfctrl *ctrl = container_obj(layr);
 	spin_lock(&ctrl->info_list_lock);
-
-	if (ctrl->first_req == NULL) {
-		spin_unlock(&ctrl->info_list_lock);
-		return;
-	}
-
-	if (ctrl->first_req->client_layer == adap_layer) {
-
-		req = ctrl->first_req;
-		ctrl->first_req = ctrl->first_req->next;
-		kfree(req);
-	}
-
-	p = ctrl->first_req;
-	while (p != NULL && p->next != NULL) {
-		if (p->next->client_layer == adap_layer) {
-
-			req = p->next;
-			p->next = p->next->next;
-			kfree(p->next);
+	pr_warning("CAIF: %s(): enter\n", __func__);
+
+	list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+		if (p->client_layer == adap_layer) {
+			pr_warning("CAIF: %s(): cancel req :%d\n", __func__,
+					p->sequence_no);
+			list_del(&p->list);
+			kfree(p);
 		}
-		p = p->next;
 	}
 
 	spin_unlock(&ctrl->info_list_lock);
@@ -634,7 +594,7 @@ static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
 	case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND:
 	case CAIF_CTRLCMD_FLOW_OFF_IND:
 		spin_lock(&this->info_list_lock);
-		if (this->first_req != NULL) {
+		if (!list_empty(&this->list)) {
 			pr_debug("CAIF: %s(): Received flow off in "
 				   "control layer", __func__);
 		}
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH net-2.6 1/6] caif: Bugfix - wait_ev*_timeout returns long.
From: sjur.brandeland @ 2010-05-21 12:16 UTC (permalink / raw)
  To: netdev, davem; +Cc: sjurbr, linus.walleij, marcel, Sjur Braendeland

From: Sjur Braendeland <sjur.brandeland@stericsson.com>

Discovered bug when testing on 64bit architecture.
Fixed by using long to store result from wait_event_interruptible_timeout.

Signed-off-by: Sjur Braendeland <sjur.brandeland@stericsson.com>
---
 net/caif/caif_socket.c |   13 ++++++-------
 1 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index c3a70c5..77e9956 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -920,17 +920,17 @@ wait_connect:
 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
 
 	release_sock(sk);
-	err = wait_event_interruptible_timeout(*sk_sleep(sk),
+	err = -ERESTARTSYS;
+	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
 			sk->sk_state != CAIF_CONNECTING,
 			timeo);
 	lock_sock(sk);
-	if (err < 0)
+	if (timeo < 0)
 		goto out; /* -ERESTARTSYS */
-	if (err == 0 && sk->sk_state != CAIF_CONNECTED) {
-		err = -ETIMEDOUT;
-		goto out;
-	}
 
+	err = -ETIMEDOUT;
+	if (timeo == 0 && sk->sk_state != CAIF_CONNECTED)
+		goto out;
 	if (sk->sk_state != CAIF_CONNECTED) {
 		sock->state = SS_UNCONNECTED;
 		err = sock_error(sk);
@@ -945,7 +945,6 @@ out:
 	return err;
 }
 
-
 /*
  * caif_release() - Disconnect a CAIF Socket
  * Copied and modified af_irda.c:irda_release().
-- 
1.6.3.3


^ permalink raw reply related

* Re: tun: Use netif_receive_skb instead of netif_rx
From: Herbert Xu @ 2010-05-21 11:08 UTC (permalink / raw)
  To: Neil Horman; +Cc: David Miller, eric.dumazet, bmb, tgraf, nhorman, netdev
In-Reply-To: <20100521105128.GA29521@hmsreliant.think-freely.org>

On Fri, May 21, 2010 at 06:51:28AM -0400, Neil Horman wrote:
>
> I read it, we just described the issue in diferent terms.

Yeah I wasn't clear enough with my email without an actual patch :)

Thanks for testing Neil!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: tun: Use netif_receive_skb instead of netif_rx
From: Neil Horman @ 2010-05-21 10:51 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, eric.dumazet, bmb, tgraf, nhorman, netdev
In-Reply-To: <20100520.224944.102694876.davem@davemloft.net>

On Thu, May 20, 2010 at 10:49:44PM -0700, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Thu, 20 May 2010 20:39:39 -0400
> 
> > On Fri, May 21, 2010 at 09:16:30AM +1000, Herbert Xu wrote:
> >> On Thu, May 20, 2010 at 01:29:18PM -0400, Neil Horman wrote:
> >> >
> >> > So, I'm testing this patch out now, and unfotunately it doesn't seem to be
> >> > working.  Every frame seems to be holding a classid of 0.  Trying to figure out
> >> > why now.
> >> 
> >> Not very surprising since tun.c doesn't go through the normal
> >> socket interface.  I'll send a additional patch for that.
> >> 
> > I don't think thats it.  I think its a chicken and egg situation.  I think the
> > problem is that tasks can't be assigned to cgroups until their created, and in
> > that time a sock can be created.  Its a natural race.  If you create a socket
> > before you assign it to a cgroup, that socket retains a classid of zero.  I'm
> > going to try modify the patch to update sockets owned by tasks when the cgroup
> > is assigned.
> 
> Neil, you must not be using Herbert's most recent patch.
> 
I'm not, I was testing the version prior. I wrote my note before he posted the
update for the tun driver

> Either that or you haven't even read it.
> 
I read it, we just described the issue in diferent terms.

Neil

> Herbert's most recent patch doesn't create this chicken and egg
> problem you mention because it explicitly watches for cgroupid changes
> at all socket I/O operations including sendmsg() and sendmsg().  And
> if it sees a different cgroupid at a socket I/O call, it updates the
> cgroupid value in the socket.
> 
> So you very much can change the cgroup of the process mid-socket
> ownership and it will work.
> 
> The only problem is, as Herbert stated, tun.  Because it does it's
> networking I/O directly by calling netif_receive_skb() so it won't
> hit any of Herbert's cgroup check points.
> 

^ permalink raw reply

* [RFC][PATCH v6 19/19] Provides multiple submits and asynchronous notifications.
From: xiaohui.xin @ 2010-05-21  9:30 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1274434235-5929-19-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

    The vhost-net backend now only supports synchronous send/recv
    operations. The patch provides multiple submits and asynchronous
    notifications. This is needed for zero-copy case.

    Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
---
 drivers/vhost/net.c   |  255 ++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.c |  120 +++++++++++++----------
 drivers/vhost/vhost.h |   14 +++
 3 files changed, 333 insertions(+), 56 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9777583..9a0d162 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -24,6 +24,8 @@
 #include <linux/if_arp.h>
 #include <linux/if_tun.h>
 #include <linux/if_macvlan.h>
+#include <linux/mpassthru.h>
+#include <linux/aio.h>
 
 #include <net/sock.h>
 
@@ -45,10 +47,13 @@ enum vhost_net_poll_state {
 	VHOST_NET_POLL_STOPPED = 2,
 };
 
+static struct kmem_cache *notify_cache;
+
 struct vhost_net {
 	struct vhost_dev dev;
 	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
 	struct vhost_poll poll[VHOST_NET_VQ_MAX];
+	struct kmem_cache       *cache;
 	/* Tells us whether we are polling a socket for TX.
 	 * We only do this when socket buffer fills up.
 	 * Protected by tx vq lock. */
@@ -93,11 +98,146 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 	net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+	struct kiocb *iocb = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vq->notify_lock, flags);
+	if (!list_empty(&vq->notifier)) {
+		iocb = list_first_entry(&vq->notifier,
+				struct kiocb, ki_list);
+		list_del(&iocb->ki_list);
+	}
+	spin_unlock_irqrestore(&vq->notify_lock, flags);
+	return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+	struct vhost_virtqueue *vq = iocb->private;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vq->notify_lock, flags);
+	list_add_tail(&iocb->ki_list, &vq->notifier);
+	spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static int is_async_vq(struct vhost_virtqueue *vq)
+{
+	return (vq->link_state == VHOST_VQ_LINK_ASYNC);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+					  struct vhost_virtqueue *vq,
+					  struct socket *sock)
+{
+	struct kiocb *iocb = NULL;
+	struct vhost_log *vq_log = NULL;
+	int rx_total_len = 0;
+	unsigned int head, log, in, out;
+	int size;
+
+	if (!is_async_vq(vq))
+		return;
+
+	if (sock->sk->sk_data_ready)
+		sock->sk->sk_data_ready(sock->sk, 0);
+
+	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+		vq->log : NULL;
+
+	while ((iocb = notify_dequeue(vq)) != NULL) {
+		vhost_add_used_and_signal(&net->dev, vq,
+				iocb->ki_pos, iocb->ki_nbytes);
+		size = iocb->ki_nbytes;
+		head = iocb->ki_pos;
+		rx_total_len += iocb->ki_nbytes;
+
+		if (iocb->ki_dtor)
+			iocb->ki_dtor(iocb);
+		kmem_cache_free(net->cache, iocb);
+
+		/* when log is enabled, recomputing the log info is needed,
+		 * since these buffers are in async queue, and may not get
+		 * the log info before.
+		 */
+		if (unlikely(vq_log)) {
+			if (!log)
+				__vhost_get_vq_desc(&net->dev, vq, vq->iov,
+						    ARRAY_SIZE(vq->iov),
+						    &out, &in, vq_log,
+						    &log, head);
+			vhost_log_write(vq, vq_log, log, size);
+		}
+		if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+					  struct vhost_virtqueue *vq)
+{
+	struct kiocb *iocb = NULL;
+	struct list_head *entry, *tmp;
+	unsigned long flags;
+	int tx_total_len = 0;
+
+	if (!is_async_vq(vq))
+		return;
+	spin_lock_irqsave(&vq->notify_lock, flags);
+	list_for_each_safe(entry, tmp, &vq->notifier) {
+		iocb = list_entry(entry,
+                                struct kiocb, ki_list);
+		if (!iocb->ki_flags)
+			continue;
+		list_del(&iocb->ki_list);	
+		vhost_add_used_and_signal(&net->dev, vq,
+				iocb->ki_pos, 0);
+		tx_total_len += iocb->ki_nbytes;
+
+		if (iocb->ki_dtor)
+			iocb->ki_dtor(iocb);
+
+		kmem_cache_free(net->cache, iocb);
+		if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static struct kiocb *create_iocb(struct vhost_net *net,
+				 struct vhost_virtqueue *vq,
+				 unsigned head)
+{
+	struct kiocb *iocb = NULL;
+
+	if (!is_async_vq(vq))
+		return NULL;
+
+	iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+	if (!iocb)
+		return NULL;
+	iocb->private = vq;
+	iocb->ki_pos = head;
+	iocb->ki_dtor = handle_iocb;
+	if (vq == &net->dev.vqs[VHOST_NET_VQ_RX]) {
+		iocb->ki_user_data = vq->num;
+		iocb->ki_iovec = vq->hdr;
+	}
+	return iocb;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+	struct kiocb *iocb = NULL;
 	unsigned head, out, in, s;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -130,6 +270,8 @@ static void handle_tx(struct vhost_net *net)
 		tx_poll_stop(net);
 	hdr_size = vq->hdr_size;
 
+	handle_async_tx_events_notify(net, vq);
+
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -157,6 +299,13 @@ static void handle_tx(struct vhost_net *net)
 		/* Skip header. TODO: support TSO. */
 		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
 		msg.msg_iovlen = out;
+
+		if (is_async_vq(vq)) {
+			iocb = create_iocb(net, vq, head);
+			if (!iocb)
+				break;
+		}
+
 		len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
@@ -166,12 +315,18 @@ static void handle_tx(struct vhost_net *net)
 			break;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
-		err = sock->ops->sendmsg(NULL, sock, &msg, len);
+		err = sock->ops->sendmsg(iocb, sock, &msg, len);
 		if (unlikely(err < 0)) {
+			if (is_async_vq(vq))
+				kmem_cache_free(net->cache, iocb);
 			vhost_discard_vq_desc(vq);
 			tx_poll_start(net, sock);
 			break;
 		}
+
+		if (is_async_vq(vq))
+			continue;
+
 		if (err != len)
 			pr_err("Truncated TX packet: "
 			       " len %d != %zd\n", err, len);
@@ -183,6 +338,8 @@ static void handle_tx(struct vhost_net *net)
 		}
 	}
 
+	handle_async_tx_events_notify(net, vq);
+
 	mutex_unlock(&vq->mutex);
 	unuse_mm(net->dev.mm);
 }
@@ -192,6 +349,7 @@ static void handle_tx(struct vhost_net *net)
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+	struct kiocb *iocb = NULL;
 	unsigned head, out, in, log, s;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
@@ -212,7 +370,8 @@ static void handle_rx(struct vhost_net *net)
 	int err;
 	size_t hdr_size;
 	struct socket *sock = rcu_dereference(vq->private_data);
-	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+	if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+			vq->link_state == VHOST_VQ_LINK_SYNC))
 		return;
 
 	use_mm(net->dev.mm);
@@ -220,9 +379,17 @@ static void handle_rx(struct vhost_net *net)
 	vhost_disable_notify(vq);
 	hdr_size = vq->hdr_size;
 
+	/* In async cases, when write log is enabled, in case the submitted
+	 * buffers did not get log info before the log enabling, so we'd
+	 * better recompute the log info when needed. We do this in
+	 * handle_async_rx_events_notify().
+	 */
+
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
+	handle_async_rx_events_notify(net, vq, sock);
+
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -251,6 +418,13 @@ static void handle_rx(struct vhost_net *net)
 		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
+
+		if (is_async_vq(vq)) {
+			iocb = create_iocb(net, vq, head);
+			if (!iocb)
+				break;
+		}
+
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
@@ -258,13 +432,20 @@ static void handle_rx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
-		err = sock->ops->recvmsg(NULL, sock, &msg,
+
+		err = sock->ops->recvmsg(iocb, sock, &msg,
 					 len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
+			if (is_async_vq(vq))
+				kmem_cache_free(net->cache, iocb);
 			vhost_discard_vq_desc(vq);
 			break;
 		}
+
+		if (is_async_vq(vq))
+			continue;
+
 		/* TODO: Should check and handle checksum. */
 		if (err > len) {
 			pr_err("Discarded truncated rx packet: "
@@ -290,6 +471,8 @@ static void handle_rx(struct vhost_net *net)
 		}
 	}
 
+	handle_async_rx_events_notify(net, vq, sock);
+
 	mutex_unlock(&vq->mutex);
 	unuse_mm(net->dev.mm);
 }
@@ -343,6 +526,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
 	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+	n->cache = NULL;
 
 	f->private_data = n;
 
@@ -406,6 +590,21 @@ static void vhost_net_flush(struct vhost_net *n)
 	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
 }
 
+static void vhost_async_cleanup(struct vhost_net *n)
+{
+	/* clean the notifier */
+	struct vhost_virtqueue *vq;
+	struct kiocb *iocb = NULL;
+	if (n->cache) {
+		vq = &n->dev.vqs[VHOST_NET_VQ_RX];
+		while ((iocb = notify_dequeue(vq)) != NULL)
+			kmem_cache_free(n->cache, iocb);
+		vq = &n->dev.vqs[VHOST_NET_VQ_TX];
+		while ((iocb = notify_dequeue(vq)) != NULL)
+			kmem_cache_free(n->cache, iocb);
+	}
+}
+
 static int vhost_net_release(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = f->private_data;
@@ -422,6 +621,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 	/* We do an extra flush before freeing memory,
 	 * since jobs can re-queue themselves. */
 	vhost_net_flush(n);
+	vhost_async_cleanup(n);
 	kfree(n);
 	return 0;
 }
@@ -473,21 +673,58 @@ static struct socket *get_tap_socket(int fd)
 	return sock;
 }
 
-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+	struct file *file = fget(fd);
+	struct socket *sock;
+	if (!file)
+		return ERR_PTR(-EBADF);
+	sock = mp_get_socket(file);
+	if (IS_ERR(sock))
+		fput(file);
+	return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd,
+				 enum vhost_vq_link_state *state)
 {
 	struct socket *sock;
 	/* special case to disable backend */
 	if (fd == -1)
 		return NULL;
+
+	*state = VHOST_VQ_LINK_SYNC;
+
 	sock = get_raw_socket(fd);
 	if (!IS_ERR(sock))
 		return sock;
 	sock = get_tap_socket(fd);
 	if (!IS_ERR(sock))
 		return sock;
+	/* If we dont' have notify_cache, then dont do mpassthru */
+	if (!notify_cache)
+		return ERR_PTR(-ENOTSOCK);
+	sock = get_mp_socket(fd);
+	if (!IS_ERR(sock)) {
+		*state = VHOST_VQ_LINK_ASYNC;
+		return sock;
+	}
 	return ERR_PTR(-ENOTSOCK);
 }
 
+static void vhost_init_link_state(struct vhost_net *n, int index)
+{
+	struct vhost_virtqueue *vq = n->vqs + index;
+
+	WARN_ON(!mutex_is_locked(&vq->mutex));
+	if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+		INIT_LIST_HEAD(&vq->notifier);
+		spin_lock_init(&vq->notify_lock);
+		if (!n->cache)
+			n->cache = notify_cache;
+	}
+}
+
 static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 {
 	struct socket *sock, *oldsock;
@@ -511,12 +748,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 		r = -EFAULT;
 		goto err_vq;
 	}
-	sock = get_socket(fd);
+	sock = get_socket(vq, fd, &vq->link_state);
 	if (IS_ERR(sock)) {
 		r = PTR_ERR(sock);
 		goto err_vq;
 	}
 
+	vhost_init_link_state(n, index);
+
 	/* start polling new socket */
 	oldsock = vq->private_data;
 	if (sock == oldsock)
@@ -650,6 +889,10 @@ int vhost_net_init(void)
 	r = misc_register(&vhost_net_misc);
 	if (r)
 		goto err_reg;
+
+	notify_cache = kmem_cache_create("vhost_kiocb",
+					sizeof(struct kiocb), 0,
+					SLAB_HWCACHE_ALIGN, NULL);
 	return 0;
 err_reg:
 	vhost_cleanup();
@@ -663,6 +906,8 @@ void vhost_net_exit(void)
 {
 	misc_deregister(&vhost_net_misc);
 	vhost_cleanup();
+	if (notify_cache)
+		kmem_cache_destroy(notify_cache);
 }
 module_exit(vhost_net_exit);
 
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e69d238..ad3779c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -861,61 +861,17 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
-			   struct iovec iov[], unsigned int iov_size,
-			   unsigned int *out_num, unsigned int *in_num,
-			   struct vhost_log *log, unsigned int *log_num)
+/* This computes the log info according to the index of buffer */
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+			     struct iovec iov[], unsigned int iov_size,
+			     unsigned int *out_num, unsigned int *in_num,
+			     struct vhost_log *log, unsigned int *log_num,
+			     unsigned int head)
 {
 	struct vring_desc desc;
-	unsigned int i, head, found = 0;
-	u16 last_avail_idx;
+	unsigned int i = head, found = 0;
 	int ret;
 
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	last_avail_idx = vq->last_avail_idx;
-	if (get_user(vq->avail_idx, &vq->avail->idx)) {
-		vq_err(vq, "Failed to access avail idx at %p\n",
-		       &vq->avail->idx);
-		return vq->num;
-	}
-
-	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
-		vq_err(vq, "Guest moved used index from %u to %u",
-		       last_avail_idx, vq->avail_idx);
-		return vq->num;
-	}
-
-	/* If there's nothing new since last we looked, return invalid. */
-	if (vq->avail_idx == last_avail_idx)
-		return vq->num;
-
-	/* Only get avail ring entries after they have been exposed by guest. */
-	smp_rmb();
-
-	/* Grab the next descriptor number they're advertising, and increment
-	 * the index we've seen. */
-	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
-		vq_err(vq, "Failed to read head: idx %d address %p\n",
-		       last_avail_idx,
-		       &vq->avail->ring[last_avail_idx % vq->num]);
-		return vq->num;
-	}
-
-	/* If their number is silly, that's an error. */
-	if (head >= vq->num) {
-		vq_err(vq, "Guest says index %u > %u is available",
-		       head, vq->num);
-		return vq->num;
-	}
-
-	/* When we start there are none of either input nor output. */
 	*out_num = *in_num = 0;
 	if (unlikely(log))
 		*log_num = 0;
@@ -979,8 +935,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			*out_num += ret;
 		}
 	} while ((i = next_desc(&desc)) != -1);
+	return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+			   struct iovec iov[], unsigned int iov_size,
+			   unsigned int *out_num, unsigned int *in_num,
+			   struct vhost_log *log, unsigned int *log_num)
+{
+	struct vring_desc desc;
+	unsigned int i, head, found = 0;
+	u16 last_avail_idx;
+	int ret;
+
+	/* Check it isn't doing very strange things with descriptor numbers. */
+	last_avail_idx = vq->last_avail_idx;
+	if (get_user(vq->avail_idx, &vq->avail->idx)) {
+		vq_err(vq, "Failed to access avail idx at %p\n",
+		       &vq->avail->idx);
+		return vq->num;
+	}
+
+	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+		vq_err(vq, "Guest moved used index from %u to %u",
+		       last_avail_idx, vq->avail_idx);
+		return vq->num;
+	}
+
+	/* If there's nothing new since last we looked, return invalid. */
+	if (vq->avail_idx == last_avail_idx)
+		return vq->num;
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	smp_rmb();
+
+	/* Grab the next descriptor number they're advertising, and increment
+	 * the index we've seen. */
+	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+		vq_err(vq, "Failed to read head: idx %d address %p\n",
+		       last_avail_idx,
+		       &vq->avail->ring[last_avail_idx % vq->num]);
+		return vq->num;
+	}
+
+	/* If their number is silly, that's an error. */
+	if (head >= vq->num) {
+		vq_err(vq, "Guest says index %u > %u is available",
+		       head, vq->num);
+		return vq->num;
+	}
+
+	ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+				  out_num, in_num,
+				  log, log_num, head);
 
 	/* On success, increment avail index. */
+	if (ret == vq->num)
+		return ret;
 	vq->last_avail_idx++;
 	return head;
 }
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 44591ba..3c9cbce 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,11 @@ struct vhost_log {
 	u64 len;
 };
 
+enum vhost_vq_link_state {
+	VHOST_VQ_LINK_SYNC = 0,
+	VHOST_VQ_LINK_ASYNC = 1,
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -96,6 +101,10 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log log[VHOST_NET_MAX_SG];
+	/* Differiate async socket for 0-copy from normal */
+	enum vhost_vq_link_state link_state;
+	struct list_head notifier;
+	spinlock_t notify_lock;
 };
 
 struct vhost_dev {
@@ -124,6 +133,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+			   struct iovec iov[], unsigned int iov_count,
+			   unsigned int *out_num, unsigned int *in_num,
+			   struct vhost_log *log, unsigned int *log_num,
+			   unsigned int head);
 void vhost_discard_vq_desc(struct vhost_virtqueue *);
 
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-- 
1.5.4.4

^ permalink raw reply related

* [RFC][PATCH v6 18/19] Add a kconfig entry and make entry for mp device.
From: xiaohui.xin @ 2010-05-21  9:30 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1274434235-5929-18-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 drivers/vhost/Kconfig  |   10 ++++++++++
 drivers/vhost/Makefile |    2 ++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e4e2fd1..a6b8cbf 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
 	  To compile this driver as a module, choose M here: the module will
 	  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+	tristate "mediate passthru network driver (EXPERIMENTAL)"
+	depends on VHOST_NET
+	---help---
+	  zerocopy network I/O support, we call it as mediate passthru to
+	  be distiguish with hardare passthru.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
-- 
1.5.4.4


^ permalink raw reply related

* [PATCH v6 17/19] Export proto_ops to vhost-net driver.
From: xiaohui.xin @ 2010-05-21  9:30 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1274434235-5929-17-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Currently, vhost-net is only user to the mp device.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 drivers/vhost/mpassthru.c |  330 ++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 325 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index de07f1e..d0df691 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -414,6 +414,11 @@ static void mp_put(struct mp_file *mfile)
 		mp_detach(mfile->mp);
 }
 
+static void iocb_tag(struct kiocb *iocb)
+{
+	iocb->ki_flags = 1;
+}
+
 /* The callback to destruct the external buffers or skb */
 static void page_dtor(struct skb_external_page *ext_page)
 {
@@ -449,7 +454,7 @@ static void page_dtor(struct skb_external_page *ext_page)
 	 * Queue the notifier to wake up the backend driver
 	 */
 
-	create_iocb(info, info->total);
+	iocb_tag(info->iocb);
 
 	sk = ctor->port.sock->sk;
 	sk->sk_write_space(sk);
@@ -569,8 +574,323 @@ failed:
 	return NULL;
 }
 
+static void mp_sock_destruct(struct sock *sk)
+{
+	struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+	kfree(mp);
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+	if (sk_has_sleeper(sk))
+		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+	if (sk_has_sleeper(sk))
+		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+	struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor = NULL;
+	struct sk_buff *skb = NULL;
+	struct page_info *info = NULL;
+	struct ethhdr *eth;
+	struct kiocb *iocb = NULL;
+	int len, i;
+
+	struct virtio_net_hdr hdr = {
+		.flags = 0,
+		.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	};
+
+	ctor = rcu_dereference(mp->ctor);
+	if (!ctor)
+		return;
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		if (skb_shinfo(skb)->destructor_arg) {
+			info = container_of(skb_shinfo(skb)->destructor_arg,
+					struct page_info, ext_page);
+			info->skb = skb;
+			if (skb->len > info->len) {
+				mp->dev->stats.rx_dropped++;
+				DBG(KERN_INFO "Discarded truncated rx packet: "
+				    " len %d > %zd\n", skb->len, info->len);
+				info->total = skb->len;
+				goto clean;
+			} else {
+				int i;
+				struct skb_shared_info *gshinfo =
+					(struct skb_shared_info *)
+					(&info->ushinfo);
+				struct skb_shared_info *hshinfo =
+					skb_shinfo(skb);
+
+				if (gshinfo->nr_frags < hshinfo->nr_frags)
+					goto clean;
+				eth = eth_hdr(skb);
+				skb_push(skb, ETH_HLEN);
+
+				hdr.hdr_len = skb_headlen(skb);
+				info->total = skb->len;
+
+				for (i = 0; i < gshinfo->nr_frags; i++)
+					gshinfo->frags[i].size = 0;
+				for (i = 0; i < hshinfo->nr_frags; i++)
+					gshinfo->frags[i].size =
+						hshinfo->frags[i].size;
+			}
+		} else {
+			/* The skb composed with kernel buffers
+			 * in case external buffers are not sufficent.
+			 * The case should be rare.
+			 */
+			unsigned long flags;
+			int i;
+			struct skb_shared_info *gshinfo = NULL;
+
+			info = NULL;
+
+			spin_lock_irqsave(&ctor->read_lock, flags);
+			if (!list_empty(&ctor->readq)) {
+				info = list_first_entry(&ctor->readq,
+						struct page_info, list);
+				list_del(&info->list);
+			}
+			spin_unlock_irqrestore(&ctor->read_lock, flags);
+			if (!info) {
+				DBG(KERN_INFO
+				    "No external buffer avaliable %p\n",
+				    skb);
+				skb_queue_head(&sk->sk_receive_queue,
+						skb);
+				break;
+			}
+			info->skb = skb;
+			/* compute the guest skb frags info */
+			gshinfo = (struct skb_shared_info *)
+				  (info->ext_page.start +
+				  SKB_DATA_ALIGN(info->ext_page.size));
+
+			if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
+				goto clean;
+
+			eth = eth_hdr(skb);
+			skb_push(skb, ETH_HLEN);
+			info->total = skb->len;
+
+			for (i = 0; i < gshinfo->nr_frags; i++)
+				gshinfo->frags[i].size = 0;
+			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+				gshinfo->frags[i].size =
+					skb_shinfo(skb)->frags[i].size;
+			hdr.hdr_len = min_t(int, skb->len,
+					info->iov[1].iov_len);
+			skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
+		}
+
+		len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
+				sizeof hdr);
+		if (len) {
+			DBG(KERN_INFO
+				"Unable to write vnet_hdr at addr %p: %d\n",
+				info->hdr->iov_base, len);
+			goto clean;
+		}
+
+		iocb = create_iocb(info, skb->len + sizeof(hdr));
+		continue;
+
+clean:
+		kfree_skb(skb);
+		for (i = 0; info->pages[i]; i++)
+			put_page(info->pages[i]);
+		kmem_cache_free(ext_page_info_cache, info);
+	}
+	return;
+}
+
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor;
+	struct iovec *iov = m->msg_iov;
+	struct page_info *info = NULL;
+	struct frag frags[MAX_SKB_FRAGS];
+	struct sk_buff *skb;
+	int count = m->msg_iovlen;
+	int total = 0, header, n, i, len, rc;
+	unsigned long base;
+
+	ctor = rcu_dereference(mp->ctor);
+	if (!ctor)
+		return -ENODEV;
+
+	total = iov_length(iov, count);
+
+	if (total < ETH_HLEN)
+		return -EINVAL;
+
+	if (total <= COPY_THRESHOLD)
+		goto copy;
+
+	n = 0;
+	for (i = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+		if (!len)
+			continue;
+		n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		if (n > MAX_SKB_FRAGS)
+			return -EINVAL;
+	}
+
+copy:
+	header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
+
+	skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC);
+	if (!skb)
+		goto drop;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	skb_set_network_header(skb, ETH_HLEN);
+
+	memcpy_fromiovec(skb->data, iov, header);
+	skb_put(skb, header);
+	skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
+
+	if (header == total) {
+		rc = total;
+		info = alloc_small_page_info(ctor, iocb, total);
+	} else {
+		info = alloc_page_info(ctor, iocb, iov, count, frags, 0, total);
+		if (info)
+			for (i = 0; info->pages[i]; i++) {
+				skb_add_rx_frag(skb, i, info->pages[i],
+						frags[i].offset, frags[i].size);
+				info->pages[i] = NULL;
+			}
+	}
+	if (info != NULL) {
+		info->desc_pos = iocb->ki_pos;
+		info->total = total;
+		info->skb = skb;
+		skb_shinfo(skb)->destructor_arg = &info->ext_page;
+		skb->dev = mp->dev;
+		ctor->wq_len++;
+		create_iocb(info, info->total);
+		dev_queue_xmit(skb);
+		return 0;
+	}
+drop:
+	kfree_skb(skb);
+	if (info) {
+		for (i = 0; info->pages[i]; i++)
+			put_page(info->pages[i]);
+		kmem_cache_free(ext_page_info_cache, info);
+	}
+	mp->dev->stats.tx_dropped++;
+	return -ENOMEM;
+}
+
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len,
+		int flags)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor;
+	struct iovec *iov = m->msg_iov;
+	int count = m->msg_iovlen;
+	int npages, payload;
+	struct page_info *info;
+	struct frag frags[MAX_SKB_FRAGS];
+	unsigned long base;
+	int i, len;
+	unsigned long flag;
+
+	if (!(flags & MSG_DONTWAIT))
+		return -EINVAL;
+
+	ctor = rcu_dereference(mp->ctor);
+	if (!ctor)
+		return -EINVAL;
+
+	/* Error detections in case invalid external buffer */
+	if (count > 2 && iov[1].iov_len < ctor->port.hdr_len &&
+			mp->dev->features & NETIF_F_SG) {
+		return -EINVAL;
+	}
+
+	npages = ctor->port.npages;
+	payload = ctor->port.data_len;
+
+	/* If KVM guest virtio-net FE driver use SG feature */
+	if (count > 2) {
+		for (i = 2; i < count; i++) {
+			base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
+			len = iov[i].iov_len;
+			if (npages == 1)
+				len = min_t(int, len, PAGE_SIZE - base);
+			else if (base)
+				break;
+			payload -= len;
+			if (payload <= 0)
+				goto proceed;
+			if (npages == 1 || (len & ~PAGE_MASK))
+				break;
+		}
+	}
+
+	if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
+				- NET_SKB_PAD - NET_IP_ALIGN) >= 0)
+		goto proceed;
+
+	return -EINVAL;
+
+proceed:
+	/* skip the virtnet head */
+	iov++;
+	count--;
+
+	if (!ctor->lock_pages)
+		set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+				iocb->ki_user_data * 4096 * 2,
+				iocb->ki_user_data * 4096 * 2);
+
+	/* Translate address to kernel */
+	info = alloc_page_info(ctor, iocb, iov, count, frags, npages, 0);
+	if (!info)
+		return -ENOMEM;
+	info->len = total_len;
+	info->hdr[0].iov_base = iocb->ki_iovec[0].iov_base;
+	info->hdr[0].iov_len = iocb->ki_iovec[0].iov_len;
+	info->offset = frags[0].offset;
+	info->desc_pos = iocb->ki_pos;
+
+	iov--;
+	count++;
+
+	memcpy(info->iov, iov, sizeof(struct iovec) * count);
+
+	spin_lock_irqsave(&ctor->read_lock, flag);
+	list_add_tail(&info->list, &ctor->readq);
+	spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+	ctor->rq_len++;
+
+	return 0;
+}
+
 /* Ops structure to mimic raw sockets with mp device */
 static const struct proto_ops mp_socket_ops = {
+	.sendmsg = mp_sendmsg,
+	.recvmsg = mp_recvmsg,
 };
 
 static struct proto mp_proto = {
@@ -693,10 +1013,10 @@ static long mp_chr_ioctl(struct file *file, unsigned int cmd,
 		sk->sk_sndbuf = INT_MAX;
 		container_of(sk, struct mp_sock, sk)->mp = mp;
 
-		sk->sk_destruct = NULL;
-		sk->sk_data_ready = NULL;
-		sk->sk_write_space = NULL;
-		sk->sk_state_change = NULL;
+		sk->sk_destruct = mp_sock_destruct;
+		sk->sk_data_ready = mp_sock_data_ready;
+		sk->sk_write_space = mp_sock_write_space;
+		sk->sk_state_change = mp_sock_state_change;
 		ret = mp_attach(mp, file);
 		if (ret < 0)
 			goto err_free_sk;
-- 
1.5.4.4


^ permalink raw reply related

* [RFC][PATCH v6 15/19] Add basic funcs and ioctl to mp device.
From: xiaohui.xin @ 2010-05-21  9:30 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1274434235-5929-15-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The ioctl is used by mp device to bind an underlying
NIC, it will query hardware capability and declare the
NIC to use external buffers.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---

	memory leak fixed,
	kconfig made,
	do_unbind() made,
	mp_chr_ioctl() cleanup

	by Jeff Dike <jdike@linux.intel.com>

 drivers/vhost/mpassthru.c |  681 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 681 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..25e2f3e
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,681 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME        "mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include <linux/compat.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/aio.h>
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/crc32.h>
+#include <linux/nsproxy.h>
+#include <linux/uaccess.h>
+#include <linux/virtio_net.h>
+#include <linux/mpassthru.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+	u16     offset;
+	u16     size;
+};
+
+struct page_info {
+	struct list_head        list;
+	int                     header;
+	/* indicate the actual length of bytes
+	 * send/recv in the external buffers
+	 */
+	int                     total;
+	int                     offset;
+	struct page             *pages[MAX_SKB_FRAGS+1];
+	struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+	struct sk_buff          *skb;
+	struct page_ctor        *ctor;
+
+	/* The pointer relayed to skb, to indicate
+	 * it's a external allocated skb or kernel
+	 */
+	struct skb_external_page    ext_page;
+	struct skb_shared_info  ushinfo;
+
+#define INFO_READ                      0
+#define INFO_WRITE                     1
+	unsigned                flags;
+	unsigned                pnum;
+
+	/* It's meaningful for receive, means
+	 * the max length allowed
+	 */
+	size_t                  len;
+
+	/* The fields after that is for backend
+	 * driver, now for vhost-net.
+	 */
+
+	struct kiocb            *iocb;
+	unsigned int            desc_pos;
+	struct iovec            hdr[MAX_SKB_FRAGS + 2];
+	struct iovec            iov[MAX_SKB_FRAGS + 2];
+};
+
+static struct kmem_cache *ext_page_info_cache;
+
+struct page_ctor {
+	struct list_head        readq;
+	int 			wq_len;
+	int 			rq_len;
+	spinlock_t      	read_lock;
+	/* record the locked pages */
+	int			lock_pages;
+	struct rlimit		o_rlim;
+	struct net_device   	*dev;
+	struct mpassthru_port	port;
+};
+
+struct mp_struct {
+	struct mp_file   	*mfile;
+	struct net_device       *dev;
+	struct page_ctor	*ctor;
+	struct socket           socket;
+
+#ifdef MPASSTHRU_DEBUG
+	int debug;
+#endif
+};
+
+struct mp_file {
+	atomic_t count;
+	struct mp_struct *mp;
+	struct net *net;
+};
+
+struct mp_sock {
+	struct sock            	sk;
+	struct mp_struct       	*mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+	int ret = 0;
+
+	rtnl_lock();
+	ret = dev_change_flags(dev, flags);
+	rtnl_unlock();
+
+	if (ret < 0)
+		printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+	return ret;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+	int rc;
+	struct page_ctor *ctor;
+	struct net_device *dev = mp->dev;
+
+	/* locked by mp_mutex */
+	if (rcu_dereference(mp->ctor))
+		return -EBUSY;
+
+	ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
+	if (!ctor)
+		return -ENOMEM;
+	rc = netdev_mp_port_prep(dev, &ctor->port);
+	if (rc)
+		goto fail;
+
+	INIT_LIST_HEAD(&ctor->readq);
+	spin_lock_init(&ctor->read_lock);
+
+	ctor->rq_len = 0;
+	ctor->wq_len = 0;
+
+	dev_hold(dev);
+	ctor->dev = dev;
+	ctor->port.ctor = NULL;
+	ctor->port.sock = &mp->socket;
+	ctor->lock_pages = 0;
+	rc = netdev_mp_port_attach(dev, &ctor->port);
+	if (rc)
+		goto fail;
+
+	/* locked by mp_mutex */
+	rcu_assign_pointer(mp->ctor, ctor);
+
+	/* XXX:Need we do set_offload here ? */
+
+	return 0;
+
+fail:
+	kfree(ctor);
+	dev_put(dev);
+
+	return rc;
+}
+
+struct page_info *info_dequeue(struct page_ctor *ctor)
+{
+	unsigned long flags;
+	struct page_info *info = NULL;
+	spin_lock_irqsave(&ctor->read_lock, flags);
+	if (!list_empty(&ctor->readq)) {
+		info = list_first_entry(&ctor->readq,
+				struct page_info, list);
+		list_del(&info->list);
+	}
+	spin_unlock_irqrestore(&ctor->read_lock, flags);
+	return info;
+}
+
+static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
+			      unsigned long cur, unsigned long max)
+{
+	struct rlimit new_rlim, *old_rlim;
+	int retval;
+
+	if (resource != RLIMIT_MEMLOCK)
+		return -EINVAL;
+	new_rlim.rlim_cur = cur;
+	new_rlim.rlim_max = max;
+
+	old_rlim = current->signal->rlim + resource;
+
+	/* remember the old rlimit value when backend enabled */
+	ctor->o_rlim.rlim_cur = old_rlim->rlim_cur;
+	ctor->o_rlim.rlim_max = old_rlim->rlim_max;
+
+	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+			!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	retval = security_task_setrlimit(resource, &new_rlim);
+	if (retval)
+		return retval;
+
+	task_lock(current->group_leader);
+	*old_rlim = new_rlim;
+	task_unlock(current->group_leader);
+	return 0;
+}
+
+static int page_ctor_detach(struct mp_struct *mp)
+{
+	struct page_ctor *ctor;
+	struct page_info *info;
+	struct kiocb *iocb = NULL;
+	int i;
+
+	/* locked by mp_mutex */
+	ctor = rcu_dereference(mp->ctor);
+	if (!ctor)
+		return -ENODEV;
+
+	while ((info = info_dequeue(ctor))) {
+		for (i = 0; i < info->pnum; i++)
+			if (info->pages[i])
+				put_page(info->pages[i]);
+		kmem_cache_free(ext_page_info_cache, info);
+	}
+	set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+			   ctor->o_rlim.rlim_cur,
+			   ctor->o_rlim.rlim_max);
+	netdev_mp_port_detach(ctor->dev);
+	dev_put(ctor->dev);
+
+	/* locked by mp_mutex */
+	rcu_assign_pointer(mp->ctor, NULL);
+	synchronize_rcu();
+
+	kfree(ctor);
+	return 0;
+}
+
+static void __mp_detach(struct mp_struct *mp)
+{
+	mp->mfile = NULL;
+
+	mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+	page_ctor_detach(mp);
+	mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+	/* Drop the extra count on the net device */
+	dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+	mutex_lock(&mp_mutex);
+	__mp_detach(mp);
+	mutex_unlock(&mp_mutex);
+}
+
+static struct mp_struct *mp_get(struct mp_file *mfile)
+{
+	struct mp_struct *mp = NULL;
+	if (atomic_inc_not_zero(&mfile->count))
+		mp = mfile->mp;
+
+	return mp;
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+	if (atomic_dec_and_test(&mfile->count))
+		mp_detach(mfile->mp);
+}
+
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+};
+
+static struct proto mp_proto = {
+	.name           = "mp",
+	.owner          = THIS_MODULE,
+	.obj_size       = sizeof(struct mp_sock),
+};
+
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+	struct mp_file *mfile;
+	cycle_kernel_lock();
+	DBG1(KERN_INFO "mp: mp_chr_open\n");
+
+	mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
+	if (!mfile)
+		return -ENOMEM;
+	atomic_set(&mfile->count, 0);
+	mfile->mp = NULL;
+	mfile->net = get_net(current->nsproxy->net_ns);
+	file->private_data = mfile;
+	return 0;
+}
+
+static int mp_attach(struct mp_struct *mp, struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+	int err;
+
+	netif_tx_lock_bh(mp->dev);
+
+	err = -EINVAL;
+
+	if (mfile->mp)
+		goto out;
+
+	err = -EBUSY;
+	if (mp->mfile)
+		goto out;
+
+	err = 0;
+	mfile->mp = mp;
+	mp->mfile = mfile;
+	mp->socket.file = file;
+	dev_hold(mp->dev);
+	sock_hold(mp->socket.sk);
+	atomic_inc(&mfile->count);
+
+out:
+	netif_tx_unlock_bh(mp->dev);
+	return err;
+}
+
+static int do_unbind(struct mp_file *mfile)
+{
+	struct mp_struct *mp = mp_get(mfile);
+
+	if (!mp)
+		return -EINVAL;
+
+	mp_detach(mp);
+	sock_put(mp->socket.sk);
+	mp_put(mfile);
+	return 0;
+}
+
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp;
+	struct net_device *dev;
+	void __user* argp = (void __user *)arg;
+	struct ifreq ifr;
+	struct sock *sk;
+	int ret;
+
+	ret = -EINVAL;
+
+	switch (cmd) {
+	case MPASSTHRU_BINDDEV:
+		ret = -EFAULT;
+		if (copy_from_user(&ifr, argp, sizeof ifr))
+			break;
+
+		ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+		ret = -ENODEV;
+		dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+		if (!dev)
+			break;
+
+		mutex_lock(&mp_mutex);
+
+		ret = -EBUSY;
+
+		/* the device can be only bind once */
+		if (dev_is_mpassthru(dev))
+			goto err_dev_put;
+
+		mp = mfile->mp;
+		if (mp)
+			goto err_dev_put;
+
+		mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+		if (!mp) {
+			ret = -ENOMEM;
+			goto err_dev_put;
+		}
+		mp->dev = dev;
+		ret = -ENOMEM;
+
+		sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
+		if (!sk)
+			goto err_free_mp;
+
+		init_waitqueue_head(&mp->socket.wait);
+		mp->socket.ops = &mp_socket_ops;
+		sock_init_data(&mp->socket, sk);
+		sk->sk_sndbuf = INT_MAX;
+		container_of(sk, struct mp_sock, sk)->mp = mp;
+
+		sk->sk_destruct = NULL;
+		sk->sk_data_ready = NULL;
+		sk->sk_write_space = NULL;
+		sk->sk_state_change = NULL;
+		ret = mp_attach(mp, file);
+		if (ret < 0)
+			goto err_free_sk;
+
+		ret = page_ctor_attach(mp);
+		if (ret < 0)
+			goto err_free_sk;
+
+		mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+out:
+		mutex_unlock(&mp_mutex);
+		break;
+err_free_sk:
+		sk_free(sk);
+err_free_mp:
+		kfree(mp);
+err_dev_put:
+		dev_put(dev);
+		goto out;
+
+	case MPASSTHRU_UNBINDDEV:
+		ret = do_unbind(mfile);
+		break;
+
+	default:
+		break;
+	}
+	return ret;
+}
+
+static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp = mp_get(mfile);
+	struct sock *sk;
+	unsigned int mask = 0;
+
+	if (!mp)
+		return POLLERR;
+
+	sk = mp->socket.sk;
+
+	poll_wait(file, &mp->socket.wait, wait);
+
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	if (sock_writeable(sk) ||
+		(!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+			 sock_writeable(sk)))
+		mask |= POLLOUT | POLLWRNORM;
+
+	if (mp->dev->reg_state != NETREG_REGISTERED)
+		mask = POLLERR;
+
+	mp_put(mfile);
+	return mask;
+}
+
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct mp_struct *mp = mp_get(file->private_data);
+	struct sock *sk = mp->socket.sk;
+	struct sk_buff *skb;
+	int len, err;
+	ssize_t result = 0;
+
+	if (!mp)
+		return -EBADFD;
+
+	/* currently, async is not supported.
+	 * but we may support real async aio from user application,
+	 * maybe qemu virtio-net backend.
+	 */
+	if (!is_sync_kiocb(iocb))
+		return -EFAULT;
+
+	len = iov_length(iov, count);
+
+	if (unlikely(len) < ETH_HLEN)
+		return -EINVAL;
+
+	skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+				  file->f_flags & O_NONBLOCK, &err);
+
+	if (!skb)
+		return -EFAULT;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+	skb_put(skb, len);
+
+	if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+		kfree_skb(skb);
+		return -EAGAIN;
+	}
+
+	skb->protocol = eth_type_trans(skb, mp->dev);
+	skb->dev = mp->dev;
+
+	dev_queue_xmit(skb);
+
+	mp_put(file->private_data);
+	return result;
+}
+
+static int mp_chr_close(struct inode *inode, struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+
+	/*
+	 * Ignore return value since an error only means there was nothing to
+	 * do
+	 */
+	do_unbind(mfile);
+
+	put_net(mfile->net);
+	kfree(mfile);
+
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static long mp_chr_compat_ioctl(struct file *f, unsigned int ioctl,
+				unsigned long arg)
+{
+	return mp_chr_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations mp_fops = {
+	.owner  = THIS_MODULE,
+	.llseek = no_llseek,
+	.write  = do_sync_write,
+	.aio_write = mp_chr_aio_write,
+	.poll   = mp_chr_poll,
+	.unlocked_ioctl = mp_chr_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = mp_chr_compat_ioctl,
+#endif
+	.open   = mp_chr_open,
+	.release = mp_chr_close,
+};
+
+static struct miscdevice mp_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "mp",
+	.nodename = "net/mp",
+	.fops = &mp_fops,
+};
+
+static int mp_device_event(struct notifier_block *unused,
+		unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct mpassthru_port *port;
+	struct mp_struct *mp = NULL;
+	struct socket *sock = NULL;
+
+	port = dev->mp_port;
+	if (port == NULL)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		sock = dev->mp_port->sock;
+		mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+		do_unbind(mp->mfile);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block mp_notifier_block __read_mostly = {
+	.notifier_call  = mp_device_event,
+};
+
+static int mp_init(void)
+{
+	int err = 0;
+
+	ext_page_info_cache = kmem_cache_create("skb_page_info",
+						sizeof(struct page_info),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ext_page_info_cache)
+		return -ENOMEM;
+
+	err = misc_register(&mp_miscdev);
+	if (err) {
+		printk(KERN_ERR "mp: Can't register misc device\n");
+		kmem_cache_destroy(ext_page_info_cache);
+	} else {
+		printk(KERN_INFO "Registering mp misc device - minor = %d\n",
+				mp_miscdev.minor);
+		register_netdevice_notifier(&mp_notifier_block);
+	}
+	return err;
+}
+
+void mp_exit(void)
+{
+	unregister_netdevice_notifier(&mp_notifier_block);
+	misc_deregister(&mp_miscdev);
+	kmem_cache_destroy(ext_page_info_cache);
+}
+
+/* Get an underlying socket object from mp file.  Returns error unless file is
+ * attached to a device.  The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *mp_get_socket(struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp;
+
+	if (file->f_op != &mp_fops)
+		return ERR_PTR(-EINVAL);
+	mp = mp_get(mfile);
+	if (!mp)
+		return ERR_PTR(-EBADFD);
+	mp_put(mfile);
+	return &mp->socket;
+}
+EXPORT_SYMBOL_GPL(mp_get_socket);
+
+module_init(mp_init);
+module_exit(mp_exit);
+MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_LICENSE("GPL v2");
-- 
1.5.4.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox