Netdev List
 help / color / mirror / Atom feed
* Re: [RFC PATCH v7 08/19] Make __alloc_skb() to get external buffer.
From: Eric Dumazet @ 2010-06-05 14:53 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike
In-Reply-To: <1275732899-5423-8-git-send-email-xiaohui.xin@intel.com>

Le samedi 05 juin 2010 à 18:14 +0800, xiaohui.xin@intel.com a écrit :
> From: Xin Xiaohui <xiaohui.xin@intel.com>
> 	child->fclone = SKB_FCLONE_UNAVAILABLE;
>  	}
> +	/* Record the external buffer info in this field. It's not so good,
> +	 * but we cannot find another place easily.
> +	 */
> +	shinfo->destructor_arg = ext_page;
> +


Yes this is a big problem, its basically using a cache line that was not
touched before.

^ permalink raw reply

* Re: [RFC PATCH v7 03/19] Export 2 func for device to assign/deassign new strucure
From: Eric Dumazet @ 2010-06-05 14:51 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike
In-Reply-To: <1275732899-5423-3-git-send-email-xiaohui.xin@intel.com>

Le samedi 05 juin 2010 à 18:14 +0800, xiaohui.xin@intel.com a écrit :
> From: Xin Xiaohui <xiaohui.xin@intel.com>
> 
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
> Reviewed-by: Jeff Dike <jdike@linux.intel.com>
> ---
>  include/linux/netdevice.h |    3 +++
>  net/core/dev.c            |   28 ++++++++++++++++++++++++++++
>  2 files changed, 31 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index bae725c..efb575a 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1592,6 +1592,9 @@ extern gro_result_t	napi_frags_finish(struct napi_struct *napi,
>  					  gro_result_t ret);
>  extern struct sk_buff *	napi_frags_skb(struct napi_struct *napi);
>  extern gro_result_t	napi_gro_frags(struct napi_struct *napi);
> +extern int netdev_mp_port_attach(struct net_device *dev,
> +				 struct mpassthru_port *port);
> +extern void netdev_mp_port_detach(struct net_device *dev);
>  
>  static inline void napi_free_frags(struct napi_struct *napi)
>  {
> diff --git a/net/core/dev.c b/net/core/dev.c
> index f769098..ecbb6b1 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2469,6 +2469,34 @@ void netif_nit_deliver(struct sk_buff *skb)
>  	rcu_read_unlock();
>  }
>  
> +/* Export two functions to assign/de-assign mp_port pointer
> + * to a net device.
> + */
> +
> +int netdev_mp_port_attach(struct net_device *dev,
> +			struct mpassthru_port *port)
> +{
> +	/* locked by mp_mutex */
> +	if (rcu_dereference(dev->mp_port))
> +		return -EBUSY;
> +

Please... this is bogus...

Try with following config settings :

CONFIG_PROVE_LOCKING=y
CONFIG_PROVE_RCU=y
CONFIG_PROVE_RCU_REPEATEDLY=y




> +	rcu_assign_pointer(dev->mp_port, port);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(netdev_mp_port_attach);
> +
> +void netdev_mp_port_detach(struct net_device *dev)
> +{
> +	/* locked by mp_mutex */
> +	if (!rcu_dereference(dev->mp_port))
> +		return;

same problem here

> +
> +	rcu_assign_pointer(dev->mp_port, NULL);
> +	synchronize_rcu();
> +}
> +EXPORT_SYMBOL(netdev_mp_port_detach);
> +
>  /**
>   *	netif_receive_skb - process receive buffer from network
>   *	@skb: buffer to process

^ permalink raw reply

* Re: [PATCH v2] act_mirred: don't clone skb when skb isn't shared
From: jamal @ 2010-06-05 14:49 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <1275741553-21463-1-git-send-email-xiaosuo@gmail.com>

On Sat, 2010-06-05 at 20:39 +0800, Changli Gao wrote:

> +	if ((action == TC_ACT_SHOT || action == TC_ACT_STOLEN ||

I am not so sure about SHOT; the other two are fine.
 
> -	skb2 = skb_act_clone(skb, GFP_ATOMIC);
> +	at = G_TC_AT(skb->tc_verd);

Was there any need to move above line?

> +	skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);


> -	skb2->dev = dev;

Or this one?

>  	skb2->skb_iif = skb->dev->ifindex;
> +	skb2->dev = dev;


cheers,
jamal


^ permalink raw reply

* [PATCH v2] act_mirred: don't clone skb when skb isn't shared
From: Changli Gao @ 2010-06-05 12:39 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: David S. Miller, netdev, Changli Gao

don't clone skb when skb isn't shared

When the tcf_action is TC_ACT_STOLEN, and the skb isn't shared, we don't need
to clone a new skb. As the skb will be freed after this function returns, we
can use it freely once we get a reference to it.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/net/sch_generic.h |   11 +++++++++--
 net/sched/act_mirred.c    |    6 +++---
 2 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 03ca5d8..d7d4439 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -571,9 +571,16 @@ static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen)
 }
 
 #ifdef CONFIG_NET_CLS_ACT
-static inline struct sk_buff *skb_act_clone(struct sk_buff *skb, gfp_t gfp_mask)
+static inline struct sk_buff *skb_act_clone(struct sk_buff *skb, gfp_t gfp_mask,
+					    int action)
 {
-	struct sk_buff *n = skb_clone(skb, gfp_mask);
+	struct sk_buff *n;
+
+	if ((action == TC_ACT_SHOT || action == TC_ACT_STOLEN ||
+	     action == TC_ACT_QUEUED) && !skb_shared(skb))
+		n = skb_get(skb);
+	else
+		n = skb_clone(skb, gfp_mask);
 
 	if (n) {
 		n->tc_verd = SET_TC_VERD(n->tc_verd, 0);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index c0b6863..2e9a7b9 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -169,13 +169,13 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 		goto out;
 	}
 
-	skb2 = skb_act_clone(skb, GFP_ATOMIC);
+	at = G_TC_AT(skb->tc_verd);
+	skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
 	if (skb2 == NULL)
 		goto out;
 
 	m->tcf_bstats.bytes += qdisc_pkt_len(skb2);
 	m->tcf_bstats.packets++;
-	at = G_TC_AT(skb->tc_verd);
 	if (!(at & AT_EGRESS)) {
 		if (m->tcfm_ok_push)
 			skb_push(skb2, skb2->dev->hard_header_len);
@@ -185,8 +185,8 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 	if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
-	skb2->dev = dev;
 	skb2->skb_iif = skb->dev->ifindex;
+	skb2->dev = dev;
 	dev_queue_xmit(skb2);
 	err = 0;
 

^ permalink raw reply related

* Re: [RFC] act_cpu: redirect skb receiving to a special CPU.
From: jamal @ 2010-06-05 14:26 UTC (permalink / raw)
  To: Changli Gao; +Cc: Eric Dumazet, David S. Miller, Tom Herbert, Linux Netdev List
In-Reply-To: <AANLkTim83nfQDuLNmYAvk6RBqxot0t6cGq-lbic4-DSs@mail.gmail.com>

On Sat, 2010-06-05 at 22:15 +0800, Changli Gao wrote:

> For instance: there are 4 CPUs. I want redirect traffic to CPU 1-3
> evenly. If the qdisc is linear the rules as
> 
> flow classify(flow classid ffff:2-4) | tc_index 2 action cpu 1 |
> tc_index 3 action cpu 2 | tc_index 4 action cpu3
> 
> a tree variant:
> 
> class ffff:1 : flow classify(flow classid ffff:2-4)
> class ffff:2 parent ffff:1 : action cpu 1
> class ffff:3 parent ffff:1 : action cpu 2
> class ffff:4 parent ffff:1 : action cpu 3
> 
> ingress_classify: use flow classify to get the subclass ID, then find
> the corresponding class and exec action.
> 
> When there are lots of CPUs, tree is more efficient.

I still didnt follow .. 
Even if i had a million CPUs, A classifier matches some filter
and an action already bound to filter is executed. So the expensive
part is the classifier lookup.

> It seems AMD specific. Why do the AMD guys use this to implement async
> smp_call_function() if it is useful as you said?

Indeed it is AMD specific - but my view is if i was using AMD that would
be more efficient way of doing it; i.e IPI is the lowest common
denominator which works on all archs. Essentially what i am saying is
this would be a "inter-cpu messaging netdev" and i could replace its
send/recv parts from what we do in the RPS path right now to one that
uses AMD hypertransport etc.

cheers,
jamal


^ permalink raw reply

* Re: [RFC] act_cpu: redirect skb receiving to a special CPU.
From: Changli Gao @ 2010-06-05 14:15 UTC (permalink / raw)
  To: hadi; +Cc: Eric Dumazet, David S. Miller, Tom Herbert, Linux Netdev List
In-Reply-To: <1275746045.3490.60.camel@bigi>

On Sat, Jun 5, 2010 at 9:54 PM, jamal <hadi@cyberus.ca> wrote:
> On Sat, 2010-06-05 at 21:26 +0800, Changli Gao wrote:
>
>> ingress doesn't have any qdisc, but a class tree. The ingress_queue
>> will be sth. like this:
> [..]
>
>> Then we can classify skbs in tree  manner.
> [..]
>> > The cpuid should be sufficient to map to a remote cpu queue, no?
>>
>> It should be sufficient, but it isn't efficient. With map option, we
>> can use cls_flow to map traffic to classid, and use act_cpu map to map
>> classid to cpuid.
>
> I am missing something, I would see the flow as:
> -->ethx/lo/etc->ingressqdisc->classify-->action(redirect to cpuidX)
> Why/when do you need the tree variant? If you are thinking of maybe
> rate limiting to a specific CPU, then would passing it to a policer
> first not be sufficient?  IOW, classid is not very useful.

For instance: there are 4 CPUs. I want redirect traffic to CPU 1-3
evenly. If the qdisc is linear the rules as

flow classify(flow classid ffff:2-4) | tc_index 2 action cpu 1 |
tc_index 3 action cpu 2 | tc_index 4 action cpu3

a tree variant:

class ffff:1 : flow classify(flow classid ffff:2-4)
class ffff:2 parent ffff:1 : action cpu 1
class ffff:3 parent ffff:1 : action cpu 2
class ffff:4 parent ffff:1 : action cpu 3

ingress_classify: use flow classify to get the subclass ID, then find
the corresponding class and exec action.

When there are lots of CPUs, tree is more efficient.

>
>> I won't implement a new netdevice, but reuse the softnet. Even, I'll
>> reuse the enqueue_to_backlog() introduced by RPS, and of course, use
>> IPIs as RPS. Is there another way to trigger an IRQ of the remote CPU?
>
> I would look at it as "messaging of remote CPU" which may not result
> in an IRQ. I am pretty sure if you tried hard you could use HT in AMD
> hardware - the remote cpu may have an IRQ triggered but it wont be as
> expensive as IPI.
>

It seems AMD specific. Why do the AMD guys use this to implement async
smp_call_function() if it is useful as you said?

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH] act_mirred: don't clone skb when skb isn't shared
From: jamal @ 2010-06-05 13:59 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <AANLkTinh6OylNU8NLXB6ZSb012AmzkFhzFQWPR-iAsrf@mail.gmail.com>

On Sat, 2010-06-05 at 21:33 +0800, Changli Gao wrote:

> 
> If you kill TC_OK2MUNGE, you should kill TC_MUNGED too, as it will
> become useless.

Those two have different semantics and use different verdict bits:
one is saying "this packet has been munged" another is "I give you ok to
munge this packet".
Thanks for the suggestion - I will think about it some more before
deleting anything.

cheers,
jamal


^ permalink raw reply

* Re: [PATCH] phylib: Add support for the LXT973 phy.
From: Richard Cochran @ 2010-06-05 14:00 UTC (permalink / raw)
  To: Andy Fleming; +Cc: David Miller, netdev
In-Reply-To: <AANLkTinR9_VoWX8zZu93MeohHvCbCy_P5XWHH3rxlPuH@mail.gmail.com>

On Wed, Jun 02, 2010 at 02:32:11PM -0500, Andy Fleming wrote:
> Yeah, I was clearly not thinking clearly.  dev_flags will be
> overwritten, and is not meant for this.  I believe, what we should do
> is add a "port" field to the PHY device, and if PCR_FIBER_SELECT is
> set, then set the port field to PORT_FIBRE.  I'm not entirely clear on
> the semantics of that field in the ethtool cmd, but it seems like the
> right idea.

Here is another try. Is that more like it?

Richard


This patch implements a work around for Erratum 5, "3.3 V Fiber Speed
Selection." If the hardware wiring does not respect this erratum, then
fiber optic mode will not work properly.

As part of the fix, the patch introduces a new field 'port_flags' into
the 'struct phy_device'. This field allows phy drivers to describe
fixed attributes of the port. Only phy drivers should write this field.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
---
 drivers/net/phy/lxt.c |   52 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/phy.h   |    8 +++++++
 2 files changed, 59 insertions(+), 1 deletions(-)

diff --git a/drivers/net/phy/lxt.c b/drivers/net/phy/lxt.c
index 8ee929b..ef4a320 100644
--- a/drivers/net/phy/lxt.c
+++ b/drivers/net/phy/lxt.c
@@ -53,6 +53,9 @@
 
 #define MII_LXT971_ISR		19  /* Interrupt Status Register */
 
+/* register definitions for the 973 */
+#define MII_LXT973_PCR 16 /* Port Configuration Register */
+#define PCR_FIBER_SELECT 1
 
 MODULE_DESCRIPTION("Intel LXT PHY driver");
 MODULE_AUTHOR("Andy Fleming");
@@ -119,6 +122,34 @@ static int lxt971_config_intr(struct phy_device *phydev)
 	return err;
 }
 
+static int lxt973_probe(struct phy_device *phydev)
+{
+	int val = phy_read(phydev, MII_LXT973_PCR);
+
+	if (val & PCR_FIBER_SELECT) {
+		/*
+		 * If fiber is selected, then the only correct setting
+		 * is 100Mbps, full duplex, and auto negotiation off.
+		 */
+		val = phy_read(phydev, MII_BMCR);
+		val |= (BMCR_SPEED100 | BMCR_FULLDPLX);
+		val &= ~BMCR_ANENABLE;
+		phy_write(phydev, MII_BMCR, val);
+		/* Remember that the port is in fiber mode. */
+		phydev->port_flags |= PHY_PORT_FIBER;
+	} else {
+		phydev->port_flags &= ~PHY_PORT_FIBER;
+	}
+	return 0;
+}
+
+static int lxt973_config_aneg(struct phy_device *phydev)
+{
+	/* Do nothing if port is in fiber mode. */
+	return phydev->port_flags & PHY_PORT_FIBER ?
+		0 : genphy_config_aneg(phydev);
+}
+
 static struct phy_driver lxt970_driver = {
 	.phy_id		= 0x78100000,
 	.name		= "LXT970",
@@ -146,6 +177,18 @@ static struct phy_driver lxt971_driver = {
 	.driver 	= { .owner = THIS_MODULE,},
 };
 
+static struct phy_driver lxt973_driver = {
+	.phy_id		= 0x00137a10,
+	.name		= "LXT973",
+	.phy_id_mask	= 0xfffffff0,
+	.features	= PHY_BASIC_FEATURES,
+	.flags		= 0,
+	.probe		= lxt973_probe,
+	.config_aneg	= lxt973_config_aneg,
+	.read_status	= genphy_read_status,
+	.driver		= { .owner = THIS_MODULE,},
+};
+
 static int __init lxt_init(void)
 {
 	int ret;
@@ -157,9 +200,15 @@ static int __init lxt_init(void)
 	ret = phy_driver_register(&lxt971_driver);
 	if (ret)
 		goto err2;
+
+	ret = phy_driver_register(&lxt973_driver);
+	if (ret)
+		goto err3;
 	return 0;
 
- err2:	
+ err3:
+	phy_driver_unregister(&lxt971_driver);
+ err2:
 	phy_driver_unregister(&lxt970_driver);
  err1:
 	return ret;
@@ -169,6 +218,7 @@ static void __exit lxt_exit(void)
 {
 	phy_driver_unregister(&lxt970_driver);
 	phy_driver_unregister(&lxt971_driver);
+	phy_driver_unregister(&lxt973_driver);
 }
 
 module_init(lxt_init);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1c75b6b..602228c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -234,6 +234,11 @@ enum phy_state {
 	PHY_RESUMING
 };
 
+/*
+ * PHY_PORT_xxx: flags to describe the port's fixed attributes.
+ */
+#define PHY_PORT_FIBER 0x00000001 /* Port has a fiber optic transceiver */
+
 /* phy_device: An instance of a PHY
  *
  * drv: Pointer to the driver for this PHY instance
@@ -246,6 +251,7 @@ enum phy_state {
  * link_timeout: The number of timer firings to wait before the
  * giving up on the current attempt at acquiring a link
  * irq: IRQ number of the PHY's interrupt (-1 if none)
+ * port_flags: Bit field of PHY_PORT_xxx flags
  * phy_timer: The timer for handling the state machine
  * phy_queue: A work_queue for the interrupt
  * attached_dev: The attached enet driver's device instance ptr
@@ -314,6 +320,8 @@ struct phy_device {
 	 */
 	int irq;
 
+	int port_flags;
+
 	/* private data pointer */
 	/* For use by PHYs to maintain extra state */
 	void *priv;
-- 
1.6.3.3


^ permalink raw reply related

* Re: [RFC] act_cpu: redirect skb receiving to a special CPU.
From: jamal @ 2010-06-05 13:54 UTC (permalink / raw)
  To: Changli Gao; +Cc: Eric Dumazet, David S. Miller, Tom Herbert, Linux Netdev List
In-Reply-To: <AANLkTiltF2APEnqHyHTtHA7JMNp3HX3aCu9UL3nUJ5_u@mail.gmail.com>

On Sat, 2010-06-05 at 21:26 +0800, Changli Gao wrote:

> ingress doesn't have any qdisc, but a class tree. The ingress_queue
> will be sth. like this:
[..]

> Then we can classify skbs in tree  manner.
[..]
> > The cpuid should be sufficient to map to a remote cpu queue, no?
> 
> It should be sufficient, but it isn't efficient. With map option, we
> can use cls_flow to map traffic to classid, and use act_cpu map to map
> classid to cpuid.

I am missing something, I would see the flow as:
-->ethx/lo/etc->ingressqdisc->classify-->action(redirect to cpuidX)
Why/when do you need the tree variant? If you are thinking of maybe
rate limiting to a specific CPU, then would passing it to a policer
first not be sufficient?  IOW, classid is not very useful.

> I won't implement a new netdevice, but reuse the softnet. Even, I'll
> reuse the enqueue_to_backlog() introduced by RPS, and of course, use
> IPIs as RPS. Is there another way to trigger an IRQ of the remote CPU?

I would look at it as "messaging of remote CPU" which may not result
in an IRQ. I am pretty sure if you tried hard you could use HT in AMD
hardware - the remote cpu may have an IRQ triggered but it wont be as
expensive as IPI.

cheers,
jamal


^ permalink raw reply

* Re: [PATCH] act_mirred: don't clone skb when skb isn't shared
From: Changli Gao @ 2010-06-05 13:33 UTC (permalink / raw)
  To: hadi; +Cc: David S. Miller, netdev
In-Reply-To: <1275744299.3490.48.camel@bigi>

On Sat, Jun 5, 2010 at 9:24 PM, jamal <hadi@cyberus.ca> wrote:
> On Sat, 2010-06-05 at 21:07 +0800, Changli Gao wrote:
>
>> Thanks. BTW: act_nat.c doesn't obey the following rule, and you plan
>> to remove TC_MUNGED and TC_OK2MUNGE?
>
>> 2) If you munge any packet thou shalt call pskb_expand_head in the case
>> someone else is referencing the skb. After that you "own" the skb.
>> You must also tell us if it is ok to munge the packet (TC_OK2MUNGE),
>> this way any action downstream can stomp on the packet.
>
> That rule still applies but it is upto the discretion of the action.
> i.e if the act_nat thinks it is ok for others down the street to trample
> on the packet, it should tell us so. Maybe i should change the wording
> to use the word "may" in that 3rd sentence.
> [I will kill (low prio) TC_OK2MUNGE but not TC_MUNGED.]
>

If you kill TC_OK2MUNGE, you should kill TC_MUNGED too, as it will
become useless.

localhost linux # grep MUNGE net/sched/ -R
net/sched/act_pedit.c:  if (!(skb->tc_verd & TC_OK2MUNGE)) {
net/sched/act_pedit.c:                  skb->tc_verd =
SET_TC_MUNGED(skb->tc_verd);
net/sched/act_api.c:                    if (TC_MUNGED & skb->tc_verd) {
net/sched/act_api.c:                            skb->tc_verd =
SET_TC_OK2MUNGE(skb->tc_verd);
net/sched/act_api.c:                            skb->tc_verd =
CLR_TC_MUNGED(skb->tc_verd);

int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
                    struct tcf_result *res)
{
        struct tc_action *a;
        int ret = -1;

        if (skb->tc_verd & TC_NCLS) {
                skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
                ret = TC_ACT_OK;
                goto exec_done;
        }
        while ((a = act) != NULL) {
repeat:
                if (a->ops && a->ops->act) {
                        ret = a->ops->act(skb, a, res);
                        if (TC_MUNGED & skb->tc_verd) {
                                /* copied already, allow trampling */
                                skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
                                skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
                        }
                        if (ret == TC_ACT_REPEAT)
                                goto repeat;    /* we need a ttl - JHS */
                        if (ret != TC_ACT_PIPE)
                                goto exec_done;
                }
                act = a->next;
        }
exec_done:
        return ret;
}

The bit OK2MUNGE relies on MUNGED only.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [RFC] act_cpu: redirect skb receiving to a special CPU.
From: Changli Gao @ 2010-06-05 13:26 UTC (permalink / raw)
  To: hadi; +Cc: Eric Dumazet, David S. Miller, Tom Herbert, Linux Netdev List
In-Reply-To: <1275743224.3490.44.camel@bigi>

On Sat, Jun 5, 2010 at 9:07 PM, jamal <hadi@cyberus.ca> wrote:
> Changli,
>
> I like the idea..
>
> My preference would be to not change ingress qdisc to have queues.

ingress doesn't have any qdisc, but a class tree. The ingress_queue
will be sth. like this:

while (1) {
 result = tc_classify(..., &res);
 cl = ingress_find(res.classid, ...);
 if (!cl->level)
     break;
 ...
}

Then we can classify skbs in tree  manner.

> The cpuid should be sufficient to map to a remote cpu queue, no?

It should be sufficient, but it isn't efficient. With map option, we
can use cls_flow to map traffic to classid, and use act_cpu map to map
classid to cpuid.

> Now, if you could represent each cpu as a netdevice, then we wouldnt
> need any change;-> And we could have multiple types of ways to redirect
> to cpus instead of just doing IPIs - example, ive always thought of
> sending over something like HT (I think it would be a lot cheaper).

I won't implement a new netdevice, but reuse the softnet. Even, I'll
reuse the enqueue_to_backlog() introduced by RPS, and of course, use
IPIs as RPS. Is there another way to trigger an IRQ of the remote CPU?

>
> I didnt queit understand the map OFFSET part. is this part of rfs?
>

No. As class IDs are started from 1, but CPU IDs are started from 0, I
need to minus/add a number from/to class IDs to map class IDs from CPU
IDs.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH] act_mirred: don't clone skb when skb isn't shared
From: jamal @ 2010-06-05 13:24 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <AANLkTikHqzIr-SjPZe-iiMPxVG9PRR9-0A9qeMwzfLYp@mail.gmail.com>

On Sat, 2010-06-05 at 21:07 +0800, Changli Gao wrote:

> Thanks. BTW: act_nat.c doesn't obey the following rule, and you plan
> to remove TC_MUNGED and TC_OK2MUNGE?

> 2) If you munge any packet thou shalt call pskb_expand_head in the case
> someone else is referencing the skb. After that you "own" the skb.
> You must also tell us if it is ok to munge the packet (TC_OK2MUNGE),
> this way any action downstream can stomp on the packet.

That rule still applies but it is upto the discretion of the action.
i.e if the act_nat thinks it is ok for others down the street to trample
on the packet, it should tell us so. Maybe i should change the wording
to use the word "may" in that 3rd sentence.
[I will kill (low prio) TC_OK2MUNGE but not TC_MUNGED.]

cheers,
jamal


^ permalink raw reply

* Re: [PATCH] act_mirred: don't clone skb when skb isn't shared
From: Changli Gao @ 2010-06-05 13:07 UTC (permalink / raw)
  To: hadi; +Cc: David S. Miller, netdev
In-Reply-To: <1275742435.3490.31.camel@bigi>

On Sat, Jun 5, 2010 at 8:53 PM, jamal <hadi@cyberus.ca> wrote:
> On Fri, 2010-06-04 at 21:43 +0800, Changli Gao wrote:
>> don't clone skb when skb isn't shared
>>
>> When the tcf_action is TC_ACT_STOLEN, and the skb isn't shared, we don't need
>> to clone a new skb. As the skb will be freed after this function returns, we
>> can use it freely once we get a reference to it.
>
> It looks like a good optimization - but i am not a big fan of one-offs
> [because usability goes down and I am forced to explain it longer in the
> rules (refer to: Documentation/networking/tc-actions-env-rules.txt)]

Thanks. BTW: act_nat.c doesn't obey the following rule, and you plan
to remove TC_MUNGED and TC_OK2MUNGE?

2) If you munge any packet thou shalt call pskb_expand_head in the case
someone else is referencing the skb. After that you "own" the skb.
You must also tell us if it is ok to munge the packet (TC_OK2MUNGE),
this way any action downstream can stomp on the packet.

>
> How about you update skb_act_clone to take take the action code as well
> and do the check the if stolen/queued it does a skb_get otherwise it
> calls skb_clone?
>

Good idea. Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [RFC] act_cpu: redirect skb receiving to a special CPU.
From: jamal @ 2010-06-05 13:07 UTC (permalink / raw)
  To: Changli Gao; +Cc: Eric Dumazet, David S. Miller, Tom Herbert, Linux Netdev List
In-Reply-To: <AANLkTikskTWJrULj2IKYhebxh9O_XTOMBy3i0wPY8thq@mail.gmail.com>

Changli,

I like the idea..

My preference would be to not change ingress qdisc to have queues.
The cpuid should be sufficient to map to a remote cpu queue, no?
Now, if you could represent each cpu as a netdevice, then we wouldnt
need any change;-> And we could have multiple types of ways to redirect
to cpus instead of just doing IPIs - example, ive always thought of 
sending over something like HT (I think it would be a lot cheaper).

I didnt queit understand the map OFFSET part. is this part of rfs?

cheers,
jamal

On Sat, 2010-06-05 at 18:56 +0800, Changli Gao wrote:
> I am going to implement a CPU action, which can be used with ingress
> qdisc to redirect skb receiving to a special cpu. It is much like RPS,
> but more flexible:
> 
> * choose the hash function with the help of cls_flow.c
> * pin special traffic to a dedicate CPU
> * weighted packets distributing
> 
> act_cpu will use the function enqueue_to_backlog() supplied by RPS to
> redirect skb receiving, and have two kind paramter:
> 
> * cpu CPUID - the ID of CPU, which handles this traffic
> * map OFFSET - map the mirror class ID to CPUID: CPUID = mirror class ID + CPUID
> 
> sch_ingress will be enhanced to support class tree.
> 


^ permalink raw reply

* Re: [PATCH] act_mirred: don't clone skb when skb isn't shared
From: jamal @ 2010-06-05 12:53 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <1275658990-15838-1-git-send-email-xiaosuo@gmail.com>

On Fri, 2010-06-04 at 21:43 +0800, Changli Gao wrote:
> don't clone skb when skb isn't shared
> 
> When the tcf_action is TC_ACT_STOLEN, and the skb isn't shared, we don't need
> to clone a new skb. As the skb will be freed after this function returns, we
> can use it freely once we get a reference to it.

It looks like a good optimization - but i am not a big fan of one-offs
[because usability goes down and I am forced to explain it longer in the
rules (refer to: Documentation/networking/tc-actions-env-rules.txt)]

How about you update skb_act_clone to take take the action code as well
and do the check the if stolen/queued it does a skb_get otherwise it
calls skb_clone?

cheers,
jamal


^ permalink raw reply

* Re: [PATCH] r8169: fix random mdio_write failures
From: Francois Romieu @ 2010-06-05 12:41 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev, Edward Hsu, Hayes, davem
In-Reply-To: <1275733273-28321-1-git-send-email-timo.teras@iki.fi>

Timo Teräs <timo.teras@iki.fi> :
[...]
> diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
> index 217e709..03a8318 100644
> --- a/drivers/net/r8169.c
> +++ b/drivers/net/r8169.c
> @@ -559,6 +559,11 @@ static void mdio_write(void __iomem *ioaddr, int reg_addr, int value)
>  			break;
>  		udelay(25);
>  	}
> +	/*
> +	 * Some configurations require a small delay even after the write
> +	 * completed indication or the next write might fail.
> +	 */
> +	udelay(25);

Acked-off-by: Francois Romieu <romieu@fr.zoreil.com>

Good work.

I wonder if increasing the in-loop delay as well would help the write
succeed faster (or slower ?).

-- 
Ueimor

^ permalink raw reply

* Re: 2.6.32 and Multicast group membership
From: Mark Smith @ 2010-06-05 11:33 UTC (permalink / raw)
  To: Mr. Berkley Shands; +Cc: Net Dev
In-Reply-To: <4C057DF4.3000305@exegy.com>

On Tue, 01 Jun 2010 16:39:00 -0500
"Mr. Berkley Shands" <bshands@exegy.com> wrote:

> starting in 2.6.32, my multicast connections stop getting data after 
> 60-100 seconds.
> The identical user code works fine under 2.6.22 through 2.6.31.
> 
> The NIC (an intel 82586) has two ports on the same subnet
> (eth0 at 172.16.21.55/24 and eth1 at 172.16.21.56/24)
> 
>       if (setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP, (char*)&req, 
> sizeof(req)))
>       {
>          perror("setsockopt IP_ADD_MEMBERSHIP failed");
>          ::exit(-1);
>       }
> 
> If I do ADD_MEMBERSHIP on just one of these interfaces, the sockets 
> still get data.
> But if I join on both interfaces, one or both will stop getting packets
> after 60-100 seconds. Sniffing with tcpdump shows the Cisco layer 3 switch
> is not getting its responses back to keep the multicast group open.

While it could be a kernel change, that sounds like it might also be
related to IGMP snooping on the Cisco switch. Any changes made to that
recently?

> The HP layer 2 switch does not seem to care, it keeps the data flowing 
> regardless
> of which physical port the join is executed on.
> 

This is expected behaviour on a 'dumb' layer 2 switch i.e. one that
doesn't perform IGMP snooping and therefore doesn't suppress multicasts
towards end-nodes that aren't subscribed. That further suggests a
change on the Cisco.

FWIW, I'm having no trouble with IPv4 multicast under 2.6.33, plugged
into Cisco switch, with IGMP snooping, but not multicast suppression,
enabled.

> Changing IP_MULTICAST_ALL has no effect :-(
> Did I miss something? New code that I have to specify to keep the Cisco 
> happy?
> 
> tia
> 
> Berkley
> 
> 
> -- 
> 
> // E. F. Berkley Shands, MSc//
> 
> ** Exegy Inc.**
> 
> 349 Marshall Road, Suite 100
> 
> St. Louis , MO  63119
> 
> Direct:  (314) 218-3600 X450
> 
> Cell:  (314) 303-2546
> 
> Office:  (314) 218-3600
> 
> Fax:  (314) 218-3601
> 
>  
> 
> The Usual Disclaimer follows...
> 
>  
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [RFC] act_cpu: redirect skb receiving to a special CPU.
From: Changli Gao @ 2010-06-05 10:56 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Tom Herbert, Jamal Hadi Salim
  Cc: Linux Netdev List

I am going to implement a CPU action, which can be used with ingress
qdisc to redirect skb receiving to a special cpu. It is much like RPS,
but more flexible:

* choose the hash function with the help of cls_flow.c
* pin special traffic to a dedicate CPU
* weighted packets distributing

act_cpu will use the function enqueue_to_backlog() supplied by RPS to
redirect skb receiving, and have two kind paramter:

* cpu CPUID - the ID of CPU, which handles this traffic
* map OFFSET - map the mirror class ID to CPUID: CPUID = mirror class ID + CPUID

sch_ingress will be enhanced to support class tree.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* [PATCH] r8169: fix random mdio_write failures
From: Timo Teräs @ 2010-06-05 10:21 UTC (permalink / raw)
  To: netdev; +Cc: Timo Teräs, françois romieu, Edward Hsu
In-Reply-To: <4C0A1736.9030209@iki.fi>

Some configurations need delay between the "write completed" indication
and new write to work reliably.

Realtek driver seems to use longer delay when polling the "write complete"
bit, so it waits long enough between writes with high probability (but
could probably break too). This patch adds a new udelay to make sure we
wait unconditionally some time after the write complete indication.

This caused a regression with XID 18000000 boards when the board specific
phy configuration writing many mdio registers was added in commit
2e955856ff (r8169: phy init for the 8169scd). Some of the configration
mdio writes would almost always fail, and depending on failure might leave
the PHY in non-working state.

Signed-off-by: Timo Teräs <timo.teras@iki.fi>
Cc: françois romieu <romieu@fr.zoreil.com>
Cc: Edward Hsu <edward_hsu@realtek.com.tw>
---
 drivers/net/r8169.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 217e709..03a8318 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -559,6 +559,11 @@ static void mdio_write(void __iomem *ioaddr, int reg_addr, int value)
 			break;
 		udelay(25);
 	}
+	/*
+	 * Some configurations require a small delay even after the write
+	 * completed indication or the next write might fail.
+	 */
+	udelay(25);
 }
 
 static int mdio_read(void __iomem *ioaddr, int reg_addr)
-- 
1.7.0.4


^ permalink raw reply related

* [RFC PATCH v7 00/19] Provide a zero-copy method on KVM virtio-net.
From: xiaohui.xin @ 2010-06-05 10:14 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike
In-Reply-To: <1275732899-5423-19-git-send-email-xiaohui.xin@intel.com>

We provide an zero-copy method which driver side may get external
buffers to DMA. Here external means driver don't use kernel space
to allocate skb buffers. Currently the external buffer can be from
guest virtio-net driver.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

patch 01-13:  	net core changes.
patch 14-18:  	new device as interface to mantpulate external buffers.
patch 19: 	for vhost-net.

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

Here, we have ever considered 2 ways to utilize the page constructor
API to dispense the user buffers.

One:	Modify __alloc_skb() function a bit, it can only allocate a 
	structure of sk_buff, and the data pointer is pointing to a 
	user buffer which is coming from a page constructor API.
	Then the shinfo of the skb is also from guest.
	When packet is received from hardware, the skb->data is filled
	directly by h/w. What we have done is in this way.

	Pros:	We can avoid any copy here.
	Cons:	Guest virtio-net driver needs to allocate skb as almost
		the same method with the host NIC drivers, say the size
		of netdev_alloc_skb() and the same reserved space in the
		head of skb. Many NIC drivers are the same with guest and
		ok for this. But some lastest NIC drivers reserves special
		room in skb head. To deal with it, we suggest to provide
		a method in guest virtio-net driver to ask for parameter
		we interest from the NIC driver when we know which device 
		we have bind to do zero-copy. Then we ask guest to do so.
		

Two:	Modify driver to get user buffer allocated from a page constructor
	API(to substitute alloc_page()), the user buffer are used as payload
	buffers and filled by h/w directly when packet is received. Driver
	should associate the pages with skb (skb_shinfo(skb)->frags). For 
	the head buffer side, let host allocates skb, and h/w fills it. 
	After that, the data filled in host skb header will be copied into
	guest header buffer which is submitted together with the payload buffer.

	Pros:	We could less care the way how guest or host allocates their
		buffers.
	Cons:	We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.

We have got comments from Michael. And he said the first method will break
the compatiblity of virtio-net driver and may complicate the qemu live 
migration. Currently, we tried to ignore the skb_reserve() if the device
is doing zero-copy. Then guest virtio-net driver wil not changed. So we now
continue to go with the first way. 
But comments about the two ways are still appreicated.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
	packet split support
	To support GRO
	Performance tuning

what we have done in v1:
	polish the RCU usage
	deal with write logging in asynchroush mode in vhost
	add notifier block for mp device
	rename page_ctor to mp_port in netdevice.h to make it looks generic
	add mp_dev_change_flags() for mp device to change NIC state
	add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
	a small fix for missing dev_put when fail
	using dynamic minor instead of static minor number
	a __KERNEL__ protect to mp_get_sock()

what we have done in v2:
	
	remove most of the RCU usage, since the ctor pointer is only
	changed by BIND/UNBIND ioctl, and during that time, NIC will be
	stopped to get good cleanup(all outstanding requests are finished),
	so the ctor pointer cannot be raced into wrong situation.

	Remove the struct vhost_notifier with struct kiocb.
	Let vhost-net backend to alloc/free the kiocb and transfer them
	via sendmsg/recvmsg.

	use get_user_pages_fast() and set_page_dirty_lock() when read.

	Add some comments for netdev_mp_port_prep() and handle_mpassthru().

what we have done in v3:
	the async write logging is rewritten 
	a drafted synchronous write function for qemu live migration
	a limit for locked pages from get_user_pages_fast() to prevent Dos
	by using RLIMIT_MEMLOCK
	

what we have done in v4:
	add iocb completion callback from vhost-net to queue iocb in mp device
	replace vq->receiver by mp_sock_data_ready()
	remove stuff in mp device which access structures from vhost-net
	modify skb_reserve() to ignore host NIC driver reserved space
	rebase to the latest vhost tree
	split large patches into small pieces, especially for net core part.
	

what we have done in v5:
	address Arnd Bergmann's comments
		-remove IFF_MPASSTHRU_EXCL flag in mp device
		-Add CONFIG_COMPAT macro
		-remove mp_release ops
	move dev_is_mpassthru() as inline func
	fix a bug in memory relinquish
	Apply to current git (2.6.34-rc6) tree.

what we have done in v6:
	move create_iocb() out of page_dtor which may happen in interrupt context
	-This remove the potential issues which lock called in interrupt context
	make the cache used by mp, vhost as static, and created/destoryed during
	modules init/exit functions.
	-This makes multiple mp guest created at the same time.

what we have done in v7:
	some cleanup prepared to suppprt PS mode
			
performance:
	using netperf with GSO/TSO disabled, 10G NIC, 
	disabled packet split mode, with raw socket case compared to vhost.

	bindwidth will be from 1.1Gbps to 1.7Gbps
	CPU % from 120%-140% to 140%-160%

	We have retested the performance based on 2.6.34-rc6 in above situtation.
				BW		CPU %
	vhost			1.4Gbps		120% ~ 130%
	vhost + zero-copy	2.7Gbps		160% ~ 180%

^ permalink raw reply

* [RFC PATCH v7 18/19] Add a kconfig entry and make entry for mp device.
From: xiaohui.xin @ 2010-06-05 10:14 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1275732899-5423-17-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 drivers/vhost/Kconfig  |   10 ++++++++++
 drivers/vhost/Makefile |    2 ++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e4e2fd1..a6b8cbf 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
 	  To compile this driver as a module, choose M here: the module will
 	  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+	tristate "mediate passthru network driver (EXPERIMENTAL)"
+	depends on VHOST_NET
+	---help---
+	  zerocopy network I/O support, we call it as mediate passthru to
+	  be distiguish with hardare passthru.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
-- 
1.5.4.4

^ permalink raw reply related

* [RFC PATCH v7 16/19] Manipulate external buffers in mp device.
From: xiaohui.xin @ 2010-06-05 10:14 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xiaohui Xin
In-Reply-To: <1275732899-5423-15-git-send-email-xiaohui.xin@intel.com>

From: Xiaohui Xin<xiaohui.xin@intel.com>

How external buffer comes from, how to destroy.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 drivers/vhost/mpassthru.c |  253 ++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 251 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index 25e2f3e..8c48898 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -161,6 +161,39 @@ static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
 	return ret;
 }
 
+/* The main function to allocate external buffers */
+static struct skb_external_page *page_ctor(struct mpassthru_port *port,
+		struct sk_buff *skb, int npages)
+{
+	int i;
+	unsigned long flags;
+	struct page_ctor *ctor;
+	struct page_info *info = NULL;
+
+	ctor = container_of(port, struct page_ctor, port);
+
+	spin_lock_irqsave(&ctor->read_lock, flags);
+	if (!list_empty(&ctor->readq)) {
+		info = list_first_entry(&ctor->readq, struct page_info, list);
+		list_del(&info->list);
+	}
+	spin_unlock_irqrestore(&ctor->read_lock, flags);
+	if (!info)
+		return NULL;
+
+	for (i = 0; i < info->pnum; i++) {
+		get_page(info->pages[i]);
+		info->frag[i].page = info->pages[i];
+		info->frag[i].page_offset = i ? 0 : info->offset;
+		info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+			port->data_len;
+	}
+	info->skb = skb;
+	info->ext_page.frags = info->frag;
+	info->ext_page.ushinfo = &info->ushinfo;
+	return &info->ext_page;
+}
+
 static int page_ctor_attach(struct mp_struct *mp)
 {
 	int rc;
@@ -186,7 +219,7 @@ static int page_ctor_attach(struct mp_struct *mp)
 
 	dev_hold(dev);
 	ctor->dev = dev;
-	ctor->port.ctor = NULL;
+	ctor->port.ctor = page_ctor;
 	ctor->port.sock = &mp->socket;
 	ctor->lock_pages = 0;
 	rc = netdev_mp_port_attach(dev, &ctor->port);
@@ -252,11 +285,66 @@ static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
 	return 0;
 }
 
+static void relinquish_resource(struct page_ctor *ctor)
+{
+	if (!(ctor->dev->flags & IFF_UP) &&
+			!(ctor->wq_len + ctor->rq_len))
+		printk(KERN_INFO "relinquish_resource\n");
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+	struct page_info *info = (struct page_info *)(iocb->private);
+	int i;
+
+	if (info->flags == INFO_READ) {
+		for (i = 0; i < info->pnum; i++) {
+			if (info->pages[i]) {
+				set_page_dirty_lock(info->pages[i]);
+				put_page(info->pages[i]);
+			}
+		}
+		info->skb->destructor = NULL;
+		kfree_skb(info->skb);
+		info->ctor->rq_len--;
+	} else
+		info->ctor->wq_len--;
+	/* Decrement the number of locked pages */
+	info->ctor->lock_pages -= info->pnum;
+	kmem_cache_free(ext_page_info_cache, info);
+	relinquish_resource(info->ctor);
+
+	return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+	struct kiocb *iocb = NULL;
+
+	iocb = info->iocb;
+	if (!iocb)
+		return iocb;
+	iocb->ki_flags = 0;
+	iocb->ki_users = 1;
+	iocb->ki_key = 0;
+	iocb->ki_ctx = NULL;
+	iocb->ki_cancel = NULL;
+	iocb->ki_retry = NULL;
+	iocb->ki_iovec = NULL;
+	iocb->ki_eventfd = NULL;
+	iocb->ki_pos = info->desc_pos;
+	iocb->ki_nbytes = size;
+	iocb->ki_dtor(iocb);
+	iocb->private = (void *)info;
+	iocb->ki_dtor = mp_ki_dtor;
+
+	return iocb;
+}
+
 static int page_ctor_detach(struct mp_struct *mp)
 {
 	struct page_ctor *ctor;
 	struct page_info *info;
-	struct kiocb *iocb = NULL;
 	int i;
 
 	/* locked by mp_mutex */
@@ -268,11 +356,17 @@ static int page_ctor_detach(struct mp_struct *mp)
 		for (i = 0; i < info->pnum; i++)
 			if (info->pages[i])
 				put_page(info->pages[i]);
+		create_iocb(info, 0);
+		ctor->rq_len--;
 		kmem_cache_free(ext_page_info_cache, info);
 	}
+
+	relinquish_resource(ctor);
+
 	set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
 			   ctor->o_rlim.rlim_cur,
 			   ctor->o_rlim.rlim_max);
+
 	netdev_mp_port_detach(ctor->dev);
 	dev_put(ctor->dev);
 
@@ -320,6 +414,161 @@ static void mp_put(struct mp_file *mfile)
 		mp_detach(mfile->mp);
 }
 
+/* The callback to destruct the external buffers or skb */
+static void page_dtor(struct skb_external_page *ext_page)
+{
+	struct page_info *info;
+	struct page_ctor *ctor;
+	struct sock *sk;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	if (!ext_page)
+		return;
+	info = container_of(ext_page, struct page_info, ext_page);
+	if (!info)
+		return;
+	ctor = info->ctor;
+	skb = info->skb;
+
+	if ((info->flags == INFO_READ) && info->skb)
+		info->skb->head = NULL;
+
+	/* If the info->total is 0, make it to be reused */
+	if (!info->total) {
+		spin_lock_irqsave(&ctor->read_lock, flags);
+		list_add(&info->list, &ctor->readq);
+		spin_unlock_irqrestore(&ctor->read_lock, flags);
+		return;
+	}
+
+	if (info->flags == INFO_READ)
+		return;
+
+	/* For transmit, we should wait for the DMA finish by hardware.
+	 * Queue the notifier to wake up the backend driver
+	 */
+
+	create_iocb(info, info->total);
+
+	sk = ctor->port.sock->sk;
+	sk->sk_write_space(sk);
+
+	return;
+}
+
+/* For small exteranl buffers transmit, we don't need to call
+ * get_user_pages().
+ */
+static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
+		struct kiocb *iocb, int total)
+{
+	struct page_info *info =
+		kmem_cache_zalloc(ext_page_info_cache, GFP_KERNEL);
+
+	if (!info)
+		return NULL;
+	info->total = total;
+	info->ext_page.dtor = page_dtor;
+	info->ctor = ctor;
+	info->flags = INFO_WRITE;
+	info->iocb = iocb;
+	return info;
+}
+
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the external buffer address.
+ */
+static struct page_info *alloc_page_info(struct page_ctor *ctor,
+		struct kiocb *iocb, struct iovec *iov,
+		int count, struct frag *frags,
+		int npages, int total)
+{
+	int rc;
+	int i, j, n = 0;
+	int len;
+	unsigned long base, lock_limit;
+	struct page_info *info = NULL;
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	if (ctor->lock_pages + count > lock_limit && npages) {
+		printk(KERN_INFO "exceed the locked memory rlimit.");
+		return NULL;
+	}
+
+	info = kmem_cache_zalloc(ext_page_info_cache, GFP_KERNEL);
+
+	if (!info)
+		return NULL;
+
+	for (i = j = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+
+		if (!len)
+			continue;
+		n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+		rc = get_user_pages_fast(base, n, npages ? 1 : 0,
+				&info->pages[j]);
+		if (rc != n)
+			goto failed;
+
+		while (n--) {
+			frags[j].offset = base & ~PAGE_MASK;
+			frags[j].size = min_t(int, len,
+					PAGE_SIZE - frags[j].offset);
+			len -= frags[j].size;
+			base += frags[j].size;
+			j++;
+		}
+	}
+
+#ifdef CONFIG_HIGHMEM
+	if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+		for (i = 0; i < j; i++) {
+			if (PageHighMem(info->pages[i]))
+				goto failed;
+		}
+	}
+#endif
+
+	info->total = total;
+	info->ext_page.dtor = page_dtor;
+	info->ctor = ctor;
+	info->pnum = j;
+	info->iocb = iocb;
+	if (!npages)
+		info->flags = INFO_WRITE;
+	if (info->flags == INFO_READ) {
+		info->ext_page.start = (u8 *)(((unsigned long)
+				(pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
+				frags[0].offset));
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+		info->ext_page.size = SKB_DATA_ALIGN(
+				iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD);
+#else
+		info->ext_page.size = SKB_DATA_ALIGN(
+				iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD) -
+			NET_IP_ALIGN - NET_SKB_PAD;
+#endif
+	}
+	/* increment the number of locked pages */
+	ctor->lock_pages += j;
+	return info;
+
+failed:
+	for (i = 0; i < j; i++)
+		put_page(info->pages[i]);
+
+	kmem_cache_free(ext_page_info_cache, info);
+
+	return NULL;
+}
+
 /* Ops structure to mimic raw sockets with mp device */
 static const struct proto_ops mp_socket_ops = {
 };
-- 
1.5.4.4


^ permalink raw reply related

* [RFC PATCH v7 15/19] Add basic funcs and ioctl to mp device.
From: xiaohui.xin @ 2010-06-05 10:14 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1275732899-5423-14-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The ioctl is used by mp device to bind an underlying
NIC, it will query hardware capability and declare the
NIC to use external buffers.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---

        memory leak fixed,
        kconfig made,
        do_unbind() made,
        mp_chr_ioctl() cleanup

        by Jeff Dike <jdike@linux.intel.com>


 drivers/vhost/mpassthru.c |  681 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 681 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..25e2f3e
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,681 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME        "mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include <linux/compat.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/aio.h>
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/crc32.h>
+#include <linux/nsproxy.h>
+#include <linux/uaccess.h>
+#include <linux/virtio_net.h>
+#include <linux/mpassthru.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+	u16     offset;
+	u16     size;
+};
+
+struct page_info {
+	struct list_head        list;
+	int                     header;
+	/* indicate the actual length of bytes
+	 * send/recv in the external buffers
+	 */
+	int                     total;
+	int                     offset;
+	struct page             *pages[MAX_SKB_FRAGS+1];
+	struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+	struct sk_buff          *skb;
+	struct page_ctor        *ctor;
+
+	/* The pointer relayed to skb, to indicate
+	 * it's a external allocated skb or kernel
+	 */
+	struct skb_external_page    ext_page;
+	struct skb_shared_info  ushinfo;
+
+#define INFO_READ                      0
+#define INFO_WRITE                     1
+	unsigned                flags;
+	unsigned                pnum;
+
+	/* It's meaningful for receive, means
+	 * the max length allowed
+	 */
+	size_t                  len;
+
+	/* The fields after that is for backend
+	 * driver, now for vhost-net.
+	 */
+
+	struct kiocb            *iocb;
+	unsigned int            desc_pos;
+	struct iovec            hdr[MAX_SKB_FRAGS + 2];
+	struct iovec            iov[MAX_SKB_FRAGS + 2];
+};
+
+static struct kmem_cache *ext_page_info_cache;
+
+struct page_ctor {
+	struct list_head        readq;
+	int 			wq_len;
+	int 			rq_len;
+	spinlock_t      	read_lock;
+	/* record the locked pages */
+	int			lock_pages;
+	struct rlimit		o_rlim;
+	struct net_device   	*dev;
+	struct mpassthru_port	port;
+};
+
+struct mp_struct {
+	struct mp_file   	*mfile;
+	struct net_device       *dev;
+	struct page_ctor	*ctor;
+	struct socket           socket;
+
+#ifdef MPASSTHRU_DEBUG
+	int debug;
+#endif
+};
+
+struct mp_file {
+	atomic_t count;
+	struct mp_struct *mp;
+	struct net *net;
+};
+
+struct mp_sock {
+	struct sock            	sk;
+	struct mp_struct       	*mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+	int ret = 0;
+
+	rtnl_lock();
+	ret = dev_change_flags(dev, flags);
+	rtnl_unlock();
+
+	if (ret < 0)
+		printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+	return ret;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+	int rc;
+	struct page_ctor *ctor;
+	struct net_device *dev = mp->dev;
+
+	/* locked by mp_mutex */
+	if (rcu_dereference(mp->ctor))
+		return -EBUSY;
+
+	ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
+	if (!ctor)
+		return -ENOMEM;
+	rc = netdev_mp_port_prep(dev, &ctor->port);
+	if (rc)
+		goto fail;
+
+	INIT_LIST_HEAD(&ctor->readq);
+	spin_lock_init(&ctor->read_lock);
+
+	ctor->rq_len = 0;
+	ctor->wq_len = 0;
+
+	dev_hold(dev);
+	ctor->dev = dev;
+	ctor->port.ctor = NULL;
+	ctor->port.sock = &mp->socket;
+	ctor->lock_pages = 0;
+	rc = netdev_mp_port_attach(dev, &ctor->port);
+	if (rc)
+		goto fail;
+
+	/* locked by mp_mutex */
+	rcu_assign_pointer(mp->ctor, ctor);
+
+	/* XXX:Need we do set_offload here ? */
+
+	return 0;
+
+fail:
+	kfree(ctor);
+	dev_put(dev);
+
+	return rc;
+}
+
+struct page_info *info_dequeue(struct page_ctor *ctor)
+{
+	unsigned long flags;
+	struct page_info *info = NULL;
+	spin_lock_irqsave(&ctor->read_lock, flags);
+	if (!list_empty(&ctor->readq)) {
+		info = list_first_entry(&ctor->readq,
+				struct page_info, list);
+		list_del(&info->list);
+	}
+	spin_unlock_irqrestore(&ctor->read_lock, flags);
+	return info;
+}
+
+static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
+			      unsigned long cur, unsigned long max)
+{
+	struct rlimit new_rlim, *old_rlim;
+	int retval;
+
+	if (resource != RLIMIT_MEMLOCK)
+		return -EINVAL;
+	new_rlim.rlim_cur = cur;
+	new_rlim.rlim_max = max;
+
+	old_rlim = current->signal->rlim + resource;
+
+	/* remember the old rlimit value when backend enabled */
+	ctor->o_rlim.rlim_cur = old_rlim->rlim_cur;
+	ctor->o_rlim.rlim_max = old_rlim->rlim_max;
+
+	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+			!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	retval = security_task_setrlimit(resource, &new_rlim);
+	if (retval)
+		return retval;
+
+	task_lock(current->group_leader);
+	*old_rlim = new_rlim;
+	task_unlock(current->group_leader);
+	return 0;
+}
+
+static int page_ctor_detach(struct mp_struct *mp)
+{
+	struct page_ctor *ctor;
+	struct page_info *info;
+	struct kiocb *iocb = NULL;
+	int i;
+
+	/* locked by mp_mutex */
+	ctor = rcu_dereference(mp->ctor);
+	if (!ctor)
+		return -ENODEV;
+
+	while ((info = info_dequeue(ctor))) {
+		for (i = 0; i < info->pnum; i++)
+			if (info->pages[i])
+				put_page(info->pages[i]);
+		kmem_cache_free(ext_page_info_cache, info);
+	}
+	set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
+			   ctor->o_rlim.rlim_cur,
+			   ctor->o_rlim.rlim_max);
+	netdev_mp_port_detach(ctor->dev);
+	dev_put(ctor->dev);
+
+	/* locked by mp_mutex */
+	rcu_assign_pointer(mp->ctor, NULL);
+	synchronize_rcu();
+
+	kfree(ctor);
+	return 0;
+}
+
+static void __mp_detach(struct mp_struct *mp)
+{
+	mp->mfile = NULL;
+
+	mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+	page_ctor_detach(mp);
+	mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+	/* Drop the extra count on the net device */
+	dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+	mutex_lock(&mp_mutex);
+	__mp_detach(mp);
+	mutex_unlock(&mp_mutex);
+}
+
+static struct mp_struct *mp_get(struct mp_file *mfile)
+{
+	struct mp_struct *mp = NULL;
+	if (atomic_inc_not_zero(&mfile->count))
+		mp = mfile->mp;
+
+	return mp;
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+	if (atomic_dec_and_test(&mfile->count))
+		mp_detach(mfile->mp);
+}
+
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+};
+
+static struct proto mp_proto = {
+	.name           = "mp",
+	.owner          = THIS_MODULE,
+	.obj_size       = sizeof(struct mp_sock),
+};
+
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+	struct mp_file *mfile;
+	cycle_kernel_lock();
+	DBG1(KERN_INFO "mp: mp_chr_open\n");
+
+	mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
+	if (!mfile)
+		return -ENOMEM;
+	atomic_set(&mfile->count, 0);
+	mfile->mp = NULL;
+	mfile->net = get_net(current->nsproxy->net_ns);
+	file->private_data = mfile;
+	return 0;
+}
+
+static int mp_attach(struct mp_struct *mp, struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+	int err;
+
+	netif_tx_lock_bh(mp->dev);
+
+	err = -EINVAL;
+
+	if (mfile->mp)
+		goto out;
+
+	err = -EBUSY;
+	if (mp->mfile)
+		goto out;
+
+	err = 0;
+	mfile->mp = mp;
+	mp->mfile = mfile;
+	mp->socket.file = file;
+	dev_hold(mp->dev);
+	sock_hold(mp->socket.sk);
+	atomic_inc(&mfile->count);
+
+out:
+	netif_tx_unlock_bh(mp->dev);
+	return err;
+}
+
+static int do_unbind(struct mp_file *mfile)
+{
+	struct mp_struct *mp = mp_get(mfile);
+
+	if (!mp)
+		return -EINVAL;
+
+	mp_detach(mp);
+	sock_put(mp->socket.sk);
+	mp_put(mfile);
+	return 0;
+}
+
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp;
+	struct net_device *dev;
+	void __user* argp = (void __user *)arg;
+	struct ifreq ifr;
+	struct sock *sk;
+	int ret;
+
+	ret = -EINVAL;
+
+	switch (cmd) {
+	case MPASSTHRU_BINDDEV:
+		ret = -EFAULT;
+		if (copy_from_user(&ifr, argp, sizeof ifr))
+			break;
+
+		ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+		ret = -ENODEV;
+		dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+		if (!dev)
+			break;
+
+		mutex_lock(&mp_mutex);
+
+		ret = -EBUSY;
+
+		/* the device can be only bind once */
+		if (dev_is_mpassthru(dev))
+			goto err_dev_put;
+
+		mp = mfile->mp;
+		if (mp)
+			goto err_dev_put;
+
+		mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+		if (!mp) {
+			ret = -ENOMEM;
+			goto err_dev_put;
+		}
+		mp->dev = dev;
+		ret = -ENOMEM;
+
+		sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
+		if (!sk)
+			goto err_free_mp;
+
+		init_waitqueue_head(&mp->socket.wait);
+		mp->socket.ops = &mp_socket_ops;
+		sock_init_data(&mp->socket, sk);
+		sk->sk_sndbuf = INT_MAX;
+		container_of(sk, struct mp_sock, sk)->mp = mp;
+
+		sk->sk_destruct = NULL;
+		sk->sk_data_ready = NULL;
+		sk->sk_write_space = NULL;
+		sk->sk_state_change = NULL;
+		ret = mp_attach(mp, file);
+		if (ret < 0)
+			goto err_free_sk;
+
+		ret = page_ctor_attach(mp);
+		if (ret < 0)
+			goto err_free_sk;
+
+		mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+out:
+		mutex_unlock(&mp_mutex);
+		break;
+err_free_sk:
+		sk_free(sk);
+err_free_mp:
+		kfree(mp);
+err_dev_put:
+		dev_put(dev);
+		goto out;
+
+	case MPASSTHRU_UNBINDDEV:
+		ret = do_unbind(mfile);
+		break;
+
+	default:
+		break;
+	}
+	return ret;
+}
+
+static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp = mp_get(mfile);
+	struct sock *sk;
+	unsigned int mask = 0;
+
+	if (!mp)
+		return POLLERR;
+
+	sk = mp->socket.sk;
+
+	poll_wait(file, &mp->socket.wait, wait);
+
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	if (sock_writeable(sk) ||
+		(!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+			 sock_writeable(sk)))
+		mask |= POLLOUT | POLLWRNORM;
+
+	if (mp->dev->reg_state != NETREG_REGISTERED)
+		mask = POLLERR;
+
+	mp_put(mfile);
+	return mask;
+}
+
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct mp_struct *mp = mp_get(file->private_data);
+	struct sock *sk = mp->socket.sk;
+	struct sk_buff *skb;
+	int len, err;
+	ssize_t result = 0;
+
+	if (!mp)
+		return -EBADFD;
+
+	/* currently, async is not supported.
+	 * but we may support real async aio from user application,
+	 * maybe qemu virtio-net backend.
+	 */
+	if (!is_sync_kiocb(iocb))
+		return -EFAULT;
+
+	len = iov_length(iov, count);
+
+	if (unlikely(len) < ETH_HLEN)
+		return -EINVAL;
+
+	skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+				  file->f_flags & O_NONBLOCK, &err);
+
+	if (!skb)
+		return -EFAULT;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+	skb_put(skb, len);
+
+	if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+		kfree_skb(skb);
+		return -EAGAIN;
+	}
+
+	skb->protocol = eth_type_trans(skb, mp->dev);
+	skb->dev = mp->dev;
+
+	dev_queue_xmit(skb);
+
+	mp_put(file->private_data);
+	return result;
+}
+
+static int mp_chr_close(struct inode *inode, struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+
+	/*
+	 * Ignore return value since an error only means there was nothing to
+	 * do
+	 */
+	do_unbind(mfile);
+
+	put_net(mfile->net);
+	kfree(mfile);
+
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static long mp_chr_compat_ioctl(struct file *f, unsigned int ioctl,
+				unsigned long arg)
+{
+	return mp_chr_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations mp_fops = {
+	.owner  = THIS_MODULE,
+	.llseek = no_llseek,
+	.write  = do_sync_write,
+	.aio_write = mp_chr_aio_write,
+	.poll   = mp_chr_poll,
+	.unlocked_ioctl = mp_chr_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = mp_chr_compat_ioctl,
+#endif
+	.open   = mp_chr_open,
+	.release = mp_chr_close,
+};
+
+static struct miscdevice mp_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "mp",
+	.nodename = "net/mp",
+	.fops = &mp_fops,
+};
+
+static int mp_device_event(struct notifier_block *unused,
+		unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct mpassthru_port *port;
+	struct mp_struct *mp = NULL;
+	struct socket *sock = NULL;
+
+	port = dev->mp_port;
+	if (port == NULL)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		sock = dev->mp_port->sock;
+		mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+		do_unbind(mp->mfile);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block mp_notifier_block __read_mostly = {
+	.notifier_call  = mp_device_event,
+};
+
+static int mp_init(void)
+{
+	int err = 0;
+
+	ext_page_info_cache = kmem_cache_create("skb_page_info",
+						sizeof(struct page_info),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ext_page_info_cache)
+		return -ENOMEM;
+
+	err = misc_register(&mp_miscdev);
+	if (err) {
+		printk(KERN_ERR "mp: Can't register misc device\n");
+		kmem_cache_destroy(ext_page_info_cache);
+	} else {
+		printk(KERN_INFO "Registering mp misc device - minor = %d\n",
+				mp_miscdev.minor);
+		register_netdevice_notifier(&mp_notifier_block);
+	}
+	return err;
+}
+
+void mp_exit(void)
+{
+	unregister_netdevice_notifier(&mp_notifier_block);
+	misc_deregister(&mp_miscdev);
+	kmem_cache_destroy(ext_page_info_cache);
+}
+
+/* Get an underlying socket object from mp file.  Returns error unless file is
+ * attached to a device.  The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *mp_get_socket(struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp;
+
+	if (file->f_op != &mp_fops)
+		return ERR_PTR(-EINVAL);
+	mp = mp_get(mfile);
+	if (!mp)
+		return ERR_PTR(-EBADFD);
+	mp_put(mfile);
+	return &mp->socket;
+}
+EXPORT_SYMBOL_GPL(mp_get_socket);
+
+module_init(mp_init);
+module_exit(mp_exit);
+MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_LICENSE("GPL v2");
-- 
1.5.4.4

^ permalink raw reply related

* [RFC PATCH v7 13/19] To skip GRO if buffer is external currently.
From: xiaohui.xin @ 2010-06-05 10:14 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1275732899-5423-12-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 net/core/dev.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index dc2f225..6c6b2fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2787,6 +2787,10 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	if (skb_is_gso(skb) || skb_has_frags(skb))
 		goto normal;
 
+	/* currently GRO is not supported by mediate passthru */
+	if (dev_is_mpassthru(skb->dev))
+		goto normal;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
-- 
1.5.4.4


^ permalink raw reply related

* [RFC PATCH v7 12/19] Add a hook to intercept external buffers from NIC driver.
From: xiaohui.xin @ 2010-06-05 10:14 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1275732899-5423-11-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The hook is called in netif_receive_skb().
Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 net/core/dev.c |   35 +++++++++++++++++++++++++++++++++++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 37b389a..dc2f225 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2548,6 +2548,37 @@ err:
 EXPORT_SYMBOL(netdev_mp_port_prep);
 #endif
 
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+/* Add a hook to intercept mediate passthru(zero-copy) packets,
+ * and insert it to the socket queue owned by mp_port specially.
+ */
+static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
+					       struct packet_type **pt_prev,
+					       int *ret,
+					       struct net_device *orig_dev)
+{
+	struct mpassthru_port *mp_port = NULL;
+	struct sock *sk = NULL;
+
+	if (!dev_is_mpassthru(skb->dev))
+		return skb;
+	mp_port = skb->dev->mp_port;
+
+	if (*pt_prev) {
+		*ret = deliver_skb(skb, *pt_prev, orig_dev);
+		*pt_prev = NULL;
+	}
+
+	sk = mp_port->sock->sk;
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_state_change(sk);
+
+	return NULL;
+}
+#else
+#define handle_mpassthru(skb, pt_prev, ret, orig_dev)     (skb)
+#endif
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
@@ -2629,6 +2660,10 @@ int netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
+	/* To intercept mediate passthru(zero-copy) packets here */
+	skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
+	if (!skb)
+		goto out;
 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
 		goto out;
-- 
1.5.4.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox