Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 4/5] macvtap: Rewrite macvtap_newlink so the error handling works.
From: Eric W. Biederman @ 2011-10-20 14:28 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1hb33u4ab.fsf_-_@fess.ebiederm.org>


Place macvlan_common_newlink at the end of macvtap_newlink because
failing in newlink after registering your network device is not
supported.

Move device_create into a netdevice creation notifier.   The network device
notifier is the only hook that is called after the network device has been
registered with the device layer and before register_network_device returns
success.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c |   73 +++++++++++++++++++++++++++++++++----------------
 1 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 515aa87..25689e9 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -280,34 +280,16 @@ static int macvtap_newlink(struct net *src_net,
 			   struct nlattr *tb[],
 			   struct nlattr *data[])
 {
-	struct device *classdev;
-	dev_t devt;
-	int err;
-
-	err = macvlan_common_newlink(src_net, dev, tb, data,
-				     macvtap_receive, macvtap_forward);
-	if (err)
-		goto out;
-
-	devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
-
-	classdev = device_create(macvtap_class, &dev->dev, devt,
-				 dev, "tap%d", dev->ifindex);
-	if (IS_ERR(classdev)) {
-		err = PTR_ERR(classdev);
-		macvtap_del_queues(dev);
-	}
-
-out:
-	return err;
+	/* Don't put anything that may fail after macvlan_common_newlink
+	 * because we can't undo what it does.
+	 */
+	return macvlan_common_newlink(src_net, dev, tb, data,
+				      macvtap_receive, macvtap_forward);
 }
 
 static void macvtap_dellink(struct net_device *dev,
 			    struct list_head *head)
 {
-	device_destroy(macvtap_class,
-		       MKDEV(MAJOR(macvtap_major), dev->ifindex));
-
 	macvtap_del_queues(dev);
 	macvlan_dellink(dev, head);
 }
@@ -975,6 +957,42 @@ struct socket *macvtap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(macvtap_get_socket);
 
+static int macvtap_device_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct device *classdev;
+	dev_t devt;
+
+	if (dev->rtnl_link_ops != &macvtap_link_ops)
+		return NOTIFY_DONE;
+
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		/* Create the device node here after the network device has
+		 * been registered but before register_netdevice has
+		 * finished running.
+		 */
+		devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+		classdev = device_create(macvtap_class, &dev->dev, devt,
+					 dev, "tap%d", dev->ifindex);
+		if (IS_ERR(classdev))
+			return notifier_from_errno(PTR_ERR(classdev));
+		break;
+	case NETDEV_UNREGISTER:
+		devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+		device_destroy(macvtap_class, devt);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block macvtap_notifier_block __read_mostly = {
+	.notifier_call	= macvtap_device_event,
+};
+
 static int macvtap_init(void)
 {
 	int err;
@@ -995,12 +1013,18 @@ static int macvtap_init(void)
 		goto out3;
 	}
 
-	err = macvlan_link_register(&macvtap_link_ops);
+	err = register_netdevice_notifier(&macvtap_notifier_block);
 	if (err)
 		goto out4;
 
+	err = macvlan_link_register(&macvtap_link_ops);
+	if (err)
+		goto out5;
+
 	return 0;
 
+out5:
+	unregister_netdevice_notifier(&macvtap_notifier_block);
 out4:
 	class_unregister(macvtap_class);
 out3:
@@ -1015,6 +1039,7 @@ module_init(macvtap_init);
 static void macvtap_exit(void)
 {
 	rtnl_link_unregister(&macvtap_link_ops);
+	unregister_netdevice_notifier(&macvtap_notifier_block);
 	class_unregister(macvtap_class);
 	cdev_del(&macvtap_cdev);
 	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 3/5] macvtap: Don't leak unreceived packets when we delete a macvtap device.
From: Eric W. Biederman @ 2011-10-20 14:27 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1mxcvu4bk.fsf_-_@fess.ebiederm.org>


To avoid leaking packets in the receive queue.  Add a socket destructor
that will run whenever destroy a macvtap socket.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 1d9c9c2..515aa87 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -339,6 +339,11 @@ static void macvtap_sock_write_space(struct sock *sk)
 		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
 }
 
+static void macvtap_sock_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
 static int macvtap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
@@ -369,6 +374,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	q->sock.ops = &macvtap_socket_ops;
 	sock_init_data(&q->sock, &q->sk);
 	q->sk.sk_write_space = macvtap_sock_write_space;
+	q->sk.sk_destruct = macvtap_sock_destruct;
 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
 	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 2/5] macvtap: Fix macvtap_open races in the zero copy enable code.
From: Eric W. Biederman @ 2011-10-20 14:26 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1sjmnu4cm.fsf@fess.ebiederm.org>


To see if it is appropriate to enable the macvtap zero copy feature
don't test the lowerdev network device flags.   Instead test the
macvtap network device flags which are a direct copy of the lowerdev
flags.  This is important because nothing holds a reference to lowerdev
and on a very bad day we lowerdev could be a pointer to stale memory.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c |   11 +++++------
 1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 70aa628..1d9c9c2 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -343,7 +343,6 @@ static int macvtap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct net_device *dev = dev_get_by_index(net, iminor(inode));
-	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvtap_queue *q;
 	int err;
 
@@ -376,12 +375,12 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	/*
 	 * so far only KVM virtio_net uses macvtap, enable zero copy between
 	 * guest kernel and host kernel when lower device supports zerocopy
+	 *
+	 * The macvlan supports zerocopy iff the lower device supports zero
+	 * copy so we don't have to look at the lower device directly.
 	 */
-	if (vlan) {
-		if ((vlan->lowerdev->features & NETIF_F_HIGHDMA) &&
-		    (vlan->lowerdev->features & NETIF_F_SG))
-			sock_set_flag(&q->sk, SOCK_ZEROCOPY);
-	}
+	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
+		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
 
 	err = macvtap_set_queue(dev, file, q);
 	if (err)
-- 
1.7.2.5

^ permalink raw reply related

* Re: [patch] pktgen: bug when calling ndelay in x86 architectures
From: Daniel Turull @ 2011-10-20 14:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ben Hutchings, David Miller, netdev, Robert Olsson,
	Voravit Tanyingyong, Jens Laas
In-Reply-To: <1319118253.7735.9.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

On 10/20/2011 03:44 PM, Eric Dumazet wrote:
> Le jeudi 20 octobre 2011 à 15:22 +0200, Daniel Turull a écrit :
>> Hi,
>>
>> I tested the patch and it works well.
>>
> 
> Thanks !
> 
> 
>>
>> I think if we increase the constant to 1ms, we will reduce the jitter if we have
>> a rate between 1kpps and 10 kpps, but I guess is not a big deal.
>>
> 
> 
>> I've plot this new graph with this patch:
>> http://tslab.ssvl.kth.se/pktgen/img/inter_eric1.eps
> 
> Unfortunately, the sender cpu might be preempted by timer irq or other
> expensive irq, so the Min/Max values are not very different I guess.
> 
> I dont understand your Min values.
> 
> At 100 pps, how is it possible to have a Min value of ~5000 ns ?
> 
> 
My assumption is that for low rate, the min value is caused in the 
beginning of the test. When we start the transmission in pktgen_run(),
we set the pkt_dev->next_tx to the current time but the are
more operation to do, so the first transmission is a bit delayed. 
Even more if the cpu is preempted.
For the second packet, we are taking the pkt_dev->next_tx as a reference
and add the delay, in order to decide when to send.
So, my guess is that the first packet is delayed
and then we send the second packet only after a short time, in order
to keep the average rate in the transmission.

Daniel

^ permalink raw reply

* [PATCH 1/5] macvtap: Close a race between macvtap_open and macvtap_dellink.
From: Eric W. Biederman @ 2011-10-20 14:26 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1wrbzu4ec.fsf@fess.ebiederm.org>


There is a small window in macvtap_open between looking up a
networking device and calling macvtap_set_queue in which
macvtap_del_queues called from macvtap_dellink.   After
calling macvtap_del_queues it is totally incorrect to
allow macvtap_set_queue to proceed so prevent success by
reporting that all of the available queues are in use.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 3da5578..70aa628 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -231,6 +231,8 @@ static void macvtap_del_queues(struct net_device *dev)
 		}
 	}
 	BUG_ON(vlan->numvtaps != 0);
+	/* guarantee that any future macvtap_set_queue will fail */
+	vlan->numvtaps = MAX_MACVTAP_QUEUES;
 	spin_unlock(&macvtap_lock);
 
 	synchronize_rcu();
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 0/5] macvtap fixes.
From: Eric W. Biederman @ 2011-10-20 14:24 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma


This series of patches fixes a series of minor bugs in the macvtap code.

The fixes to handle failures in newlink and the change in how we handle
minor device number allocations are particularly significant.

Eric W. Biederman (5):
      macvtap: Close a race between macvtap_open and macvtap_dellink.
      macvtap: Fix macvtap_open races in the zero copy enable code.
      macvtap: Don't leak unreceived packets when we delete a macvtap device.
      macvtap: Rewrite macvtap_newlink so the error handling works.
      macvtap: Fix the minor device number allocation

^ permalink raw reply

* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Serge E. Hallyn @ 2011-10-20 14:24 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, akpm, oleg, richard, mikevs, segoon, gregkh,
	dhowells, eparis, Serge E. Hallyn, netdev
In-Reply-To: <m1fwinvl8u.fsf@fess.ebiederm.org>

Quoting Eric W. Biederman (ebiederm@xmission.com):
> "Serge E. Hallyn" <serge@hallyn.com> writes:
> 
> > Quoting Eric W. Biederman (ebiederm@xmission.com):
> >> Serge Hallyn <serge@hallyn.com> writes:
> >> 
> >> > From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
> >> >
> >> > Currently uids are compared without regard for the user namespace.
> >> > Fix that to prevent tasks in a different user namespace from
> >> > wrongly matching on SCM_CREDENTIALS.
> >> >
> >> > In the past, either your uids had to match, or you had to have
> >> > CAP_SETXID.  In a namespaced world, you must either (both be in the
> >> > same user namespace and have your uids match), or you must have
> >> > CAP_SETXID targeted at the other user namespace.  The latter can
> >> > happen for instance if uid 500 created a new user namespace and
> >> > now interacts with uid 0 in it.
> >> 
> >> Serge this approach is wrong.
> >
> > Thanks for looking, Eric.
> >
> >> Because we pass the cred and the pid through the socket socket itself
> >> is just a conduit and should be ignored in this context.
> >
> > Ok, that makes sense, but
> >
> >> The only interesting test should be are you allowed to impersonate other
> >> users in your current userk namespace.
> >
> > Why in your current user namespace?  Shouldn't it be in the
> > target user ns?  I understand it could be wrong to tie the
> > user ns owning the socket to the target userns (though I still
> > kind of like it), but just because I have CAP_SETUID in my
> > own user_ns doesn't mean I should be able to pose as another
> > uid in your user_ns.
> 
> First and foremost it is important that you be able if you have the
> capability to impersonate other users in your current user namespace.
> That is what the capability actually controls.
> 
> None of this allows you to impersonate any user in any other user
> namespace.  The translation between users prevents that.
> 
> > (Now I also see that cred_to_ucred() translates to the current
> > user_ns, so that should have been a hint to me before about
> > your intent, but I'm not convinced I agree with your intent).
> >
> > And you do the same with the pid.  Why is that a valid assumption?
> 
> Yes.  Basically all the code is allow you to impersonate people you
> would have been able to impersonate before.  If your target is in
> another namespace you can not fool them.
> 
> With pids the logic should be a lot clearer.  Pretend to be a pid you can
> see in your current pid namespace.  Lookup and convert to struct pid aka
> the namespace agnostic object.  On output return the pid value that
> the target process will know you as.
> 
> Ultimately I think we need a ns_capable for the current user namespace
> instead of a global one.  But I don't see any rush to introduce
> ns_capable here.

I think I agree - I was mistakenly thinking that without this patch
there is an opportunity for a less privileged task in child user ns
to impersonate, but that's not possible, so let's drop this patch
for now!

thanks,
-serge

^ permalink raw reply

* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Serge E. Hallyn @ 2011-10-20 14:14 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Serge E. Hallyn, linux-kernel, akpm, oleg, richard, mikevs,
	segoon, gregkh, dhowells, eparis, netdev
In-Reply-To: <m1fwinvl8u.fsf@fess.ebiederm.org>

Quoting Eric W. Biederman (ebiederm@xmission.com):
> "Serge E. Hallyn" <serge@hallyn.com> writes:
> 
> > Quoting Eric W. Biederman (ebiederm@xmission.com):
> >> Serge Hallyn <serge@hallyn.com> writes:
> >> 
> >> > From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
> >> >
> >> > Currently uids are compared without regard for the user namespace.
> >> > Fix that to prevent tasks in a different user namespace from
> >> > wrongly matching on SCM_CREDENTIALS.
> >> >
> >> > In the past, either your uids had to match, or you had to have
> >> > CAP_SETXID.  In a namespaced world, you must either (both be in the
> >> > same user namespace and have your uids match), or you must have
> >> > CAP_SETXID targeted at the other user namespace.  The latter can
> >> > happen for instance if uid 500 created a new user namespace and
> >> > now interacts with uid 0 in it.
> >> 
> >> Serge this approach is wrong.
> >
> > Thanks for looking, Eric.
> >
> >> Because we pass the cred and the pid through the socket socket itself
> >> is just a conduit and should be ignored in this context.
> >
> > Ok, that makes sense, but
> >
> >> The only interesting test should be are you allowed to impersonate other
> >> users in your current userk namespace.
> >
> > Why in your current user namespace?  Shouldn't it be in the
> > target user ns?  I understand it could be wrong to tie the
> > user ns owning the socket to the target userns (though I still
> > kind of like it), but just because I have CAP_SETUID in my
> > own user_ns doesn't mean I should be able to pose as another
> > uid in your user_ns.
> 
> First and foremost it is important that you be able if you have the
> capability to impersonate other users in your current user namespace.
> That is what the capability actually controls.
> 
> None of this allows you to impersonate any user in any other user
> namespace.  The translation between users prevents that.
> 
> > (Now I also see that cred_to_ucred() translates to the current
> > user_ns, so that should have been a hint to me before about
> > your intent, but I'm not convinced I agree with your intent).
> >
> > And you do the same with the pid.  Why is that a valid assumption?
> 
> Yes.  Basically all the code is allow you to impersonate people you
> would have been able to impersonate before.  If your target is in
> another namespace you can not fool them.
> 
> With pids the logic should be a lot clearer.  Pretend to be a pid you can
> see in your current pid namespace.  Lookup and convert to struct pid aka
> the namespace agnostic object.  On output return the pid value that

No.  That conversion is happending before the user-specified pid is
set.

> the target process will know you as.
> 
> Ultimately I think we need a ns_capable for the current user namespace
> instead of a global one.  But I don't see any rush to introduce
> ns_capable here.
> 
> Eric
> 

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Dmitry Eremin-Solenikov @ 2011-10-20 13:54 UTC (permalink / raw)
  To: Alexander Smirnov
  Cc: eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	kernel-janitors-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q
In-Reply-To: <20111020111718.GA32181-AUGNqIMGY+aR2kOLt6zJ8ErlnG4Plg33XqFh9Ls21Oc@public.gmane.org>

On 10/20/2011 03:17 PM, Alexander Smirnov wrote:
 > Hello everybody,
 >
 > below is the patch which adds support for fragmentation in 6LoWPAN

Thanks for the patch!

 > point to point networks. This activity needs because of difference
 > in MTU: 1280 ipv6 and 128 ieee802.15.4

127.

 >
 > This patch is just a draft. Could anyone please look at
 > it and let me know your opinion.
 >
 > The most doubtful moments for me are:
 > 1. Should the list 'frag_list' and the mutex 'flist_lock' be
 > included into dev private data?

I'd also think about being lock-free here via using RCU.

 > 2. Can I use 'dev_queue_xmit' to send fragments to queue?

Yes.

It seems I see the source of your problems. You try to fragment skb from 
the header_create function. It is not designed for this task. Please, 
don't do this! You are not the owner of the skb ATM. You can't just drop 
it from that function. Strictly speaking you can't be sure that this skb 
will really hit the device queue.

You really should push this part into queue processing on the device.


 > From 48472bae269b7b1a4047967ec21eadb217c4fd6d Mon Sep 17 00:00:00 2001
 > From: Alexander Smirnov <alex.bluesman.smirnov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
 > Date: Thu, 20 Oct 2011 15:02:36 +0400
 > Subject: [PATCH] 6LoWPAN fragmentation support
 >
 > Signed-off-by: Alexander Smirnov <alex.bluesman.smirnov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
 > ---
 >  net/ieee802154/6lowpan.c |  286 
+++++++++++++++++++++++++++++++++++++++++++++-
 >  net/ieee802154/6lowpan.h |    3 +
 >  2 files changed, 288 insertions(+), 1 deletions(-)
 >
 > diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
 > index 96877bd..1923ec7 100644
 > --- a/net/ieee802154/6lowpan.c
 > +++ b/net/ieee802154/6lowpan.c
 > @@ -113,6 +113,24 @@ struct lowpan_dev_record {
 >      struct list_head list;
 >  };
 >
 > +struct lowpan_fragment {
 > +    u8 in_progress;            /* assembling is in progress */
 > +    struct sk_buff *skb;        /* skb to be assembled */
 > +    u8 *data;            /* data to be stored */
 > +    struct mutex lock;        /* concurency lock */
 > +    u16 length;            /* frame length to be assemled */
 > +    u32 bytes_rcv;            /* bytes received */
 > +    u16 tag;            /* current fragment tag */
 > +    struct timer_list timer;    /* assembling timer */
 > +    struct list_head list;        /* fragments list handler */
 > +};
 > +
 > +static unsigned short fragment_tag;

What is this? Is it a part of 6lowpan standard? There is a long history 
behind being able to predict various packet/stream parameters. Please 
rethink and adjust this.

Ideally (if it's not contra the standard) this could be a part of a hash 
(probably even 16 bits from CRC32 could work) calculated from a set of 
values like jiffies, cpu number, some other variables.

 > +
 > +/* TODO: bind mutex and list to device */
 > +static LIST_HEAD(lowpan_fragments);
 > +struct mutex flist_lock;
 > +
 >  static inline struct
 >  lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
 >  {
 > @@ -244,6 +262,18 @@ static u8 lowpan_fetch_skb_u8(struct sk_buff *skb)
 >      return ret;
 >  }
 >
 > +static u16 lowpan_fetch_skb_u16(struct sk_buff *skb)
 > +{
 > +    u16 ret;
 > +
 > +    BUG_ON(skb->len < 2);
 > +
 > +    ret = skb->data[0] | (skb->data[1] << 8);
 > +    skb_pull(skb, 2);
 > +    return ret;
 > +}
 > +
 > +static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct 
net_device *dev);
 >  static int lowpan_header_create(struct sk_buff *skb,
 >                 struct net_device *dev,
 >                 unsigned short type, const void *_daddr,
 > @@ -467,9 +497,102 @@ static int lowpan_header_create(struct sk_buff 
*skb,
 >          memcpy(&(sa.hwaddr), saddr, 8);
 >
 >          mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
 > +
 > +        /* frame fragmentation */
 > +
 > +        /*
 > +         * if payload + mac header doesn't fit MTU-sized frame
 > +         * we need to fragment it.
 > +         */
 > +        if (skb->len > (127 - 24)) { /* MTU - MAC_HEADER_LENGTH */

Magic constants. And the statement will have to be adjusted  after 
adding security handling. Does 6lowpan specify the maximum fragment 
size? IIRC there is a setting in the standard which exactly describes
which should be the maximum data size: either 127 - max_header - 
max_security_header or just 'data + MPDU headers should fit into 127'.

Could you please recheck this in both standards?

 > +            struct sk_buff *fr_skb;
 > +            u16 b_sent = 0;
 > +            unsigned short payload_len = skb->len;
 > +            int stat = 0;
 > +
 > +            pr_debug("%s: the frame is too big (0x%x),"
 > +                 "fragmentation needed, using tag = 0x%x\n",
 > +                 __func__, payload_len, fragment_tag);
 > +
 > +            fr_skb = skb_copy(skb, GFP_KERNEL);
 > +            if (!fr_skb)
 > +                goto error;
 > +
 > +            /* 40-bit - fragment dispatch size */
 > +            head = kzalloc(5, GFP_KERNEL);

No real need to kzalloc. Could you please allocate it on stack?

 > +            if (!head)
 > +                goto error;
 > +
 > +            /* first fagment header */
 > +            head[0] = LOWPAN_DISPATCH_FRAG1 | (payload_len & 0x7);
 > +            head[1] = (payload_len >> 3) & 0xff;
 > +            head[2] = fragment_tag & 0xff;
 > +            head[3] = fragment_tag >> 8;
 > +

This is not atomic!!! You should get the fragment tag value once for the 
whole skb and then use the same value in the whole function.

 > +
 > +            lowpan_raw_dump_inline(__func__, "first header",
 > +                            head, 4);
 > +
 > +            memcpy(skb_push(fr_skb, 4), head, 4);

And what if there is no 4-byte space for the header?

 > +            skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
 > +
 > +            dev_hard_header(fr_skb, lowpan_dev_info(dev)->real_dev,
 > +                type, (void *)&da, (void *)&sa, fr_skb->len);
 > +
 > +            /* send fragment to dev queue */
 > +            dev_queue_xmit(fr_skb);
 > +
 > +            /* next fragments headers */
 > +            head[0] |= 0x20;

Magic value

 > +
 > +            lowpan_raw_dump_inline(__func__, "next headers",
 > +                            head, 5);
 > +
 > +            while (b_sent < payload_len) {
 > +                /* not the first fragment */
 > +                if (b_sent)
 > +                    skb_pull(skb, LOWPAN_FRAG_SIZE);

Are you the owner of the original skb here? Seems you are not. So you 
can't change the original skb.

 > +
 > +                pr_debug("%s: preparing fragment %d\n",
 > +                    __func__, b_sent / LOWPAN_FRAG_SIZE);
 > +
 > +                /*
 > +                 * create copy of current buffer and trim it
 > +                 * down to fragment size
 > +                 */
 > +                fr_skb = skb_copy(skb, GFP_KERNEL);
 > +                if (!fr_skb)
 > +                    goto error;
 > +
 > +                skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
 > +
 > +                /* add fragment header */
 > +                head[4] = b_sent / 8;
 > +                memcpy(skb_push(fr_skb, 5), head, 5);
 > +
 > +                b_sent += LOWPAN_FRAG_SIZE;
 > +
 > +                lowpan_raw_dump_table(__func__,
 > +                   "fragment data", fr_skb->data, fr_skb->len);
 > +
 > +                stat = dev_hard_header(fr_skb,
 > +                    lowpan_dev_info(dev)->real_dev, type,
 > +                    (void *)&da, (void *)&sa, fr_skb->len);
 > +
 > +                dev_queue_xmit(fr_skb);
 > +            }

I don't like this piece of code. Please refactor it to a separate 
function that can send both first and next fragments.

Also I don't see a point in copying original skb again and again. It 
would be wiser to allocate fragment skb's via dev_alloc_skb, reserve 
header space, push fragmentation header, then memcpy the rest of the data.
Or you can use non-linear skb's referencing fragments from the data part 
of the original skb.


 > +
 > +            /* TODO: what's the correct way to skip default skb? */
 > +
 > +            fragment_tag++;

Hmmm.

 > +            return stat;
 > +        } else
 >          return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
 >                  type, (void *)&da, (void *)&sa, skb->len);
 >      }
 > +error:
 > +    kfree_skb(skb);
 > +    return -ENOMEM;
 >  }
 >
 >  static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr)
 > @@ -511,6 +634,23 @@ static int lowpan_skb_deliver(struct sk_buff 
*skb, struct ipv6hdr *hdr)
 >      return stat;
 >  }
 >
 > +static void lowpan_fragment_timer_expired(unsigned long tag)
 > +{
 > +    struct lowpan_fragment *entry, *tmp;
 > +
 > +    pr_debug("%s: timer expired for frame with tag %lu\n", __func__, 
tag);
 > +
 > +    mutex_lock(&flist_lock);
 > +    list_for_each_entry_safe(entry, tmp, &lowpan_fragments, list)
 > +        if (entry->tag == tag) {
 > +            list_del(&entry->list);
 > +            kfree(entry->data);
 > +            kfree(entry);
 > +            break;
 > +        }
 > +    mutex_unlock(&flist_lock);
 > +}

Rather than using a timer here, I'd use a delayed job (that can drop 
several fragmentated packets at once, if it's execution is delayed) plus 
a sorted fragments list to ease the calculation of the next fragment 
time out wipe.

 > +
 >  static int
 >  lowpan_process_data(struct sk_buff *skb)
 >  {
 > @@ -525,6 +665,139 @@ lowpan_process_data(struct sk_buff *skb)
 >      if (skb->len < 2)
 >          goto drop;
 >      iphc0 = lowpan_fetch_skb_u8(skb);
 > +
 > +    /* fragments assmebling */
 > +    switch (iphc0 & 0xf8) {
 > +    /* first fragment of the frame */
 > +    case LOWPAN_DISPATCH_FRAG1:
 > +    {
 > +        struct lowpan_fragment *entry, *frame;
 > +        u16 tag;
 > +
 > +        lowpan_raw_dump_inline(__func__, "first frame fragment header",
 > +                                skb->data, 3);
 > +
 > +        tmp = lowpan_fetch_skb_u8(skb);
 > +        tag = lowpan_fetch_skb_u16(skb);
 > +
 > +        /*
 > +         * check if frame assembling with the same tag is
 > +         * already in progress
 > +         */
 > +        rcu_read_lock();
 > +        list_for_each_entry_rcu(entry, &lowpan_fragments, list)
 > +            if (entry->tag == tag) {
 > +                pr_debug("%s ERROR: frame with this tag is"
 > +                     "alredy in assembling", __func__);
 > +                goto drop_rcu;
 > +            }
 > +        rcu_read_unlock();

I'm not quite sure that your RCU/locking usage is correct.

 > +
 > +        /* alloc new frame structure */
 > +        frame = kzalloc(sizeof(struct lowpan_fragment), GFP_KERNEL);
 > +        if (!frame)
 > +            goto drop;
 > +
 > +        INIT_LIST_HEAD(&frame->list);
 > +
 > +        frame->bytes_rcv = 0;
 > +        frame->length = (iphc0 & 7) | (tmp << 3);
 > +        frame->tag = tag;
 > +        /* allocate buffer for frame assembling */
 > +        frame->data = kzalloc(frame->length, GFP_KERNEL);

Why not allocate an skb here? You can do all fragments processing on the 
top of one skb + ranges handling.

BTW: Did you study the skb reassembly code of IPv4?

 > +        if (!frame->data) {
 > +            kfree(frame);
 > +            goto drop;
 > +        }
 > +
 > +        pr_debug("%s: frame to be assembled: length = 0x%x, "
 > +             "tag = 0x%x\n", __func__, frame->length, frame->tag);
 > +
 > +        init_timer(&frame->timer);
 > +        /* (number of fragments) * (fragment processing time-out) */
 > +        frame->timer.expires = jiffies +
 > +          (frame->length / LOWPAN_FRAG_SIZE + 1) * LOWPAN_FRAG_TIMEOUT;
 > +        frame->timer.data = tag;
 > +        frame->timer.function = lowpan_fragment_timer_expired;
 > +
 > +        add_timer(&frame->timer);
 > +
 > +        mutex_lock(&flist_lock);
 > +        list_add_tail(&frame->list, &lowpan_fragments);
 > +        mutex_unlock(&flist_lock);
 > +
 > +        return kfree_skb(skb), 0;
 > +    }
 > +    /* second and next fragment of the frame */
 > +    case LOWPAN_DISPATCH_FRAGN:
 > +    {
 > +        u16 tag;
 > +        struct lowpan_fragment *entry, *t;
 > +
 > +        lowpan_raw_dump_inline(__func__, "next fragment header",
 > +                    skb->data, 4);
 > +
 > +        lowpan_fetch_skb_u8(skb); /* skip frame length byte */
 > +        tag = lowpan_fetch_skb_u16(skb);
 > +
 > +        rcu_read_lock();
 > +        list_for_each_entry_rcu(entry, &lowpan_fragments, list)
 > +            if (entry->tag == tag)
 > +                break;
 > +        rcu_read_unlock();
 > +
 > +        if (entry->tag != tag) {
 > +            pr_debug("%s ERROR: no frame structure found for this"
 > +                 "fragment", __func__);
 > +            goto drop;
 > +        }

Can you be sure that you won't receive fragments out of order? No, you 
can not!

 > +
 > +        tmp = lowpan_fetch_skb_u8(skb); /* fetch offset */
 > +
 > +        lowpan_raw_dump_table(__func__, "next fragment payload",
 > +                    skb->data, skb->len);
 > +
 > +        /* if payload fits buffer, copy it */
 > +        if ((tmp * 8 + skb->len) <= entry->length) /* TODO: likely? */
 > +            memcpy(entry->data + tmp * 8, skb->data, skb->len);
 > +        else
 > +            goto drop;
 > +
 > +        entry->bytes_rcv += skb->len;
 > +
 > +        pr_debug("%s: frame length = 0x%x, bytes received = 0x%x\n",
 > +             __func__, entry->length, entry->bytes_rcv);
 > +
 > +        /* frame assembling complete */
 > +        if (entry->bytes_rcv == entry->length) {
 > +            struct sk_buff *tmp = skb;


WTF?

 > +
 > +            mutex_lock(&flist_lock);
 > +            list_for_each_entry_safe(entry, t, &lowpan_fragments, list)
 > +                if (entry->tag == tag) {
 > +                    list_del(&entry->list);
 > +                    /* copy and clear skb */
 > +                    skb = skb_copy_expand(skb, entry->length, 
skb_tailroom(skb), GFP_KERNEL);
 > +                    skb_pull(skb, skb->len);
 > +                    /* copy new data to skb */
 > +                    memcpy(skb_push(skb, entry->length), 
entry->data, entry->length);
 > +                    kfree_skb(tmp);
 > +                    del_timer(&entry->timer);
 > +                    kfree(entry->data);
 > +                    kfree(entry);

This is not the optimal way to code this. Consider reading about string 
concatenation in Java or Python.

 > +
 > +                    iphc0 = lowpan_fetch_skb_u8(skb);
 > +                    break;
 > +                }
 > +            mutex_unlock(&flist_lock);
 > +            break;
 > +        }
 > +        return kfree_skb(skb), 0;
 > +    }
 > +    default:
 > +        break;
 > +    }
 > +
 >      iphc1 = lowpan_fetch_skb_u8(skb);
 >
 >      _saddr = mac_cb(skb)->sa.hwaddr;
 > @@ -674,6 +947,8 @@ lowpan_process_data(struct sk_buff *skb)
 >      lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr,
 >                              sizeof(hdr));
 >      return lowpan_skb_deliver(skb, &hdr);
 > +drop_rcu:
 > +    rcu_read_unlock();
 >  drop:
 >      kfree(skb);
 >      return -EINVAL;
 > @@ -765,8 +1040,15 @@ static int lowpan_rcv(struct sk_buff *skb, 
struct net_device *dev,
 >          goto drop;
 >
 >      /* check that it's our buffer */
 > -    if ((skb->data[0] & 0xe0) == 0x60)
 > +    switch (skb->data[0] & 0xe0) {
 > +    case 0x60:        /* ipv6 datagram */
 > +    case 0xc0:        /* first fragment header */
 > +    case 0xe0:        /* next fragments headers */
 >          lowpan_process_data(skb);
 > +        break;
 > +    default:
 > +        break;
 > +    }
 >
 >      return NET_RX_SUCCESS;
 >
 > @@ -793,6 +1075,8 @@ static int lowpan_newlink(struct net *src_net, 
struct net_device *dev,
 >      lowpan_dev_info(dev)->real_dev = real_dev;
 >      mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
 >
 > +    mutex_init(&flist_lock);
 > +
 >      entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL);
 >      if (!entry)
 >          return -ENOMEM;
 > diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
 > index 5d8cf80..e8e57c4 100644
 > --- a/net/ieee802154/6lowpan.h
 > +++ b/net/ieee802154/6lowpan.h
 > @@ -159,6 +159,9 @@
 >  #define LOWPAN_DISPATCH_FRAG1    0xc0 /* 11000xxx */
 >  #define LOWPAN_DISPATCH_FRAGN    0xe0 /* 11100xxx */
 >
 > +#define LOWPAN_FRAG_SIZE    40        /* fragment payload size */
 > +#define LOWPAN_FRAG_TIMEOUT    (HZ * 2)    /* processing time: 2 sec */

Is it a standard defined interval?


-- 
With best wishes
Dmitry

------------------------------------------------------------------------------
The demand for IT networking professionals continues to grow, and the
demand for specialized networking skills is growing even more rapidly.
Take a complimentary Learning@Ciosco Self-Assessment and learn 
about Cisco certifications, training, and career opportunities. 
http://p.sf.net/sfu/cisco-dev2dev

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Eric Dumazet @ 2011-10-20 13:51 UTC (permalink / raw)
  To: Dmitry Eremin-Solenikov
  Cc: Alexander Smirnov, davem, slapin, linux-zigbee-devel, netdev,
	jonsmirl
In-Reply-To: <4EA01E77.8040300@gmail.com>

Le jeudi 20 octobre 2011 à 17:13 +0400, Dmitry Eremin-Solenikov a
écrit :
> Hi, Alexander, colleagues,
> 
> On 10/20/2011 04:50 PM, Alexander Smirnov wrote:
> > Hi Eric,
> >
> > thank you for the replies. And another question I forgot to ask:
> >
> > when I send fragments, I still have original skb buffer. What should I
> > do with it, is there any
> > "proper/good" ways to drop it? Because I've already fragmented it and
> > do not need to send
> > original skb to queue.
> 
> You might want to check the TCP/IP fragmentation code path. I think you 
> can drop it with kfree_skb, but I ain't sure ATM.

In the TCP/IP frag code path, we own the skb and can do many things,
like using the skb to store one of the fragment.

In a driver ndo_start_xmit(), things are a bit different.

Special care must be taken if skb_cloned(skb) is true...

^ permalink raw reply

* Re: [patch] pktgen: bug when calling ndelay in x86 architectures
From: Eric Dumazet @ 2011-10-20 13:44 UTC (permalink / raw)
  To: Daniel Turull
  Cc: Ben Hutchings, David Miller, netdev, Robert Olsson,
	Voravit Tanyingyong, Jens Laas
In-Reply-To: <4EA020B3.4030208@gmail.com>

Le jeudi 20 octobre 2011 à 15:22 +0200, Daniel Turull a écrit :
> Hi,
> 
> I tested the patch and it works well.
> 

Thanks !


> 
> I think if we increase the constant to 1ms, we will reduce the jitter if we have
> a rate between 1kpps and 10 kpps, but I guess is not a big deal.
> 


> I've plot this new graph with this patch:
> http://tslab.ssvl.kth.se/pktgen/img/inter_eric1.eps

Unfortunately, the sender cpu might be preempted by timer irq or other
expensive irq, so the Min/Max values are not very different I guess.

I dont understand your Min values.

At 100 pps, how is it possible to have a Min value of ~5000 ns ?

^ permalink raw reply

* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Eric W. Biederman @ 2011-10-20 13:35 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: linux-kernel, akpm, oleg, richard, mikevs, segoon, gregkh,
	dhowells, eparis, Serge E. Hallyn, netdev
In-Reply-To: <20111020125801.GA1315@hallyn.com>

"Serge E. Hallyn" <serge@hallyn.com> writes:

> Quoting Eric W. Biederman (ebiederm@xmission.com):
>> Serge Hallyn <serge@hallyn.com> writes:
>> 
>> > From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
>> >
>> > Currently uids are compared without regard for the user namespace.
>> > Fix that to prevent tasks in a different user namespace from
>> > wrongly matching on SCM_CREDENTIALS.
>> >
>> > In the past, either your uids had to match, or you had to have
>> > CAP_SETXID.  In a namespaced world, you must either (both be in the
>> > same user namespace and have your uids match), or you must have
>> > CAP_SETXID targeted at the other user namespace.  The latter can
>> > happen for instance if uid 500 created a new user namespace and
>> > now interacts with uid 0 in it.
>> 
>> Serge this approach is wrong.
>
> Thanks for looking, Eric.
>
>> Because we pass the cred and the pid through the socket socket itself
>> is just a conduit and should be ignored in this context.
>
> Ok, that makes sense, but
>
>> The only interesting test should be are you allowed to impersonate other
>> users in your current userk namespace.
>
> Why in your current user namespace?  Shouldn't it be in the
> target user ns?  I understand it could be wrong to tie the
> user ns owning the socket to the target userns (though I still
> kind of like it), but just because I have CAP_SETUID in my
> own user_ns doesn't mean I should be able to pose as another
> uid in your user_ns.

First and foremost it is important that you be able if you have the
capability to impersonate other users in your current user namespace.
That is what the capability actually controls.

None of this allows you to impersonate any user in any other user
namespace.  The translation between users prevents that.

> (Now I also see that cred_to_ucred() translates to the current
> user_ns, so that should have been a hint to me before about
> your intent, but I'm not convinced I agree with your intent).
>
> And you do the same with the pid.  Why is that a valid assumption?

Yes.  Basically all the code is allow you to impersonate people you
would have been able to impersonate before.  If your target is in
another namespace you can not fool them.

With pids the logic should be a lot clearer.  Pretend to be a pid you can
see in your current pid namespace.  Lookup and convert to struct pid aka
the namespace agnostic object.  On output return the pid value that
the target process will know you as.

Ultimately I think we need a ns_capable for the current user namespace
instead of a global one.  But I don't see any rush to introduce
ns_capable here.

Eric

^ permalink raw reply

* Re: [patch] pktgen: bug when calling ndelay in x86 architectures
From: Daniel Turull @ 2011-10-20 13:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ben Hutchings, David Miller, netdev, Robert Olsson,
	Voravit Tanyingyong, Jens Laas
In-Reply-To: <1319019235.3103.10.camel@edumazet-laptop>

Hi,

I tested the patch and it works well.


On 10/19/2011 12:13 PM, Eric Dumazet wrote:
> Le mercredi 19 octobre 2011 à 11:33 +0200, Daniel Turull a écrit :
>> Hi,
>> then if we want to use the spin more often.
>> maybe we can increase the constant from 100000 (0.1ms) to 1000000 (1ms)?
>> How was the current value chosen?
>>
> 
> Based on user needs ;)

I think if we increase the constant to 1ms, we will reduce the jitter if we have
a rate between 1kpps and 10 kpps, but I guess is not a big deal.

I've plot this new graph with this patch:
http://tslab.ssvl.kth.se/pktgen/img/inter_eric1.eps

> 
>> I did some measurements of the inter-arrival time between packets
>> and with bigger values the maximal is reduced in the rates between
>> 2kpps and 20kpps.
>>
> 
> ndelay()/udelay() have some inaccuracies, for 'long' values, because of
> rounding errors.
> 
> If we spin, just call ktime_now() in a loop until spin_until is
> reached...
> 
> That way you get max possible resolution, given kernel time service
> constraints.
> 
> Untested patch :
> 
> diff --git a/net/core/pktgen.c b/net/core/pktgen.c
> index 38d6577..5c7e900 100644
> --- a/net/core/pktgen.c
> +++ b/net/core/pktgen.c
> @@ -2145,9 +2145,11 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
>  	}
>  
>  	start_time = ktime_now();
> -	if (remaining < 100000)
> -		ndelay(remaining);	/* really small just spin */
> -	else {
> +	if (remaining < 100000) {
> +		do {
> +			end_time = ktime_now();
> +		} while (ktime_lt(end_time, spin_until));
> +	} else {
>  		/* see do_nanosleep */
>  		hrtimer_init_sleeper(&t, current);
>  		do {
> @@ -2162,8 +2164,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
>  			hrtimer_cancel(&t.timer);
>  		} while (t.task && pkt_dev->running && !signal_pending(current));
>  		__set_current_state(TASK_RUNNING);
> +		end_time = ktime_now();
>  	}
> -	end_time = ktime_now();
>  
>  	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
>  	pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
> 
> 

Daniel

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Dmitry Eremin-Solenikov @ 2011-10-20 13:13 UTC (permalink / raw)
  To: Alexander Smirnov
  Cc: Eric Dumazet, davem, slapin, linux-zigbee-devel, netdev, jonsmirl
In-Reply-To: <CAJmB2rCe3BJKD07TOSyAT0vbDq_K1VHLOtECqeOMzaTsg3DokA@mail.gmail.com>

Hi, Alexander, colleagues,

On 10/20/2011 04:50 PM, Alexander Smirnov wrote:
> Hi Eric,
>
> thank you for the replies. And another question I forgot to ask:
>
> when I send fragments, I still have original skb buffer. What should I
> do with it, is there any
> "proper/good" ways to drop it? Because I've already fragmented it and
> do not need to send
> original skb to queue.

You might want to check the TCP/IP fragmentation code path. I think you 
can drop it with kfree_skb, but I ain't sure ATM.

>
> Thank you,
> Alexander

P.S. Top posting is really a bad style. And it's now that welcome in the 
MLs.

-- 
With best wishes
Dmitry

^ permalink raw reply

* Requesting for your partnership
From: Park Zihao @ 2011-10-20 12:42 UTC (permalink / raw)




I am Mr Park Zihao, an Account Officer with the International bank of Taipei,I need your assistance in a business deal and you will be paid 30% for your Management Fees". Please reply for details

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Eric Dumazet @ 2011-10-20 13:11 UTC (permalink / raw)
  To: Alexander Smirnov
  Cc: davem, dbaryshkov, slapin, linux-zigbee-devel, netdev, jonsmirl
In-Reply-To: <CAJmB2rCe3BJKD07TOSyAT0vbDq_K1VHLOtECqeOMzaTsg3DokA@mail.gmail.com>

Le jeudi 20 octobre 2011 à 16:50 +0400, Alexander Smirnov a écrit :
> Hi Eric,
> 
> thank you for the replies. And another question I forgot to ask:
> 
> when I send fragments, I still have original skb buffer. What should I
> do with it, is there any
> "proper/good" ways to drop it? Because I've already fragmented it and
> do not need to send
> original skb to queue.

I dont quite understand. Once your xmits are done, you must free the
original skb.

^ permalink raw reply

* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Serge E. Hallyn @ 2011-10-20 12:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, akpm, oleg, richard, mikevs, segoon, gregkh,
	dhowells, eparis, Serge E. Hallyn, netdev
In-Reply-To: <m1sjmpytpf.fsf@fess.ebiederm.org>

Quoting Eric W. Biederman (ebiederm@xmission.com):
> Serge Hallyn <serge@hallyn.com> writes:
> 
> > From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
> >
> > Currently uids are compared without regard for the user namespace.
> > Fix that to prevent tasks in a different user namespace from
> > wrongly matching on SCM_CREDENTIALS.
> >
> > In the past, either your uids had to match, or you had to have
> > CAP_SETXID.  In a namespaced world, you must either (both be in the
> > same user namespace and have your uids match), or you must have
> > CAP_SETXID targeted at the other user namespace.  The latter can
> > happen for instance if uid 500 created a new user namespace and
> > now interacts with uid 0 in it.
> 
> Serge this approach is wrong.

Thanks for looking, Eric.

> Because we pass the cred and the pid through the socket socket itself
> is just a conduit and should be ignored in this context.

Ok, that makes sense, but

> The only interesting test should be are you allowed to impersonate other
> users in your current userk namespace.

Why in your current user namespace?  Shouldn't it be in the
target user ns?  I understand it could be wrong to tie the
user ns owning the socket to the target userns (though I still
kind of like it), but just because I have CAP_SETUID in my
own user_ns doesn't mean I should be able to pose as another
uid in your user_ns.

(Now I also see that cred_to_ucred() translates to the current
user_ns, so that should have been a hint to me before about
your intent, but I'm not convinced I agree with your intent).

And you do the same with the pid.  Why is that a valid assumption?

(I've got that feeling that I'll feel like a dunce once you explain :)

> So it should be possible to simplify the entire patch to just:
>  static __inline__ int scm_check_creds(struct ucred *creds)
>  {
>  	const struct cred *cred = current_cred();
> +	struct user_namespace *ns = cred->user_ns;
> 
> -	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
> -	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
> -	      creds->uid == cred->suid) || capable(CAP_SETUID)) &&
> -	    ((creds->gid == cred->gid   || creds->gid == cred->egid ||
> -	      creds->gid == cred->sgid) || capable(CAP_SETGID))) {
> +	if ((creds->pid == task_tgid_vnr(current) || ns_capable(ns, CAP_SYS_ADMIN)) &&
> +	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
> +	      creds->uid == cred->suid) || ns_capable(ns, CAP_SETUID)) &&
> +	    ((creds->gid == cred->gid   || creds->gid == cred->egid ||
> +	      creds->gid == cred->sgid) || ns_capable(ns, CAP_SETGID))) {
>   	       return 0;
>   	}
>   	return -EPERM;
>   }

^ permalink raw reply

* Re: [RFC PATCH 0/5] SUNRPC: "RPC pipefs per network namespace" preparations
From: Stanislav Kinsbursky @ 2011-10-20 12:56 UTC (permalink / raw)
  To: bfields@fieldses.org
  Cc: Trond.Myklebust@netapp.com, linux-nfs@vger.kernel.org,
	Pavel Emelianov, neilb@suse.de, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, davem@davemloft.net,
	devel@openvz.org
In-Reply-To: <20111020123242.GN5444@fieldses.org>

20.10.2011 16:32, bfields@fieldses.org пишет:
> On Thu, Oct 20, 2011 at 03:06:46PM +0400, Stanislav Kinsbursky wrote:
>> Guys, please, spend some of your expensive time to review this patch-set briefly.
>
> I'll try to take a look soon, but I'm travelling tomorrow through the
> 31st, and things will be a little hectic.
>

Thanks for your time, Bruce.

> Just one quick comment:
>
>>> The only problem about I'm not sure how to solve properly yet, is auth gss
>>> pipes creations operations. Hoping for some help with it.
>
> I suspect one reason it may be a little complicated is the
> upcall-version switching.  The old version is deprecated, and there's no
> need to support the combination of the old version with the a new
> feature like containers.  And now that it's been there a while the
> version-switching code already achieved its goal of avoiding a flag day.
> So, one approach might be:
>
> 	- move all the code for the old gss upcall and for the version
> 	  switching under a new CONFIG_DEPRECATED_GSS, or similar.
> 	- print a warning if the old stuff is used, and plan to rip it
> 	  out completely in a future kernel version.
> 	- do something that works just in the !CONFIG_DEPRECATED_GSS
> 	  case.
>

Thanks for this comment. I'll check the code for problem you mentioned here.
But I was actually talking about other thing.
Currently we create pipe in gss without any checks since we assume, that pipefs 
client dir is created already.
But with approach, represented in this patch set, pipes and dirs will be created 
only when pipefs was mounted from user-space. I.e. clients with gss auth may 
already present and some callback is required for creating gss pipes.
And also this approch assumes existence of gss auth without pipe.

> Would that help?
>
> --b.


-- 
Best regards,
Stanislav Kinsbursky

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Alexander Smirnov @ 2011-10-20 12:50 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, dbaryshkov, slapin, linux-zigbee-devel, netdev, jonsmirl
In-Reply-To: <1319114385.3781.34.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

Hi Eric,

thank you for the replies. And another question I forgot to ask:

when I send fragments, I still have original skb buffer. What should I
do with it, is there any
"proper/good" ways to drop it? Because I've already fragmented it and
do not need to send
original skb to queue.

Thank you,
Alexander

2011/10/20 Eric Dumazet <eric.dumazet@gmail.com>:
> Le jeudi 20 octobre 2011 à 15:17 +0400, Alexander Smirnov a écrit :
>> Hello everybody,
>>
>> below is the patch which adds support for fragmentation in 6LoWPAN
>> point to point networks. This activity needs because of difference
>> in MTU: 1280 ipv6 and 128 ieee802.15.4
>>
>> This patch is just a draft. Could anyone please look at
>> it and let me know your opinion.
>>
>
> I removed janitor list, since this patch is certainly not a janitor one.
>
>> The most doubtful moments for me are:
>> 1. Should the list 'frag_list' and the mutex 'flist_lock' be
>> included into dev private data?
>
>        The mutex is wrong, you need a spinlock since run from softirq handler.
>        Allocations should use GFP_ATOMIC for same reason.
>
>> 2. Can I use 'dev_queue_xmit' to send fragments to queue?
>
>        Well, it is not very clean, but it seems there is no alternative
>
>> 3. Creating new 'skb' instead of copying and modifying main one.
>
>        You cant do that without making sure you own the skb and its data.
>        Think about a sniffer running...
>
>
> 4) No limitation on number of in-flight fragments.
> You can consume lot of ram and have a list with 65536 elements...
>
>
>
>
>
>>  net/ieee802154/6lowpan.c |  286
>> +++++++++++++++++++++++++++++++++++++++++++++-
>>  net/ieee802154/6lowpan.h |    3 +
>>  2 files changed, 288 insertions(+), 1 deletions(-)
>>
>> diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
>> index 96877bd..1923ec7 100644
>> --- a/net/ieee802154/6lowpan.c
>> +++ b/net/ieee802154/6lowpan.c
>> @@ -113,6 +113,24 @@ struct lowpan_dev_record {
>>         struct list_head list;
>>  };
>>
>> +struct lowpan_fragment {
>> +       u8 in_progress;                 /* assembling is in progress
>> */
>> +       struct sk_buff *skb;            /* skb to be assembled */
>> +       u8 *data;                       /* data to be stored */
>> +       struct mutex lock;              /* concurency lock */
>> +       u16 length;                     /* frame length to be assemled
>> */
>> +       u32 bytes_rcv;                  /* bytes received */
>> +       u16 tag;                        /* current fragment tag */
>> +       struct timer_list timer;        /* assembling timer */
>> +       struct list_head list;          /* fragments list handler
>> */
>> +};
>> +
>> +static unsigned short fragment_tag;
>> +
>> +/* TODO: bind mutex and list to device */
>> +static LIST_HEAD(lowpan_fragments);
>> +struct mutex flist_lock;
>> +
>>  static inline struct
>>  lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
>>  {
>> @@ -244,6 +262,18 @@ static u8 lowpan_fetch_skb_u8(struct sk_buff
>> *skb)
>>         return ret;
>>  }
>>
>> +static u16 lowpan_fetch_skb_u16(struct sk_buff *skb)
>> +{
>> +       u16 ret;
>> +
>> +       BUG_ON(skb->len < 2);
>>
>
>        Hmm, check pskb_may_pull(skb, 2), or in caller.
>
>        skb->len >= 2 doesnt mean you can access to skb->data[0] and
> skb->data[1] : Data might be on a fragment, not on skb head.
>
>> +
>> +       ret = skb->data[0] | (skb->data[1] << 8);
>> +       skb_pull(skb, 2);
>> +       return ret;
>> +}
>> +
>> +static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device
>> *dev);
>>  static int lowpan_header_create(struct sk_buff *skb,
>>                            struct net_device *dev,
>>                            unsigned short type, const void *_daddr,
>> @@ -467,9 +497,102 @@ static int lowpan_header_create(struct sk_buff
>> *skb,
>>                 memcpy(&(sa.hwaddr), saddr, 8);
>>
>>                 mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
>> +
>> +               /* frame fragmentation */
>> +
>> +               /*
>> +                * if payload + mac header doesn't fit MTU-sized frame
>> +                * we need to fragment it.
>> +                */
>> +               if (skb->len > (127 - 24)) { /* MTU -
>> MAC_HEADER_LENGTH */
>> +                       struct sk_buff *fr_skb;
>> +                       u16 b_sent = 0;
>> +                       unsigned short payload_len = skb->len;
>> +                       int stat = 0;
>> +
>> +                       pr_debug("%s: the frame is too big (0x%x),"
>> +                                "fragmentation needed, using tag = 0x
>> %x\n",
>> +                                __func__, payload_len, fragment_tag);
>> +
>> +                       fr_skb = skb_copy(skb, GFP_KERNEL);
>>
>
>                        GFP_ATOMIC
>>
> And I wonder why you skb_copy(). You are not allowed to change skb like
> that. ( when you later skb_push(fr_skb, 4), you are modifying this skb
> data too...)
>>
>> +                       if (!fr_skb)
>> +                               goto error;
>> +
>> +                       /* 40-bit - fragment dispatch size */
>> +                       head = kzalloc(5, GFP_KERNEL);
>
>                        GFP_ATOMIC
>
>
>> +                       if (!head)
>> +                               goto error;
>> +
>> +                       /* first fagment header */
>> +                       head[0] = LOWPAN_DISPATCH_FRAG1 | (payload_len
>> & 0x7);
>> +                       head[1] = (payload_len >> 3) & 0xff;
>> +                       head[2] = fragment_tag & 0xff;
>> +                       head[3] = fragment_tag >> 8;
>> +
>> +
>> +                       lowpan_raw_dump_inline(__func__, "first
>> header",
>> +                                                       head, 4);
>> +
>> +                       memcpy(skb_push(fr_skb, 4), head, 4);
>> +                       skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
>> +
>> +                       dev_hard_header(fr_skb,
>> lowpan_dev_info(dev)->real_dev,
>> +                               type, (void *)&da, (void *)&sa,
>> fr_skb->len);
>> +
>> +                       /* send fragment to dev queue */
>> +                       dev_queue_xmit(fr_skb);
>> +
>> +                       /* next fragments headers */
>> +                       head[0] |= 0x20;
>> +
>> +                       lowpan_raw_dump_inline(__func__, "next
>> headers",
>> +                                                       head, 5);
>> +
>> +                       while (b_sent < payload_len) {
>> +                               /* not the first fragment */
>> +                               if (b_sent)
>> +                                       skb_pull(skb,
>> LOWPAN_FRAG_SIZE);
>> +
>> +                               pr_debug("%s: preparing fragment %d
>> \n",
>> +                                   __func__, b_sent /
>> LOWPAN_FRAG_SIZE);
>> +
>> +                               /*
>> +                                * create copy of current buffer and
>> trim it
>> +                                * down to fragment size
>> +                                */
>> +                               fr_skb = skb_copy(skb, GFP_KERNEL);
>> +                               if (!fr_skb)
>> +                                       goto error;
>> +
>> +                               skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
>> +
>> +                               /* add fragment header */
>> +                               head[4] = b_sent / 8;
>> +                               memcpy(skb_push(fr_skb, 5), head, 5);
>> +
>> +                               b_sent += LOWPAN_FRAG_SIZE;
>> +
>> +                               lowpan_raw_dump_table(__func__,
>> +                                  "fragment data", fr_skb->data,
>> fr_skb->len);
>> +
>> +                               stat = dev_hard_header(fr_skb,
>> +                                       lowpan_dev_info(dev)->real_dev, type,
>> +                                       (void *)&da, (void *)&sa,
>> fr_skb->len);
>> +
>> +                               dev_queue_xmit(fr_skb);
>> +                       }
>> +
>> +                       /* TODO: what's the correct way to skip
>> default skb? */
>> +
>> +                       fragment_tag++;
>> +                       return stat;
>> +               } else
>>                 return dev_hard_header(skb,
>> lowpan_dev_info(dev)->real_dev,
>>                                 type, (void *)&da, (void *)&sa,
>> skb->len);
>>         }
>> +error:
>> +       kfree_skb(skb);
>> +       return -ENOMEM;
>>  }
>>
>>  static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr
>> *hdr)
>> @@ -511,6 +634,23 @@ static int lowpan_skb_deliver(struct sk_buff
>> *skb, struct ipv6hdr *hdr)
>>         return stat;
>>  }
>>
>> +static void lowpan_fragment_timer_expired(unsigned long tag)
>> +{
>> +       struct lowpan_fragment *entry, *tmp;
>> +
>> +       pr_debug("%s: timer expired for frame with tag %lu\n",
>> __func__, tag);
>> +
>> +       mutex_lock(&flist_lock);
>>
>>
>        A mutex_lock() is not allowed in this context (softirq).
> You must use a spinlock.
>>
>>
>> +       list_for_each_entry_safe(entry, tmp, &lowpan_fragments, list)
>> +               if (entry->tag == tag) {
>>
>>
>        Since you have a timer per entry, instead of doing a lookup to find
> 'tag', you could just say 'tag' is the pointer to your "struct
> lowpan_fragment"
>
>>
>> +                       list_del(&entry->list);
>> +                       kfree(entry->data);
>> +                       kfree(entry);
>> +                       break;
>> +               }
>> +       mutex_unlock(&flist_lock);
>> +}
>>
>
>        struct lowpan_fragment *entry = (struct lowpan_fragment *)tag;
>        spin_lock();
>        list_del(&entry->list);
>        kfree(entry->data);
>        kfree(entry);
>        spin_unlock();
>>
>> +
>>  static int
>>  lowpan_process_data(struct sk_buff *skb)
>>  {
>> @@ -525,6 +665,139 @@ lowpan_process_data(struct sk_buff *skb)
>>         if (skb->len < 2)
>>                 goto drop;
>>         iphc0 = lowpan_fetch_skb_u8(skb);
>> +
>> +       /* fragments assmebling */
>> +       switch (iphc0 & 0xf8) {
>
>        0xf8 means ? Please use a macro or something...
>>
>> +       /* first fragment of the frame */
>> +       case LOWPAN_DISPATCH_FRAG1:
>> +       {
>> +               struct lowpan_fragment *entry, *frame;
>> +               u16 tag;
>> +
>> +               lowpan_raw_dump_inline(__func__, "first frame fragment
>> header",
>> +                                                               skb->data, 3);
>> +
>> +               tmp = lowpan_fetch_skb_u8(skb);
>> +               tag = lowpan_fetch_skb_u16(skb);
>> +
>> +               /*
>> +                * check if frame assembling with the same tag is
>> +                * already in progress
>> +                */
>> +               rcu_read_lock();
>> +               list_for_each_entry_rcu(entry, &lowpan_fragments,
>> list)
>> +                       if (entry->tag == tag) {
>> +                               pr_debug("%s ERROR: frame with this
>> tag is"
>> +                                        "alredy in assembling",
>> __func__);
>> +                               goto drop_rcu;
>> +                       }
>> +               rcu_read_unlock();
>> +
>> +               /* alloc new frame structure */
>> +               frame = kzalloc(sizeof(struct lowpan_fragment),
>> GFP_KERNEL);
>>
>>
>        GFP_ATOMIC
>>
>> +               if (!frame)
>> +                       goto drop;
>> +
>> +               INIT_LIST_HEAD(&frame->list);
>> +
>> +               frame->bytes_rcv = 0;
>> +               frame->length = (iphc0 & 7) | (tmp << 3);
>> +               frame->tag = tag;
>> +               /* allocate buffer for frame assembling */
>> +               frame->data = kzalloc(frame->length, GFP_KERNEL);
>>
>>
>                GFP_ATOMIC
>
>> +               if (!frame->data) {
>> +                       kfree(frame);
>> +                       goto drop;
>> +               }
>> +
>> +               pr_debug("%s: frame to be assembled: length = 0x%x, "
>> +                        "tag = 0x%x\n", __func__, frame->length,
>> frame->tag);
>> +
>> +               init_timer(&frame->timer);
>> +               /* (number of fragments) * (fragment processing
>> time-out) */
>> +               frame->timer.expires = jiffies +
>> +                 (frame->length / LOWPAN_FRAG_SIZE + 1) *
>> LOWPAN_FRAG_TIMEOUT;
>> +               frame->timer.data = tag;
>> +               frame->timer.function = lowpan_fragment_timer_expired;
>> +
>> +               add_timer(&frame->timer);
>> +
>> +               mutex_lock(&flist_lock);
>> +               list_add_tail(&frame->list, &lowpan_fragments);
>> +               mutex_unlock(&flist_lock);
>> +
>> +               return kfree_skb(skb), 0;
>> +       }
>> +       /* second and next fragment of the frame */
>> +       case LOWPAN_DISPATCH_FRAGN:
>> +       {
>> +               u16 tag;
>> +               struct lowpan_fragment *entry, *t;
>> +
>> +               lowpan_raw_dump_inline(__func__, "next fragment
>> header",
>> +                                       skb->data, 4);
>> +
>> +               lowpan_fetch_skb_u8(skb); /* skip frame length byte */
>> +               tag = lowpan_fetch_skb_u16(skb);
>> +
>> +               rcu_read_lock();
>> +               list_for_each_entry_rcu(entry, &lowpan_fragments,
>> list)
>> +                       if (entry->tag == tag)
>> +                               break;
>> +               rcu_read_unlock();
>> +
>> +               if (entry->tag != tag) {
>> +                       pr_debug("%s ERROR: no frame structure found
>> for this"
>> +                                "fragment", __func__);
>> +                       goto drop;
>> +               }
>> +
>> +               tmp = lowpan_fetch_skb_u8(skb); /* fetch offset */
>> +
>> +               lowpan_raw_dump_table(__func__, "next fragment
>> payload",
>> +                                       skb->data, skb->len);
>> +
>> +               /* if payload fits buffer, copy it */
>> +               if ((tmp * 8 + skb->len) <= entry->length) /* TODO:
>> likely? */
>> +                       memcpy(entry->data + tmp * 8, skb->data,
>> skb->len);
>> +               else
>> +                       goto drop;
>> +
>> +               entry->bytes_rcv += skb->len;
>> +
>> +               pr_debug("%s: frame length = 0x%x, bytes received = 0x
>> %x\n",
>> +                        __func__, entry->length, entry->bytes_rcv);
>> +
>> +               /* frame assembling complete */
>> +               if (entry->bytes_rcv == entry->length) {
>> +                       struct sk_buff *tmp = skb;
>> +
>> +                       mutex_lock(&flist_lock);
>> +                       list_for_each_entry_safe(entry, t,
>> &lowpan_fragments, list)
>> +                               if (entry->tag == tag) {
>> +                                       list_del(&entry->list);
>> +                                       /* copy and clear skb */
>> +                                       skb = skb_copy_expand(skb,
>> entry->length, skb_tailroom(skb), GFP_KERNEL);
>> +                                       skb_pull(skb, skb->len);
>> +                                       /* copy new data to skb */
>> +                                       memcpy(skb_push(skb,
>> entry->length), entry->data, entry->length);
>> +                                       kfree_skb(tmp);
>> +                                       del_timer(&entry->timer);
>> +                                       kfree(entry->data);
>> +                                       kfree(entry);
>> +
>> +                                       iphc0 =
>> lowpan_fetch_skb_u8(skb);
>> +                                       break;
>> +                               }
>> +                       mutex_unlock(&flist_lock);
>> +                       break;
>> +               }
>> +               return kfree_skb(skb), 0;
>> +       }
>> +       default:
>> +               break;
>> +       }
>> +
>>         iphc1 = lowpan_fetch_skb_u8(skb);
>>
>>         _saddr = mac_cb(skb)->sa.hwaddr;
>> @@ -674,6 +947,8 @@ lowpan_process_data(struct sk_buff *skb)
>>         lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr,
>>                                                         sizeof(hdr));
>>         return lowpan_skb_deliver(skb, &hdr);
>> +drop_rcu:
>> +       rcu_read_unlock();
>>  drop:
>>         kfree(skb);
>>         return -EINVAL;
>> @@ -765,8 +1040,15 @@ static int lowpan_rcv(struct sk_buff *skb,
>> struct net_device *dev,
>>                 goto drop;
>>
>>         /* check that it's our buffer */
>> -       if ((skb->data[0] & 0xe0) == 0x60)
>> +       switch (skb->data[0] & 0xe0) {
>> +       case 0x60:              /* ipv6 datagram */
>> +       case 0xc0:              /* first fragment header */
>> +       case 0xe0:              /* next fragments headers */
>>                 lowpan_process_data(skb);
>> +               break;
>> +       default:
>> +               break;
>> +       }
>>
>>         return NET_RX_SUCCESS;
>>
>> @@ -793,6 +1075,8 @@ static int lowpan_newlink(struct net *src_net,
>> struct net_device *dev,
>>         lowpan_dev_info(dev)->real_dev = real_dev;
>>         mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
>>
>> +       mutex_init(&flist_lock);
>>
>>
>        Doing this init each time a link is setup is wrong.
>        Do it once.
>>
>> +
>>         entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL);
>>         if (!entry)
>>                 return -ENOMEM;
>> diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
>> index 5d8cf80..e8e57c4 100644
>> --- a/net/ieee802154/6lowpan.h
>> +++ b/net/ieee802154/6lowpan.h
>> @@ -159,6 +159,9 @@
>>  #define LOWPAN_DISPATCH_FRAG1  0xc0 /* 11000xxx */
>>  #define LOWPAN_DISPATCH_FRAGN  0xe0 /* 11100xxx */
>>
>> +#define LOWPAN_FRAG_SIZE       40              /* fragment payload
>> size */
>> +#define LOWPAN_FRAG_TIMEOUT    (HZ * 2)        /* processing time: 2
>> sec */
>> +
>>  /*
>>   * Values of fields within the IPHC encoding first byte
>>   * (C stands for compressed and I for inline)
>> --
>> 1.7.2.5
>>
>
>
>

^ permalink raw reply

* (unknown)
From: Western Union @ 2011-10-20 12:34 UTC (permalink / raw)



You've won $85,000USD by IMF via western union.Confirm with name,age,occupation,
country

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Eric Dumazet @ 2011-10-20 12:39 UTC (permalink / raw)
  To: Alexander Smirnov
  Cc: davem, dbaryshkov, slapin, linux-zigbee-devel, netdev, jonsmirl
In-Reply-To: <20111020111718.GA32181@avtobot.ww600.siemens.net>

Le jeudi 20 octobre 2011 à 15:17 +0400, Alexander Smirnov a écrit :
> Hello everybody,
> 
> below is the patch which adds support for fragmentation in 6LoWPAN
> point to point networks. This activity needs because of difference
> in MTU: 1280 ipv6 and 128 ieee802.15.4
> 
> This patch is just a draft. Could anyone please look at
> it and let me know your opinion.
> 

I removed janitor list, since this patch is certainly not a janitor one.

> The most doubtful moments for me are:
> 1. Should the list 'frag_list' and the mutex 'flist_lock' be
> included into dev private data?

	The mutex is wrong, you need a spinlock since run from softirq handler.
 	Allocations should use GFP_ATOMIC for same reason.

> 2. Can I use 'dev_queue_xmit' to send fragments to queue?

	Well, it is not very clean, but it seems there is no alternative

> 3. Creating new 'skb' instead of copying and modifying main one.

	You cant do that without making sure you own the skb and its data.
	Think about a sniffer running...


4) No limitation on number of in-flight fragments. 
You can consume lot of ram and have a list with 65536 elements...





>  net/ieee802154/6lowpan.c |  286
> +++++++++++++++++++++++++++++++++++++++++++++-
>  net/ieee802154/6lowpan.h |    3 +
>  2 files changed, 288 insertions(+), 1 deletions(-)
> 
> diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
> index 96877bd..1923ec7 100644
> --- a/net/ieee802154/6lowpan.c
> +++ b/net/ieee802154/6lowpan.c
> @@ -113,6 +113,24 @@ struct lowpan_dev_record {
>         struct list_head list;
>  };
>  
> +struct lowpan_fragment {
> +       u8 in_progress;                 /* assembling is in progress
> */
> +       struct sk_buff *skb;            /* skb to be assembled */
> +       u8 *data;                       /* data to be stored */
> +       struct mutex lock;              /* concurency lock */
> +       u16 length;                     /* frame length to be assemled
> */
> +       u32 bytes_rcv;                  /* bytes received */
> +       u16 tag;                        /* current fragment tag */
> +       struct timer_list timer;        /* assembling timer */
> +       struct list_head list;          /* fragments list handler
> */    
> +};
> +
> +static unsigned short fragment_tag;
> +
> +/* TODO: bind mutex and list to device */
> +static LIST_HEAD(lowpan_fragments);
> +struct mutex flist_lock;
> +
>  static inline struct
>  lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
>  {
> @@ -244,6 +262,18 @@ static u8 lowpan_fetch_skb_u8(struct sk_buff
> *skb)
>         return ret;
>  }
>  
> +static u16 lowpan_fetch_skb_u16(struct sk_buff *skb)
> +{
> +       u16 ret;
> +
> +       BUG_ON(skb->len < 2);
> 

	Hmm, check pskb_may_pull(skb, 2), or in caller.

	skb->len >= 2 doesnt mean you can access to skb->data[0] and
skb->data[1] : Data might be on a fragment, not on skb head.

> +
> +       ret = skb->data[0] | (skb->data[1] << 8);
> +       skb_pull(skb, 2);
> +       return ret;
> +}
> +
> +static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device
> *dev);
>  static int lowpan_header_create(struct sk_buff *skb,
>                            struct net_device *dev,
>                            unsigned short type, const void *_daddr,
> @@ -467,9 +497,102 @@ static int lowpan_header_create(struct sk_buff
> *skb,
>                 memcpy(&(sa.hwaddr), saddr, 8);
>  
>                 mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
> +
> +               /* frame fragmentation */
> +
> +               /*
> +                * if payload + mac header doesn't fit MTU-sized frame
> +                * we need to fragment it.
> +                */
> +               if (skb->len > (127 - 24)) { /* MTU -
> MAC_HEADER_LENGTH */
> +                       struct sk_buff *fr_skb;
> +                       u16 b_sent = 0;
> +                       unsigned short payload_len = skb->len;
> +                       int stat = 0;
> +
> +                       pr_debug("%s: the frame is too big (0x%x),"
> +                                "fragmentation needed, using tag = 0x
> %x\n",
> +                                __func__, payload_len, fragment_tag);
> +
> +                       fr_skb = skb_copy(skb, GFP_KERNEL);
> 

			GFP_ATOMIC
> 
And I wonder why you skb_copy(). You are not allowed to change skb like
that. ( when you later skb_push(fr_skb, 4), you are modifying this skb
data too...)
> 
> +                       if (!fr_skb)
> +                               goto error;
> +
> +                       /* 40-bit - fragment dispatch size */
> +                       head = kzalloc(5, GFP_KERNEL);

			GFP_ATOMIC


> +                       if (!head)
> +                               goto error;
> +
> +                       /* first fagment header */
> +                       head[0] = LOWPAN_DISPATCH_FRAG1 | (payload_len
> & 0x7);
> +                       head[1] = (payload_len >> 3) & 0xff;
> +                       head[2] = fragment_tag & 0xff;
> +                       head[3] = fragment_tag >> 8;
> +
> +
> +                       lowpan_raw_dump_inline(__func__, "first
> header",
> +                                                       head, 4);
> +
> +                       memcpy(skb_push(fr_skb, 4), head, 4);
> +                       skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
> +
> +                       dev_hard_header(fr_skb,
> lowpan_dev_info(dev)->real_dev,
> +                               type, (void *)&da, (void *)&sa,
> fr_skb->len);
> +
> +                       /* send fragment to dev queue */
> +                       dev_queue_xmit(fr_skb);
> +
> +                       /* next fragments headers */
> +                       head[0] |= 0x20;
> +
> +                       lowpan_raw_dump_inline(__func__, "next
> headers",
> +                                                       head, 5);
> +
> +                       while (b_sent < payload_len) {
> +                               /* not the first fragment */
> +                               if (b_sent)
> +                                       skb_pull(skb,
> LOWPAN_FRAG_SIZE);
> +
> +                               pr_debug("%s: preparing fragment %d
> \n",
> +                                   __func__, b_sent /
> LOWPAN_FRAG_SIZE);
> +
> +                               /*
> +                                * create copy of current buffer and
> trim it
> +                                * down to fragment size
> +                                */
> +                               fr_skb = skb_copy(skb, GFP_KERNEL);
> +                               if (!fr_skb)
> +                                       goto error;
> +
> +                               skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
> +
> +                               /* add fragment header */
> +                               head[4] = b_sent / 8;
> +                               memcpy(skb_push(fr_skb, 5), head, 5);
> +
> +                               b_sent += LOWPAN_FRAG_SIZE;
> +
> +                               lowpan_raw_dump_table(__func__,
> +                                  "fragment data", fr_skb->data,
> fr_skb->len);
> +
> +                               stat = dev_hard_header(fr_skb,
> +                                       lowpan_dev_info(dev)->real_dev, type,
> +                                       (void *)&da, (void *)&sa,
> fr_skb->len);
> +
> +                               dev_queue_xmit(fr_skb);
> +                       }
> +
> +                       /* TODO: what's the correct way to skip
> default skb? */
> +
> +                       fragment_tag++;
> +                       return stat;
> +               } else
>                 return dev_hard_header(skb,
> lowpan_dev_info(dev)->real_dev,
>                                 type, (void *)&da, (void *)&sa,
> skb->len);
>         }
> +error:
> +       kfree_skb(skb);
> +       return -ENOMEM;
>  }
>  
>  static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr
> *hdr)
> @@ -511,6 +634,23 @@ static int lowpan_skb_deliver(struct sk_buff
> *skb, struct ipv6hdr *hdr)
>         return stat;
>  }
>  
> +static void lowpan_fragment_timer_expired(unsigned long tag)
> +{
> +       struct lowpan_fragment *entry, *tmp;
> +
> +       pr_debug("%s: timer expired for frame with tag %lu\n",
> __func__, tag);
> +
> +       mutex_lock(&flist_lock);
> 
> 
	A mutex_lock() is not allowed in this context (softirq).
You must use a spinlock.
> 
> 
> +       list_for_each_entry_safe(entry, tmp, &lowpan_fragments, list)
> +               if (entry->tag == tag) {
> 
> 
	Since you have a timer per entry, instead of doing a lookup to find
'tag', you could just say 'tag' is the pointer to your "struct
lowpan_fragment"

> 
> +                       list_del(&entry->list);
> +                       kfree(entry->data);
> +                       kfree(entry);
> +                       break;
> +               }
> +       mutex_unlock(&flist_lock);
> +}
> 

	struct lowpan_fragment *entry = (struct lowpan_fragment *)tag;
	spin_lock();
	list_del(&entry->list);
	kfree(entry->data);
	kfree(entry);
	spin_unlock();
> 
> +
>  static int
>  lowpan_process_data(struct sk_buff *skb)
>  {
> @@ -525,6 +665,139 @@ lowpan_process_data(struct sk_buff *skb)
>         if (skb->len < 2)
>                 goto drop;
>         iphc0 = lowpan_fetch_skb_u8(skb);
> +
> +       /* fragments assmebling */
> +       switch (iphc0 & 0xf8) {

	0xf8 means ? Please use a macro or something...
> 
> +       /* first fragment of the frame */
> +       case LOWPAN_DISPATCH_FRAG1:
> +       {
> +               struct lowpan_fragment *entry, *frame;
> +               u16 tag;
> +
> +               lowpan_raw_dump_inline(__func__, "first frame fragment
> header",
> +                                                               skb->data, 3);
> +
> +               tmp = lowpan_fetch_skb_u8(skb);
> +               tag = lowpan_fetch_skb_u16(skb);
> +
> +               /*
> +                * check if frame assembling with the same tag is
> +                * already in progress
> +                */
> +               rcu_read_lock();
> +               list_for_each_entry_rcu(entry, &lowpan_fragments,
> list)
> +                       if (entry->tag == tag) {
> +                               pr_debug("%s ERROR: frame with this
> tag is"
> +                                        "alredy in assembling",
> __func__);
> +                               goto drop_rcu;
> +                       }
> +               rcu_read_unlock();
> +
> +               /* alloc new frame structure */
> +               frame = kzalloc(sizeof(struct lowpan_fragment),
> GFP_KERNEL);
> 
> 
   	GFP_ATOMIC
> 
> +               if (!frame)
> +                       goto drop;
> +
> +               INIT_LIST_HEAD(&frame->list);
> +
> +               frame->bytes_rcv = 0;
> +               frame->length = (iphc0 & 7) | (tmp << 3);
> +               frame->tag = tag;
> +               /* allocate buffer for frame assembling */
> +               frame->data = kzalloc(frame->length, GFP_KERNEL);
> 
> 
		GFP_ATOMIC

> +               if (!frame->data) {
> +                       kfree(frame);
> +                       goto drop;
> +               }
> +
> +               pr_debug("%s: frame to be assembled: length = 0x%x, "
> +                        "tag = 0x%x\n", __func__, frame->length,
> frame->tag);
> +
> +               init_timer(&frame->timer);
> +               /* (number of fragments) * (fragment processing
> time-out) */
> +               frame->timer.expires = jiffies +
> +                 (frame->length / LOWPAN_FRAG_SIZE + 1) *
> LOWPAN_FRAG_TIMEOUT;
> +               frame->timer.data = tag;
> +               frame->timer.function = lowpan_fragment_timer_expired;
> +
> +               add_timer(&frame->timer);
> +
> +               mutex_lock(&flist_lock);
> +               list_add_tail(&frame->list, &lowpan_fragments);
> +               mutex_unlock(&flist_lock);
> +
> +               return kfree_skb(skb), 0;
> +       }
> +       /* second and next fragment of the frame */
> +       case LOWPAN_DISPATCH_FRAGN:
> +       {
> +               u16 tag;
> +               struct lowpan_fragment *entry, *t;
> +
> +               lowpan_raw_dump_inline(__func__, "next fragment
> header",
> +                                       skb->data, 4);
> +
> +               lowpan_fetch_skb_u8(skb); /* skip frame length byte */
> +               tag = lowpan_fetch_skb_u16(skb);
> +
> +               rcu_read_lock();
> +               list_for_each_entry_rcu(entry, &lowpan_fragments,
> list)
> +                       if (entry->tag == tag)
> +                               break;
> +               rcu_read_unlock();
> +
> +               if (entry->tag != tag) {
> +                       pr_debug("%s ERROR: no frame structure found
> for this"
> +                                "fragment", __func__);
> +                       goto drop;
> +               }
> +
> +               tmp = lowpan_fetch_skb_u8(skb); /* fetch offset */
> +
> +               lowpan_raw_dump_table(__func__, "next fragment
> payload",
> +                                       skb->data, skb->len);
> +
> +               /* if payload fits buffer, copy it */
> +               if ((tmp * 8 + skb->len) <= entry->length) /* TODO:
> likely? */
> +                       memcpy(entry->data + tmp * 8, skb->data,
> skb->len);
> +               else
> +                       goto drop;
> +
> +               entry->bytes_rcv += skb->len;
> +
> +               pr_debug("%s: frame length = 0x%x, bytes received = 0x
> %x\n",
> +                        __func__, entry->length, entry->bytes_rcv);
> +
> +               /* frame assembling complete */
> +               if (entry->bytes_rcv == entry->length) {
> +                       struct sk_buff *tmp = skb;
> +
> +                       mutex_lock(&flist_lock);
> +                       list_for_each_entry_safe(entry, t,
> &lowpan_fragments, list)
> +                               if (entry->tag == tag) {
> +                                       list_del(&entry->list);
> +                                       /* copy and clear skb */
> +                                       skb = skb_copy_expand(skb,
> entry->length, skb_tailroom(skb), GFP_KERNEL);
> +                                       skb_pull(skb, skb->len);
> +                                       /* copy new data to skb */
> +                                       memcpy(skb_push(skb,
> entry->length), entry->data, entry->length);
> +                                       kfree_skb(tmp);
> +                                       del_timer(&entry->timer);
> +                                       kfree(entry->data);
> +                                       kfree(entry);
> +
> +                                       iphc0 =
> lowpan_fetch_skb_u8(skb);
> +                                       break;
> +                               }
> +                       mutex_unlock(&flist_lock);
> +                       break;
> +               }
> +               return kfree_skb(skb), 0;
> +       }
> +       default:
> +               break;
> +       }
> +
>         iphc1 = lowpan_fetch_skb_u8(skb);
>  
>         _saddr = mac_cb(skb)->sa.hwaddr;
> @@ -674,6 +947,8 @@ lowpan_process_data(struct sk_buff *skb)
>         lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr,
>                                                         sizeof(hdr));
>         return lowpan_skb_deliver(skb, &hdr);
> +drop_rcu:
> +       rcu_read_unlock();
>  drop:
>         kfree(skb);
>         return -EINVAL;
> @@ -765,8 +1040,15 @@ static int lowpan_rcv(struct sk_buff *skb,
> struct net_device *dev,
>                 goto drop;
>  
>         /* check that it's our buffer */
> -       if ((skb->data[0] & 0xe0) == 0x60)
> +       switch (skb->data[0] & 0xe0) {
> +       case 0x60:              /* ipv6 datagram */
> +       case 0xc0:              /* first fragment header */
> +       case 0xe0:              /* next fragments headers */
>                 lowpan_process_data(skb);
> +               break;
> +       default:
> +               break;
> +       }
>  
>         return NET_RX_SUCCESS;
>  
> @@ -793,6 +1075,8 @@ static int lowpan_newlink(struct net *src_net,
> struct net_device *dev,
>         lowpan_dev_info(dev)->real_dev = real_dev;
>         mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
>  
> +       mutex_init(&flist_lock);
> 
> 
	Doing this init each time a link is setup is wrong.
	Do it once.
> 
> +
>         entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL);
>         if (!entry)
>                 return -ENOMEM;
> diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
> index 5d8cf80..e8e57c4 100644
> --- a/net/ieee802154/6lowpan.h
> +++ b/net/ieee802154/6lowpan.h
> @@ -159,6 +159,9 @@
>  #define LOWPAN_DISPATCH_FRAG1  0xc0 /* 11000xxx */
>  #define LOWPAN_DISPATCH_FRAGN  0xe0 /* 11100xxx */
>  
> +#define LOWPAN_FRAG_SIZE       40              /* fragment payload
> size */
> +#define LOWPAN_FRAG_TIMEOUT    (HZ * 2)        /* processing time: 2
> sec */
> +
>  /*
>   * Values of fields within the IPHC encoding first byte
>   * (C stands for compressed and I for inline)
> -- 
> 1.7.2.5
> 

^ permalink raw reply

* Re: [RFC PATCH 0/5] SUNRPC: "RPC pipefs per network namespace" preparations
From: bfields-uC3wQj2KruNg9hUCZPvPmw @ 2011-10-20 12:32 UTC (permalink / raw)
  To: Stanislav Kinsbursky
  Cc: Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Pavel Emelianov, neilb-l3A5Bk7waGM@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org,
	devel-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org
In-Reply-To: <4EA000C6.1040502-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

On Thu, Oct 20, 2011 at 03:06:46PM +0400, Stanislav Kinsbursky wrote:
> Guys, please, spend some of your expensive time to review this patch-set briefly.

I'll try to take a look soon, but I'm travelling tomorrow through the
31st, and things will be a little hectic.

Just one quick comment:

> >The only problem about I'm not sure how to solve properly yet, is auth gss
> >pipes creations operations. Hoping for some help with it.

I suspect one reason it may be a little complicated is the
upcall-version switching.  The old version is deprecated, and there's no
need to support the combination of the old version with the a new
feature like containers.  And now that it's been there a while the
version-switching code already achieved its goal of avoiding a flag day.
So, one approach might be:

	- move all the code for the old gss upcall and for the version
	  switching under a new CONFIG_DEPRECATED_GSS, or similar.
	- print a warning if the old stuff is used, and plan to rip it
	  out completely in a future kernel version.
	- do something that works just in the !CONFIG_DEPRECATED_GSS
	  case.

Would that help?

--b.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] ipvs: Fix compilation error in ip_vs.h for ip_vs_confirm_conntrack function.
From: Krzysztof Wilczynski @ 2011-10-20 12:18 UTC (permalink / raw)
  To: Simon Horman; +Cc: Patrick McHardy, netdev

This is to address the following error during the compilation:

  In file included from kernel/sysctl_binary.c:6:
  include/net/ip_vs.h:1406: error: expected identifier or ‘(’ before ‘{’ token
  make[1]: *** [kernel/sysctl_binary.o] Error 1
  make[1]: *** Waiting for unfinished jobs....

That manifests itself when CONFIG_IP_VS_NFCT is undefined in .config file.

Signed-off-by: Krzysztof Wilczynski <krzysztof.wilczynski@linux.com>
---
 include/net/ip_vs.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 139784e..de527d1 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1395,7 +1395,7 @@ static inline void ip_vs_update_conntrack(struct sk_buff *skb,
 {
 }
 
-static inline int ip_vs_confirm_conntrack(struct sk_buff *skb);
+static inline int ip_vs_confirm_conntrack(struct sk_buff *skb)
 {
 	return NF_ACCEPT;
 }
-- 
1.7.7

^ permalink raw reply related

* [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Alexander Smirnov @ 2011-10-20 11:17 UTC (permalink / raw)
  To: davem
  Cc: dbaryshkov, slapin, linux-zigbee-devel, netdev, eric.dumazet,
	kernel-janitors, jonsmirl, alex.bluesman.smirnov

[-- Attachment #1: Type: text/plain, Size: 567 bytes --]

Hello everybody,

below is the patch which adds support for fragmentation in 6LoWPAN
point to point networks. This activity needs because of difference
in MTU: 1280 ipv6 and 128 ieee802.15.4

This patch is just a draft. Could anyone please look at
it and let me know your opinion.

The most doubtful moments for me are:
1. Should the list 'frag_list' and the mutex 'flist_lock' be
included into dev private data?
2. Can I use 'dev_queue_xmit' to send fragments to queue?
3. Creating new 'skb' instead of copying and modifying main one.

With best regards,
Alexander


[-- Attachment #2: 0001-6LoWPAN-fragmentation-support.patch --]
[-- Type: text/plain, Size: 10375 bytes --]

>From 48472bae269b7b1a4047967ec21eadb217c4fd6d Mon Sep 17 00:00:00 2001
From: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
Date: Thu, 20 Oct 2011 15:02:36 +0400
Subject: [PATCH] 6LoWPAN fragmentation support

Signed-off-by: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
---
 net/ieee802154/6lowpan.c |  286 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ieee802154/6lowpan.h |    3 +
 2 files changed, 288 insertions(+), 1 deletions(-)

diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 96877bd..1923ec7 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -113,6 +113,24 @@ struct lowpan_dev_record {
 	struct list_head list;
 };
 
+struct lowpan_fragment {
+	u8 in_progress;			/* assembling is in progress */
+	struct sk_buff *skb;		/* skb to be assembled */
+	u8 *data;			/* data to be stored */
+	struct mutex lock;		/* concurency lock */
+	u16 length;			/* frame length to be assemled */
+	u32 bytes_rcv;			/* bytes received */
+	u16 tag;			/* current fragment tag */
+	struct timer_list timer;	/* assembling timer */
+	struct list_head list;		/* fragments list handler */	
+};
+
+static unsigned short fragment_tag;
+
+/* TODO: bind mutex and list to device */
+static LIST_HEAD(lowpan_fragments);
+struct mutex flist_lock;
+
 static inline struct
 lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
 {
@@ -244,6 +262,18 @@ static u8 lowpan_fetch_skb_u8(struct sk_buff *skb)
 	return ret;
 }
 
+static u16 lowpan_fetch_skb_u16(struct sk_buff *skb)
+{
+	u16 ret;
+
+	BUG_ON(skb->len < 2);
+
+	ret = skb->data[0] | (skb->data[1] << 8);
+	skb_pull(skb, 2);
+	return ret;
+}
+
+static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev);
 static int lowpan_header_create(struct sk_buff *skb,
 			   struct net_device *dev,
 			   unsigned short type, const void *_daddr,
@@ -467,9 +497,102 @@ static int lowpan_header_create(struct sk_buff *skb,
 		memcpy(&(sa.hwaddr), saddr, 8);
 
 		mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
+
+		/* frame fragmentation */
+
+		/*
+		 * if payload + mac header doesn't fit MTU-sized frame
+		 * we need to fragment it.
+		 */
+		if (skb->len > (127 - 24)) { /* MTU - MAC_HEADER_LENGTH */
+			struct sk_buff *fr_skb;
+			u16 b_sent = 0;
+			unsigned short payload_len = skb->len;
+			int stat = 0;
+
+			pr_debug("%s: the frame is too big (0x%x),"
+				 "fragmentation needed, using tag = 0x%x\n",
+				 __func__, payload_len, fragment_tag);
+
+			fr_skb = skb_copy(skb, GFP_KERNEL);
+			if (!fr_skb)
+				goto error;
+
+			/* 40-bit - fragment dispatch size */
+			head = kzalloc(5, GFP_KERNEL);
+			if (!head)
+				goto error;
+
+			/* first fagment header */
+			head[0] = LOWPAN_DISPATCH_FRAG1 | (payload_len & 0x7);
+			head[1] = (payload_len >> 3) & 0xff;
+			head[2] = fragment_tag & 0xff;
+			head[3] = fragment_tag >> 8;
+
+
+			lowpan_raw_dump_inline(__func__, "first header",
+							head, 4);
+
+			memcpy(skb_push(fr_skb, 4), head, 4);
+			skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
+
+			dev_hard_header(fr_skb, lowpan_dev_info(dev)->real_dev,
+				type, (void *)&da, (void *)&sa, fr_skb->len);
+
+			/* send fragment to dev queue */
+			dev_queue_xmit(fr_skb);
+
+			/* next fragments headers */
+			head[0] |= 0x20;
+
+			lowpan_raw_dump_inline(__func__, "next headers",
+							head, 5);
+
+			while (b_sent < payload_len) {
+				/* not the first fragment */
+				if (b_sent)
+					skb_pull(skb, LOWPAN_FRAG_SIZE);
+
+				pr_debug("%s: preparing fragment %d\n",
+				    __func__, b_sent / LOWPAN_FRAG_SIZE);
+
+				/*
+				 * create copy of current buffer and trim it
+				 * down to fragment size
+				 */
+				fr_skb = skb_copy(skb, GFP_KERNEL);
+				if (!fr_skb)
+					goto error;
+
+				skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
+
+				/* add fragment header */
+				head[4] = b_sent / 8;
+				memcpy(skb_push(fr_skb, 5), head, 5);
+
+				b_sent += LOWPAN_FRAG_SIZE;
+
+				lowpan_raw_dump_table(__func__,
+				   "fragment data", fr_skb->data, fr_skb->len);
+
+				stat = dev_hard_header(fr_skb,
+					lowpan_dev_info(dev)->real_dev, type,
+					(void *)&da, (void *)&sa, fr_skb->len);
+
+				dev_queue_xmit(fr_skb);
+			}
+
+			/* TODO: what's the correct way to skip default skb? */
+
+			fragment_tag++;
+			return stat;
+		} else
 		return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
 				type, (void *)&da, (void *)&sa, skb->len);
 	}
+error:
+	kfree_skb(skb);
+	return -ENOMEM;
 }
 
 static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr)
@@ -511,6 +634,23 @@ static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr)
 	return stat;
 }
 
+static void lowpan_fragment_timer_expired(unsigned long tag)
+{
+	struct lowpan_fragment *entry, *tmp;
+
+	pr_debug("%s: timer expired for frame with tag %lu\n", __func__, tag);
+
+	mutex_lock(&flist_lock);
+	list_for_each_entry_safe(entry, tmp, &lowpan_fragments, list)
+		if (entry->tag == tag) {
+			list_del(&entry->list);
+			kfree(entry->data);
+			kfree(entry);
+			break;
+		}
+	mutex_unlock(&flist_lock);
+}
+
 static int
 lowpan_process_data(struct sk_buff *skb)
 {
@@ -525,6 +665,139 @@ lowpan_process_data(struct sk_buff *skb)
 	if (skb->len < 2)
 		goto drop;
 	iphc0 = lowpan_fetch_skb_u8(skb);
+
+	/* fragments assmebling */
+	switch (iphc0 & 0xf8) {
+	/* first fragment of the frame */
+	case LOWPAN_DISPATCH_FRAG1:
+	{
+		struct lowpan_fragment *entry, *frame;
+		u16 tag;
+
+		lowpan_raw_dump_inline(__func__, "first frame fragment header",
+								skb->data, 3);
+
+		tmp = lowpan_fetch_skb_u8(skb);
+		tag = lowpan_fetch_skb_u16(skb);
+
+		/*
+		 * check if frame assembling with the same tag is
+		 * already in progress
+		 */
+		rcu_read_lock();
+		list_for_each_entry_rcu(entry, &lowpan_fragments, list)
+			if (entry->tag == tag) {
+				pr_debug("%s ERROR: frame with this tag is"
+					 "alredy in assembling", __func__);
+				goto drop_rcu;
+			}
+		rcu_read_unlock();
+
+		/* alloc new frame structure */
+		frame = kzalloc(sizeof(struct lowpan_fragment), GFP_KERNEL);
+		if (!frame)
+			goto drop;
+
+		INIT_LIST_HEAD(&frame->list);
+
+		frame->bytes_rcv = 0;
+		frame->length = (iphc0 & 7) | (tmp << 3);
+		frame->tag = tag;
+		/* allocate buffer for frame assembling */
+		frame->data = kzalloc(frame->length, GFP_KERNEL);
+		if (!frame->data) {
+			kfree(frame);
+			goto drop;
+		}
+
+		pr_debug("%s: frame to be assembled: length = 0x%x, "
+			 "tag = 0x%x\n", __func__, frame->length, frame->tag);
+
+		init_timer(&frame->timer);
+		/* (number of fragments) * (fragment processing time-out) */
+		frame->timer.expires = jiffies +
+		  (frame->length / LOWPAN_FRAG_SIZE + 1) * LOWPAN_FRAG_TIMEOUT;
+		frame->timer.data = tag;
+		frame->timer.function = lowpan_fragment_timer_expired;
+
+		add_timer(&frame->timer);
+
+		mutex_lock(&flist_lock);
+		list_add_tail(&frame->list, &lowpan_fragments);
+		mutex_unlock(&flist_lock);
+
+		return kfree_skb(skb), 0;
+	}
+	/* second and next fragment of the frame */
+	case LOWPAN_DISPATCH_FRAGN:
+	{
+		u16 tag;
+		struct lowpan_fragment *entry, *t;
+
+		lowpan_raw_dump_inline(__func__, "next fragment header",
+					skb->data, 4);
+
+		lowpan_fetch_skb_u8(skb); /* skip frame length byte */
+		tag = lowpan_fetch_skb_u16(skb);
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(entry, &lowpan_fragments, list)
+			if (entry->tag == tag)
+				break;
+		rcu_read_unlock();
+
+		if (entry->tag != tag) {
+			pr_debug("%s ERROR: no frame structure found for this"
+				 "fragment", __func__);
+			goto drop;
+		}
+
+		tmp = lowpan_fetch_skb_u8(skb); /* fetch offset */
+
+		lowpan_raw_dump_table(__func__, "next fragment payload",
+					skb->data, skb->len);
+
+		/* if payload fits buffer, copy it */
+		if ((tmp * 8 + skb->len) <= entry->length) /* TODO: likely? */
+			memcpy(entry->data + tmp * 8, skb->data, skb->len);
+		else
+			goto drop;
+
+		entry->bytes_rcv += skb->len;
+
+		pr_debug("%s: frame length = 0x%x, bytes received = 0x%x\n",
+			 __func__, entry->length, entry->bytes_rcv);
+
+		/* frame assembling complete */
+		if (entry->bytes_rcv == entry->length) {
+			struct sk_buff *tmp = skb;
+
+			mutex_lock(&flist_lock);
+			list_for_each_entry_safe(entry, t, &lowpan_fragments, list)
+				if (entry->tag == tag) {
+					list_del(&entry->list);
+					/* copy and clear skb */
+					skb = skb_copy_expand(skb, entry->length, skb_tailroom(skb), GFP_KERNEL);
+					skb_pull(skb, skb->len);
+					/* copy new data to skb */
+					memcpy(skb_push(skb, entry->length), entry->data, entry->length);
+					kfree_skb(tmp);
+					del_timer(&entry->timer);
+					kfree(entry->data);
+					kfree(entry);
+
+					iphc0 = lowpan_fetch_skb_u8(skb);
+					break;
+				}
+			mutex_unlock(&flist_lock);
+			break;
+		}
+		return kfree_skb(skb), 0;
+	}
+	default:
+		break;
+	}
+
 	iphc1 = lowpan_fetch_skb_u8(skb);
 
 	_saddr = mac_cb(skb)->sa.hwaddr;
@@ -674,6 +947,8 @@ lowpan_process_data(struct sk_buff *skb)
 	lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr,
 							sizeof(hdr));
 	return lowpan_skb_deliver(skb, &hdr);
+drop_rcu:
+	rcu_read_unlock();
 drop:
 	kfree(skb);
 	return -EINVAL;
@@ -765,8 +1040,15 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto drop;
 
 	/* check that it's our buffer */
-	if ((skb->data[0] & 0xe0) == 0x60)
+	switch (skb->data[0] & 0xe0) {
+	case 0x60:		/* ipv6 datagram */
+	case 0xc0:		/* first fragment header */
+	case 0xe0:		/* next fragments headers */
 		lowpan_process_data(skb);
+		break;
+	default:
+		break;
+	}
 
 	return NET_RX_SUCCESS;
 
@@ -793,6 +1075,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev,
 	lowpan_dev_info(dev)->real_dev = real_dev;
 	mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
 
+	mutex_init(&flist_lock);
+
 	entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL);
 	if (!entry)
 		return -ENOMEM;
diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
index 5d8cf80..e8e57c4 100644
--- a/net/ieee802154/6lowpan.h
+++ b/net/ieee802154/6lowpan.h
@@ -159,6 +159,9 @@
 #define LOWPAN_DISPATCH_FRAG1	0xc0 /* 11000xxx */
 #define LOWPAN_DISPATCH_FRAGN	0xe0 /* 11100xxx */
 
+#define LOWPAN_FRAG_SIZE	40		/* fragment payload size */
+#define LOWPAN_FRAG_TIMEOUT	(HZ * 2)	/* processing time: 2 sec */
+
 /*
  * Values of fields within the IPHC encoding first byte
  * (C stands for compressed and I for inline)
-- 
1.7.2.5


^ permalink raw reply related

* Re: [RFC PATCH 0/5] SUNRPC: "RPC pipefs per network namespace" preparations
From: Stanislav Kinsbursky @ 2011-10-20 11:06 UTC (permalink / raw)
  To: Trond.Myklebust@netapp.com
  Cc: linux-nfs@vger.kernel.org, Pavel Emelianov, neilb@suse.de,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	bfields@fieldses.org, davem@davemloft.net, devel@openvz.org
In-Reply-To: <20111017120629.4541.67395.stgit@localhost6.localdomain6>

Guys, please, spend some of your expensive time to review this patch-set briefly.
This is not for commit, but just an idea representation.
I really need some opinions about it, since all my further work aroud RPC pipefs 
depends on it.
IOW I need to now, does anyone has something against this idea.
Trond, please, respond, does this idea suits you in general or not?

17.10.2011 17:10, Stanislav Kinsbursky пишет:
> Hello to everyone.
> RPC pipefs file system have to work per network namespace context is required
> prior to any NFS modifications.
> This is a way how to do it. I'll really appreciate for any comments.
>
> There are several statements about how to make RPC pipefs working per network
> namespace context.
> Here they are:
> 1) RPC pipefs should be mounted per network namespace context.
> 2) RPC pipefs superblock should holds network namespace while active.
> 3) RPC pipefs lookup and readir should be perfomed in network namespace context
> it was mounted. IOW, user-space process, working in another network namespace
> context, should see RPC pipefs dentries from network namespace context this
> mount-point was created (like it was done for sysfs).
>
> These statement leads to some restrictions which we must follow during
> implementation. Here are they:
> 1) RPC pipefs mount can't be performed in kernel context since new super block
> will holds networks namespace reference and it's impossible to recognize, when
> and how we have to release this mount point. IOW rpc_get_mount() and
> rpc_put_mount() have to be removed.
> 2) RPC pipefs should provide some new helpers to lookup directory dentry for
> those modules which creates pipes, because without RPC pipefs mount point
> general lookup can't be performed.
> 3) These methods must garantee, that pipefs superblock will be active during
> pipes creation and destruction.
>
> So, here is the idea of making RPC pipefs works per network namespace context:
> 1) RPC pipefs superblock should holds network namespcae context while active.
> 2) RPC pipefs should send notification events on superblock creation and
> destruction.
> 3) RPC pipefs should provide "lookup dentry by name" method for notification
> subscribers.
> 4) RPC pipefs should place superblock reference on current network namespace
> context on creation and remove it on destruction.
> 5) RPC pipefs should provide safe "lookup dentry by name" method for per-net
> operations, which garantees, that superblock is active, while
> per-net-operations are performing.
> 6) Client and cache directories creation and destruction should be performed
> also on superblock creation and destruction notification events. Note: generic
> creation (like now) can fail (if no superblock is not created yet).
> 7) Pipes creation and destruction should be performed on superblock creation
> and destruction events. Also pipes operations should be performed during
> per-net operation and in this case they could fail (due to the same reason as
> in statement above).
>
> This patch-set implements first 5 points and thus doesn't affects current RPC
> pipefs logic.
>
> The only problem about I'm not sure how to solve properly yet, is auth gss
> pipes creations operations. Hoping for some help with it.
>
>
> The following series consists of:
>
> ---
>
> Stanislav Kinsbursky (5):
>        SUNRPC: hold current network namespace while pipefs superblock is active
>        SUNRPC: send notification events on pipefs sb creation and destruction
>        SUNRPC: pipefs dentry lookup helper introduced
>        SUNRPC: put pipefs superblock link on network namespace
>        SUNRPC: pipefs per-net operations helper introduced
>
>
>   include/linux/sunrpc/rpc_pipe_fs.h |   16 ++++++
>   net/sunrpc/netns.h                 |    3 +
>   net/sunrpc/rpc_pipe.c              |  103 ++++++++++++++++++++++++++++++++++++
>   net/sunrpc/sunrpc_syms.c           |    1
>   4 files changed, 122 insertions(+), 1 deletions(-)
>


-- 
Best regards,
Stanislav Kinsbursky

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox