* Re: tun: Use netif_receive_skb instead of netif_rx
From: Herbert Xu @ 2010-05-21 1:16 UTC (permalink / raw)
To: Neil Horman; +Cc: Eric Dumazet, David Miller, bmb, tgraf, nhorman, netdev
In-Reply-To: <20100521010211.GA23671@gondor.apana.org.au>
On Fri, May 21, 2010 at 11:02:11AM +1000, Herbert Xu wrote:
>
> That's what I meant above. My patch will make tun.c to the
> classid update every time it sends out a packet.
Here it is:
tun: Update classid on packet injection
This patch makes tun update its socket classid every time we
inject a packet into the network stack. This is so that any
updates made by the admin to the process writing packets to
tun is effected.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 4326520..a8a9aa8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -525,6 +525,8 @@ static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
struct sk_buff *skb;
int err;
+ sock_update_classid(sk);
+
/* Under a page? Don't bother with paged skb. */
if (prepad + len < PAGE_SIZE || !linear)
linear = len;
diff --git a/net/core/sock.c b/net/core/sock.c
index 8f7fdf8..4969bd1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1055,6 +1055,7 @@ void sock_update_classid(struct sock *sk)
if (classid && classid != sk->sk_classid)
sk->classid = classid;
}
+EXPORT_SYMBOL(sock_update_classid);
#endif
/**
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related
* gro: Fix bogus gso_size on the first fraglist entry
From: Herbert Xu @ 2010-05-21 2:46 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Igor Zhang
Hi:
gro: Fix bogus gso_size on the first fraglist entry
When GRO produces fraglist entries, and the resulting skb hits
an interface that is incapable of TSO but capable of FRAGLIST,
we end up producing a bogus packet with gso_size non-zero.
This was reported in the field with older versions of KVM that
did not set the TSO bits on tuntap.
This patch fixes that.
Reported-by: Igor Zhang <yugzhang@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e06..cad8e97 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2729,6 +2729,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
skb_shinfo(nskb)->frag_list = p;
skb_shinfo(nskb)->gso_size = pinfo->gso_size;
+ pinfo->gso_size = 0;
skb_header_release(p);
nskb->prev = p;
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply related
* [PATCH v2] net: fix problem in dequeuing from input_pkt_queue
From: Tom Herbert @ 2010-05-21 4:37 UTC (permalink / raw)
To: davem; +Cc: eric.dumazet, xiaosuo, netdev
Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
The primary issue it that the queue head must be incremented only
after a packet has been processed, that is only after
__netif_receive_skb has been called. This is needed for the mechanism
to prevent OOO packet in RFS. Also when flushing the input_pkt_queue
and process_queue, the process queue should be done first to prevent
OOO packets.
Because the input_pkt_queue has been effectively split into two queues,
the calculation of the tail ptr is no longer correct. The correct value
would be head+input_pkt_queue->len+process_queue->len. To avoid
this calculation we added an explict input_queue_tail in softnet_data.
The tail value is simply incremented when queuing to input_pkt_queue.
Signed-off-by: Tom Herbert <therbert@google.com>
---
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c3487a6..726b3cb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1403,17 +1403,25 @@ struct softnet_data {
struct softnet_data *rps_ipi_next;
unsigned int cpu;
unsigned int input_queue_head;
+ unsigned int input_queue_tail;
#endif
unsigned dropped;
struct sk_buff_head input_pkt_queue;
struct napi_struct backlog;
};
-static inline void input_queue_head_add(struct softnet_data *sd,
- unsigned int len)
+static inline void input_queue_head_incr(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- sd->input_queue_head += len;
+ sd->input_queue_head++;
+#endif
+}
+
+static inline void input_queue_tail_incr_save(struct softnet_data *sd,
+ unsigned int *qtail)
+{
+#ifdef CONFIG_RPS
+ *qtail = ++sd->input_queue_tail;
#endif
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 6c82065..0aab66d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2426,10 +2426,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
-#ifdef CONFIG_RPS
- *qtail = sd->input_queue_head +
- skb_queue_len(&sd->input_pkt_queue);
-#endif
+ input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
@@ -2964,7 +2961,7 @@ static void flush_backlog(void *arg)
if (skb->dev == dev) {
__skb_unlink(skb, &sd->input_pkt_queue);
kfree_skb(skb);
- input_queue_head_add(sd, 1);
+ input_queue_head_incr(sd);
}
}
rps_unlock(sd);
@@ -2973,6 +2970,7 @@ static void flush_backlog(void *arg)
if (skb->dev == dev) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
+ input_queue_head_incr(sd);
}
}
}
@@ -3328,18 +3326,20 @@ static int process_backlog(struct napi_struct *napi, int quota)
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
__netif_receive_skb(skb);
- if (++work >= quota)
- return work;
local_irq_disable();
+ input_queue_head_incr(sd);
+ if (++work >= quota) {
+ local_irq_enable();
+ return work;
+ }
}
rps_lock(sd);
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen) {
- input_queue_head_add(sd, qlen);
+ if (qlen)
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
- }
+
if (qlen < quota - work) {
/*
* Inline a custom version of __napi_complete().
@@ -5679,12 +5679,14 @@ static int dev_cpu_callback(struct notifier_block *nfb,
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
- while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
+ while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx(skb);
- input_queue_head_add(oldsd, 1);
+ input_queue_head_incr(oldsd);
}
- while ((skb = __skb_dequeue(&oldsd->process_queue)))
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
+ input_queue_head_incr(oldsd);
+ }
return NOTIFY_OK;
}
^ permalink raw reply related
* Re: linux-next: build failure after merge of the suspend tree
From: David Miller @ 2010-05-21 5:46 UTC (permalink / raw)
To: sfr; +Cc: linville, torvalds, rjw, linux-next, linux-kernel, Helmut.Schaa,
netdev
In-Reply-To: <20100521102913.ae4e8cd2.sfr@canb.auug.org.au>
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 21 May 2010 10:29:13 +1000
> On Sat, 8 May 2010 04:13:24 +0200 "Rafael J. Wysocki" <rjw@sisk.pl> wrote:
>>
>> On Friday 07 May 2010, Stephen Rothwell wrote:
>> Both trees are based on Linus' current and I don't see a good way of fixing
>> this issue in any of them individually.
>
> The suspend tree has been merged into Linus' tree, so this patch is
> needed in the net tree before it is merged (or as part of the merge).
Since the net tree is still on it's way to Linus, we'll just have to
wait for him to do that merge.
Then we can sort this out. I don't want to touch a tree that is
already on it's way.
Thanks Stephen.
^ permalink raw reply
* Re: tun: Use netif_receive_skb instead of netif_rx
From: David Miller @ 2010-05-21 5:49 UTC (permalink / raw)
To: nhorman; +Cc: herbert, eric.dumazet, bmb, tgraf, nhorman, netdev
In-Reply-To: <20100521003939.GA2223@localhost.localdomain>
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 20 May 2010 20:39:39 -0400
> On Fri, May 21, 2010 at 09:16:30AM +1000, Herbert Xu wrote:
>> On Thu, May 20, 2010 at 01:29:18PM -0400, Neil Horman wrote:
>> >
>> > So, I'm testing this patch out now, and unfotunately it doesn't seem to be
>> > working. Every frame seems to be holding a classid of 0. Trying to figure out
>> > why now.
>>
>> Not very surprising since tun.c doesn't go through the normal
>> socket interface. I'll send a additional patch for that.
>>
> I don't think thats it. I think its a chicken and egg situation. I think the
> problem is that tasks can't be assigned to cgroups until their created, and in
> that time a sock can be created. Its a natural race. If you create a socket
> before you assign it to a cgroup, that socket retains a classid of zero. I'm
> going to try modify the patch to update sockets owned by tasks when the cgroup
> is assigned.
Neil, you must not be using Herbert's most recent patch.
Either that or you haven't even read it.
Herbert's most recent patch doesn't create this chicken and egg
problem you mention because it explicitly watches for cgroupid changes
at all socket I/O operations including sendmsg() and sendmsg(). And
if it sees a different cgroupid at a socket I/O call, it updates the
cgroupid value in the socket.
So you very much can change the cgroup of the process mid-socket
ownership and it will work.
The only problem is, as Herbert stated, tun. Because it does it's
networking I/O directly by calling netif_receive_skb() so it won't
hit any of Herbert's cgroup check points.
^ permalink raw reply
* Re: linux-next: build failure after merge of the suspend tree
From: Eric Dumazet @ 2010-05-21 5:51 UTC (permalink / raw)
To: David Miller
Cc: sfr, linville, torvalds, rjw, linux-next, linux-kernel,
Helmut.Schaa, netdev
In-Reply-To: <20100520.224644.21290162.davem@davemloft.net>
Le jeudi 20 mai 2010 à 22:46 -0700, David Miller a écrit :
> From: Stephen Rothwell <sfr@canb.auug.org.au>
> Date: Fri, 21 May 2010 10:29:13 +1000
>
> > On Sat, 8 May 2010 04:13:24 +0200 "Rafael J. Wysocki" <rjw@sisk.pl> wrote:
> >>
> >> On Friday 07 May 2010, Stephen Rothwell wrote:
> >> Both trees are based on Linus' current and I don't see a good way of fixing
> >> this issue in any of them individually.
> >
> > The suspend tree has been merged into Linus' tree, so this patch is
> > needed in the net tree before it is merged (or as part of the merge).
>
> Since the net tree is still on it's way to Linus, we'll just have to
> wait for him to do that merge.
>
> Then we can sort this out. I don't want to touch a tree that is
> already on it's way.
Linus merged your tree David.
^ permalink raw reply
* Re: linux-next: build failure after merge of the suspend tree
From: David Miller @ 2010-05-21 5:56 UTC (permalink / raw)
To: eric.dumazet
Cc: sfr, linville, torvalds, rjw, linux-next, linux-kernel,
Helmut.Schaa, netdev
In-Reply-To: <1274421117.4977.9.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 21 May 2010 07:51:57 +0200
> Linus merged your tree David.
This must have happened in the past hour :-)
Great, I can sort this now.
^ permalink raw reply
* net-2.6 and net-next-2.6 rebased...
From: David Miller @ 2010-05-21 6:00 UTC (permalink / raw)
To: netdev
I have rebased these two trees now that Linus has pulled in the
net-next-2.6 merge.
We will take fixes and things like new drivers, device ID additions
etc. into net-2.6
After 2.6.35-rc1 is released, I will start adding cleanups and new
feature changes into net-next-2.6
Thanks.
^ permalink raw reply
* Re: [net-next PATCH v2] ixgbe:add support for a new 82599 10G Base-T device
From: David Miller @ 2010-05-21 6:07 UTC (permalink / raw)
To: jeffrey.t.kirsher; +Cc: netdev, gospo, mallikarjuna.chilakala
In-Reply-To: <20100519224607.2802.52952.stgit@localhost.localdomain>
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Wed, 19 May 2010 15:46:09 -0700
> From: Mallikarjuna R Chilakala <mallikarjuna.chilakala@intel.com>
>
> This adds support for a new copper device for 82599, device id 0x151c.
> This 82599 10GBase-T device uses the PHY's internal temperature sensor
> to guard against over-temp conditions. In this scenario the PHY will be
> put in a low power mode and link will no longer be able to transmit or
> receive any data. When this occurs, the over-temp interrupt is latched
> and driver logs this error message. A HW reset or power cycle is
> required to clear this status.
>
> Signed-off-by: Mallikarjuna R Chilakala <mallikarjuna.chilakala@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Applied, thanks.
^ permalink raw reply
* Re: gro: Fix bogus gso_size on the first fraglist entry
From: David Miller @ 2010-05-21 6:08 UTC (permalink / raw)
To: herbert; +Cc: netdev, yugzhang
In-Reply-To: <20100521024629.GA24700@gondor.apana.org.au>
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 21 May 2010 12:46:29 +1000
> gro: Fix bogus gso_size on the first fraglist entry
>
> When GRO produces fraglist entries, and the resulting skb hits
> an interface that is incapable of TSO but capable of FRAGLIST,
> we end up producing a bogus packet with gso_size non-zero.
>
> This was reported in the field with older versions of KVM that
> did not set the TSO bits on tuntap.
>
> This patch fixes that.
>
> Reported-by: Igor Zhang <yugzhang@redhat.com>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Applied and queued up for -stable, thanks!
^ permalink raw reply
* Re: [PATCH 0/3] netfilter: netfilter fixes
From: David Miller @ 2010-05-21 6:12 UTC (permalink / raw)
To: kaber; +Cc: netfilter-devel, netdev
In-Reply-To: <1274371246-26760-1-git-send-email-kaber@trash.net>
From: kaber@trash.net
Date: Thu, 20 May 2010 18:00:43 +0200
> following are three fixes for netfilter:
>
> - handling of non-linear skbs in the SIP conntrack helper. This fixes tracking
> failures when running on the same machine as a SIP application that is
> producing non-linear skbs. Long term this should be fixed by using the string
> search API, but that is a bigger piece of work.
>
> - fix for a race condition in nf_conntrack that might lead to conntrack entries
> marked as dead entering the hash tables, blocking new connections with similar
> keys.
>
> - a fix for an incorrect comment about the checkentry return conventions.
>
> Please apply or pull from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git master
Pulled, thanks Patrick.
^ permalink raw reply
* Re: [PATCH] sh_eth: Fix memleak in sh_mdio_release
From: David Miller @ 2010-05-21 6:14 UTC (permalink / raw)
To: iwamatsu; +Cc: dkirjanov, shimoda.yoshihiro, morimoto.kuninori, netdev
In-Reply-To: <AANLkTikXzLHCwIFashAm1Z22Mw6n0TxC0AcBc4ER7zI6@mail.gmail.com>
From: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
Date: Fri, 21 May 2010 07:12:21 +0900
> 2010/5/20 Denis Kirjanov <dkirjanov@hera.kernel.org>:
>> Allocated memory for IRQs should be freed when releasing the mii_bus
>>
>> Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
...
> Acked-by: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
Applied, thanks everyone.
^ permalink raw reply
* linux-next: manual merge of the driver-core tree with the net tree
From: Stephen Rothwell @ 2010-05-21 6:21 UTC (permalink / raw)
To: Greg KH
Cc: linux-next, linux-kernel, Tom Herbert, David Miller, netdev,
Eric W. Biederman
Hi Greg,
Today's linux-next merge of the driver-core tree got a conflict in
net/core/net-sysfs.c between commits
0a9627f2649a02bea165cfd529d7bcb625c2fcad ("rps: Receive Packet Steering")
and fec5e652e58fa6017b2c9e06466cb2a6538de5b4 ("rfs: Receive Flow
Steering") from the net tree and commits
bc28c84244da26bafb0d3bce95ef45212b31c6b8 ("net/sysfs: Fix the bitrot in
network device kobject namespace support") and
83dc0fbf37495691219d019ec16b40d8592d2956 ("net: Expose all network
devices in a namespaces in sysfs") from the driver-core tree.
I fixed it up (I think - see below) and can carry the fix as necessary.
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
diff --cc net/core/net-sysfs.c
index c57c4b2,46add45..0000000
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@@ -14,10 -14,11 +14,12 @@@
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/slab.h>
+ #include <linux/nsproxy.h>
#include <net/sock.h>
+ #include <net/net_namespace.h>
#include <linux/rtnetlink.h>
#include <linux/wireless.h>
+#include <linux/vmalloc.h>
#include <net/wext.h>
#include "net-sysfs.h"
@@@ -467,307 -468,40 +469,339 @@@ static struct attribute_group wireless_
.attrs = wireless_attrs,
};
#endif
+
+#ifdef CONFIG_RPS
+/*
+ * RX queue sysfs structures and functions.
+ */
+struct rx_queue_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr, char *buf);
+ ssize_t (*store)(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr, const char *buf, size_t len);
+};
+#define to_rx_queue_attr(_attr) container_of(_attr, \
+ struct rx_queue_attribute, attr)
+
+#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
+
+static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(queue, attribute, buf, count);
+}
+
+static struct sysfs_ops rx_queue_sysfs_ops = {
+ .show = rx_queue_attr_show,
+ .store = rx_queue_attr_store,
+};
+
+static ssize_t show_rps_map(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute, char *buf)
+{
+ struct rps_map *map;
+ cpumask_var_t mask;
+ size_t len = 0;
+ int i;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rcu_read_lock();
+ map = rcu_dereference(queue->rps_map);
+ if (map)
+ for (i = 0; i < map->len; i++)
+ cpumask_set_cpu(map->cpus[i], mask);
+
+ len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
+ if (PAGE_SIZE - len < 3) {
+ rcu_read_unlock();
+ free_cpumask_var(mask);
+ return -EINVAL;
+ }
+ rcu_read_unlock();
+
+ free_cpumask_var(mask);
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+
+static void rps_map_release(struct rcu_head *rcu)
+{
+ struct rps_map *map = container_of(rcu, struct rps_map, rcu);
+
+ kfree(map);
+}
+
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct rps_map *old_map, *map;
+ cpumask_var_t mask;
+ int err, cpu, i;
+ static DEFINE_SPINLOCK(rps_map_lock);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ map = kzalloc(max_t(unsigned,
+ RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
+ GFP_KERNEL);
+ if (!map) {
+ free_cpumask_var(mask);
+ return -ENOMEM;
+ }
+
+ i = 0;
+ for_each_cpu_and(cpu, mask, cpu_online_mask)
+ map->cpus[i++] = cpu;
+
+ if (i)
+ map->len = i;
+ else {
+ kfree(map);
+ map = NULL;
+ }
+
+ spin_lock(&rps_map_lock);
+ old_map = queue->rps_map;
+ rcu_assign_pointer(queue->rps_map, map);
+ spin_unlock(&rps_map_lock);
+
+ if (old_map)
+ call_rcu(&old_map->rcu, rps_map_release);
+
+ free_cpumask_var(mask);
+ return len;
+}
+
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ char *buf)
+{
+ struct rps_dev_flow_table *flow_table;
+ unsigned int val = 0;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(queue->rps_flow_table);
+ if (flow_table)
+ val = flow_table->mask + 1;
+ rcu_read_unlock();
+
+ return sprintf(buf, "%u\n", val);
+}
+
+static void rps_dev_flow_table_release_work(struct work_struct *work)
+{
+ struct rps_dev_flow_table *table = container_of(work,
+ struct rps_dev_flow_table, free_work);
+
+ vfree(table);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+ struct rps_dev_flow_table *table = container_of(rcu,
+ struct rps_dev_flow_table, rcu);
+
+ INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
+ schedule_work(&table->free_work);
+}
+
+static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int count;
+ char *endp;
+ struct rps_dev_flow_table *table, *old_table;
+ static DEFINE_SPINLOCK(rps_dev_flow_lock);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ count = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EINVAL;
+
+ if (count) {
+ int i;
+
+ if (count > 1<<30) {
+ /* Enforce a limit to prevent overflow */
+ return -EINVAL;
+ }
+ count = roundup_pow_of_two(count);
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+ if (!table)
+ return -ENOMEM;
+
+ table->mask = count - 1;
+ for (i = 0; i < count; i++)
+ table->flows[i].cpu = RPS_NO_CPU;
+ } else
+ table = NULL;
+
+ spin_lock(&rps_dev_flow_lock);
+ old_table = queue->rps_flow_table;
+ rcu_assign_pointer(queue->rps_flow_table, table);
+ spin_unlock(&rps_dev_flow_lock);
+
+ if (old_table)
+ call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+ return len;
+}
+
+static struct rx_queue_attribute rps_cpus_attribute =
+ __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
+ __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+ show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+
+static struct attribute *rx_queue_default_attrs[] = {
+ &rps_cpus_attribute.attr,
+ &rps_dev_flow_table_cnt_attribute.attr,
+ NULL
+};
+
+static void rx_queue_release(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+ struct netdev_rx_queue *first = queue->first;
+
+ if (queue->rps_map)
+ call_rcu(&queue->rps_map->rcu, rps_map_release);
+
+ if (queue->rps_flow_table)
+ call_rcu(&queue->rps_flow_table->rcu,
+ rps_dev_flow_table_release);
+
+ if (atomic_dec_and_test(&first->count))
+ kfree(first);
+}
+
+static struct kobj_type rx_queue_ktype = {
+ .sysfs_ops = &rx_queue_sysfs_ops,
+ .release = rx_queue_release,
+ .default_attrs = rx_queue_default_attrs,
+};
+
+static int rx_queue_add_kobject(struct net_device *net, int index)
+{
+ struct netdev_rx_queue *queue = net->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ kobj->kset = net->queues_kset;
+ error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
+ "rx-%u", index);
+ if (error) {
+ kobject_put(kobj);
+ return error;
+ }
+
+ kobject_uevent(kobj, KOBJ_ADD);
+
+ return error;
+}
+
+static int rx_queue_register_kobjects(struct net_device *net)
+{
+ int i;
+ int error = 0;
+
+ net->queues_kset = kset_create_and_add("queues",
+ NULL, &net->dev.kobj);
+ if (!net->queues_kset)
+ return -ENOMEM;
+ for (i = 0; i < net->num_rx_queues; i++) {
+ error = rx_queue_add_kobject(net, i);
+ if (error)
+ break;
+ }
+
+ if (error)
+ while (--i >= 0)
+ kobject_put(&net->_rx[i].kobj);
+
+ return error;
+}
+
+static void rx_queue_remove_kobjects(struct net_device *net)
+{
+ int i;
+
+ for (i = 0; i < net->num_rx_queues; i++)
+ kobject_put(&net->_rx[i].kobj);
+ kset_unregister(net->queues_kset);
+}
+#endif /* CONFIG_RPS */
#endif /* CONFIG_SYSFS */
+ static const void *net_current_ns(void)
+ {
+ return current->nsproxy->net_ns;
+ }
+
+ static const void *net_initial_ns(void)
+ {
+ return &init_net;
+ }
+
+ static const void *net_netlink_ns(struct sock *sk)
+ {
+ return sock_net(sk);
+ }
+
+ static struct kobj_ns_type_operations net_ns_type_operations = {
+ .type = KOBJ_NS_TYPE_NET,
+ .current_ns = net_current_ns,
+ .netlink_ns = net_netlink_ns,
+ .initial_ns = net_initial_ns,
+ };
+
+ static void net_kobj_ns_exit(struct net *net)
+ {
+ kobj_ns_exit(KOBJ_NS_TYPE_NET, net);
+ }
+
+ static struct pernet_operations kobj_net_ops = {
+ .exit = net_kobj_ns_exit,
+ };
+
+
#ifdef CONFIG_HOTPLUG
static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
{
@@@ -826,13 -566,6 +866,10 @@@ void netdev_unregister_kobject(struct n
kobject_get(&dev->kobj);
- if (!net_eq(dev_net(net), &init_net))
- return;
-
+#ifdef CONFIG_RPS
+ rx_queue_remove_kobjects(net);
+#endif
+
device_del(dev);
}
@@@ -841,8 -574,8 +878,9 @@@ int netdev_register_kobject(struct net_
{
struct device *dev = &(net->dev);
const struct attribute_group **groups = net->sysfs_groups;
+ int error = 0;
+ device_initialize(dev);
dev->class = &net_class;
dev->platform_data = net;
dev->groups = groups;
@@@ -865,22 -598,7 +903,19 @@@
#endif
#endif /* CONFIG_SYSFS */
- if (!net_eq(dev_net(net), &init_net))
- return 0;
-
- return device_add(dev);
+ error = device_add(dev);
+ if (error)
+ return error;
+
+#ifdef CONFIG_RPS
+ error = rx_queue_register_kobjects(net);
+ if (error) {
+ device_del(dev);
+ return error;
+ }
+#endif
+
+ return error;
}
int netdev_class_create_file(struct class_attribute *class_attr)
^ permalink raw reply
* Re: [PATCH net-next-2.6] can: SJA1000 add missing spin_lock_init()
From: David Miller @ 2010-05-21 6:21 UTC (permalink / raw)
To: socketcan; +Cc: netdev, socketcan-core, sam
In-Reply-To: <4BF415EE.8090307@hartkopp.net>
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Wed, 19 May 2010 18:46:38 +0200
> As remarked by Sam Ravnborg the spin_lock variable, that has been introduced
> in commit 57c8a456640fa3ca777652f11f2db4179a3e66b6 ("can: Fix SJA1000 command
> register writes on SMP systems") has not been initialized properly.
>
> This patch adds the initialization to allow spinlock debugging.
>
> Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Applied, thanks!
^ permalink raw reply
* Re: linux-next: manual merge of the driver-core tree with the net tree
From: Stephen Rothwell @ 2010-05-21 6:38 UTC (permalink / raw)
To: Greg KH
Cc: linux-next, linux-kernel, Tom Herbert, David Miller, netdev,
Eric W. Biederman
In-Reply-To: <20100521162134.6b6326bd.sfr@canb.auug.org.au>
[-- Attachment #1: Type: text/plain, Size: 943 bytes --]
Hi Greg,
On Fri, 21 May 2010 16:21:34 +1000 Stephen Rothwell <sfr@canb.auug.org.au> wrote:
>
> Today's linux-next merge of the driver-core tree got a conflict in
> net/core/net-sysfs.c between commits
> 0a9627f2649a02bea165cfd529d7bcb625c2fcad ("rps: Receive Packet Steering")
> and fec5e652e58fa6017b2c9e06466cb2a6538de5b4 ("rfs: Receive Flow
> Steering") from the net tree and commits
> bc28c84244da26bafb0d3bce95ef45212b31c6b8 ("net/sysfs: Fix the bitrot in
> network device kobject namespace support") and
> 83dc0fbf37495691219d019ec16b40d8592d2956 ("net: Expose all network
> devices in a namespaces in sysfs") from the driver-core tree.
>
> I fixed it up (I think - see below) and can carry the fix as necessary.
The net tree has been merged by Linus, so this should be fixable in the
driver-core tree now.
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/
[-- Attachment #2: Type: application/pgp-signature, Size: 198 bytes --]
^ permalink raw reply
* Re: linux-next: manual merge of the driver-core tree with the net tree
From: Eric W. Biederman @ 2010-05-21 6:46 UTC (permalink / raw)
To: Stephen Rothwell
Cc: Greg KH, linux-next, linux-kernel, Tom Herbert, David Miller,
netdev
In-Reply-To: <20100521162134.6b6326bd.sfr@canb.auug.org.au>
Stephen Rothwell <sfr@canb.auug.org.au> writes:
> Hi Greg,
>
> Today's linux-next merge of the driver-core tree got a conflict in
> net/core/net-sysfs.c between commits
> 0a9627f2649a02bea165cfd529d7bcb625c2fcad ("rps: Receive Packet Steering")
> and fec5e652e58fa6017b2c9e06466cb2a6538de5b4 ("rfs: Receive Flow
> Steering") from the net tree and commits
> bc28c84244da26bafb0d3bce95ef45212b31c6b8 ("net/sysfs: Fix the bitrot in
> network device kobject namespace support") and
> 83dc0fbf37495691219d019ec16b40d8592d2956 ("net: Expose all network
> devices in a namespaces in sysfs") from the driver-core tree.
>
> I fixed it up (I think - see below) and can carry the fix as necessary.
It looks right, except perhaps the RPS code looks like it will cause a
build failure with sysfs disabled, but that has nothing to do with your
changes. I don't see any real conflicts here, just two patches passing
very close to each other.
Thanks for the heads up.
Eric
^ permalink raw reply
* Re: linux-next: manual merge of the driver-core tree with the net tree
From: Stephen Rothwell @ 2010-05-21 6:49 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Greg KH, linux-next, linux-kernel, Tom Herbert, David Miller,
netdev
In-Reply-To: <m1tyq1wy4h.fsf@fess.ebiederm.org>
[-- Attachment #1: Type: text/plain, Size: 494 bytes --]
Hi Eric,
On Thu, 20 May 2010 23:46:22 -0700 ebiederm@xmission.com (Eric W. Biederman) wrote:
>
> It looks right, except perhaps the RPS code looks like it will cause a
> build failure with sysfs disabled, but that has nothing to do with your
> changes. I don't see any real conflicts here, just two patches passing
> very close to each other.
Thanks for the confirmation.
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/
[-- Attachment #2: Type: application/pgp-signature, Size: 198 bytes --]
^ permalink raw reply
* Re: linux-next: manual merge of the driver-core tree with the net tree
From: David Miller @ 2010-05-21 6:50 UTC (permalink / raw)
To: ebiederm; +Cc: sfr, greg, linux-next, linux-kernel, therbert, netdev
In-Reply-To: <m1tyq1wy4h.fsf@fess.ebiederm.org>
From: ebiederm@xmission.com (Eric W. Biederman)
Date: Thu, 20 May 2010 23:46:22 -0700
> It looks right, except perhaps the RPS code looks like it will cause a
> build failure with sysfs disabled, but that has nothing to do with your
> changes.
CONFIG_RPS depends upon CONFIG_SMP && CONFIG_SYSFS, so no that build
failure is not possible.
^ permalink raw reply
* Re: [PATCH v2] net: fix problem in dequeuing from input_pkt_queue
From: Eric Dumazet @ 2010-05-21 6:55 UTC (permalink / raw)
To: Tom Herbert; +Cc: davem, xiaosuo, netdev
In-Reply-To: <alpine.DEB.1.00.1005202101290.25742@pokey.mtv.corp.google.com>
Le jeudi 20 mai 2010 à 21:37 -0700, Tom Herbert a écrit :
> Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
> The primary issue it that the queue head must be incremented only
> after a packet has been processed, that is only after
> __netif_receive_skb has been called. This is needed for the mechanism
> to prevent OOO packet in RFS. Also when flushing the input_pkt_queue
> and process_queue, the process queue should be done first to prevent
> OOO packets.
>
> Because the input_pkt_queue has been effectively split into two queues,
> the calculation of the tail ptr is no longer correct. The correct value
> would be head+input_pkt_queue->len+process_queue->len. To avoid
> this calculation we added an explict input_queue_tail in softnet_data.
> The tail value is simply incremented when queuing to input_pkt_queue.
>
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
^ permalink raw reply
* Re: [PATCH v2] net: fix problem in dequeuing from input_pkt_queue
From: David Miller @ 2010-05-21 7:38 UTC (permalink / raw)
To: eric.dumazet; +Cc: therbert, xiaosuo, netdev
In-Reply-To: <1274424946.2439.7.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 21 May 2010 08:55:46 +0200
> Le jeudi 20 mai 2010 à 21:37 -0700, Tom Herbert a écrit :
>> Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
>> The primary issue it that the queue head must be incremented only
>> after a packet has been processed, that is only after
>> __netif_receive_skb has been called. This is needed for the mechanism
>> to prevent OOO packet in RFS. Also when flushing the input_pkt_queue
>> and process_queue, the process queue should be done first to prevent
>> OOO packets.
>>
>> Because the input_pkt_queue has been effectively split into two queues,
>> the calculation of the tail ptr is no longer correct. The correct value
>> would be head+input_pkt_queue->len+process_queue->len. To avoid
>> this calculation we added an explict input_queue_tail in softnet_data.
>> The tail value is simply incremented when queuing to input_pkt_queue.
>>
>> Signed-off-by: Tom Herbert <therbert@google.com>
>> ---
>
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Applied, thanks everyone.
^ permalink raw reply
* Re: linux-next: manual merge of the driver-core tree with the net tree
From: Eric W. Biederman @ 2010-05-21 7:43 UTC (permalink / raw)
To: David Miller; +Cc: sfr, greg, linux-next, linux-kernel, therbert, netdev
In-Reply-To: <20100520.235008.97331614.davem@davemloft.net>
David Miller <davem@davemloft.net> writes:
> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Thu, 20 May 2010 23:46:22 -0700
>
>> It looks right, except perhaps the RPS code looks like it will cause a
>> build failure with sysfs disabled, but that has nothing to do with your
>> changes.
>
> CONFIG_RPS depends upon CONFIG_SMP && CONFIG_SYSFS, so no that build
> failure is not possible.
That's the bit I'm missing. My apologies if I caused any unnecessary worry.
Eric
^ permalink raw reply
* [RFC][PATCH v6 01/19] Add a new structure for skb buffer from external.
From: xiaohui.xin @ 2010-05-21 9:30 UTC (permalink / raw)
To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1274434235-5929-1-git-send-email-xiaohui.xin@intel.com>
From: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
include/linux/skbuff.h | 12 ++++++++++++
1 files changed, 12 insertions(+), 0 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 124f90c..cf309c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -203,6 +203,18 @@ struct skb_shared_info {
void * destructor_arg;
};
+/* The structure is for a skb which skb->data may point to
+ * an external buffer, which is not allocated from kernel space.
+ * Since the buffer is external, then the shinfo or frags are
+ * also extern too. It also contains a destructor for itself.
+ */
+struct skb_external_page {
+ u8 *start;
+ int size;
+ struct skb_frag_struct *frags;
+ struct skb_shared_info *ushinfo;
+ void (*dtor)(struct skb_external_page *);
+};
/* We divide dataref into two halves. The higher 16 bits hold references
* to the payload part of skb->data. The lower 16 bits hold references to
* the entire skb->data. A clone of a headerless skb holds the length of
--
1.5.4.4
^ permalink raw reply related
* [RFC][PATCH v6 02/19] Add a new struct for device to manipulate external buffer.
From: xiaohui.xin @ 2010-05-21 9:30 UTC (permalink / raw)
To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1274434235-5929-2-git-send-email-xiaohui.xin@intel.com>
From: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
include/linux/netdevice.h | 19 ++++++++++++++++++-
1 files changed, 18 insertions(+), 1 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa8b476..bae725c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,6 +530,22 @@ struct netdev_queue {
unsigned long tx_dropped;
} ____cacheline_aligned_in_smp;
+/* Add a structure in structure net_device, the new field is
+ * named as mp_port. It's for mediate passthru (zero-copy).
+ * It contains the capability for the net device driver,
+ * a socket, and an external buffer creator, external means
+ * skb buffer belongs to the device may not be allocated from
+ * kernel space.
+ */
+struct mpassthru_port {
+ int hdr_len;
+ int data_len;
+ int npages;
+ unsigned flags;
+ struct socket *sock;
+ struct skb_external_page *(*ctor)(struct mpassthru_port *,
+ struct sk_buff *, int);
+};
/*
* This structure defines the management hooks for network devices.
@@ -952,7 +968,8 @@ struct net_device {
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port *garp_port;
-
+ /* mpassthru */
+ struct mpassthru_port *mp_port;
/* class/net/name entry */
struct device dev;
/* space for optional device, statistics, and wireless sysfs groups */
--
1.5.4.4
^ permalink raw reply related
* [RFC][PATCH v6 05/19] Add a function make external buffer owner to query capability.
From: xiaohui.xin @ 2010-05-21 9:30 UTC (permalink / raw)
To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1274434235-5929-5-git-send-email-xiaohui.xin@intel.com>
From: Xin Xiaohui <xiaohui.xin@intel.com>
The external buffer owner can use the functions to get
the capability of the underlying NIC driver.
Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhaonew@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
include/linux/netdevice.h | 2 +
net/core/dev.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 53 insertions(+), 0 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 183c786..31d9c4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1599,6 +1599,8 @@ extern gro_result_t napi_gro_frags(struct napi_struct *napi);
extern int netdev_mp_port_attach(struct net_device *dev,
struct mpassthru_port *port);
extern void netdev_mp_port_detach(struct net_device *dev);
+extern int netdev_mp_port_prep(struct net_device *dev,
+ struct mpassthru_port *port);
static inline void napi_free_frags(struct napi_struct *napi)
{
diff --git a/net/core/dev.c b/net/core/dev.c
index ecbb6b1..37b389a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2497,6 +2497,57 @@ void netdev_mp_port_detach(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_mp_port_detach);
+/* To support meidate passthru(zero-copy) with NIC driver,
+ * we'd better query NIC driver for the capability it can
+ * provide, especially for packet split mode, now we only
+ * query for the header size, and the payload a descriptor
+ * may carry. If a driver does not use the API to export,
+ * then we may try to use a default value, currently,
+ * we use the default value from an IGB driver. Now,
+ * it's only called by mpassthru device.
+ */
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+int netdev_mp_port_prep(struct net_device *dev,
+ struct mpassthru_port *port)
+{
+ int rc;
+ int npages, data_len;
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /* needed by packet split */
+
+ if (ops->ndo_mp_port_prep) {
+ rc = ops->ndo_mp_port_prep(dev, port);
+ if (rc)
+ return rc;
+ } else {
+ /* If the NIC driver did not report this,
+ * then we try to use default value.
+ */
+ port->hdr_len = 128;
+ port->data_len = 2048;
+ port->npages = 1;
+ }
+
+ if (port->hdr_len <= 0)
+ goto err;
+
+ npages = port->npages;
+ data_len = port->data_len;
+ if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+ (data_len < PAGE_SIZE * (npages - 1) ||
+ data_len > PAGE_SIZE * npages))
+ goto err;
+
+ return 0;
+err:
+ dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_mp_port_prep);
+#endif
+
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
--
1.5.4.4
^ permalink raw reply related
* [RFC][PATCH v6 08/19] Make __alloc_skb() to get external buffer.
From: xiaohui.xin @ 2010-05-21 9:30 UTC (permalink / raw)
To: netdev, kvm, linux-kernel, mst, mingo, davem, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1274434235-5929-8-git-send-email-xiaohui.xin@intel.com>
From: Xin Xiaohui <xiaohui.xin@intel.com>
Add a dev parameter to __alloc_skb(), skb->data
points to external buffer, recompute skb->head,
maintain shinfo of the external buffer, record
external buffer info into destructor_arg field.
Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
__alloc_skb() cleanup by
Jeff Dike <jdike@linux.intel.com>
include/linux/skbuff.h | 7 ++++---
net/core/skbuff.c | 43 +++++++++++++++++++++++++++++++++++++------
2 files changed, 41 insertions(+), 9 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 281a1c0..5ff8c27 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -442,17 +442,18 @@ extern void kfree_skb(struct sk_buff *skb);
extern void consume_skb(struct sk_buff *skb);
extern void __kfree_skb(struct sk_buff *skb);
extern struct sk_buff *__alloc_skb(unsigned int size,
- gfp_t priority, int fclone, int node);
+ gfp_t priority, int fclone,
+ int node, struct net_device *dev);
static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
- return __alloc_skb(size, priority, 0, -1);
+ return __alloc_skb(size, priority, 0, -1, NULL);
}
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
gfp_t priority)
{
- return __alloc_skb(size, priority, 1, -1);
+ return __alloc_skb(size, priority, 1, -1, NULL);
}
extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fbdb1f1..38d19d0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -161,7 +161,8 @@ EXPORT_SYMBOL(skb_under_panic);
* @fclone: allocate from fclone cache instead of head cache
* and allocate a cloned (child) skb
* @node: numa node to allocate memory on
- *
+ * @dev: a device owns the skb if the skb try to get external buffer.
+ * otherwise is NULL.
* Allocate a new &sk_buff. The returned buffer has no headroom and a
* tail room of size bytes. The object has a reference count of one.
* The return is the buffer. On a failure the return is %NULL.
@@ -170,12 +171,13 @@ EXPORT_SYMBOL(skb_under_panic);
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone, int node)
+ int fclone, int node, struct net_device *dev)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
- u8 *data;
+ u8 *data = NULL;
+ struct skb_external_page *ext_page = NULL;
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
@@ -185,8 +187,23 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
goto out;
size = SKB_DATA_ALIGN(size);
- data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
- gfp_mask, node);
+
+ /* If the device wants to do mediate passthru(zero-copy),
+ * the skb may try to get external buffers from outside.
+ * If fails, then fall back to alloc buffers from kernel.
+ */
+ if (dev && dev->mp_port) {
+ ext_page = netdev_alloc_external_page(dev, skb, size);
+ if (ext_page) {
+ data = ext_page->start;
+ size = ext_page->size;
+ }
+ }
+
+ if (!data)
+ data = kmalloc_node_track_caller(
+ size + sizeof(struct skb_shared_info),
+ gfp_mask, node);
if (!data)
goto nodata;
@@ -208,6 +225,15 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->mac_header = ~0U;
#endif
+ /* If the skb get external buffers sucessfully, since the shinfo is
+ * at the end of the buffer, we may retain the shinfo once we
+ * need it sometime.
+ */
+ if (ext_page) {
+ skb->head = skb->data - NET_IP_ALIGN - NET_SKB_PAD;
+ memcpy(ext_page->ushinfo, skb_shinfo(skb),
+ sizeof(struct skb_shared_info));
+ }
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
@@ -231,6 +257,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
+ /* Record the external buffer info in this field. It's not so good,
+ * but we cannot find another place easily.
+ */
+ shinfo->destructor_arg = ext_page;
+
out:
return skb;
nodata:
@@ -259,7 +290,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
struct sk_buff *skb;
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev);
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
--
1.5.4.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox