* [PATCH 1/6] [6LoWPAN] add fragmentation support
From: Alexander Smirnov @ 2011-11-10 17:38 UTC (permalink / raw)
To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <20111110173258.GA8056-AUGNqIMGY+bGcXpsla5Oef8+0UxHXcjY@public.gmane.org>
This patch adds support for frame fragmentation.
Signed-off-by: Alexander Smirnov <alex.bluesman.smirnov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
include/net/ieee802154.h | 6 +
net/ieee802154/6lowpan.c | 260 +++++++++++++++++++++++++++++++++++++++++++++-
net/ieee802154/6lowpan.h | 18 +++
3 files changed, 280 insertions(+), 4 deletions(-)
diff --git a/include/net/ieee802154.h b/include/net/ieee802154.h
index d52685d..ee59f8b 100644
--- a/include/net/ieee802154.h
+++ b/include/net/ieee802154.h
@@ -21,11 +21,14 @@
* Maxim Gorbachyov <maxim.gorbachev-kv7WeFo6aLtBDgjK7y7TUQ@public.gmane.org>
* Maxim Osipov <maxim.osipov-kv7WeFo6aLtBDgjK7y7TUQ@public.gmane.org>
* Dmitry Eremin-Solenikov <dbaryshkov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
+ * Alexander Smirnov <alex.bluesman.smirnov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
*/
#ifndef NET_IEEE802154_H
#define NET_IEEE802154_H
+#define IEEE802154_MTU 127
+
#define IEEE802154_FC_TYPE_BEACON 0x0 /* Frame is beacon */
#define IEEE802154_FC_TYPE_DATA 0x1 /* Frame is data */
#define IEEE802154_FC_TYPE_ACK 0x2 /* Frame is acknowledgment */
@@ -56,6 +59,9 @@
(((x) & IEEE802154_FC_DAMODE_MASK) >> IEEE802154_FC_DAMODE_SHIFT)
+/* MAC footer size */
+#define IEEE802154_MFR_SIZE 2 /* 2 octets */
+
/* MAC's Command Frames Identifiers */
#define IEEE802154_CMD_ASSOCIATION_REQ 0x01
#define IEEE802154_CMD_ASSOCIATION_RESP 0x02
diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index 19d6aef..7d4cb58 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -113,6 +113,20 @@ struct lowpan_dev_record {
struct list_head list;
};
+struct lowpan_fragment {
+ struct sk_buff *skb; /* skb to be assembled */
+ spinlock_t lock; /* concurency lock */
+ u16 length; /* length to be assemled */
+ u32 bytes_rcv; /* bytes received */
+ u16 tag; /* current fragment tag */
+ struct timer_list timer; /* assembling timer */
+ struct list_head list; /* fragments list */
+};
+
+static unsigned short fragment_tag;
+static LIST_HEAD(lowpan_fragments);
+spinlock_t flist_lock;
+
static inline struct
lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
{
@@ -244,6 +258,17 @@ static u8 lowpan_fetch_skb_u8(struct sk_buff *skb)
return ret;
}
+static u16 lowpan_fetch_skb_u16(struct sk_buff *skb)
+{
+ u16 ret;
+
+ BUG_ON(!pskb_may_pull(skb, 2));
+
+ ret = skb->data[0] | (skb->data[1] << 8);
+ skb_pull(skb, 2);
+ return ret;
+}
+
static int lowpan_header_create(struct sk_buff *skb,
struct net_device *dev,
unsigned short type, const void *_daddr,
@@ -467,6 +492,7 @@ static int lowpan_header_create(struct sk_buff *skb,
memcpy(&(sa.hwaddr), saddr, 8);
mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
+
return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
type, (void *)&da, (void *)&sa, skb->len);
}
@@ -511,6 +537,21 @@ static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr)
return stat;
}
+static void lowpan_fragment_timer_expired(unsigned long entry_addr)
+{
+ struct lowpan_fragment *entry = (struct lowpan_fragment *)entry_addr;
+
+ pr_debug("%s: timer expired for frame with tag %d\n", __func__,
+ entry->tag);
+
+ spin_lock(&flist_lock);
+ list_del(&entry->list);
+ spin_unlock(&flist_lock);
+
+ dev_kfree_skb(entry->skb);
+ kfree(entry);
+}
+
static int
lowpan_process_data(struct sk_buff *skb)
{
@@ -525,6 +566,107 @@ lowpan_process_data(struct sk_buff *skb)
if (skb->len < 2)
goto drop;
iphc0 = lowpan_fetch_skb_u8(skb);
+
+ /* fragments assembling */
+ switch (iphc0 & LOWPAN_DISPATCH_MASK) {
+ case LOWPAN_DISPATCH_FRAG1:
+ case LOWPAN_DISPATCH_FRAGN:
+ {
+ struct lowpan_fragment *frame;
+ u8 len, offset;
+ u16 tag;
+ bool found = false;
+
+ len = lowpan_fetch_skb_u8(skb); /* frame length */
+ tag = lowpan_fetch_skb_u16(skb);
+
+ /*
+ * check if frame assembling with the same tag is
+ * already in progress
+ */
+ spin_lock(&flist_lock);
+
+ list_for_each_entry(frame, &lowpan_fragments, list)
+ if (frame->tag == tag) {
+ found = true;
+ break;
+ }
+
+ /* alloc new frame structure */
+ if (!found) {
+ frame = kzalloc(sizeof(struct lowpan_fragment),
+ GFP_ATOMIC);
+ if (!frame)
+ goto unlock_and_drop;
+
+ INIT_LIST_HEAD(&frame->list);
+
+ frame->length = (iphc0 & 7) | (len << 3);
+ frame->tag = tag;
+
+ /* allocate buffer for frame assembling */
+ frame->skb = alloc_skb(frame->length +
+ sizeof(struct ipv6hdr), GFP_ATOMIC);
+
+ if (!frame->skb) {
+ kfree(frame);
+ goto unlock_and_drop;
+ }
+
+ frame->skb->priority = skb->priority;
+ frame->skb->dev = skb->dev;
+
+ /* reserve headroom for uncompressed ipv6 header */
+ skb_reserve(frame->skb, sizeof(struct ipv6hdr));
+ skb_put(frame->skb, frame->length);
+
+ init_timer(&frame->timer);
+ /* time out is the same as for ipv6 - 60 sec */
+ frame->timer.expires = jiffies + LOWPAN_FRAG_TIMEOUT;
+ frame->timer.data = (unsigned long)frame;
+ frame->timer.function = lowpan_fragment_timer_expired;
+
+ add_timer(&frame->timer);
+
+ list_add_tail(&frame->list, &lowpan_fragments);
+ }
+
+ if ((iphc0 & LOWPAN_DISPATCH_MASK) == LOWPAN_DISPATCH_FRAG1)
+ goto unlock_and_drop;
+
+ offset = lowpan_fetch_skb_u8(skb); /* fetch offset */
+
+ /* if payload fits buffer, copy it */
+ if (likely((offset * 8 + skb->len) <= frame->length))
+ skb_copy_to_linear_data_offset(frame->skb, offset * 8,
+ skb->data, skb->len);
+ else
+ goto unlock_and_drop;
+
+ frame->bytes_rcv += skb->len;
+
+ /* frame assembling complete */
+ if ((frame->bytes_rcv == frame->length) &&
+ frame->timer.expires > jiffies) {
+ /* if timer haven't expired - first of all delete it */
+ del_timer(&frame->timer);
+ list_del(&frame->list);
+ spin_unlock(&flist_lock);
+
+ dev_kfree_skb(skb);
+ skb = frame->skb;
+ kfree(frame);
+ iphc0 = lowpan_fetch_skb_u8(skb);
+ break;
+ }
+ spin_unlock(&flist_lock);
+
+ return kfree_skb(skb), 0;
+ }
+ default:
+ break;
+ }
+
iphc1 = lowpan_fetch_skb_u8(skb);
_saddr = mac_cb(skb)->sa.hwaddr;
@@ -674,6 +816,9 @@ lowpan_process_data(struct sk_buff *skb)
lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr,
sizeof(hdr));
return lowpan_skb_deliver(skb, &hdr);
+
+unlock_and_drop:
+ spin_unlock(&flist_lock);
drop:
kfree_skb(skb);
return -EINVAL;
@@ -692,18 +837,118 @@ static int lowpan_set_address(struct net_device *dev, void *p)
return 0;
}
+static int lowpan_get_mac_header_length(struct sk_buff *skb)
+{
+ /*
+ * Currently long addressing mode is supported only, so the overall
+ * header size is 21:
+ * FC SeqNum DPAN DA SA Sec
+ * 2 + 1 + 2 + 8 + 8 + 0 = 21
+ */
+ return 21;
+}
+
+static int
+lowpan_fragment_xmit(struct sk_buff *skb, u8 *head,
+ int mlen, int plen, int offset)
+{
+ struct sk_buff *frag;
+ int hlen, ret;
+
+ /* if payload length is zero, therefore it's a first fragment */
+ hlen = (plen == 0 ? LOWPAN_FRAG1_HEAD_SIZE : LOWPAN_FRAGN_HEAD_SIZE);
+
+ lowpan_raw_dump_inline(__func__, "6lowpan fragment header", head, hlen);
+
+ frag = dev_alloc_skb(hlen + mlen + plen + IEEE802154_MFR_SIZE);
+ if (!frag)
+ return -ENOMEM;
+
+ frag->priority = skb->priority;
+ frag->dev = skb->dev;
+
+ /* copy header, MFR and payload */
+ memcpy(skb_put(frag, mlen), skb->data, mlen);
+ memcpy(skb_put(frag, hlen), head, hlen);
+
+ if (plen)
+ skb_copy_from_linear_data_offset(skb, offset + mlen,
+ skb_put(frag, plen), plen);
+
+ lowpan_raw_dump_table(__func__, " raw fragment dump", frag->data,
+ frag->len);
+
+ ret = dev_queue_xmit(frag);
+
+ if (ret < 0)
+ dev_kfree_skb(frag);
+
+ return ret;
+}
+
+static int
+lowpan_skb_fragmentation(struct sk_buff *skb)
+{
+ int err, header_length, payload_length, tag, offset = 0;
+ u8 head[5];
+
+ header_length = lowpan_get_mac_header_length(skb);
+ payload_length = skb->len - header_length;
+ tag = fragment_tag++;
+
+ /* first fragment header */
+ head[0] = LOWPAN_DISPATCH_FRAG1 | (payload_length & 0x7);
+ head[1] = (payload_length >> 3) & 0xff;
+ head[2] = tag & 0xff;
+ head[3] = tag >> 8;
+
+ err = lowpan_fragment_xmit(skb, head, header_length, 0, 0);
+
+ /* next fragment header */
+ head[0] &= ~LOWPAN_DISPATCH_FRAG1;
+ head[0] |= LOWPAN_DISPATCH_FRAGN;
+
+ while ((payload_length - offset > 0) && (err >= 0)) {
+ int len = LOWPAN_FRAG_SIZE;
+
+ head[4] = offset / 8;
+
+ if (payload_length - offset < len)
+ len = payload_length - offset;
+
+ err = lowpan_fragment_xmit(skb, head, header_length,
+ len, offset);
+ offset += len;
+ }
+
+ return err;
+}
+
static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev)
{
- int err = 0;
+ int err = -1;
pr_debug("(%s): package xmit\n", __func__);
skb->dev = lowpan_dev_info(dev)->real_dev;
if (skb->dev == NULL) {
pr_debug("(%s) ERROR: no real wpan device found\n", __func__);
- dev_kfree_skb(skb);
- } else
+ goto error;
+ }
+
+ if (skb->len <= IEEE802154_MTU) {
err = dev_queue_xmit(skb);
+ goto out;
+ }
+
+ pr_debug("(%s): frame is too big, fragmentation is needed\n",
+ __func__);
+ err = lowpan_skb_fragmentation(skb);
+error:
+ dev_kfree_skb(skb);
+out:
+ if (err < 0)
+ pr_debug("(%s): ERROR: xmit failed\n", __func__);
return (err < 0 ? NETDEV_TX_BUSY : NETDEV_TX_OK);
}
@@ -765,8 +1010,15 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
goto drop;
/* check that it's our buffer */
- if ((skb->data[0] & 0xe0) == 0x60)
+ switch (skb->data[0] & 0xe0) {
+ case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */
+ case LOWPAN_DISPATCH_FRAG1: /* first fragment header */
+ case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */
lowpan_process_data(skb);
+ break;
+ default:
+ break;
+ }
return NET_RX_SUCCESS;
diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
index 5d8cf80..5d2e5a0 100644
--- a/net/ieee802154/6lowpan.h
+++ b/net/ieee802154/6lowpan.h
@@ -159,6 +159,24 @@
#define LOWPAN_DISPATCH_FRAG1 0xc0 /* 11000xxx */
#define LOWPAN_DISPATCH_FRAGN 0xe0 /* 11100xxx */
+#define LOWPAN_DISPATCH_MASK 0xf8 /* 11111000 */
+
+#define LOWPAN_FRAG_TIMEOUT (HZ * 60) /* time-out 60 sec */
+
+#define LOWPAN_FRAG1_HEAD_SIZE 0x4
+#define LOWPAN_FRAGN_HEAD_SIZE 0x5
+
+/*
+ * According IEEE802.15.4 standard:
+ * - MTU is 127 octets
+ * - maximum MHR size is 37 octets
+ * - MFR size is 2 octets
+ *
+ * so minimal payload size that we may guarantee is:
+ * MTU - MHR - MFR = 88 octets
+ */
+#define LOWPAN_FRAG_SIZE 88
+
/*
* Values of fields within the IPHC encoding first byte
* (C stands for compressed and I for inline)
--
1.7.2.5
------------------------------------------------------------------------------
RSA(R) Conference 2012
Save $700 by Nov 18
Register now
http://p.sf.net/sfu/rsa-sfdev2dev1
^ permalink raw reply related
* [PATCH 3/6] set proper netdev flags
From: Alexander Smirnov @ 2011-11-10 17:39 UTC (permalink / raw)
To: davem; +Cc: dbaryshkov, linux-zigbee-devel, netdev, Alexander Smirnov
In-Reply-To: <20111110173258.GA8056@avtobot.cybertron>
This patch fixes settings for device initialization which makes possible to
use NDISC and TCP.
Signed-off-by: Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
Acked-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
---
net/ieee802154/6lowpan.c | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)
diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index af5553e..39ec6e0 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -973,13 +973,12 @@ static void lowpan_setup(struct net_device *dev)
dev->addr_len = IEEE802154_ADDR_LEN;
memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN);
dev->type = ARPHRD_IEEE802154;
- dev->features = NETIF_F_NO_CSUM;
/* Frame Control + Sequence Number + Address fields + Security Header */
dev->hard_header_len = 2 + 1 + 20 + 14;
dev->needed_tailroom = 2; /* FCS */
dev->mtu = 1281;
dev->tx_queue_len = 0;
- dev->flags = IFF_NOARP | IFF_BROADCAST;
+ dev->flags = IFF_BROADCAST | IFF_MULTICAST;
dev->watchdog_timeo = 0;
dev->netdev_ops = &lowpan_netdev_ops;
--
1.7.2.5
^ permalink raw reply related
* [PATCH series][6LoWPAN][net-next]
From: Alexander Smirnov @ 2011-11-10 17:32 UTC (permalink / raw)
To: davem
Cc: dbaryshkov, netdev, linux-zigbee-devel, alex.bluesman.smirnov,
jonsmirl
Hello all,
The following patch series adds both major feature and minor fixes.
Please find detailed description in each of the patches.
Just to summarize current development status:
By using MAC and PHY layers support from
"http://sourceforge.net/projects/linux-zigbee"
project now it's possible to run generic network applications (ssh, iperf) over
ieee802.15.4 network.
iperf shows about ~40Kbits/sec for TCP
With best regards,
Alexander
^ permalink raw reply
* Re: creating netdev queues on the fly?
From: Eric Dumazet @ 2011-11-10 17:00 UTC (permalink / raw)
To: Johannes Berg; +Cc: netdev, linux-wireless
In-Reply-To: <1320942369.3967.127.camel-8upI4CBIZJIJvtFkdXX2HixXY32XiHfO@public.gmane.org>
Le jeudi 10 novembre 2011 à 17:26 +0100, Johannes Berg a écrit :
> On Thu, 2011-11-10 at 15:35 +0100, Eric Dumazet wrote:
>
> > > So to get to my question: What if we could create netdev queues on the
> > > fly?
>
> > In term of qdisc management I believe its a bit complex if we start to
> > dynamically add netdev queues :)
>
> Yeah I was thinking that would be the problematic part.
>
> > My first idea would be to extend Qdisc management so that a device can
> > callback qdisc when a frame is finaly delivered / consumed / discarded.
> >
> > We currently only have qdisc->enqueue() and qdisc->dequeue(), we could
> > add qdisc->deliver_callback(skb)
> >
> > You keep devices as they are, with a netdevqueue per hardware queue.
> >
> > Then, using a Qdisc like existing ones, but with a limit of
> > outstanding(given to device but not yet consumed) packets per class.
> >
> > external tc classifier would deliver a hash/index depending on remote
> > station.
>
> So basically you'd have a class for each station? Or a class for each
> station/AC? That's a lot of classes rather than queues, are they
> pre-allocated, or does it not matter? Overall that sounds like a good
> approach.
They could be statically allocated in a stochastic fashion (Sorry, SFQ
is one of my favorite qdisc, very light in memory/cpu and yet powerful)
net/sched/sch_sfq.c
Of course, HFSC / DRR are considered more flexible, but eat _much_ more
memory.
http://people.netfilter.org/kaber/shaping
You declare a hash divisor of 1024 to map one station to one class
(Might have hash collision of course...)
Each active slot contains a local queue to one remote station.
Each queue can be pfifo or whatever smart qdisc (sfq, or a more flexible
qdisc)
Then you use a filter to select the appropriate class :
tc filter add dev ${dev} protocol all pref 1 parent $handle handle 1 \
flow hash keys remote_station divisor 1024
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [patch net-next V7] net: introduce ethernet teaming device
From: Jiri Pirko @ 2011-11-10 16:59 UTC (permalink / raw)
To: Eric Dumazet
Cc: netdev, davem, bhutchings, shemminger, fubar, andy, tgraf,
ebiederm, mirqus, kaber, greearb, jesse, fbl, benjamin.poirier,
jzupka, ivecera
In-Reply-To: <1320940930.10042.2.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>
Thu, Nov 10, 2011 at 05:02:10PM CET, eric.dumazet@gmail.com wrote:
>Le jeudi 10 novembre 2011 à 16:41 +0100, Jiri Pirko a écrit :
>> This patch introduces new network device called team. It supposes to be
>> very fast, simple, userspace-driven alternative to existing bonding
>> driver.
>>
>
>> +static bool team_dummy_transmit(struct team *team, struct sk_buff *skb)
>> +{
>> + dev_kfree_skb(skb);
>> + return false;
>> +}
>> +
>
>ndo_start_xmit() method can be called with IRQ disabled (netpoll ?), so
>dev_kfree_skb_any() is better ?
>
For example bonding is using dev_kfree_skb in xmit as well.
I propose this to be eventually addressed in follow-up patch.
>
>
^ permalink raw reply
* Re: [PATCH v2] drivers/net/usb/asix: resync from vendor's copy
From: Grant Grundler @ 2011-11-10 16:54 UTC (permalink / raw)
To: Mark Lord
Cc: David Miller, netdev, linux-kernel, Ben Hutchings, Michal Marek,
Freddy Xin, ASIX Allan Email [office]
In-Reply-To: <4EBBD936.40705@teksavvy.com>
+ASIX (Freddy Xin and Allan)
On Thu, Nov 10, 2011 at 6:01 AM, Mark Lord <kernel@teksavvy.com> wrote:
> On 11-11-09 12:31 PM, Mark Lord wrote:
>> Second pass (for review) at updating the in-kernel asix usb/network driver
>> from the v4.1.0 vendor GPL version of the driver, obtained from here:
>>
>> http://www.asix.com.tw/download.php?sub=searchresult&PItemID=84&download=driver
>>
>> The original vendor copy used a local "axusbnet" middleware (rather than "usbnet").
>> I've converted it back to using "usbnet", made a ton of cosmetic changes
>> to get it to pass checkpatch.pl, and removed a small amount of code duplication.
>>
>> The tx/rx checksum code has been updated per Ben's comments,
>> and the duplicated MII_* definitions have been removed.
>> I've changed the version string to be "4.1.0-kernel",
>> to reflect the vendor's code version while also distinguishing
>> this port from the original vendor code.
>>
>> It can use more work going forward, but it is important to get it upstream
>> sooner than later -- the current in-kernel driver fails with many devices,
>> both old and new. This updated version works with everything I have available
>> to test with, and also handles suspend / resume (unlike the in-kernel one).
>>
>> Signed-off-by: Mark Lord <mlord@pobox.com>
> ...
>
> Okay, the vendor has told me to cease development on this now.
> They prefer instead to feed small parts of their mainline driver
> to Grant Grundler as issues arise, rather than to get the whole
> thing upstream.
Hi Mark,
I'm very surprise a second time now. It's partly my own fault since
I'm not following netdev. Until you contacted me privately 8 days
ago, I didn't know that you had been working with ASIX on asix.c. That
was the first surprise.
The second surprise is ASIX suggested I'm going to be the conduit for
all their patches upstream. I never committed to that. I was very
clear that I would help them get started by feeding 3 small patches to
"upstream". I've fulfilled that commitment (3 fixes plus one cleanup).
I have one more patch (work in progress:
https://gerrit.chromium.org/gerrit/11485 ) I'd like to see go in and
then they have to start "catching their own fish".
The statement above implies Google has an economic interest in their
products and AFAIK that's not true. We do not buy their product in
volume. Getting this working has sort of been my 20% project over the
past couple of months. It would have been cheaper buy USB ethernet
dongles from a vendor with working Linux support and *give* them to
our partners who need them.
Lastly, offlist I have expressed my opinion about 5K line patches to
you and to ASIX. I'm not allowed to push such large patches into
Chromium.org (upstream first!) and I was certain on their own, ASIX
wouldn't get that upstream in one shot. No one knows them. No one
trusts them. Asix never mentioned that they had already hired you to
do it (and I am pretty sure you could have). So I feel a bit blind
sided (as you probably do to) by how events are unfolding here. ASIX
may still need help to *continue* upstreaming additional functionality
(e.g. CSO/TSO) and I will remind ASIX of that in future
communications.
*sigh*
cheers,
grant
> So be it.
>
> Cheers
>
^ permalink raw reply
* Re: [PATCH net-next] bnx2x: reduce skb truesize by 50%
From: Eric Dumazet @ 2011-11-10 16:45 UTC (permalink / raw)
To: eilong
Cc: David Miller, bhutchings@solarflare.com, pstaszewski@itcare.pl,
netdev@vger.kernel.org
In-Reply-To: <1320942423.307.10.camel@lb-tlvb-eilong.il.broadcom.com>
Le jeudi 10 novembre 2011 à 18:27 +0200, Eilon Greenstein a écrit :
> On Thu, 2011-11-10 at 07:27 -0800, Eric Dumazet wrote:
> > Le jeudi 10 novembre 2011 à 17:05 +0200, Eilon Greenstein a écrit :
> > > > --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> > > > +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> > > > @@ -1185,9 +1185,14 @@ struct bnx2x {
> > > > #define ETH_MAX_PACKET_SIZE 1500
> > > > #define ETH_MAX_JUMBO_PACKET_SIZE 9600
> > > >
> > > > - /* Max supported alignment is 256 (8 shift) */
> > > > -#define BNX2X_RX_ALIGN_SHIFT ((L1_CACHE_SHIFT < 8) ? \
> > > > - L1_CACHE_SHIFT : 8)
> > > > +/* Max supported alignment is 256 (8 shift)
> > > > + * It should ideally be min(L1_CACHE_SHIFT, 8)
> > > > + * Choosing 5 (32 bytes) permits to get skb heads of 2048 bytes
> > > > + * instead of 4096 bytes.
> > > > + * With SLUB/SLAB allocators, data will be cache line aligned anyway.
> > > > + */
> > > > +#define BNX2X_RX_ALIGN_SHIFT 5
> > > > +
> > >
> > > Hi Eric,
> > >
> > > This can seriously hurt the PCI utilization. So in scenarios in which
> > > the PCI is the bottle neck, you will see performance degradation. We are
> > > looking at alternatives to reduce the allocation, but it is taking a
> > > while. Please hold off with this patch.
> >
> > What do you mean exactly ?
> >
> > This patch doesnt change skb->data alignment, its still 64 bytes
> > aligned. (cqe_fp->placement_offset == 2). PCI utilization is the same.
> >
> > Only SLOB could get a misalignement, but who uses SLOB for performance ?
>
> Obviously you are right... But the FW is configured to the wrong
> alignment and that will affect the end alignment (padding) which is
> significant in small packets scenarios where the PCI is the bottle neck.
Yes, I fully understand.
>
> > Alternative would be to check why hardware need 2*L1_CACHE_BYTES extra
> > room for alignment... Normaly it could be 1*L1_CACHE_BYTES ?
>
> Again - you are a mind reader :) This is what we are looking into right
> now. The problem is that `if` the buffer is not aligned (SLOB) we can
> overstep the allocated boundaries by configuring the FW to align.
>
> > /* FW use 2 Cache lines Alignment for start packet and size */
> > -#define BNX2X_FW_RX_ALIGN (2 << BNX2X_RX_ALIGN_SHIFT)
> > +#define BNX2X_FW_RX_ALIGN (1 << BNX2X_RX_ALIGN_SHIFT)
> >
> >
I did a SLOB test (and my patch included as well)
skb->len=66 pad=26 wkb->data=0xffff8801194da048 truesize=2304
So skb->data + pad -> 0xffff8801194da062 : So a 32bytes alignement + 2
bytes to align IP header. (BTW we dont really need it, NET_IP_ALIGN is
now 0 on most x86 platforms ?)
In the end, we get 98 bytes of 'skb reserve', and also 64 bytes of extra
headroom _after_ the end of full frame.
In my understanding, hardware alignement should be between 0 and 63, not
0 and 127.
So maybe only BNX2X_FW_RX_ALIGN is twice the needed amount.
^ permalink raw reply
* [PATCH] net, bridge: print log message after state changed
From: Holger Brunck @ 2011-11-10 16:18 UTC (permalink / raw)
To: netdev
Cc: Wolfgang Fritz, Holger Brunck, Stephen Hemminger, David S. Miller,
bridge
From: Wolfgang Fritz <wolfgang.fritz@keymile.com>
Function br_log_state writes log message "... entering XXXX state" so it
should be called after the state has changed and not before.
Signed-off-by: Wolfgang Fritz <wolfgang.fritz@keymile.com>
Signed-off-by: Holger Brunck <holger.brunck@keymile.com>
cc: Stephen Hemminger <shemminger@linux-foundation.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: bridge@lists.linux-foundation.org
---
net/bridge/br_stp_if.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 19308e3..e91be40 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -98,8 +98,6 @@ void br_stp_disable_port(struct net_bridge_port *p)
struct net_bridge *br = p->br;
int wasroot;
- br_log_state(p);
-
wasroot = br_is_root_bridge(br);
br_become_designated_port(p);
p->state = BR_STATE_DISABLED;
@@ -121,6 +119,8 @@ void br_stp_disable_port(struct net_bridge_port *p)
if (br_is_root_bridge(br) && !wasroot)
br_become_root_bridge(br);
+
+ br_log_state(p);
}
static void br_stp_start(struct net_bridge *br)
--
1.7.1
^ permalink raw reply related
* Re: [PATCH net-next] bnx2x: reduce skb truesize by 50%
From: Eilon Greenstein @ 2011-11-10 16:27 UTC (permalink / raw)
To: Eric Dumazet
Cc: David Miller, bhutchings@solarflare.com, pstaszewski@itcare.pl,
netdev@vger.kernel.org
In-Reply-To: <1320938878.2310.15.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>
On Thu, 2011-11-10 at 07:27 -0800, Eric Dumazet wrote:
> Le jeudi 10 novembre 2011 à 17:05 +0200, Eilon Greenstein a écrit :
> > > --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> > > +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> > > @@ -1185,9 +1185,14 @@ struct bnx2x {
> > > #define ETH_MAX_PACKET_SIZE 1500
> > > #define ETH_MAX_JUMBO_PACKET_SIZE 9600
> > >
> > > - /* Max supported alignment is 256 (8 shift) */
> > > -#define BNX2X_RX_ALIGN_SHIFT ((L1_CACHE_SHIFT < 8) ? \
> > > - L1_CACHE_SHIFT : 8)
> > > +/* Max supported alignment is 256 (8 shift)
> > > + * It should ideally be min(L1_CACHE_SHIFT, 8)
> > > + * Choosing 5 (32 bytes) permits to get skb heads of 2048 bytes
> > > + * instead of 4096 bytes.
> > > + * With SLUB/SLAB allocators, data will be cache line aligned anyway.
> > > + */
> > > +#define BNX2X_RX_ALIGN_SHIFT 5
> > > +
> >
> > Hi Eric,
> >
> > This can seriously hurt the PCI utilization. So in scenarios in which
> > the PCI is the bottle neck, you will see performance degradation. We are
> > looking at alternatives to reduce the allocation, but it is taking a
> > while. Please hold off with this patch.
>
> What do you mean exactly ?
>
> This patch doesnt change skb->data alignment, its still 64 bytes
> aligned. (cqe_fp->placement_offset == 2). PCI utilization is the same.
>
> Only SLOB could get a misalignement, but who uses SLOB for performance ?
Obviously you are right... But the FW is configured to the wrong
alignment and that will affect the end alignment (padding) which is
significant in small packets scenarios where the PCI is the bottle neck.
> Alternative would be to check why hardware need 2*L1_CACHE_BYTES extra
> room for alignment... Normaly it could be 1*L1_CACHE_BYTES ?
Again - you are a mind reader :) This is what we are looking into right
now. The problem is that `if` the buffer is not aligned (SLOB) we can
overstep the allocated boundaries by configuring the FW to align.
> /* FW use 2 Cache lines Alignment for start packet and size */
> -#define BNX2X_FW_RX_ALIGN (2 << BNX2X_RX_ALIGN_SHIFT)
> +#define BNX2X_FW_RX_ALIGN (1 << BNX2X_RX_ALIGN_SHIFT)
>
>
>
>
^ permalink raw reply
* Re: creating netdev queues on the fly?
From: Johannes Berg @ 2011-11-10 16:26 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, linux-wireless
In-Reply-To: <1320935713.2310.9.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>
On Thu, 2011-11-10 at 15:35 +0100, Eric Dumazet wrote:
> > So to get to my question: What if we could create netdev queues on the
> > fly?
> In term of qdisc management I believe its a bit complex if we start to
> dynamically add netdev queues :)
Yeah I was thinking that would be the problematic part.
> My first idea would be to extend Qdisc management so that a device can
> callback qdisc when a frame is finaly delivered / consumed / discarded.
>
> We currently only have qdisc->enqueue() and qdisc->dequeue(), we could
> add qdisc->deliver_callback(skb)
>
> You keep devices as they are, with a netdevqueue per hardware queue.
>
> Then, using a Qdisc like existing ones, but with a limit of
> outstanding(given to device but not yet consumed) packets per class.
>
> external tc classifier would deliver a hash/index depending on remote
> station.
So basically you'd have a class for each station? Or a class for each
station/AC? That's a lot of classes rather than queues, are they
pre-allocated, or does it not matter? Overall that sounds like a good
approach.
> As a bonus you can get all the existing rate estimators / QOS /
> shapers ...
Yeah ... I guess I need to start learning tc :)
johannes
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [patch net-next V7] net: introduce ethernet teaming device
From: Eric Dumazet @ 2011-11-10 16:02 UTC (permalink / raw)
To: Jiri Pirko
Cc: netdev, davem, bhutchings, shemminger, fubar, andy, tgraf,
ebiederm, mirqus, kaber, greearb, jesse, fbl, benjamin.poirier,
jzupka, ivecera
In-Reply-To: <1320939698-1062-1-git-send-email-jpirko@redhat.com>
Le jeudi 10 novembre 2011 à 16:41 +0100, Jiri Pirko a écrit :
> This patch introduces new network device called team. It supposes to be
> very fast, simple, userspace-driven alternative to existing bonding
> driver.
>
> +static bool team_dummy_transmit(struct team *team, struct sk_buff *skb)
> +{
> + dev_kfree_skb(skb);
> + return false;
> +}
> +
ndo_start_xmit() method can be called with IRQ disabled (netpoll ?), so
dev_kfree_skb_any() is better ?
^ permalink raw reply
* [PATCH] bridge: Fix potential deadlock on br->multicast_lock
From: Andrew Vagin @ 2011-11-10 15:48 UTC (permalink / raw)
To: Stephen Hemminger
Cc: David S. Miller, bridge, netdev, linux-kernel, devel, avagin
multicast_lock is taken in softirq context, so we should use
spin_lock_bh() in userspace.
call-chain in softirq context:
run_timer_softirq()
br_multicast_query_expired()
call-chain in userspace:
sysfs_write_file()
store_multicast_snooping()
br_multicast_toggle()
Signed-off-by: Andrew Vagin <avagin@openvz.org>
---
net/bridge/br_multicast.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 995cbe0..2eefe27 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1770,7 +1770,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
int err = 0;
struct net_bridge_mdb_htable *mdb;
- spin_lock(&br->multicast_lock);
+ spin_lock_bh(&br->multicast_lock);
if (br->multicast_disabled == !val)
goto unlock;
@@ -1806,7 +1806,7 @@ rollback:
}
unlock:
- spin_unlock(&br->multicast_lock);
+ spin_unlock_bh(&br->multicast_lock);
return err;
}
--
1.7.1
^ permalink raw reply related
* Re:
From: Steve Wilson @ 2011-11-10 15:38 UTC (permalink / raw)
To: mail1
http://docs.google.com/demo/edit?id=scABPBj9J1C9e_ISwvPh5xoST&hl=en&dt=document#document
^ permalink raw reply
* [patch net-next V7] net: introduce ethernet teaming device
From: Jiri Pirko @ 2011-11-10 15:41 UTC (permalink / raw)
To: netdev
Cc: davem, eric.dumazet, bhutchings, shemminger, fubar, andy, tgraf,
ebiederm, mirqus, kaber, greearb, jesse, fbl, benjamin.poirier,
jzupka, ivecera
This patch introduces new network device called team. It supposes to be
very fast, simple, userspace-driven alternative to existing bonding
driver.
Userspace library called libteam with couple of demo apps is available
here:
https://github.com/jpirko/libteam
Note it's still in its dipers atm.
team<->libteam use generic netlink for communication. That and rtnl
suppose to be the only way to configure team device, no sysfs etc.
Python binding of libteam was recently introduced.
Daemon providing arpmon/miimon active-backup functionality will be
introduced shortly. All what's necessary is already implemented in
kernel team driver.
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
v6->v7:
- transmit and receive functions are not checked in hot paths.
That also resolves memory leak on transmit when no port is
present
v5->v6:
- changed couple of _rcu calls to non _rcu ones in non-readers
v4->v5:
- team_change_mtu() uses team->lock while travesing though port
list
- mac address changes are moved completely to jurisdiction of
userspace daemon. This way the daemon can do FOM1, FOM2 and
possibly other weird things with mac addresses.
Only round-robin mode sets up all ports to bond's address then
enslaved.
- Extended Kconfig text
v3->v4:
- remove redundant synchronize_rcu from __team_change_mode()
- revert "set and clear of mode_ops happens per pointer, not per
byte"
- extend comment of function __team_change_mode()
v2->v3:
- team_change_mtu() uses rcu version of list traversal to unwind
- set and clear of mode_ops happens per pointer, not per byte
- port hashlist changed to be embedded into team structure
- error branch in team_port_enter() does cleanup now
- fixed rtln->rtnl
v1->v2:
- modes are made as modules. Makes team more modular and
extendable.
- several commenters' nitpicks found on v1 were fixed
- several other bugs were fixed.
- note I ignored Eric's comment about roundrobin port selector
as Eric's way may be easily implemented as another mode (mode
"random") in future.
---
Documentation/networking/team.txt | 2 +
MAINTAINERS | 7 +
drivers/net/Kconfig | 2 +
drivers/net/Makefile | 1 +
drivers/net/team/Kconfig | 43 +
drivers/net/team/Makefile | 7 +
drivers/net/team/team.c | 1581 +++++++++++++++++++++++++++++
drivers/net/team/team_mode_activebackup.c | 137 +++
drivers/net/team/team_mode_roundrobin.c | 107 ++
include/linux/Kbuild | 1 +
include/linux/if.h | 1 +
include/linux/if_team.h | 242 +++++
12 files changed, 2131 insertions(+), 0 deletions(-)
create mode 100644 Documentation/networking/team.txt
create mode 100644 drivers/net/team/Kconfig
create mode 100644 drivers/net/team/Makefile
create mode 100644 drivers/net/team/team.c
create mode 100644 drivers/net/team/team_mode_activebackup.c
create mode 100644 drivers/net/team/team_mode_roundrobin.c
create mode 100644 include/linux/if_team.h
diff --git a/Documentation/networking/team.txt b/Documentation/networking/team.txt
new file mode 100644
index 0000000..5a01368
--- /dev/null
+++ b/Documentation/networking/team.txt
@@ -0,0 +1,2 @@
+Team devices are driven from userspace via libteam library which is here:
+ https://github.com/jpirko/libteam
diff --git a/MAINTAINERS b/MAINTAINERS
index 4808256..8d94169 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6484,6 +6484,13 @@ W: http://tcp-lp-mod.sourceforge.net/
S: Maintained
F: net/ipv4/tcp_lp.c
+TEAM DRIVER
+M: Jiri Pirko <jpirko@redhat.com>
+L: netdev@vger.kernel.org
+S: Supported
+F: drivers/net/team/
+F: include/linux/if_team.h
+
TEGRA SUPPORT
M: Colin Cross <ccross@android.com>
M: Olof Johansson <olof@lixom.net>
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 583f66c..b3020be 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -125,6 +125,8 @@ config IFB
'ifb1' etc.
Look at the iproute2 documentation directory for usage etc
+source "drivers/net/team/Kconfig"
+
config MACVLAN
tristate "MAC-VLAN support (EXPERIMENTAL)"
depends on EXPERIMENTAL
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index fa877cd..4e4ebfe 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_NET) += Space.o loopback.o
obj-$(CONFIG_NETCONSOLE) += netconsole.o
obj-$(CONFIG_PHYLIB) += phy/
obj-$(CONFIG_RIONET) += rionet.o
+obj-$(CONFIG_NET_TEAM) += team/
obj-$(CONFIG_TUN) += tun.o
obj-$(CONFIG_VETH) += veth.o
obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
diff --git a/drivers/net/team/Kconfig b/drivers/net/team/Kconfig
new file mode 100644
index 0000000..248a144
--- /dev/null
+++ b/drivers/net/team/Kconfig
@@ -0,0 +1,43 @@
+menuconfig NET_TEAM
+ tristate "Ethernet team driver support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ ---help---
+ This allows one to create virtual interfaces that teams together
+ multiple ethernet devices.
+
+ Team devices can be added using the "ip" command from the
+ iproute2 package:
+
+ "ip link add link [ address MAC ] [ NAME ] type team"
+
+ To compile this driver as a module, choose M here: the module
+ will be called team.
+
+if NET_TEAM
+
+config NET_TEAM_MODE_ROUNDROBIN
+ tristate "Round-robin mode support"
+ depends on NET_TEAM
+ ---help---
+ Basic mode where port used for transmitting packets is selected in
+ round-robin fashion using packet counter.
+
+ All added ports are setup to have bond's mac address.
+
+ To compile this team mode as a module, choose M here: the module
+ will be called team_mode_roundrobin.
+
+config NET_TEAM_MODE_ACTIVEBACKUP
+ tristate "Active-backup mode support"
+ depends on NET_TEAM
+ ---help---
+ Only one port is active at a time and the rest of ports are used
+ for backup.
+
+ Mac addresses of ports are not modified. Userspace is responsible
+ to do so.
+
+ To compile this team mode as a module, choose M here: the module
+ will be called team_mode_activebackup.
+
+endif # NET_TEAM
diff --git a/drivers/net/team/Makefile b/drivers/net/team/Makefile
new file mode 100644
index 0000000..85f2028
--- /dev/null
+++ b/drivers/net/team/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the network team driver
+#
+
+obj-$(CONFIG_NET_TEAM) += team.o
+obj-$(CONFIG_NET_TEAM_MODE_ROUNDROBIN) += team_mode_roundrobin.o
+obj-$(CONFIG_NET_TEAM_MODE_ACTIVEBACKUP) += team_mode_activebackup.o
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
new file mode 100644
index 0000000..72853b3
--- /dev/null
+++ b/drivers/net/team/team.c
@@ -0,0 +1,1581 @@
+/*
+ * net/drivers/team/team.c - Network team device driver
+ * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <linux/errno.h>
+#include <linux/ctype.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/socket.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/rtnetlink.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <linux/if_team.h>
+
+#define DRV_NAME "team"
+
+
+/**********
+ * Helpers
+ **********/
+
+#define team_port_exists(dev) (dev->priv_flags & IFF_TEAM_PORT)
+
+static struct team_port *team_port_get_rcu(const struct net_device *dev)
+{
+ struct team_port *port = rcu_dereference(dev->rx_handler_data);
+
+ return team_port_exists(dev) ? port : NULL;
+}
+
+static struct team_port *team_port_get_rtnl(const struct net_device *dev)
+{
+ struct team_port *port = rtnl_dereference(dev->rx_handler_data);
+
+ return team_port_exists(dev) ? port : NULL;
+}
+
+/*
+ * Since the ability to change mac address for open port device is tested in
+ * team_port_add, this function can be called without control of return value
+ */
+static int __set_port_mac(struct net_device *port_dev,
+ const unsigned char *dev_addr)
+{
+ struct sockaddr addr;
+
+ memcpy(addr.sa_data, dev_addr, ETH_ALEN);
+ addr.sa_family = ARPHRD_ETHER;
+ return dev_set_mac_address(port_dev, &addr);
+}
+
+int team_port_set_orig_mac(struct team_port *port)
+{
+ return __set_port_mac(port->dev, port->orig.dev_addr);
+}
+
+int team_port_set_team_mac(struct team_port *port)
+{
+ return __set_port_mac(port->dev, port->team->dev->dev_addr);
+}
+EXPORT_SYMBOL(team_port_set_team_mac);
+
+
+/*******************
+ * Options handling
+ *******************/
+
+void team_options_register(struct team *team, struct team_option *option,
+ size_t option_count)
+{
+ int i;
+
+ for (i = 0; i < option_count; i++, option++)
+ list_add_tail(&option->list, &team->option_list);
+}
+EXPORT_SYMBOL(team_options_register);
+
+static void __team_options_change_check(struct team *team,
+ struct team_option *changed_option);
+
+static void __team_options_unregister(struct team *team,
+ struct team_option *option,
+ size_t option_count)
+{
+ int i;
+
+ for (i = 0; i < option_count; i++, option++)
+ list_del(&option->list);
+}
+
+void team_options_unregister(struct team *team, struct team_option *option,
+ size_t option_count)
+{
+ __team_options_unregister(team, option, option_count);
+ __team_options_change_check(team, NULL);
+}
+EXPORT_SYMBOL(team_options_unregister);
+
+static int team_option_get(struct team *team, struct team_option *option,
+ void *arg)
+{
+ return option->getter(team, arg);
+}
+
+static int team_option_set(struct team *team, struct team_option *option,
+ void *arg)
+{
+ int err;
+
+ err = option->setter(team, arg);
+ if (err)
+ return err;
+
+ __team_options_change_check(team, option);
+ return err;
+}
+
+/****************
+ * Mode handling
+ ****************/
+
+static LIST_HEAD(mode_list);
+static DEFINE_SPINLOCK(mode_list_lock);
+
+static struct team_mode *__find_mode(const char *kind)
+{
+ struct team_mode *mode;
+
+ list_for_each_entry(mode, &mode_list, list) {
+ if (strcmp(mode->kind, kind) == 0)
+ return mode;
+ }
+ return NULL;
+}
+
+static bool is_good_mode_name(const char *name)
+{
+ while (*name != '\0') {
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+ return false;
+ name++;
+ }
+ return true;
+}
+
+int team_mode_register(struct team_mode *mode)
+{
+ int err = 0;
+
+ if (!is_good_mode_name(mode->kind) ||
+ mode->priv_size > TEAM_MODE_PRIV_SIZE)
+ return -EINVAL;
+ spin_lock(&mode_list_lock);
+ if (__find_mode(mode->kind)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+ list_add_tail(&mode->list, &mode_list);
+unlock:
+ spin_unlock(&mode_list_lock);
+ return err;
+}
+EXPORT_SYMBOL(team_mode_register);
+
+int team_mode_unregister(struct team_mode *mode)
+{
+ spin_lock(&mode_list_lock);
+ list_del_init(&mode->list);
+ spin_unlock(&mode_list_lock);
+ return 0;
+}
+EXPORT_SYMBOL(team_mode_unregister);
+
+static struct team_mode *team_mode_get(const char *kind)
+{
+ struct team_mode *mode;
+
+ spin_lock(&mode_list_lock);
+ mode = __find_mode(kind);
+ if (!mode) {
+ spin_unlock(&mode_list_lock);
+ request_module("team-mode-%s", kind);
+ spin_lock(&mode_list_lock);
+ mode = __find_mode(kind);
+ }
+ if (mode)
+ if (!try_module_get(mode->owner))
+ mode = NULL;
+
+ spin_unlock(&mode_list_lock);
+ return mode;
+}
+
+static void team_mode_put(const struct team_mode *mode)
+{
+ module_put(mode->owner);
+}
+
+static bool team_dummy_transmit(struct team *team, struct sk_buff *skb)
+{
+ dev_kfree_skb(skb);
+ return false;
+}
+
+rx_handler_result_t team_dummy_receive(struct team *team,
+ struct team_port *port,
+ struct sk_buff *skb)
+{
+ return RX_HANDLER_ANOTHER;
+}
+
+static void team_adjust_ops(struct team *team)
+{
+ /*
+ * To avoid checks in rx/tx skb paths, ensure here that non-null and
+ * correct ops are always set.
+ */
+
+ if (list_empty(&team->port_list) ||
+ !team->mode || !team->mode->ops->transmit)
+ team->ops.transmit = team_dummy_transmit;
+ else
+ team->ops.transmit = team->mode->ops->transmit;
+
+ if (list_empty(&team->port_list) ||
+ !team->mode || !team->mode->ops->receive)
+ team->ops.receive = team_dummy_receive;
+ else
+ team->ops.receive = team->mode->ops->receive;
+}
+
+/*
+ * We can benefit from the fact that it's ensured no port is present
+ * at the time of mode change. Therefore no packets are in fly so there's no
+ * need to set mode operations in any special way.
+ */
+static int __team_change_mode(struct team *team,
+ const struct team_mode *new_mode)
+{
+ /* Check if mode was previously set and do cleanup if so */
+ if (team->mode) {
+ void (*exit_op)(struct team *team) = team->ops.exit;
+
+ /* Clear ops area so no callback is called any longer */
+ memset(&team->ops, 0, sizeof(struct team_mode_ops));
+ team_adjust_ops(team);
+
+ if (exit_op)
+ exit_op(team);
+ team_mode_put(team->mode);
+ team->mode = NULL;
+ /* zero private data area */
+ memset(&team->mode_priv, 0,
+ sizeof(struct team) - offsetof(struct team, mode_priv));
+ }
+
+ if (!new_mode)
+ return 0;
+
+ if (new_mode->ops->init) {
+ int err;
+
+ err = new_mode->ops->init(team);
+ if (err)
+ return err;
+ }
+
+ team->mode = new_mode;
+ memcpy(&team->ops, new_mode->ops, sizeof(struct team_mode_ops));
+ team_adjust_ops(team);
+
+ return 0;
+}
+
+static int team_change_mode(struct team *team, const char *kind)
+{
+ struct team_mode *new_mode;
+ struct net_device *dev = team->dev;
+ int err;
+
+ if (!list_empty(&team->port_list)) {
+ netdev_err(dev, "No ports can be present during mode change\n");
+ return -EBUSY;
+ }
+
+ if (team->mode && strcmp(team->mode->kind, kind) == 0) {
+ netdev_err(dev, "Unable to change to the same mode the team is in\n");
+ return -EINVAL;
+ }
+
+ new_mode = team_mode_get(kind);
+ if (!new_mode) {
+ netdev_err(dev, "Mode \"%s\" not found\n", kind);
+ return -EINVAL;
+ }
+
+ err = __team_change_mode(team, new_mode);
+ if (err) {
+ netdev_err(dev, "Failed to change to mode \"%s\"\n", kind);
+ team_mode_put(new_mode);
+ return err;
+ }
+
+ netdev_info(dev, "Mode changed to \"%s\"\n", kind);
+ return 0;
+}
+
+
+/************************
+ * Rx path frame handler
+ ************************/
+
+/* note: already called with rcu_read_lock */
+static rx_handler_result_t team_handle_frame(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct team_port *port;
+ struct team *team;
+ rx_handler_result_t res;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return RX_HANDLER_CONSUMED;
+
+ *pskb = skb;
+
+ port = team_port_get_rcu(skb->dev);
+ team = port->team;
+
+ res = team->ops.receive(team, port, skb);
+ if (res == RX_HANDLER_ANOTHER) {
+ struct team_pcpu_stats *pcpu_stats;
+
+ pcpu_stats = this_cpu_ptr(team->pcpu_stats);
+ u64_stats_update_begin(&pcpu_stats->syncp);
+ pcpu_stats->rx_packets++;
+ pcpu_stats->rx_bytes += skb->len;
+ if (skb->pkt_type == PACKET_MULTICAST)
+ pcpu_stats->rx_multicast++;
+ u64_stats_update_end(&pcpu_stats->syncp);
+
+ skb->dev = team->dev;
+ } else {
+ this_cpu_inc(team->pcpu_stats->rx_dropped);
+ }
+
+ return res;
+}
+
+
+/****************
+ * Port handling
+ ****************/
+
+static bool team_port_find(const struct team *team,
+ const struct team_port *port)
+{
+ struct team_port *cur;
+
+ list_for_each_entry(cur, &team->port_list, list)
+ if (cur == port)
+ return true;
+ return false;
+}
+
+/*
+ * Add/delete port to the team port list. Write guarded by rtnl_lock.
+ * Takes care of correct port->index setup (might be racy).
+ */
+static void team_port_list_add_port(struct team *team,
+ struct team_port *port)
+{
+ port->index = team->port_count++;
+ hlist_add_head_rcu(&port->hlist,
+ team_port_index_hash(team, port->index));
+ list_add_tail_rcu(&port->list, &team->port_list);
+}
+
+static void __reconstruct_port_hlist(struct team *team, int rm_index)
+{
+ int i;
+ struct team_port *port;
+
+ for (i = rm_index + 1; i < team->port_count; i++) {
+ port = team_get_port_by_index(team, i);
+ hlist_del_rcu(&port->hlist);
+ port->index--;
+ hlist_add_head_rcu(&port->hlist,
+ team_port_index_hash(team, port->index));
+ }
+}
+
+static void team_port_list_del_port(struct team *team,
+ struct team_port *port)
+{
+ int rm_index = port->index;
+
+ hlist_del_rcu(&port->hlist);
+ list_del_rcu(&port->list);
+ __reconstruct_port_hlist(team, rm_index);
+ team->port_count--;
+}
+
+#define TEAM_VLAN_FEATURES (NETIF_F_ALL_CSUM | NETIF_F_SG | \
+ NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
+ NETIF_F_HIGHDMA | NETIF_F_LRO)
+
+static void __team_compute_features(struct team *team)
+{
+ struct team_port *port;
+ u32 vlan_features = TEAM_VLAN_FEATURES;
+ unsigned short max_hard_header_len = ETH_HLEN;
+
+ list_for_each_entry(port, &team->port_list, list) {
+ vlan_features = netdev_increment_features(vlan_features,
+ port->dev->vlan_features,
+ TEAM_VLAN_FEATURES);
+
+ if (port->dev->hard_header_len > max_hard_header_len)
+ max_hard_header_len = port->dev->hard_header_len;
+ }
+
+ team->dev->vlan_features = vlan_features;
+ team->dev->hard_header_len = max_hard_header_len;
+
+ netdev_change_features(team->dev);
+}
+
+static void team_compute_features(struct team *team)
+{
+ spin_lock(&team->lock);
+ __team_compute_features(team);
+ spin_unlock(&team->lock);
+}
+
+static int team_port_enter(struct team *team, struct team_port *port)
+{
+ int err = 0;
+
+ dev_hold(team->dev);
+ port->dev->priv_flags |= IFF_TEAM_PORT;
+ if (team->ops.port_enter) {
+ err = team->ops.port_enter(team, port);
+ if (err) {
+ netdev_err(team->dev, "Device %s failed to enter team mode\n",
+ port->dev->name);
+ goto err_port_enter;
+ }
+ }
+
+ return 0;
+
+err_port_enter:
+ port->dev->priv_flags &= ~IFF_TEAM_PORT;
+ dev_put(team->dev);
+
+ return err;
+}
+
+static void team_port_leave(struct team *team, struct team_port *port)
+{
+ if (team->ops.port_leave)
+ team->ops.port_leave(team, port);
+ port->dev->priv_flags &= ~IFF_TEAM_PORT;
+ dev_put(team->dev);
+}
+
+static void __team_port_change_check(struct team_port *port, bool linkup);
+
+static int team_port_add(struct team *team, struct net_device *port_dev)
+{
+ struct net_device *dev = team->dev;
+ struct team_port *port;
+ char *portname = port_dev->name;
+ int err;
+
+ if (port_dev->flags & IFF_LOOPBACK ||
+ port_dev->type != ARPHRD_ETHER) {
+ netdev_err(dev, "Device %s is of an unsupported type\n",
+ portname);
+ return -EINVAL;
+ }
+
+ if (team_port_exists(port_dev)) {
+ netdev_err(dev, "Device %s is already a port "
+ "of a team device\n", portname);
+ return -EBUSY;
+ }
+
+ if (port_dev->flags & IFF_UP) {
+ netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n",
+ portname);
+ return -EBUSY;
+ }
+
+ port = kzalloc(sizeof(struct team_port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+
+ port->dev = port_dev;
+ port->team = team;
+
+ port->orig.mtu = port_dev->mtu;
+ err = dev_set_mtu(port_dev, dev->mtu);
+ if (err) {
+ netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err);
+ goto err_set_mtu;
+ }
+
+ memcpy(port->orig.dev_addr, port_dev->dev_addr, ETH_ALEN);
+
+ err = team_port_enter(team, port);
+ if (err) {
+ netdev_err(dev, "Device %s failed to enter team mode\n",
+ portname);
+ goto err_port_enter;
+ }
+
+ err = dev_open(port_dev);
+ if (err) {
+ netdev_dbg(dev, "Device %s opening failed\n",
+ portname);
+ goto err_dev_open;
+ }
+
+ err = netdev_set_master(port_dev, dev);
+ if (err) {
+ netdev_err(dev, "Device %s failed to set master\n", portname);
+ goto err_set_master;
+ }
+
+ err = netdev_rx_handler_register(port_dev, team_handle_frame,
+ port);
+ if (err) {
+ netdev_err(dev, "Device %s failed to register rx_handler\n",
+ portname);
+ goto err_handler_register;
+ }
+
+ team_port_list_add_port(team, port);
+ team_adjust_ops(team);
+ __team_compute_features(team);
+ __team_port_change_check(port, !!netif_carrier_ok(port_dev));
+
+ netdev_info(dev, "Port device %s added\n", portname);
+
+ return 0;
+
+err_handler_register:
+ netdev_set_master(port_dev, NULL);
+
+err_set_master:
+ dev_close(port_dev);
+
+err_dev_open:
+ team_port_leave(team, port);
+ team_port_set_orig_mac(port);
+
+err_port_enter:
+ dev_set_mtu(port_dev, port->orig.mtu);
+
+err_set_mtu:
+ kfree(port);
+
+ return err;
+}
+
+static int team_port_del(struct team *team, struct net_device *port_dev)
+{
+ struct net_device *dev = team->dev;
+ struct team_port *port;
+ char *portname = port_dev->name;
+
+ port = team_port_get_rtnl(port_dev);
+ if (!port || !team_port_find(team, port)) {
+ netdev_err(dev, "Device %s does not act as a port of this team\n",
+ portname);
+ return -ENOENT;
+ }
+
+ __team_port_change_check(port, false);
+ team_port_list_del_port(team, port);
+ team_adjust_ops(team);
+ netdev_rx_handler_unregister(port_dev);
+ netdev_set_master(port_dev, NULL);
+ dev_close(port_dev);
+ team_port_leave(team, port);
+ team_port_set_orig_mac(port);
+ dev_set_mtu(port_dev, port->orig.mtu);
+ synchronize_rcu();
+ kfree(port);
+ netdev_info(dev, "Port device %s removed\n", portname);
+ __team_compute_features(team);
+
+ return 0;
+}
+
+
+/*****************
+ * Net device ops
+ *****************/
+
+static const char team_no_mode_kind[] = "*NOMODE*";
+
+static int team_mode_option_get(struct team *team, void *arg)
+{
+ const char **str = arg;
+
+ *str = team->mode ? team->mode->kind : team_no_mode_kind;
+ return 0;
+}
+
+static int team_mode_option_set(struct team *team, void *arg)
+{
+ const char **str = arg;
+
+ return team_change_mode(team, *str);
+}
+
+static struct team_option team_options[] = {
+ {
+ .name = "mode",
+ .type = TEAM_OPTION_TYPE_STRING,
+ .getter = team_mode_option_get,
+ .setter = team_mode_option_set,
+ },
+};
+
+static int team_init(struct net_device *dev)
+{
+ struct team *team = netdev_priv(dev);
+ int i;
+
+ team->dev = dev;
+ spin_lock_init(&team->lock);
+
+ team->pcpu_stats = alloc_percpu(struct team_pcpu_stats);
+ if (!team->pcpu_stats)
+ return -ENOMEM;
+
+ for (i = 0; i < TEAM_PORT_HASHENTRIES; i++)
+ INIT_HLIST_HEAD(&team->port_hlist[i]);
+ INIT_LIST_HEAD(&team->port_list);
+
+ team_adjust_ops(team);
+
+ INIT_LIST_HEAD(&team->option_list);
+ team_options_register(team, team_options, ARRAY_SIZE(team_options));
+ netif_carrier_off(dev);
+
+ return 0;
+}
+
+static void team_uninit(struct net_device *dev)
+{
+ struct team *team = netdev_priv(dev);
+ struct team_port *port;
+ struct team_port *tmp;
+
+ spin_lock(&team->lock);
+ list_for_each_entry_safe(port, tmp, &team->port_list, list)
+ team_port_del(team, port->dev);
+
+ __team_change_mode(team, NULL); /* cleanup */
+ __team_options_unregister(team, team_options, ARRAY_SIZE(team_options));
+ spin_unlock(&team->lock);
+}
+
+static void team_destructor(struct net_device *dev)
+{
+ struct team *team = netdev_priv(dev);
+
+ free_percpu(team->pcpu_stats);
+ free_netdev(dev);
+}
+
+static int team_open(struct net_device *dev)
+{
+ netif_carrier_on(dev);
+ return 0;
+}
+
+static int team_close(struct net_device *dev)
+{
+ netif_carrier_off(dev);
+ return 0;
+}
+
+/*
+ * note: already called with rcu_read_lock
+ */
+static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct team *team = netdev_priv(dev);
+ bool tx_success = false;
+ unsigned int len = skb->len;
+
+ tx_success = team->ops.transmit(team, skb);
+ if (tx_success) {
+ struct team_pcpu_stats *pcpu_stats;
+
+ pcpu_stats = this_cpu_ptr(team->pcpu_stats);
+ u64_stats_update_begin(&pcpu_stats->syncp);
+ pcpu_stats->tx_packets++;
+ pcpu_stats->tx_bytes += len;
+ u64_stats_update_end(&pcpu_stats->syncp);
+ } else {
+ this_cpu_inc(team->pcpu_stats->tx_dropped);
+ }
+
+ return NETDEV_TX_OK;
+}
+
+static void team_change_rx_flags(struct net_device *dev, int change)
+{
+ struct team *team = netdev_priv(dev);
+ struct team_port *port;
+ int inc;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, &team->port_list, list) {
+ if (change & IFF_PROMISC) {
+ inc = dev->flags & IFF_PROMISC ? 1 : -1;
+ dev_set_promiscuity(port->dev, inc);
+ }
+ if (change & IFF_ALLMULTI) {
+ inc = dev->flags & IFF_ALLMULTI ? 1 : -1;
+ dev_set_allmulti(port->dev, inc);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void team_set_rx_mode(struct net_device *dev)
+{
+ struct team *team = netdev_priv(dev);
+ struct team_port *port;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, &team->port_list, list) {
+ dev_uc_sync(port->dev, dev);
+ dev_mc_sync(port->dev, dev);
+ }
+ rcu_read_unlock();
+}
+
+static int team_set_mac_address(struct net_device *dev, void *p)
+{
+ struct team *team = netdev_priv(dev);
+ struct team_port *port;
+ struct sockaddr *addr = p;
+
+ memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, &team->port_list, list)
+ if (team->ops.port_change_mac)
+ team->ops.port_change_mac(team, port);
+ rcu_read_unlock();
+ return 0;
+}
+
+static int team_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct team *team = netdev_priv(dev);
+ struct team_port *port;
+ int err;
+
+ /*
+ * Alhough this is reader, it's guarded by team lock. It's not possible
+ * to traverse list in reverse under rcu_read_lock
+ */
+ spin_lock(&team->lock);
+ list_for_each_entry(port, &team->port_list, list) {
+ err = dev_set_mtu(port->dev, new_mtu);
+ if (err) {
+ netdev_err(dev, "Device %s failed to change mtu",
+ port->dev->name);
+ goto unwind;
+ }
+ }
+ spin_unlock(&team->lock);
+
+ dev->mtu = new_mtu;
+
+ return 0;
+
+unwind:
+ list_for_each_entry_continue_reverse(port, &team->port_list, list)
+ dev_set_mtu(port->dev, dev->mtu);
+ spin_unlock(&team->lock);
+
+ return err;
+}
+
+static struct rtnl_link_stats64 *
+team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+ struct team *team = netdev_priv(dev);
+ struct team_pcpu_stats *p;
+ u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes;
+ u32 rx_dropped = 0, tx_dropped = 0;
+ unsigned int start;
+ int i;
+
+ for_each_possible_cpu(i) {
+ p = per_cpu_ptr(team->pcpu_stats, i);
+ do {
+ start = u64_stats_fetch_begin_bh(&p->syncp);
+ rx_packets = p->rx_packets;
+ rx_bytes = p->rx_bytes;
+ rx_multicast = p->rx_multicast;
+ tx_packets = p->tx_packets;
+ tx_bytes = p->tx_bytes;
+ } while (u64_stats_fetch_retry_bh(&p->syncp, start));
+
+ stats->rx_packets += rx_packets;
+ stats->rx_bytes += rx_bytes;
+ stats->multicast += rx_multicast;
+ stats->tx_packets += tx_packets;
+ stats->tx_bytes += tx_bytes;
+ /*
+ * rx_dropped & tx_dropped are u32, updated
+ * without syncp protection.
+ */
+ rx_dropped += p->rx_dropped;
+ tx_dropped += p->tx_dropped;
+ }
+ stats->rx_dropped = rx_dropped;
+ stats->tx_dropped = tx_dropped;
+ return stats;
+}
+
+static void team_vlan_rx_add_vid(struct net_device *dev, uint16_t vid)
+{
+ struct team *team = netdev_priv(dev);
+ struct team_port *port;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, &team->port_list, list) {
+ const struct net_device_ops *ops = port->dev->netdev_ops;
+
+ ops->ndo_vlan_rx_add_vid(port->dev, vid);
+ }
+ rcu_read_unlock();
+}
+
+static void team_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
+{
+ struct team *team = netdev_priv(dev);
+ struct team_port *port;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, &team->port_list, list) {
+ const struct net_device_ops *ops = port->dev->netdev_ops;
+
+ ops->ndo_vlan_rx_kill_vid(port->dev, vid);
+ }
+ rcu_read_unlock();
+}
+
+static int team_add_slave(struct net_device *dev, struct net_device *port_dev)
+{
+ struct team *team = netdev_priv(dev);
+ int err;
+
+ spin_lock(&team->lock);
+ err = team_port_add(team, port_dev);
+ spin_unlock(&team->lock);
+ return err;
+}
+
+static int team_del_slave(struct net_device *dev, struct net_device *port_dev)
+{
+ struct team *team = netdev_priv(dev);
+ int err;
+
+ spin_lock(&team->lock);
+ err = team_port_del(team, port_dev);
+ spin_unlock(&team->lock);
+ return err;
+}
+
+static const struct net_device_ops team_netdev_ops = {
+ .ndo_init = team_init,
+ .ndo_uninit = team_uninit,
+ .ndo_open = team_open,
+ .ndo_stop = team_close,
+ .ndo_start_xmit = team_xmit,
+ .ndo_change_rx_flags = team_change_rx_flags,
+ .ndo_set_rx_mode = team_set_rx_mode,
+ .ndo_set_mac_address = team_set_mac_address,
+ .ndo_change_mtu = team_change_mtu,
+ .ndo_get_stats64 = team_get_stats64,
+ .ndo_vlan_rx_add_vid = team_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = team_vlan_rx_kill_vid,
+ .ndo_add_slave = team_add_slave,
+ .ndo_del_slave = team_del_slave,
+};
+
+
+/***********************
+ * rt netlink interface
+ ***********************/
+
+static void team_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->netdev_ops = &team_netdev_ops;
+ dev->destructor = team_destructor;
+ dev->tx_queue_len = 0;
+ dev->flags |= IFF_MULTICAST;
+ dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+
+ /*
+ * Indicate we support unicast address filtering. That way core won't
+ * bring us to promisc mode in case a unicast addr is added.
+ * Let this up to underlay drivers.
+ */
+ dev->priv_flags |= IFF_UNICAST_FLT;
+
+ dev->features |= NETIF_F_LLTX;
+ dev->features |= NETIF_F_GRO;
+ dev->hw_features = NETIF_F_HW_VLAN_TX |
+ NETIF_F_HW_VLAN_RX |
+ NETIF_F_HW_VLAN_FILTER;
+
+ dev->features |= dev->hw_features;
+}
+
+static int team_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ int err;
+
+ if (tb[IFLA_ADDRESS] == NULL)
+ random_ether_addr(dev->dev_addr);
+
+ err = register_netdevice(dev);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int team_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
+}
+
+static struct rtnl_link_ops team_link_ops __read_mostly = {
+ .kind = DRV_NAME,
+ .priv_size = sizeof(struct team),
+ .setup = team_setup,
+ .newlink = team_newlink,
+ .validate = team_validate,
+};
+
+
+/***********************************
+ * Generic netlink custom interface
+ ***********************************/
+
+static struct genl_family team_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .name = TEAM_GENL_NAME,
+ .version = TEAM_GENL_VERSION,
+ .maxattr = TEAM_ATTR_MAX,
+ .netnsok = true,
+};
+
+static const struct nla_policy team_nl_policy[TEAM_ATTR_MAX + 1] = {
+ [TEAM_ATTR_UNSPEC] = { .type = NLA_UNSPEC, },
+ [TEAM_ATTR_TEAM_IFINDEX] = { .type = NLA_U32 },
+ [TEAM_ATTR_LIST_OPTION] = { .type = NLA_NESTED },
+ [TEAM_ATTR_LIST_PORT] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+team_nl_option_policy[TEAM_ATTR_OPTION_MAX + 1] = {
+ [TEAM_ATTR_OPTION_UNSPEC] = { .type = NLA_UNSPEC, },
+ [TEAM_ATTR_OPTION_NAME] = {
+ .type = NLA_STRING,
+ .len = TEAM_STRING_MAX_LEN,
+ },
+ [TEAM_ATTR_OPTION_CHANGED] = { .type = NLA_FLAG },
+ [TEAM_ATTR_OPTION_TYPE] = { .type = NLA_U8 },
+ [TEAM_ATTR_OPTION_DATA] = {
+ .type = NLA_BINARY,
+ .len = TEAM_STRING_MAX_LEN,
+ },
+};
+
+static int team_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+
+ msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
+ &team_nl_family, 0, TEAM_CMD_NOOP);
+ if (IS_ERR(hdr)) {
+ err = PTR_ERR(hdr);
+ goto err_msg_put;
+ }
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_unicast(genl_info_net(info), msg, info->snd_pid);
+
+err_msg_put:
+ nlmsg_free(msg);
+
+ return err;
+}
+
+/*
+ * Netlink cmd functions should be locked by following two functions.
+ * To ensure team_uninit would not be called in between, hold rcu_read_lock
+ * all the time.
+ */
+static struct team *team_nl_team_get(struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ int ifindex;
+ struct net_device *dev;
+ struct team *team;
+
+ if (!info->attrs[TEAM_ATTR_TEAM_IFINDEX])
+ return NULL;
+
+ ifindex = nla_get_u32(info->attrs[TEAM_ATTR_TEAM_IFINDEX]);
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev || dev->netdev_ops != &team_netdev_ops) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ team = netdev_priv(dev);
+ spin_lock(&team->lock);
+ return team;
+}
+
+static void team_nl_team_put(struct team *team)
+{
+ spin_unlock(&team->lock);
+ rcu_read_unlock();
+}
+
+static int team_nl_send_generic(struct genl_info *info, struct team *team,
+ int (*fill_func)(struct sk_buff *skb,
+ struct genl_info *info,
+ int flags, struct team *team))
+{
+ struct sk_buff *skb;
+ int err;
+
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ err = fill_func(skb, info, NLM_F_ACK, team);
+ if (err < 0)
+ goto err_fill;
+
+ err = genlmsg_unicast(genl_info_net(info), skb, info->snd_pid);
+ return err;
+
+err_fill:
+ nlmsg_free(skb);
+ return err;
+}
+
+static int team_nl_fill_options_get_changed(struct sk_buff *skb,
+ u32 pid, u32 seq, int flags,
+ struct team *team,
+ struct team_option *changed_option)
+{
+ struct nlattr *option_list;
+ void *hdr;
+ struct team_option *option;
+
+ hdr = genlmsg_put(skb, pid, seq, &team_nl_family, flags,
+ TEAM_CMD_OPTIONS_GET);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ NLA_PUT_U32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex);
+ option_list = nla_nest_start(skb, TEAM_ATTR_LIST_OPTION);
+ if (!option_list)
+ return -EMSGSIZE;
+
+ list_for_each_entry(option, &team->option_list, list) {
+ struct nlattr *option_item;
+ long arg;
+
+ option_item = nla_nest_start(skb, TEAM_ATTR_ITEM_OPTION);
+ if (!option_item)
+ goto nla_put_failure;
+ NLA_PUT_STRING(skb, TEAM_ATTR_OPTION_NAME, option->name);
+ if (option == changed_option)
+ NLA_PUT_FLAG(skb, TEAM_ATTR_OPTION_CHANGED);
+ switch (option->type) {
+ case TEAM_OPTION_TYPE_U32:
+ NLA_PUT_U8(skb, TEAM_ATTR_OPTION_TYPE, NLA_U32);
+ team_option_get(team, option, &arg);
+ NLA_PUT_U32(skb, TEAM_ATTR_OPTION_DATA, arg);
+ break;
+ case TEAM_OPTION_TYPE_STRING:
+ NLA_PUT_U8(skb, TEAM_ATTR_OPTION_TYPE, NLA_STRING);
+ team_option_get(team, option, &arg);
+ NLA_PUT_STRING(skb, TEAM_ATTR_OPTION_DATA,
+ (char *) arg);
+ break;
+ default:
+ BUG();
+ }
+ nla_nest_end(skb, option_item);
+ }
+
+ nla_nest_end(skb, option_list);
+ return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int team_nl_fill_options_get(struct sk_buff *skb,
+ struct genl_info *info, int flags,
+ struct team *team)
+{
+ return team_nl_fill_options_get_changed(skb, info->snd_pid,
+ info->snd_seq, NLM_F_ACK,
+ team, NULL);
+}
+
+static int team_nl_cmd_options_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct team *team;
+ int err;
+
+ team = team_nl_team_get(info);
+ if (!team)
+ return -EINVAL;
+
+ err = team_nl_send_generic(info, team, team_nl_fill_options_get);
+
+ team_nl_team_put(team);
+
+ return err;
+}
+
+static int team_nl_cmd_options_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct team *team;
+ int err = 0;
+ int i;
+ struct nlattr *nl_option;
+
+ team = team_nl_team_get(info);
+ if (!team)
+ return -EINVAL;
+
+ err = -EINVAL;
+ if (!info->attrs[TEAM_ATTR_LIST_OPTION]) {
+ err = -EINVAL;
+ goto team_put;
+ }
+
+ nla_for_each_nested(nl_option, info->attrs[TEAM_ATTR_LIST_OPTION], i) {
+ struct nlattr *mode_attrs[TEAM_ATTR_OPTION_MAX + 1];
+ enum team_option_type opt_type;
+ struct team_option *option;
+ char *opt_name;
+ bool opt_found = false;
+
+ if (nla_type(nl_option) != TEAM_ATTR_ITEM_OPTION) {
+ err = -EINVAL;
+ goto team_put;
+ }
+ err = nla_parse_nested(mode_attrs, TEAM_ATTR_OPTION_MAX,
+ nl_option, team_nl_option_policy);
+ if (err)
+ goto team_put;
+ if (!mode_attrs[TEAM_ATTR_OPTION_NAME] ||
+ !mode_attrs[TEAM_ATTR_OPTION_TYPE] ||
+ !mode_attrs[TEAM_ATTR_OPTION_DATA]) {
+ err = -EINVAL;
+ goto team_put;
+ }
+ switch (nla_get_u8(mode_attrs[TEAM_ATTR_OPTION_TYPE])) {
+ case NLA_U32:
+ opt_type = TEAM_OPTION_TYPE_U32;
+ break;
+ case NLA_STRING:
+ opt_type = TEAM_OPTION_TYPE_STRING;
+ break;
+ default:
+ goto team_put;
+ }
+
+ opt_name = nla_data(mode_attrs[TEAM_ATTR_OPTION_NAME]);
+ list_for_each_entry(option, &team->option_list, list) {
+ long arg;
+ struct nlattr *opt_data_attr;
+
+ if (option->type != opt_type ||
+ strcmp(option->name, opt_name))
+ continue;
+ opt_found = true;
+ opt_data_attr = mode_attrs[TEAM_ATTR_OPTION_DATA];
+ switch (opt_type) {
+ case TEAM_OPTION_TYPE_U32:
+ arg = nla_get_u32(opt_data_attr);
+ break;
+ case TEAM_OPTION_TYPE_STRING:
+ arg = (long) nla_data(opt_data_attr);
+ break;
+ default:
+ BUG();
+ }
+ err = team_option_set(team, option, &arg);
+ if (err)
+ goto team_put;
+ }
+ if (!opt_found) {
+ err = -ENOENT;
+ goto team_put;
+ }
+ }
+
+team_put:
+ team_nl_team_put(team);
+
+ return err;
+}
+
+static int team_nl_fill_port_list_get_changed(struct sk_buff *skb,
+ u32 pid, u32 seq, int flags,
+ struct team *team,
+ struct team_port *changed_port)
+{
+ struct nlattr *port_list;
+ void *hdr;
+ struct team_port *port;
+
+ hdr = genlmsg_put(skb, pid, seq, &team_nl_family, flags,
+ TEAM_CMD_PORT_LIST_GET);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ NLA_PUT_U32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex);
+ port_list = nla_nest_start(skb, TEAM_ATTR_LIST_PORT);
+ if (!port_list)
+ return -EMSGSIZE;
+
+ list_for_each_entry(port, &team->port_list, list) {
+ struct nlattr *port_item;
+
+ port_item = nla_nest_start(skb, TEAM_ATTR_ITEM_PORT);
+ if (!port_item)
+ goto nla_put_failure;
+ NLA_PUT_U32(skb, TEAM_ATTR_PORT_IFINDEX, port->dev->ifindex);
+ if (port == changed_port)
+ NLA_PUT_FLAG(skb, TEAM_ATTR_PORT_CHANGED);
+ if (port->linkup)
+ NLA_PUT_FLAG(skb, TEAM_ATTR_PORT_LINKUP);
+ NLA_PUT_U32(skb, TEAM_ATTR_PORT_SPEED, port->speed);
+ NLA_PUT_U8(skb, TEAM_ATTR_PORT_DUPLEX, port->duplex);
+ nla_nest_end(skb, port_item);
+ }
+
+ nla_nest_end(skb, port_list);
+ return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int team_nl_fill_port_list_get(struct sk_buff *skb,
+ struct genl_info *info, int flags,
+ struct team *team)
+{
+ return team_nl_fill_port_list_get_changed(skb, info->snd_pid,
+ info->snd_seq, NLM_F_ACK,
+ team, NULL);
+}
+
+static int team_nl_cmd_port_list_get(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct team *team;
+ int err;
+
+ team = team_nl_team_get(info);
+ if (!team)
+ return -EINVAL;
+
+ err = team_nl_send_generic(info, team, team_nl_fill_port_list_get);
+
+ team_nl_team_put(team);
+
+ return err;
+}
+
+static struct genl_ops team_nl_ops[] = {
+ {
+ .cmd = TEAM_CMD_NOOP,
+ .doit = team_nl_cmd_noop,
+ .policy = team_nl_policy,
+ },
+ {
+ .cmd = TEAM_CMD_OPTIONS_SET,
+ .doit = team_nl_cmd_options_set,
+ .policy = team_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = TEAM_CMD_OPTIONS_GET,
+ .doit = team_nl_cmd_options_get,
+ .policy = team_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = TEAM_CMD_PORT_LIST_GET,
+ .doit = team_nl_cmd_port_list_get,
+ .policy = team_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static struct genl_multicast_group team_change_event_mcgrp = {
+ .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME,
+};
+
+static int team_nl_send_event_options_get(struct team *team,
+ struct team_option *changed_option)
+{
+ struct sk_buff *skb;
+ int err;
+ struct net *net = dev_net(team->dev);
+
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ err = team_nl_fill_options_get_changed(skb, 0, 0, 0, team,
+ changed_option);
+ if (err < 0)
+ goto err_fill;
+
+ err = genlmsg_multicast_netns(net, skb, 0, team_change_event_mcgrp.id,
+ GFP_KERNEL);
+ return err;
+
+err_fill:
+ nlmsg_free(skb);
+ return err;
+}
+
+static int team_nl_send_event_port_list_get(struct team_port *port)
+{
+ struct sk_buff *skb;
+ int err;
+ struct net *net = dev_net(port->team->dev);
+
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ err = team_nl_fill_port_list_get_changed(skb, 0, 0, 0,
+ port->team, port);
+ if (err < 0)
+ goto err_fill;
+
+ err = genlmsg_multicast_netns(net, skb, 0, team_change_event_mcgrp.id,
+ GFP_KERNEL);
+ return err;
+
+err_fill:
+ nlmsg_free(skb);
+ return err;
+}
+
+static int team_nl_init(void)
+{
+ int err;
+
+ err = genl_register_family_with_ops(&team_nl_family, team_nl_ops,
+ ARRAY_SIZE(team_nl_ops));
+ if (err)
+ return err;
+
+ err = genl_register_mc_group(&team_nl_family, &team_change_event_mcgrp);
+ if (err)
+ goto err_change_event_grp_reg;
+
+ return 0;
+
+err_change_event_grp_reg:
+ genl_unregister_family(&team_nl_family);
+
+ return err;
+}
+
+static void team_nl_fini(void)
+{
+ genl_unregister_family(&team_nl_family);
+}
+
+
+/******************
+ * Change checkers
+ ******************/
+
+static void __team_options_change_check(struct team *team,
+ struct team_option *changed_option)
+{
+ int err;
+
+ err = team_nl_send_event_options_get(team, changed_option);
+ if (err)
+ netdev_warn(team->dev, "Failed to send options change via netlink\n");
+}
+
+/* rtnl lock is held */
+static void __team_port_change_check(struct team_port *port, bool linkup)
+{
+ int err;
+
+ if (port->linkup == linkup)
+ return;
+
+ port->linkup = linkup;
+ if (linkup) {
+ struct ethtool_cmd ecmd;
+
+ err = __ethtool_get_settings(port->dev, &ecmd);
+ if (!err) {
+ port->speed = ethtool_cmd_speed(&ecmd);
+ port->duplex = ecmd.duplex;
+ goto send_event;
+ }
+ }
+ port->speed = 0;
+ port->duplex = 0;
+
+send_event:
+ err = team_nl_send_event_port_list_get(port);
+ if (err)
+ netdev_warn(port->team->dev, "Failed to send port change of device %s via netlink\n",
+ port->dev->name);
+
+}
+
+static void team_port_change_check(struct team_port *port, bool linkup)
+{
+ struct team *team = port->team;
+
+ spin_lock(&team->lock);
+ __team_port_change_check(port, linkup);
+ spin_unlock(&team->lock);
+}
+
+/************************************
+ * Net device notifier event handler
+ ************************************/
+
+static int team_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = (struct net_device *) ptr;
+ struct team_port *port;
+
+ port = team_port_get_rtnl(dev);
+ if (!port)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ if (netif_carrier_ok(dev))
+ team_port_change_check(port, true);
+ case NETDEV_DOWN:
+ team_port_change_check(port, false);
+ case NETDEV_CHANGE:
+ if (netif_running(port->dev))
+ team_port_change_check(port,
+ !!netif_carrier_ok(port->dev));
+ break;
+ case NETDEV_UNREGISTER:
+ team_del_slave(port->team->dev, dev);
+ break;
+ case NETDEV_FEAT_CHANGE:
+ team_compute_features(port->team);
+ break;
+ case NETDEV_CHANGEMTU:
+ /* Forbid to change mtu of underlaying device */
+ return NOTIFY_BAD;
+ case NETDEV_PRE_TYPE_CHANGE:
+ /* Forbid to change type of underlaying device */
+ return NOTIFY_BAD;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block team_notifier_block __read_mostly = {
+ .notifier_call = team_device_event,
+};
+
+
+/***********************
+ * Module init and exit
+ ***********************/
+
+static int __init team_module_init(void)
+{
+ int err;
+
+ register_netdevice_notifier(&team_notifier_block);
+
+ err = rtnl_link_register(&team_link_ops);
+ if (err)
+ goto err_rtnl_reg;
+
+ err = team_nl_init();
+ if (err)
+ goto err_nl_init;
+
+ return 0;
+
+err_nl_init:
+ rtnl_link_unregister(&team_link_ops);
+
+err_rtnl_reg:
+ unregister_netdevice_notifier(&team_notifier_block);
+
+ return err;
+}
+
+static void __exit team_module_exit(void)
+{
+ team_nl_fini();
+ rtnl_link_unregister(&team_link_ops);
+ unregister_netdevice_notifier(&team_notifier_block);
+}
+
+module_init(team_module_init);
+module_exit(team_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>");
+MODULE_DESCRIPTION("Ethernet team device driver");
+MODULE_ALIAS_RTNL_LINK(DRV_NAME);
diff --git a/drivers/net/team/team_mode_activebackup.c b/drivers/net/team/team_mode_activebackup.c
new file mode 100644
index 0000000..b939a2c
--- /dev/null
+++ b/drivers/net/team/team_mode_activebackup.c
@@ -0,0 +1,137 @@
+/*
+ * net/drivers/team/team_mode_activebackup.c - Active-backup mode for team
+ * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <net/rtnetlink.h>
+#include <linux/if_team.h>
+
+struct ab_priv {
+ struct team_port __rcu *active_port;
+};
+
+static struct ab_priv *ab_priv(struct team *team)
+{
+ return (struct ab_priv *) &team->mode_priv;
+}
+
+static rx_handler_result_t ab_receive(struct team *team, struct team_port *port,
+ struct sk_buff *skb) {
+ struct team_port *active_port;
+
+ active_port = rcu_dereference(ab_priv(team)->active_port);
+ if (active_port != port)
+ return RX_HANDLER_EXACT;
+ return RX_HANDLER_ANOTHER;
+}
+
+static bool ab_transmit(struct team *team, struct sk_buff *skb)
+{
+ struct team_port *active_port;
+
+ active_port = rcu_dereference(ab_priv(team)->active_port);
+ if (unlikely(!active_port))
+ goto drop;
+ skb->dev = active_port->dev;
+ if (dev_queue_xmit(skb))
+ return false;
+ return true;
+
+drop:
+ dev_kfree_skb(skb);
+ return false;
+}
+
+static void ab_port_leave(struct team *team, struct team_port *port)
+{
+ if (ab_priv(team)->active_port == port)
+ rcu_assign_pointer(ab_priv(team)->active_port, NULL);
+}
+
+static int ab_active_port_get(struct team *team, void *arg)
+{
+ u32 *ifindex = arg;
+
+ *ifindex = 0;
+ if (ab_priv(team)->active_port)
+ *ifindex = ab_priv(team)->active_port->dev->ifindex;
+ return 0;
+}
+
+static int ab_active_port_set(struct team *team, void *arg)
+{
+ u32 *ifindex = arg;
+ struct team_port *port;
+
+ list_for_each_entry_rcu(port, &team->port_list, list) {
+ if (port->dev->ifindex == *ifindex) {
+ rcu_assign_pointer(ab_priv(team)->active_port, port);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static struct team_option ab_options[] = {
+ {
+ .name = "activeport",
+ .type = TEAM_OPTION_TYPE_U32,
+ .getter = ab_active_port_get,
+ .setter = ab_active_port_set,
+ },
+};
+
+int ab_init(struct team *team)
+{
+ team_options_register(team, ab_options, ARRAY_SIZE(ab_options));
+ return 0;
+}
+
+void ab_exit(struct team *team)
+{
+ team_options_unregister(team, ab_options, ARRAY_SIZE(ab_options));
+}
+
+static const struct team_mode_ops ab_mode_ops = {
+ .init = ab_init,
+ .exit = ab_exit,
+ .receive = ab_receive,
+ .transmit = ab_transmit,
+ .port_leave = ab_port_leave,
+};
+
+static struct team_mode ab_mode = {
+ .kind = "activebackup",
+ .owner = THIS_MODULE,
+ .priv_size = sizeof(struct ab_priv),
+ .ops = &ab_mode_ops,
+};
+
+static int __init ab_init_module(void)
+{
+ return team_mode_register(&ab_mode);
+}
+
+static void __exit ab_cleanup_module(void)
+{
+ team_mode_unregister(&ab_mode);
+}
+
+module_init(ab_init_module);
+module_exit(ab_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>");
+MODULE_DESCRIPTION("Active-backup mode for team");
+MODULE_ALIAS("team-mode-activebackup");
diff --git a/drivers/net/team/team_mode_roundrobin.c b/drivers/net/team/team_mode_roundrobin.c
new file mode 100644
index 0000000..0374052
--- /dev/null
+++ b/drivers/net/team/team_mode_roundrobin.c
@@ -0,0 +1,107 @@
+/*
+ * net/drivers/team/team_mode_roundrobin.c - Round-robin mode for team
+ * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/if_team.h>
+
+struct rr_priv {
+ unsigned int sent_packets;
+};
+
+static struct rr_priv *rr_priv(struct team *team)
+{
+ return (struct rr_priv *) &team->mode_priv;
+}
+
+static struct team_port *__get_first_port_up(struct team *team,
+ struct team_port *port)
+{
+ struct team_port *cur;
+
+ if (port->linkup)
+ return port;
+ cur = port;
+ list_for_each_entry_continue_rcu(cur, &team->port_list, list)
+ if (cur->linkup)
+ return cur;
+ list_for_each_entry_rcu(cur, &team->port_list, list) {
+ if (cur == port)
+ break;
+ if (cur->linkup)
+ return cur;
+ }
+ return NULL;
+}
+
+static bool rr_transmit(struct team *team, struct sk_buff *skb)
+{
+ struct team_port *port;
+ int port_index;
+
+ port_index = rr_priv(team)->sent_packets++ % team->port_count;
+ port = team_get_port_by_index_rcu(team, port_index);
+ port = __get_first_port_up(team, port);
+ if (unlikely(!port))
+ goto drop;
+ skb->dev = port->dev;
+ if (dev_queue_xmit(skb))
+ return false;
+ return true;
+
+drop:
+ dev_kfree_skb(skb);
+ return false;
+}
+
+static int rr_port_enter(struct team *team, struct team_port *port)
+{
+ return team_port_set_team_mac(port);
+}
+
+static void rr_port_change_mac(struct team *team, struct team_port *port)
+{
+ team_port_set_team_mac(port);
+}
+
+static const struct team_mode_ops rr_mode_ops = {
+ .transmit = rr_transmit,
+ .port_enter = rr_port_enter,
+ .port_change_mac = rr_port_change_mac,
+};
+
+static struct team_mode rr_mode = {
+ .kind = "roundrobin",
+ .owner = THIS_MODULE,
+ .priv_size = sizeof(struct rr_priv),
+ .ops = &rr_mode_ops,
+};
+
+static int __init rr_init_module(void)
+{
+ return team_mode_register(&rr_mode);
+}
+
+static void __exit rr_cleanup_module(void)
+{
+ team_mode_unregister(&rr_mode);
+}
+
+module_init(rr_init_module);
+module_exit(rr_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>");
+MODULE_DESCRIPTION("Round-robin mode for team");
+MODULE_ALIAS("team-mode-roundrobin");
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 619b565..0b091b3 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -185,6 +185,7 @@ header-y += if_pppol2tp.h
header-y += if_pppox.h
header-y += if_slip.h
header-y += if_strip.h
+header-y += if_team.h
header-y += if_tr.h
header-y += if_tun.h
header-y += if_tunnel.h
diff --git a/include/linux/if.h b/include/linux/if.h
index db20bd4..06b6ef6 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -79,6 +79,7 @@
#define IFF_TX_SKB_SHARING 0x10000 /* The interface supports sharing
* skbs on transmit */
#define IFF_UNICAST_FLT 0x20000 /* Supports unicast filtering */
+#define IFF_TEAM_PORT 0x40000 /* device used as team port */
#define IF_GET_IFACE 0x0001 /* for querying only */
#define IF_GET_PROTO 0x0002
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
new file mode 100644
index 0000000..14f6388
--- /dev/null
+++ b/include/linux/if_team.h
@@ -0,0 +1,242 @@
+/*
+ * include/linux/if_team.h - Network team device driver header
+ * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _LINUX_IF_TEAM_H_
+#define _LINUX_IF_TEAM_H_
+
+#ifdef __KERNEL__
+
+struct team_pcpu_stats {
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 rx_multicast;
+ u64 tx_packets;
+ u64 tx_bytes;
+ struct u64_stats_sync syncp;
+ u32 rx_dropped;
+ u32 tx_dropped;
+};
+
+struct team;
+
+struct team_port {
+ struct net_device *dev;
+ struct hlist_node hlist; /* node in hash list */
+ struct list_head list; /* node in ordinary list */
+ struct team *team;
+ int index;
+
+ /*
+ * A place for storing original values of the device before it
+ * become a port.
+ */
+ struct {
+ unsigned char dev_addr[MAX_ADDR_LEN];
+ unsigned int mtu;
+ } orig;
+
+ bool linkup;
+ u32 speed;
+ u8 duplex;
+
+ struct rcu_head rcu;
+};
+
+struct team_mode_ops {
+ int (*init)(struct team *team);
+ void (*exit)(struct team *team);
+ rx_handler_result_t (*receive)(struct team *team,
+ struct team_port *port,
+ struct sk_buff *skb);
+ bool (*transmit)(struct team *team, struct sk_buff *skb);
+ int (*port_enter)(struct team *team, struct team_port *port);
+ void (*port_leave)(struct team *team, struct team_port *port);
+ void (*port_change_mac)(struct team *team, struct team_port *port);
+};
+
+enum team_option_type {
+ TEAM_OPTION_TYPE_U32,
+ TEAM_OPTION_TYPE_STRING,
+};
+
+struct team_option {
+ struct list_head list;
+ const char *name;
+ enum team_option_type type;
+ int (*getter)(struct team *team, void *arg);
+ int (*setter)(struct team *team, void *arg);
+};
+
+struct team_mode {
+ struct list_head list;
+ const char *kind;
+ struct module *owner;
+ size_t priv_size;
+ const struct team_mode_ops *ops;
+};
+
+#define TEAM_PORT_HASHBITS 4
+#define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS)
+
+#define TEAM_MODE_PRIV_LONGS 4
+#define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS)
+
+struct team {
+ struct net_device *dev; /* associated netdevice */
+ struct team_pcpu_stats __percpu *pcpu_stats;
+
+ spinlock_t lock; /* used for overall locking, e.g. port lists write */
+
+ /*
+ * port lists with port count
+ */
+ int port_count;
+ struct hlist_head port_hlist[TEAM_PORT_HASHENTRIES];
+ struct list_head port_list;
+
+ struct list_head option_list;
+
+ const struct team_mode *mode;
+ struct team_mode_ops ops;
+ long mode_priv[TEAM_MODE_PRIV_LONGS];
+};
+
+static inline struct hlist_head *team_port_index_hash(struct team *team,
+ int port_index)
+{
+ return &team->port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)];
+}
+
+static inline struct team_port *team_get_port_by_index(struct team *team,
+ int port_index)
+{
+ struct hlist_node *p;
+ struct team_port *port;
+ struct hlist_head *head = team_port_index_hash(team, port_index);
+
+ hlist_for_each_entry(port, p, head, hlist)
+ if (port->index == port_index)
+ return port;
+ return NULL;
+}
+static inline struct team_port *team_get_port_by_index_rcu(struct team *team,
+ int port_index)
+{
+ struct hlist_node *p;
+ struct team_port *port;
+ struct hlist_head *head = team_port_index_hash(team, port_index);
+
+ hlist_for_each_entry_rcu(port, p, head, hlist)
+ if (port->index == port_index)
+ return port;
+ return NULL;
+}
+
+extern int team_port_set_team_mac(struct team_port *port);
+extern void team_options_register(struct team *team,
+ struct team_option *option,
+ size_t option_count);
+extern void team_options_unregister(struct team *team,
+ struct team_option *option,
+ size_t option_count);
+extern int team_mode_register(struct team_mode *mode);
+extern int team_mode_unregister(struct team_mode *mode);
+
+#endif /* __KERNEL__ */
+
+#define TEAM_STRING_MAX_LEN 32
+
+/**********************************
+ * NETLINK_GENERIC netlink family.
+ **********************************/
+
+enum {
+ TEAM_CMD_NOOP,
+ TEAM_CMD_OPTIONS_SET,
+ TEAM_CMD_OPTIONS_GET,
+ TEAM_CMD_PORT_LIST_GET,
+
+ __TEAM_CMD_MAX,
+ TEAM_CMD_MAX = (__TEAM_CMD_MAX - 1),
+};
+
+enum {
+ TEAM_ATTR_UNSPEC,
+ TEAM_ATTR_TEAM_IFINDEX, /* u32 */
+ TEAM_ATTR_LIST_OPTION, /* nest */
+ TEAM_ATTR_LIST_PORT, /* nest */
+
+ __TEAM_ATTR_MAX,
+ TEAM_ATTR_MAX = __TEAM_ATTR_MAX - 1,
+};
+
+/* Nested layout of get/set msg:
+ *
+ * [TEAM_ATTR_LIST_OPTION]
+ * [TEAM_ATTR_ITEM_OPTION]
+ * [TEAM_ATTR_OPTION_*], ...
+ * [TEAM_ATTR_ITEM_OPTION]
+ * [TEAM_ATTR_OPTION_*], ...
+ * ...
+ * [TEAM_ATTR_LIST_PORT]
+ * [TEAM_ATTR_ITEM_PORT]
+ * [TEAM_ATTR_PORT_*], ...
+ * [TEAM_ATTR_ITEM_PORT]
+ * [TEAM_ATTR_PORT_*], ...
+ * ...
+ */
+
+enum {
+ TEAM_ATTR_ITEM_OPTION_UNSPEC,
+ TEAM_ATTR_ITEM_OPTION, /* nest */
+
+ __TEAM_ATTR_ITEM_OPTION_MAX,
+ TEAM_ATTR_ITEM_OPTION_MAX = __TEAM_ATTR_ITEM_OPTION_MAX - 1,
+};
+
+enum {
+ TEAM_ATTR_OPTION_UNSPEC,
+ TEAM_ATTR_OPTION_NAME, /* string */
+ TEAM_ATTR_OPTION_CHANGED, /* flag */
+ TEAM_ATTR_OPTION_TYPE, /* u8 */
+ TEAM_ATTR_OPTION_DATA, /* dynamic */
+
+ __TEAM_ATTR_OPTION_MAX,
+ TEAM_ATTR_OPTION_MAX = __TEAM_ATTR_OPTION_MAX - 1,
+};
+
+enum {
+ TEAM_ATTR_ITEM_PORT_UNSPEC,
+ TEAM_ATTR_ITEM_PORT, /* nest */
+
+ __TEAM_ATTR_ITEM_PORT_MAX,
+ TEAM_ATTR_ITEM_PORT_MAX = __TEAM_ATTR_ITEM_PORT_MAX - 1,
+};
+
+enum {
+ TEAM_ATTR_PORT_UNSPEC,
+ TEAM_ATTR_PORT_IFINDEX, /* u32 */
+ TEAM_ATTR_PORT_CHANGED, /* flag */
+ TEAM_ATTR_PORT_LINKUP, /* flag */
+ TEAM_ATTR_PORT_SPEED, /* u32 */
+ TEAM_ATTR_PORT_DUPLEX, /* u8 */
+
+ __TEAM_ATTR_PORT_MAX,
+ TEAM_ATTR_PORT_MAX = __TEAM_ATTR_PORT_MAX - 1,
+};
+
+/*
+ * NETLINK_GENERIC related info
+ */
+#define TEAM_GENL_NAME "team"
+#define TEAM_GENL_VERSION 0x1
+#define TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME "change_event"
+
+#endif /* _LINUX_IF_TEAM_H_ */
--
1.7.6
^ permalink raw reply related
* Re: [PATCH net-next] bnx2x: reduce skb truesize by 50%
From: Eric Dumazet @ 2011-11-10 15:27 UTC (permalink / raw)
To: eilong
Cc: David Miller, bhutchings@solarflare.com, pstaszewski@itcare.pl,
netdev@vger.kernel.org
In-Reply-To: <1320937526.307.0.camel@lb-tlvb-eilong.il.broadcom.com>
Le jeudi 10 novembre 2011 à 17:05 +0200, Eilon Greenstein a écrit :
> On Wed, 2011-11-09 at 16:29 -0800, Eric Dumazet wrote:
> > Le mercredi 09 novembre 2011 à 23:03 +0100, Eric Dumazet a écrit :
> >
> > > BTW, on my bnx2x adapter, even small UDP frames use more than PAGE_SIZE
> > > bytes :
> > >
> > > skb->truesize=4352 len=26 (payload only)
> > >
> >
> > > I wonder if we shouldnt increase SK_MEM_QUANTUM a bit to avoid
> > > ping/pong...
> > >
> > > -#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
> > > +#define SK_MEM_QUANTUM ((int)PAGE_SIZE * 2)
> > >
> >
> > Following patch also helps a lot, even with only two cpus (one handling
> > device interrupts, one running the application thread)
> >
> > [PATCH net-next] bnx2x: reduce skb truesize by ~50%
> >
> > bnx2x uses following formula to compute its rx_buf_sz :
> >
> > dev->mtu + 2*L1_CACHE_BYTES + 14 + 8 + 8
> >
> > Then core network adds NET_SKB_PAD and SKB_DATA_ALIGN(sizeof(struct
> > skb_shared_info))
> >
> > Final allocated size for skb head on x86_64 (L1_CACHE_BYTES = 64,
> > MTU=1500) : 2112 bytes : SLUB/SLAB round this to 4096 bytes.
> >
> > Since skb truesize is then bigger than SK_MEM_QUANTUM, we have lot of
> > false sharing because of mem_reclaim in UDP stack.
> >
> > One possible way to half truesize is to lower the need by 64 bytes (2112
> > -> 2048 bytes)
> >
> > This way, skb->truesize is lower than SK_MEM_QUANTUM and we get better
> > performance.
> >
> > (760.000 pps on a rx UDP monothread benchmark, instead of 720.000 pps)
> >
> >
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> > CC: Eilon Greenstein <eilong@broadcom.com>
> > ---
> > drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 11 ++++++++---
> > 1 file changed, 8 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> > index aec7212..ebbdc55 100644
> > --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> > +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> > @@ -1185,9 +1185,14 @@ struct bnx2x {
> > #define ETH_MAX_PACKET_SIZE 1500
> > #define ETH_MAX_JUMBO_PACKET_SIZE 9600
> >
> > - /* Max supported alignment is 256 (8 shift) */
> > -#define BNX2X_RX_ALIGN_SHIFT ((L1_CACHE_SHIFT < 8) ? \
> > - L1_CACHE_SHIFT : 8)
> > +/* Max supported alignment is 256 (8 shift)
> > + * It should ideally be min(L1_CACHE_SHIFT, 8)
> > + * Choosing 5 (32 bytes) permits to get skb heads of 2048 bytes
> > + * instead of 4096 bytes.
> > + * With SLUB/SLAB allocators, data will be cache line aligned anyway.
> > + */
> > +#define BNX2X_RX_ALIGN_SHIFT 5
> > +
>
> Hi Eric,
>
> This can seriously hurt the PCI utilization. So in scenarios in which
> the PCI is the bottle neck, you will see performance degradation. We are
> looking at alternatives to reduce the allocation, but it is taking a
> while. Please hold off with this patch.
What do you mean exactly ?
This patch doesnt change skb->data alignment, its still 64 bytes
aligned. (cqe_fp->placement_offset == 2). PCI utilization is the same.
Only SLOB could get a misalignement, but who uses SLOB for performance ?
Alternative would be to check why hardware need 2*L1_CACHE_BYTES extra
room for alignment... Normaly it could be 1*L1_CACHE_BYTES ?
/* FW use 2 Cache lines Alignment for start packet and size */
-#define BNX2X_FW_RX_ALIGN (2 << BNX2X_RX_ALIGN_SHIFT)
+#define BNX2X_FW_RX_ALIGN (1 << BNX2X_RX_ALIGN_SHIFT)
^ permalink raw reply
* Re: creating netdev queues on the fly?
From: Dave Taht @ 2011-11-10 15:25 UTC (permalink / raw)
To: Denys Fedoryshchenko; +Cc: Helmut Schaa, Johannes Berg, netdev, linux-wireless
In-Reply-To: <7be02f26a67fac4c7448a74f1f17aa01@visp.net.lb>
On Thu, Nov 10, 2011 at 3:55 PM, Denys Fedoryshchenko <denys@wbc.net.sa> wrote:
> On Thu, 10 Nov 2011 15:40:01 +0100, Helmut Schaa wrote:
>>>
>>> I think this might also make implementing reservation (tspec) easier.
>>> Not sure if anyone wants/needs that though.
>>
>> Wouldn't it be possible to implement something like this as a qdisc on top
>> of
>> mq that makes use of the current tx rate per station to distribute
>> the airtime
>> equitably?
>>
>> Of course this would require the qdisc to know the tx rate a priori but
>> for
>> mac80211 drivers we could just use last_tx_rate as an estimate ...
Need last_aggregation_depth bytes and packets and last feasible
versions of those for an estimate with n. completion rate, perhaps...
>>
>> Helmut
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> Maybe someone will make something like "tfifo" in future :)
Did some of it last weekend. Code doesn't work yet - I mixed in too
many of the ideas mentioned in
my earlier, longer, more difficult to read missive. Time based queue
limits are an important part of the answer, however.
Our current fixed packet queue limits are merely a proxy for time. At
100Mbit ethernet, they were 100ms - which is conus distances...
and I'd hope that someone more knowledgable than I at these layers
take all this on...
Two notes:
1) Getting 'time' from the kernel is expensive. And: System time wanders.
mac80211 Wifi devices however do export a get_tsf function which could
be used as a relative-to-the-queue clock - and we actually don't need
accuracy down to the level of get_tsf (25ns)
2) You need to get a timestamp on entry to the first queue and check
against the allowable latency on exit from the last. So to construct a
tc chain you'd want a tfifo (timestamp on entry), fifot (check
timestamp against limit on dequeue), and for the simplest of
applications : tfifot (timestamp on entry, check on exit)
> And when clients are connected, each have his own queue.
Needed, yes. Particularly with mixed g/n, but also to optimize
aggregation AND fairness.
>
> then, for example qdisc add dev wlan0 parent 1:10 handle 10 tfifo limit
> 100ms
> If packet are older than 100ms will be dropped, or new packets are not
> added, if
> there is packet older than 100ms are not sent yet.
Aside from me calling it 'tfifot last weekend, yes. Though I prefer
using 'timelimit' as the key name, to distinguish between previous
overloaded uses of the word limit.
>
> I am not sure that bandwidth will be distributed fairly, it is different
> question,
> probably each queue should have some "limited chunk of time" to send data.
Round robin with random starting points for each cycle through the
stations in what I was calling a 'grouper' in the previous mail.
> And again, 802.11a/b/g at least are half-duplex and CSMA, and without
> polling/TDMA or CTS/RTS tricks
> it will be complicated to give guaranteed chunks of time.
I *think* but am not sure that what I have described interacts with
the EDCA reasonably well.
>
> P.S. That's just a dream :)
>
> ---
> Network engineer
> Denys Fedoryshchenko
>
> P.O.Box 41553 Jeddah 21531
> Tel: 920023422
> Fax: +966 26501784
> E-Mail: denys@wbc.net.sa
> --
> To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
^ permalink raw reply
* Re: [RFC] The Linux kernel IPv6 stack don't follow the RFC 4942 recommendation
From: François-Xavier Le Bail @ 2011-11-10 15:23 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev@vger.kernel.org
In-Reply-To: <1320929648.97649.YahooMailNeo@web126007.mail.ne1.yahoo.com>
----- Original Message -----
> From: François-Xavier Le Bail <fx.lebail@yahoo.com>
> To: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>
> Sent: Thursday, November 10, 2011 1:54 PM
> Subject: Re: [RFC] The Linux kernel IPv6 stack don't follow the RFC 4942 recommendation
>
> ----- Original Message -----
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> To: François-Xavier Le Bail <fx.lebail@yahoo.com>
>> Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>
>> Sent: Thursday, November 10, 2011 12:27 PM
>> Subject: Re: [RFC] The Linux kernel IPv6 stack don't follow the RFC
> 4942 recommendation
>>
>> Le jeudi 10 novembre 2011 à 02:58 -0800, François-Xavier Le Bail a
>> écrit :
>>> ----- Original Message -----
>>>
>>> > From: Eric Dumazet <eric.dumazet@gmail.com>
>>> > To: François-Xavier Le Bail <fx.lebail@yahoo.com>
>>> > Cc: "netdev@vger.kernel.org"
> <netdev@vger.kernel.org>
>>> > Sent: Saturday, November 5, 2011 10:30 AM
>>> > Subject: Re: [RFC] The Linux kernel IPv6 stack don't follow
> the
>> RFC 4942 recommendation
>>> >
>>> > Le samedi 05 novembre 2011 à 01:39 -0700, François-Xavier Le Bail
> a
>>> > écrit :
>>> >
>>> >>
>>> >> I will study and test these options for my application
> server
>>> >
>>> > Here is a sample of use of the IPv4 part, an udpecho service that
> use
>>> > IP_PKTINFO and IP_RECVTOS/IP_TOS to be able to use multihomed
> machine,
>>> > and reflect TOS field as well.
>>> > [. . .]
>>>
>>> Hi,
>>>
>>> I have updated the code for IPv6.
>>>
>>> When a UDP client send to an unicast address on a multihomed Linux
> 3.0.0
>> host, from another host, it's OK.
>>> For example :
>>> setup 2001::1 on eth0, 2a01::1 on eth1.
>>> send to 2001::1, recv from 2001::1.
>>> send to 2a01::1, recv from 2a01::1.
>>>
>>> When the UDP client send to an Subnet-Router anycast address on a
>> multihomed Linux 3.0.0 host, from another host, it's KO.
>>> send to 2001:: or 2a01::, the udpecho server display "sendmsg:
> Invalid
>> argument".
>>>
>>> Any idea ?
>>
>> Could you describe the setup of this machine ?
>>
>> ip -6 addr
>> ip -6 ro
>
> The server has ipv6 forwarding on.
>
> # ip -6 a
> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 16436
> inet6 ::1/128 scope host
> valid_lft forever preferred_lft forever
> 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qlen 1000
> inet6 2a01::1/64 scope global
> valid_lft forever preferred_lft forever
> inet6 fe80::20c:29ff:fecc:bc43/64 scope link
> valid_lft forever preferred_lft forever
> 3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qlen 1000
> inet6 2001::1/64 scope global
> valid_lft forever preferred_lft forever
> inet6 fe80::20c:29ff:fecc:bc4d/64 scope link
> valid_lft forever preferred_lft forever
>
> # ip -6 r
> 2001::/64 dev eth1 proto kernel metric 256
> 2a01::/64 dev eth0 proto kernel metric 256
> fe80::/64 dev eth1 proto kernel metric 256
> fe80::/64 dev eth0 proto kernel metric 256
> default via 2001::2 dev eth1 metric 1024
>
> 2001::2 is the address of the other (client) host.
I put a printk in addrconf.c at the beginning of ipv6_get_saddr_eval function.
In th OK case : nothing is print.
In the KO case : many printk messages.
Any problem in RFC 3484 code ?
Thanks,
Francois-Xavier
^ permalink raw reply
* [PATCH net-next 13/13] bnx2x: update driver version to 1.70.35-0
From: Dmitry Kravkov @ 2011-11-10 15:14 UTC (permalink / raw)
To: davem, netdev; +Cc: Dmitry Kravkov, Eilon Greenstein
In-Reply-To: <1320938054-31288-1-git-send-email-dmitry@broadcom.com>
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 78613ef..4b90b51 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -23,8 +23,8 @@
* (you will need to reboot afterwards) */
/* #define BNX2X_STOP_ON_ERROR */
-#define DRV_MODULE_VERSION "1.70.30-0"
-#define DRV_MODULE_RELDATE "2011/10/25"
+#define DRV_MODULE_VERSION "1.70.35-0"
+#define DRV_MODULE_RELDATE "2011/11/10"
#define BNX2X_BC_VER 0x040200
#if defined(CONFIG_DCB)
--
1.7.7.2
^ permalink raw reply related
* [PATCH net-next 12/13] bnx2x: Remove on-stack napi struct variable
From: Dmitry Kravkov @ 2011-11-10 15:14 UTC (permalink / raw)
To: davem, netdev; +Cc: Ariel Elior, Eilon Greenstein
In-Reply-To: <1320938054-31288-1-git-send-email-dmitry@broadcom.com>
From: Ariel Elior <ariele@broadcom.com>
Signed-off-by: Ariel Elior <ariele@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 12 +++++++-----
1 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index e9a91a3..13dad92 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -79,19 +79,21 @@ static inline void bnx2x_bz_fp(struct bnx2x *bp, int index)
* @to: destination FP index
*
* Makes sure the contents of the bp->fp[to].napi is kept
- * intact.
+ * intact. This is done by first copying the napi struct from
+ * the target to the source, and then mem copying the entire
+ * source onto the target
*/
static inline void bnx2x_move_fp(struct bnx2x *bp, int from, int to)
{
struct bnx2x_fastpath *from_fp = &bp->fp[from];
struct bnx2x_fastpath *to_fp = &bp->fp[to];
- struct napi_struct orig_napi = to_fp->napi;
+
+ /* Copy the NAPI object as it has been already initialized */
+ from_fp->napi = to_fp->napi;
+
/* Move bnx2x_fastpath contents */
memcpy(to_fp, from_fp, sizeof(*to_fp));
to_fp->index = to;
-
- /* Restore the NAPI object as it has been already initialized */
- to_fp->napi = orig_napi;
}
int load_count[2][3] = { {0} }; /* per-path: 0-common, 1-port0, 2-port1 */
--
1.7.7.2
^ permalink raw reply related
* [PATCH net-next 11/13] bnx2x: prevent race in statistics flow
From: Dmitry Kravkov @ 2011-11-10 15:14 UTC (permalink / raw)
To: davem, netdev; +Cc: Dmitry Kravkov, Eilon Greenstein
In-Reply-To: <1320938054-31288-1-git-send-email-dmitry@broadcom.com>
The race may cause access of registers while MAC hw block is
in reset state. As a result syslog will show error messages.
We can prevent this by using state from local variable.
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 4 +++-
1 files changed, 3 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
index 02ac6a7..3034f0e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
@@ -1349,12 +1349,14 @@ void bnx2x_stats_handle(struct bnx2x *bp, enum bnx2x_stats_event event)
enum bnx2x_stats_state state;
if (unlikely(bp->panic))
return;
- bnx2x_stats_stm[bp->stats_state][event].action(bp);
+
spin_lock_bh(&bp->stats_lock);
state = bp->stats_state;
bp->stats_state = bnx2x_stats_stm[state][event].next_state;
spin_unlock_bh(&bp->stats_lock);
+ bnx2x_stats_stm[state][event].action(bp);
+
if ((event != STATS_EVENT_UPDATE) || netif_msg_timer(bp))
DP(BNX2X_MSG_STATS, "state %d -> event %d -> state %d\n",
state, event, bp->stats_state);
--
1.7.7.2
^ permalink raw reply related
* [PATCH net-next 10/13] bnx2x: add fan failure event handling
From: Dmitry Kravkov @ 2011-11-10 15:14 UTC (permalink / raw)
To: davem, netdev; +Cc: Ariel Elior, Dmitry Kravkov, Eilon Greenstein
In-Reply-To: <1320938054-31288-1-git-send-email-dmitry@broadcom.com>
From: Ariel Elior <ariele@broadcom.com>
Shut down the device in case of fan failure to prevent HW damage.
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 3 ++
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 24 +++++++++++++++++++++-
2 files changed, 26 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 23e91f4..78613ef 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1141,6 +1141,7 @@ struct bnx2x_fw_stats_data {
enum {
BNX2X_SP_RTNL_SETUP_TC,
BNX2X_SP_RTNL_TX_TIMEOUT,
+ BNX2X_SP_RTNL_FAN_FAILURE,
};
@@ -2048,6 +2049,8 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms,
#define BNX2X_VPD_LEN 128
#define VENDOR_ID_LEN 4
+int bnx2x_close(struct net_device *dev);
+
/* Congestion management fairness mode */
#define CMNG_FNS_NONE 0
#define CMNG_FNS_MINMAX 1
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 5b44b85..1f65391 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -3303,6 +3303,17 @@ static inline void bnx2x_fan_failure(struct bnx2x *bp)
netdev_err(bp->dev, "Fan Failure on Network Controller has caused"
" the driver to shutdown the card to prevent permanent"
" damage. Please contact OEM Support for assistance\n");
+
+ /*
+ * Scheudle device reset (unload)
+ * This is due to some boards consuming sufficient power when driver is
+ * up to overheat if fan fails.
+ */
+ smp_mb__before_clear_bit();
+ set_bit(BNX2X_SP_RTNL_FAN_FAILURE, &bp->sp_rtnl_state);
+ smp_mb__after_clear_bit();
+ schedule_delayed_work(&bp->sp_rtnl_task, 0);
+
}
static inline void bnx2x_attn_int_deasserted0(struct bnx2x *bp, u32 attn)
@@ -8507,6 +8518,17 @@ sp_rtnl_not_reset:
if (test_and_clear_bit(BNX2X_SP_RTNL_SETUP_TC, &bp->sp_rtnl_state))
bnx2x_setup_tc(bp->dev, bp->dcbx_port_params.ets.num_of_cos);
+ /*
+ * in case of fan failure we need to reset id if the "stop on error"
+ * debug flag is set, since we trying to prevent permanent overheating
+ * damage
+ */
+ if (test_and_clear_bit(BNX2X_SP_RTNL_FAN_FAILURE, &bp->sp_rtnl_state)) {
+ DP(BNX2X_MSG_SP, "fan failure detected. Unloading driver");
+ netif_device_detach(bp->dev);
+ bnx2x_close(bp->dev);
+ }
+
sp_rtnl_exit:
rtnl_unlock();
}
@@ -9992,7 +10014,7 @@ static int bnx2x_open(struct net_device *dev)
}
/* called with rtnl_lock */
-static int bnx2x_close(struct net_device *dev)
+int bnx2x_close(struct net_device *dev)
{
struct bnx2x *bp = netdev_priv(dev);
--
1.7.7.2
^ permalink raw reply related
* [PATCH net-next 09/13] bnx2x: add pri_map module parameter
From: Dmitry Kravkov @ 2011-11-10 15:14 UTC (permalink / raw)
To: davem, netdev; +Cc: Ariel Elior, Dmitry Kravkov, Eilon Greenstein
In-Reply-To: <1320938054-31288-1-git-send-email-dmitry@broadcom.com>
From: Ariel Elior <ariele@broadcom.com>
The optional parameter pri_map is used to map the
skb->priority to a Class Of Service (CoS) in the HW.
This 32 bit parameter is evaluated by the driver as 8
values of 4 bits each. Each nibble sets the desired
HW queue number for that priority.
Also:
on the 5771x family this feature is unavailable (a single COS services all).
on the 57712 family two classes of service are available.
on the 578xx family three classes of service are availabe.
configuring priorities to unavailable COSs will log an error and default to
COS 0.
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 26 ++++++++++++++++++++++
1 files changed, 26 insertions(+), 0 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 967c41b..5b44b85 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -92,6 +92,10 @@ MODULE_FIRMWARE(FW_FILE_NAME_E1);
MODULE_FIRMWARE(FW_FILE_NAME_E1H);
MODULE_FIRMWARE(FW_FILE_NAME_E2);
+static uint pri_map;
+module_param(pri_map, uint, 0);
+MODULE_PARM_DESC(pri_map, " Priority to HW queue mapping");
+
static int multi_mode = 1;
module_param(multi_mode, int, 0);
MODULE_PARM_DESC(multi_mode, " Multi queue mode "
@@ -9682,6 +9686,25 @@ static int __devinit bnx2x_get_hwinfo(struct bnx2x *bp)
return rc;
}
+static void bnx2x_init_multi_cos(struct bnx2x *bp)
+{
+ int pri, cos;
+ for (pri = 0; pri < BNX2X_MAX_PRIORITY; pri++) {
+ cos = ((pri_map & (0xf << pri*4)) >> pri*4);
+ if (cos < bp->max_cos) {
+ bp->prio_to_cos[pri] = cos;
+ DP(BNX2X_MSG_SP, "configuring priority %d to cos %d",
+ pri, cos);
+ } else {
+ netdev_err(bp->dev,
+ "Illegal COS (%d) for priority %d "
+ "Max COS allowed is %d "
+ "defaulting to 0\n", cos, pri, bp->max_cos - 1);
+ bp->prio_to_cos[pri] = 0;
+ }
+ }
+}
+
static void __devinit bnx2x_read_fwinfo(struct bnx2x *bp)
{
int cnt, i, block_end, rodi;
@@ -10832,6 +10855,9 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
*/
bnx2x_set_int_mode(bp);
+ /* configure priority to cos map according to pri_map module param */
+ bnx2x_init_multi_cos(bp);
+
/* Add all NAPI objects */
bnx2x_add_all_napi(bp);
--
1.7.7.2
^ permalink raw reply related
* [PATCH net-next 08/13] bnx2x: remove unused #define
From: Dmitry Kravkov @ 2011-11-10 15:14 UTC (permalink / raw)
To: davem, netdev; +Cc: Dmitry Kravkov, Eilon Greenstein
In-Reply-To: <1320938054-31288-1-git-send-email-dmitry@broadcom.com>
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 7 -------
1 files changed, 0 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index b78c384..23e91f4 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1984,13 +1984,6 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms,
#define HW_PRTY_ASSERT_SET_4 (AEU_INPUTS_ATTN_BITS_PGLUE_PARITY_ERROR | \
AEU_INPUTS_ATTN_BITS_ATC_PARITY_ERROR)
-#define RSS_FLAGS(bp) \
- (TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY | \
- TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_TCP_CAPABILITY | \
- TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_CAPABILITY | \
- TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV6_TCP_CAPABILITY | \
- (bp->multi_mode << \
- TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_MODE_SHIFT))
#define MULTI_MASK 0x7f
--
1.7.7.2
^ permalink raw reply related
* [PATCH net-next 06/13] bnx2x: DCBX: use #define instead of magic
From: Dmitry Kravkov @ 2011-11-10 15:14 UTC (permalink / raw)
To: davem, netdev; +Cc: Dmitry Kravkov, Eilon Greenstein
In-Reply-To: <1320938054-31288-1-git-send-email-dmitry@broadcom.com>
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c | 6 +++---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h | 4 +++-
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c
index a0598fd..5051cf3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.c
@@ -874,7 +874,7 @@ static void bnx2x_dcbx_admin_mib_updated_params(struct bnx2x *bp,
/*For IEEE admin_recommendation_bw_precentage
*For IEEE admin_recommendation_ets_pg */
af->pfc.pri_en_bitmap = (u8)dp->admin_pfc_bitmap;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < DCBX_CONFIG_MAX_APP_PROTOCOL; i++) {
if (dp->admin_priority_app_table[i].valid) {
struct bnx2x_admin_priority_app_table *table =
dp->admin_priority_app_table;
@@ -2249,7 +2249,7 @@ static int bnx2x_set_admin_app_up(struct bnx2x *bp, u8 idtype, u16 idval, u8 up)
int i, ff;
/* iterate over the app entries looking for idtype and idval */
- for (i = 0, ff = -1; i < 4; i++) {
+ for (i = 0, ff = -1; i < DCBX_CONFIG_MAX_APP_PROTOCOL; i++) {
struct bnx2x_admin_priority_app_table *app_ent =
&bp->dcbx_config_params.admin_priority_app_table[i];
if (bnx2x_admin_app_is_equal(app_ent, idtype, idval))
@@ -2258,7 +2258,7 @@ static int bnx2x_set_admin_app_up(struct bnx2x *bp, u8 idtype, u16 idval, u8 up)
if (ff < 0 && !app_ent->valid)
ff = i;
}
- if (i < 4)
+ if (i < DCBX_CONFIG_MAX_APP_PROTOCOL)
/* if found overwrite up */
bp->dcbx_config_params.
admin_priority_app_table[i].priority = up;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h
index 2c6a3bc..2ab9254 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_dcb.h
@@ -90,6 +90,7 @@ struct bnx2x_admin_priority_app_table {
u32 app_id;
};
+#define DCBX_CONFIG_MAX_APP_PROTOCOL 4
struct bnx2x_config_dcbx_params {
u32 overwrite_settings;
u32 admin_dcbx_version;
@@ -109,7 +110,8 @@ struct bnx2x_config_dcbx_params {
u32 admin_recommendation_bw_precentage[8];
u32 admin_recommendation_ets_pg[8];
u32 admin_pfc_bitmap;
- struct bnx2x_admin_priority_app_table admin_priority_app_table[4];
+ struct bnx2x_admin_priority_app_table
+ admin_priority_app_table[DCBX_CONFIG_MAX_APP_PROTOCOL];
u32 admin_default_priority;
};
--
1.7.7.2
^ permalink raw reply related
* [PATCH net-next 07/13] bnx2x: simplify definition of RX_SGE_MASK_LEN and use it.
From: Dmitry Kravkov @ 2011-11-10 15:14 UTC (permalink / raw)
To: davem, netdev; +Cc: Dmitry Kravkov, Eilon Greenstein
In-Reply-To: <1320938054-31288-1-git-send-email-dmitry@broadcom.com>
Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h | 3 +--
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 3 +--
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index e17a739..b78c384 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -411,8 +411,7 @@ union db_prod {
/* Number of u64 elements in SGE mask array */
-#define RX_SGE_MASK_LEN ((NUM_RX_SGE_PAGES * RX_SGE_CNT) / \
- BIT_VEC64_ELEM_SZ)
+#define RX_SGE_MASK_LEN (NUM_RX_SGE / BIT_VEC64_ELEM_SZ)
#define RX_SGE_MASK_LEN_MASK (RX_SGE_MASK_LEN - 1)
#define NEXT_SGE_MASK_ELEM(el) (((el) + 1) & RX_SGE_MASK_LEN_MASK)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 59f1291..e8efb01 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -874,8 +874,7 @@ static inline void bnx2x_clear_sge_mask_next_elems(struct bnx2x_fastpath *fp)
static inline void bnx2x_init_sge_ring_bit_mask(struct bnx2x_fastpath *fp)
{
/* Set the mask to all 1-s: it's faster to compare to 0 than to 0xf-s */
- memset(fp->sge_mask, 0xff,
- (NUM_RX_SGE >> BIT_VEC64_ELEM_SHIFT)*sizeof(u64));
+ memset(fp->sge_mask, 0xff, RX_SGE_MASK_LEN * sizeof(u64));
/* Clear the two last indices in the page to 1:
these are the indices that correspond to the "next" element,
--
1.7.7.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox