* Re: SRIOV as bridge Re: [PATCH net-next RESEND] net: Do not call ndo_dflt_fdb_dump if ndo_fdb_dump is defined.
From: Jamal Hadi Salim @ 2014-12-22 13:04 UTC (permalink / raw)
To: Roopa Prabhu
Cc: John Fastabend, Hubert Sokolowski, netdev@vger.kernel.org,
Vlad Yasevich, Shrijeet Mukherjee
In-Reply-To: <54980A33.6000801@mojatatu.com>
On 12/22/14 07:10, Jamal Hadi Salim wrote:
> Actually you cant list them as netdevs (please someone
> correct me if i am wrong). What you see are PCI devices.
I have been corrected.
It turns out i was wrong. The VFs infact show up as ethx devices.
So your idea of having the devices with "bridge .. self" would work
except when the device moves to a VM and you cant see them anymore...
and you want to add an fdb entry to the embedded switch..
cheers,
jamal
^ permalink raw reply
* [PATCH iproute2] tc: Show classes in tree view
From: Vadim Kochan @ 2014-12-22 12:57 UTC (permalink / raw)
To: netdev; +Cc: Vadim Kochan
From: Vadim Kochan <vadim4j@gmail.com>
Date: Tue, 16 Dec 2014 22:59:30 +0200
Added new '-t[ree]' which shows classes dependency
in the tree view. Meanwhile only generic stats info
is supported.
e.g.:
$ tc/tc -t class show dev tap0
+---(1:2) htb rate 6Mbit ceil 6Mbit burst 15Kb cburst 1599b
| +---(1:40) htb prio 0 rate 5Mbit ceil 5Mbit burst 15Kb cburst 1600b
| +---(1:50) htb rate 3Mbit ceil 6Mbit burst 15Kb cburst 1599b
| | +---(1:51) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
| |
| +---(1:60) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
|
+---(1:1) htb rate 6Mbit ceil 6Mbit burst 15Kb cburst 1599b
+---(1:10) htb prio 0 rate 5Mbit ceil 5Mbit burst 15Kb cburst 1600b
+---(1:20) htb prio 0 rate 3Mbit ceil 6Mbit burst 15Kb cburst 1599b
+---(1:30) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
$ tc/tc -t -s class show dev tap0
+---(1:2) htb rate 6Mbit ceil 6Mbit burst 15Kb cburst 1599b
| | Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| | rate 0bit 0pps backlog 0b 0p requeues 0
| |
| +---(1:40) htb prio 0 rate 5Mbit ceil 5Mbit burst 15Kb cburst 1600b
| | Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| | rate 0bit 0pps backlog 0b 0p requeues 0
| |
| +---(1:50) htb rate 3Mbit ceil 6Mbit burst 15Kb cburst 1599b
| | | Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| | | rate 0bit 0pps backlog 0b 0p requeues 0
| | |
| | +---(1:51) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
| | Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| | rate 0bit 0pps backlog 0b 0p requeues 0
| |
| +---(1:60) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
| Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| rate 0bit 0pps backlog 0b 0p requeues 0
|
+---(1:1) htb rate 6Mbit ceil 6Mbit burst 15Kb cburst 1599b
| Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| rate 0bit 0pps backlog 0b 0p requeues 0
|
+---(1:10) htb prio 0 rate 5Mbit ceil 5Mbit burst 15Kb cburst 1600b
| Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| rate 0bit 0pps backlog 0b 0p requeues 0
|
+---(1:20) htb prio 0 rate 3Mbit ceil 6Mbit burst 15Kb cburst 1599b
| Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| rate 0bit 0pps backlog 0b 0p requeues 0
|
+---(1:30) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
rate 0bit 0pps backlog 0b 0p requeues 0
Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
---
Changes RFC -> PATCH:
#1 get rid of INIT_HLIST_NODE
#2 added sample output to commit message
#3 use "show_tree=1" instead of "show_tree++"
#4 no need update include/hlist.h (because of #1)
#5 changed a little tree output: parentheses around class id instead of qdisc name
tc/tc.c | 5 +-
tc/tc_class.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
tc/tc_common.h | 2 +
3 files changed, 169 insertions(+), 3 deletions(-)
diff --git a/tc/tc.c b/tc/tc.c
index 9b50e74..30950a6 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -34,8 +34,9 @@ int show_stats = 0;
int show_details = 0;
int show_raw = 0;
int show_pretty = 0;
-int batch_mode = 0;
+int show_tree = 0;
+int batch_mode = 0;
int resolve_hosts = 0;
int use_iec = 0;
int force = 0;
@@ -278,6 +279,8 @@ int main(int argc, char **argv)
++show_raw;
} else if (matches(argv[1], "-pretty") == 0) {
++show_pretty;
+ } else if (matches(argv[1], "-tree") == 0) {
+ show_tree = 1;
} else if (matches(argv[1], "-Version") == 0) {
printf("tc utility, iproute2-ss%s\n", SNAPSHOT);
return 0;
diff --git a/tc/tc_class.c b/tc/tc_class.c
index e56bf07..a7b3ecd 100644
--- a/tc/tc_class.c
+++ b/tc/tc_class.c
@@ -24,6 +24,22 @@
#include "utils.h"
#include "tc_util.h"
#include "tc_common.h"
+#include "hlist.h"
+
+struct cls_node {
+ struct hlist_node hlist;
+ __u32 handle;
+ __u32 parent;
+ int level;
+ struct cls_node *cls_parent;
+ struct cls_node *cls_right;
+ struct rtattr *attr;
+ int attr_len;
+ int childs_count;
+};
+
+static struct hlist_head cls_list = {};
+static struct hlist_head root_cls_list = {};
static void usage(void);
@@ -148,13 +164,149 @@ int filter_ifindex;
__u32 filter_qdisc;
__u32 filter_classid;
+static void tree_cls_add(__u32 parent, __u32 handle, struct rtattr *attr, int len)
+{
+ struct cls_node *cls = malloc(sizeof(struct cls_node));
+
+ memset(cls, 0, sizeof(*cls));
+ cls->handle = handle;
+ cls->parent = parent;
+ cls->attr = malloc(len);
+ cls->attr_len = len;
+
+ memcpy(cls->attr, attr, len);
+
+ if (parent == TC_H_ROOT)
+ hlist_add_head(&cls->hlist, &root_cls_list);
+ else
+ hlist_add_head(&cls->hlist, &cls_list);
+}
+
+static void tree_cls_indent(char *buf, struct cls_node *cls, int is_newline,
+ int add_spaces)
+{
+ char spaces[100] = {0};
+
+ while (cls && cls->cls_parent) {
+ cls->cls_parent->cls_right = cls;
+ cls = cls->cls_parent;
+ }
+ while (cls && cls->cls_right)
+ {
+ if (cls->hlist.next)
+ strcat(buf, "| ");
+ else
+ strcat(buf, " ");
+
+ cls = cls->cls_right;
+ }
+
+ if (is_newline) {
+ if (cls->hlist.next && cls->childs_count)
+ strcat(buf, "| |");
+ else if (cls->hlist.next)
+ strcat(buf, "| ");
+ else if (cls->childs_count)
+ strcat(buf, " |");
+ else if (!cls->hlist.next)
+ strcat(buf, " ");
+ }
+ if (add_spaces > 0)
+ {
+ sprintf(spaces, "%-*s", add_spaces, "");
+ strcat(buf, spaces);
+ }
+}
+
+static void tree_cls_show(FILE *fp, char *buf, struct hlist_head *root_list, int level)
+{
+ struct hlist_node *n, *tmp_cls;
+ char cls_id_str[256] = {};
+ struct rtattr * tb[TCA_MAX+1] = {};
+ struct qdisc_util *q;
+ char str[100] = {};
+
+ hlist_for_each_safe(n, tmp_cls, root_list) {
+ struct hlist_node *c, *tmp_chld;
+ struct hlist_head childs = {};
+ struct cls_node *cls = container_of(n, struct cls_node, hlist);
+
+ hlist_for_each_safe(c, tmp_chld, &cls_list) {
+ struct cls_node *child = container_of(c, struct cls_node, hlist);
+
+ if (cls->handle == child->parent) {
+ hlist_del(c);
+ hlist_add_head(c, &childs);
+ cls->childs_count++;
+ child->cls_parent = cls;
+ }
+ }
+
+ tree_cls_indent(buf, cls, 0, 0);
+
+ print_tc_classid(cls_id_str, sizeof(cls_id_str), cls->handle);
+ sprintf(str, "+---(%s)", cls_id_str);
+ strcat(buf, str);
+
+ parse_rtattr(tb, TCA_MAX, cls->attr, cls->attr_len);
+
+ if (tb[TCA_KIND] == NULL) {
+ strcat(buf, " [unknown qdisc kind] ");
+ } else {
+ const char *kind = rta_getattr_str(tb[TCA_KIND]);
+
+ sprintf(str, " %s ", kind);
+ strcat(buf, str);
+ fprintf(fp, "%s", buf);
+ buf[0] = '\0';
+
+ q = get_qdisc_kind(kind);
+ if (q && q->print_copt) {
+ q->print_copt(q, fp, tb[TCA_OPTIONS]);
+ }
+ if (q && show_stats)
+ {
+ int cls_indent = strlen(q->id) - 2 +
+ strlen(cls_id_str);
+ struct rtattr *xstats = NULL;
+
+ tree_cls_indent(buf, cls, 1, cls_indent);
+
+ if (tb[TCA_STATS] || tb[TCA_STATS2]) {
+ fprintf(fp, "\n");
+ print_tcstats_attr(fp, tb, buf, &xstats);
+ buf[0] = '\0';
+ }
+ if (cls->hlist.next || cls->childs_count)
+ {
+ strcat(buf, "\n");
+ tree_cls_indent(buf, cls, 1, 0);
+ }
+ }
+ }
+ free(cls->attr);
+ fprintf(fp, "%s\n", buf);
+ buf[0] = '\0';
+
+ tree_cls_show(fp, buf, &childs, level + 1);
+ if (!cls->hlist.next) {
+ tree_cls_indent(buf, cls, 0, 0);
+ strcat(buf, "\n");
+ }
+
+ fprintf(fp, "%s", buf);
+ buf[0] = '\0';
+ free(cls);
+ }
+}
+
int print_class(const struct sockaddr_nl *who,
struct nlmsghdr *n, void *arg)
{
FILE *fp = (FILE*)arg;
struct tcmsg *t = NLMSG_DATA(n);
int len = n->nlmsg_len;
- struct rtattr * tb[TCA_MAX+1];
+ struct rtattr * tb[TCA_MAX+1] = {};
struct qdisc_util *q;
char abuf[256];
@@ -167,13 +319,18 @@ int print_class(const struct sockaddr_nl *who,
fprintf(stderr, "Wrong len %d\n", len);
return -1;
}
+
+ if (show_tree) {
+ tree_cls_add(t->tcm_parent, t->tcm_handle, TCA_RTA(t), len);
+ return 0;
+ }
+
if (filter_qdisc && TC_H_MAJ(t->tcm_handle^filter_qdisc))
return 0;
if (filter_classid && t->tcm_handle != filter_classid)
return 0;
- memset(tb, 0, sizeof(tb));
parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len);
if (tb[TCA_KIND] == NULL) {
@@ -236,6 +393,7 @@ static int tc_class_list(int argc, char **argv)
{
struct tcmsg t;
char d[16];
+ char buf[1024] = {0};
memset(&t, 0, sizeof(t));
t.tcm_family = AF_UNSPEC;
@@ -306,6 +464,9 @@ static int tc_class_list(int argc, char **argv)
return 1;
}
+ if (show_tree)
+ tree_cls_show(stdout, &buf[0], &root_cls_list, 0);
+
return 0;
}
diff --git a/tc/tc_common.h b/tc/tc_common.h
index 4f88856..0ee009b 100644
--- a/tc/tc_common.h
+++ b/tc/tc_common.h
@@ -19,3 +19,5 @@ extern int parse_estimator(int *p_argc, char ***p_argv, struct tc_estimator *est
struct tc_sizespec;
extern int parse_size_table(int *p_argc, char ***p_argv, struct tc_sizespec *s);
extern int check_size_table_opts(struct tc_sizespec *s);
+
+extern int show_tree;
--
2.1.3
^ permalink raw reply related
* Re: [PATCH net]tg3: tg3_disable_ints using uninitialized mailbox value to disable interrupts
From: Marcelo Ricardo Leitner @ 2014-12-22 14:06 UTC (permalink / raw)
To: Nils Holland, Prashant Sreedharan
Cc: davem, netdev, linux-pci, bhelgaas, rajatxjain, Michael Chan
In-Reply-To: <20141220221606.GA2591@teela.fritz.box>
On 20-12-2014 20:16, Nils Holland wrote:
> On Sat, Dec 20, 2014 at 12:16:17PM -0800, Prashant Sreedharan wrote:
>>
>> This driver bug was exposed because of the commit a7877b17a667 (PCI: Check only
>> the Vendor ID to identify Configuration Request Retry). Also this issue is only
>> seen in older generation chipsets like 5722 because config space write to offset
>> 0 from driver is possible.
>>
>> Fixed by initializing the interrupt mailbox registers before calling tg3_halt.
>>
>> Please queue for -stable.
>
> I gave this patch a try and can confirm what was to be expected: It
> fixes the issue and the network interface is once again working
> properly on my system. Thus, I guess the issue is adequately solved.
>
> Thanks to everyone involved, especially to Marcelo for additional
> debugging and the guys at Broadcom for the quick fix!
>
> Greetings,
> Nils
Same here, it works again.
Thanks!
Marcelo
^ permalink raw reply
* Regression with commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
From: Nicolas Dichtel @ 2014-12-22 15:17 UTC (permalink / raw)
To: netdev, Eric Dumazet
One of our engineer (Huaibin Wang <huaibin.wang@6wind.com>) has reported and
analysed this bug:
This commit introduces a regression with IPv6 + IPsec transport + TCP.
In TCP (net/ipv6/tcp_ipv6.c), xfrm6_policy_check() is called and thus, after
some intermediate functions, _decode_session6() is also called.
This function uses IP6CB() (u8 nexthdr = nh[IP6CB(skb)->nhoff]), which is wrong
becauses it has been moved to the end of TCP_SKB_CB().
Not sure what is the best way to fix this, any suggestion?
Regards,
Nicolas
^ permalink raw reply
* [PATCH net] net: ndo_gso_check() must force segmentation
From: Eric Dumazet @ 2014-12-22 15:21 UTC (permalink / raw)
To: Hayes Wang; +Cc: Tom Herbert, David Miller, netdev@vger.kernel.org, nic_swsd
In-Reply-To: <0835B3720019904CB8F7AA43166CEEB2ED5B01@RTITMBSV03.realtek.com.tw>
From: Eric Dumazet <edumazet@google.com>
If ndo_gso_check() returns true, it means a driver wants the stack
to perform software segmentation, even if device features initially
claimed hardware was able handle a TSO packet.
This means netif_needs_gso() needs to modify the features.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Hayes Wang <hayeswang@realtek.com>
Tested-by: Hayes Wang <hayeswang@realtek.com>
Fixes: 04ffcb255f22 ("net: Add ndo_gso_check")
---
drivers/net/macvtap.c | 2 +-
drivers/net/xen-netfront.c | 4 +++-
include/linux/netdevice.h | 18 ++++++++++++------
net/core/dev.c | 2 +-
4 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 7df221788cd4..0346bcfe72a5 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -314,7 +314,7 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
*/
if (q->flags & IFF_VNET_HDR)
features |= vlan->tap_features;
- if (netif_needs_gso(dev, skb, features)) {
+ if (netif_needs_gso(dev, skb, &features)) {
struct sk_buff *segs = __skb_gso_segment(skb, features, false);
if (IS_ERR(segs))
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 22bcb4e12e2a..9cacabaea175 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -578,6 +578,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
unsigned long flags;
struct netfront_queue *queue = NULL;
unsigned int num_queues = dev->real_num_tx_queues;
+ netdev_features_t features;
u16 queue_index;
/* Drop the packet if no queues are set up */
@@ -611,9 +612,10 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
spin_lock_irqsave(&queue->tx_lock, flags);
+ features = netif_skb_features(skb);
if (unlikely(!netif_carrier_ok(dev) ||
(slots > 1 && !xennet_can_sg(dev)) ||
- netif_needs_gso(dev, skb, netif_skb_features(skb)))) {
+ netif_needs_gso(dev, skb, &features))) {
spin_unlock_irqrestore(&queue->tx_lock, flags);
goto drop;
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c31f74d76ebd..fb1f8c900df9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3608,13 +3608,19 @@ static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
}
static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb,
- netdev_features_t features)
+ netdev_features_t *features)
{
- return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
- (dev->netdev_ops->ndo_gso_check &&
- !dev->netdev_ops->ndo_gso_check(skb, dev)) ||
- unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
- (skb->ip_summed != CHECKSUM_UNNECESSARY)));
+ if (!skb_is_gso(skb))
+ return false;
+ if (!skb_gso_ok(skb, *features))
+ return true;
+ if (dev->netdev_ops->ndo_gso_check &&
+ !dev->netdev_ops->ndo_gso_check(skb, dev)) {
+ *features &= ~NETIF_F_GSO_MASK;
+ return true;
+ }
+ return skb->ip_summed != CHECKSUM_PARTIAL &&
+ skb->ip_summed != CHECKSUM_UNNECESSARY;
}
static inline void netif_set_gso_max_size(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index f411c28d0a66..b61c26b45bb7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2668,7 +2668,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (skb->encapsulation)
features &= dev->hw_enc_features;
- if (netif_needs_gso(dev, skb, features)) {
+ if (netif_needs_gso(dev, skb, &features)) {
struct sk_buff *segs;
segs = skb_gso_segment(skb, features);
^ permalink raw reply related
* Re: e1000_netpoll(): disable_irq() triggers might_sleep() on linux-next
From: Bart Van Assche @ 2014-12-22 15:28 UTC (permalink / raw)
To: Sabrina Dubroca, Peter Zijlstra
Cc: Thomas Gleixner, netdev, linux-kernel, jeffrey.t.kirsher
In-Reply-To: <20141202163530.GA19420@kria>
On 12/02/14 17:35, Sabrina Dubroca wrote:
> Hello, sorry for the delay.
>
> 2014-10-29, 20:36:03 +0100, Peter Zijlstra wrote:
>> On Wed, Oct 29, 2014 at 07:33:00PM +0100, Thomas Gleixner wrote:
>>> Yuck. No. You are just papering over the problem.
>>>
>>> What happens if you add 'threadirqs' to the kernel command line? Or if
>>> the interrupt line is shared with a real threaded interrupt user?
>>>
>>> The proper solution is to have a poll_lock for e1000 which serializes
>>> the hardware interrupt against netpoll instead of using
>>> disable/enable_irq().
>>>
>>> In fact that's less expensive than the disable/enable_irq() dance and
>>> the chance of contention is pretty low. If done right it will be a
>>> NOOP for the CONFIG_NET_POLL_CONTROLLER=n case.
>>>
>>
>> OK a little something like so then I suppose.. But I suspect most all
>> the network drivers will need this and maybe more, disable_irq() is a
>> popular little thing and we 'just' changed semantics on them.
>>
>> ---
>> drivers/net/ethernet/intel/e1000/e1000.h | 2 ++
>> drivers/net/ethernet/intel/e1000/e1000_main.c | 22 +++++++++++++++++-----
>> kernel/irq/manage.c | 2 +-
>> 3 files changed, 20 insertions(+), 6 deletions(-)
>
> I've been running with variants of this patch, things seem ok.
>
> As noted earlier, there are a lot of drivers doing this disable_irq +
> irq_handler + enable_irq sequence. I found about 60.
> Many already take a lock in the interrupt handler, and look like we
> could just remove the call to disable_irq (example: cp_interrupt,
> drivers/net/ethernet/realtek/8139cp.c).
>
> Here's how I modified your patch. The locking compiles away if
> CONFIG_NET_POLL_CONTROLLER=n.
>
> I can work on converting all the drivers from disable_irq to
> netpoll_irq_lock, if that's okay with you.
>
> In igb there's also a synchronize_irq() called from the netpoll
> controller (in igb_irq_disable()), I think a similar locking scheme
> would work.
> I also saw a few disable_irq_nosync and disable_percpu_irq. These are
> okay?
>
> [ ... ]
Hello,
Earlier today I ran into the bug mentioned at the start of this thread
with kernel 3.19-rc1 and the e1000e driver. Can anyone tell me what the
latest status is ?
Thanks,
Bart.
^ permalink raw reply
* Re: [PATCH] bonding: avoid re-entry of bond_release
From: Andy Gospodarek @ 2014-12-22 15:45 UTC (permalink / raw)
To: Wengang; +Cc: Ding Tianhong, netdev
In-Reply-To: <5497D6B0.2040402@oracle.com>
On Mon, Dec 22, 2014 at 04:30:40PM +0800, Wengang wrote:
> OK. Will change as suggested and re-post.
Sounds great. Thanks for your work on this.
>
> thanks,
> wengang
>
> 于 2014年12月22日 10:05, Ding Tianhong 写道:
> >On 2014/12/22 9:09, Wengang wrote:
> >>Hi Andy and Ding,
> >>
> >>Thanks for your reviews!
> >>In the ioctl path, removing a interface that is not currently actually a slave
> >>can happen from user space(by mistake), we should avoid the noisy message.
> >>
> >>While, __bond_release_one() has another call path which is from bond_uninit().
> >>In the later case, it should be treated as an error if the interface is not with
> >>IFF_SLAVE flag. To notice that error occurred, the message is printed. I think
> >>the message is needed for this path.
> >>
> >>How do you think?
> >>
> >Just like the bond_enslave(), it is only a warning.
> >
> >Ding
> >
> >>thanks,
> >>wengang
> >>
> >>于 2014年12月21日 10:01, Ding Tianhong 写道:
> >>>On 2014/12/19 23:11, Andy Gospodarek wrote:
> >>>>On Fri, Dec 19, 2014 at 04:56:57PM +0800, Wengang Wang wrote:
> >>>>>If bond_release is run against an interface which is already detached from
> >>>>>it's master, then there is an error message shown like
> >>>>> "<master name> cannot release <slave name>".
> >>>>>
> >>>>>The call path is:
> >>>>> bond_do_ioctl()
> >>>>> bond_release()
> >>>>> __bond_release_one()
> >>>>>
> >>>>>Though it does not really harm, the message the message is misleading.
> >>>>>This patch tries to avoid the message.
> >>>>>
> >>>>>Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> >>>>>---
> >>>>> drivers/net/bonding/bond_main.c | 5 ++++-
> >>>>> 1 file changed, 4 insertions(+), 1 deletion(-)
> >>>>>
> >>>>>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> >>>>>index 184c434..4a71bbd 100644
> >>>>>--- a/drivers/net/bonding/bond_main.c
> >>>>>+++ b/drivers/net/bonding/bond_main.c
> >>>>>@@ -3256,7 +3256,10 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd
> >>>>> break;
> >>>>> case BOND_RELEASE_OLD:
> >>>>> case SIOCBONDRELEASE:
> >>>>>- res = bond_release(bond_dev, slave_dev);
> >>>>>+ if (slave_dev->flags & IFF_SLAVE)
> >>>>>+ res = bond_release(bond_dev, slave_dev);
> >>>>>+ else
> >>>>>+ res = 0;
> >>>>Functionally this patch is fine, but I would prefer that you simply
> >>>>change the check in __bond_release_one to not be so noisy. There is a
> >>>>check[1] in bond_enslave to see if a slave is already in a bond and that
> >>>>just prints a message of netdev_dbg (rather than netdev_err) and it
> >>>>seems that would be appropriate for this type of message.
> >>>>
> >>>>[1] from bond_enslave():
> >>>>
> >>>> /* already enslaved */
> >>>> if (slave_dev->flags & IFF_SLAVE) {
> >>>> netdev_dbg(bond_dev, "Error: Device was already enslaved\n");
> >>>> return -EBUSY;
> >>>> }
> >>>>
> >>>>
> >>>>> break;
> >>>>> case BOND_SETHWADDR_OLD:
> >>>>> case SIOCBONDSETHWADDR:
> >>>>>--
> >>>agree ,use netdev_dbg looks more better and enough.
> >>>
> >>>Ding
> >>>
> >>>
> >>
>
^ permalink raw reply
* Re: [PATCH net] net: ndo_gso_check() must force segmentation
From: Jesse Gross @ 2014-12-22 15:59 UTC (permalink / raw)
To: Eric Dumazet
Cc: Hayes Wang, Tom Herbert, David Miller, netdev@vger.kernel.org,
nic_swsd
In-Reply-To: <1419261697.11185.15.camel@edumazet-glaptop2.roam.corp.google.com>
On Mon, Dec 22, 2014 at 10:21 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> If ndo_gso_check() returns true, it means a driver wants the stack
> to perform software segmentation, even if device features initially
> claimed hardware was able handle a TSO packet.
>
> This means netif_needs_gso() needs to modify the features.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: Hayes Wang <hayeswang@realtek.com>
> Tested-by: Hayes Wang <hayeswang@realtek.com>
> Fixes: 04ffcb255f22 ("net: Add ndo_gso_check")
There's actually another problem with ndo_gso_check() - it doesn't
deal with other features like checksum offloading which usually have
similar constraints. As a result, the GSO code generates segments with
checksums offloading that immediately fails.
I have a patch that I was working on last night that behaves similarly
to yours but is also generalized to handle both cases. Let me send it
out now.
^ permalink raw reply
* [PATCH iproute2 v2] tc: Show classes in tree view
From: Vadim Kochan @ 2014-12-22 15:51 UTC (permalink / raw)
To: netdev; +Cc: Vadim Kochan
From: Vadim Kochan <vadim4j@gmail.com>
Added new '-t[ree]' which shows classes dependency
in the tree view. Meanwhile only generic stats info
is supported.
e.g.:
$ tc/tc -t class show dev tap0
+---(1:2) htb rate 6Mbit ceil 6Mbit burst 15Kb cburst 1599b
| +---(1:40) htb prio 0 rate 5Mbit ceil 5Mbit burst 15Kb cburst 1600b
| +---(1:50) htb rate 3Mbit ceil 6Mbit burst 15Kb cburst 1599b
| | +---(1:51) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
| |
| +---(1:60) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
|
+---(1:1) htb rate 6Mbit ceil 6Mbit burst 15Kb cburst 1599b
+---(1:10) htb prio 0 rate 5Mbit ceil 5Mbit burst 15Kb cburst 1600b
+---(1:20) htb prio 0 rate 3Mbit ceil 6Mbit burst 15Kb cburst 1599b
+---(1:30) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
$ tc/tc -t -s class show dev tap0
+---(1:2) htb rate 6Mbit ceil 6Mbit burst 15Kb cburst 1599b
| | Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| | rate 0bit 0pps backlog 0b 0p requeues 0
| |
| +---(1:40) htb prio 0 rate 5Mbit ceil 5Mbit burst 15Kb cburst 1600b
| | Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| | rate 0bit 0pps backlog 0b 0p requeues 0
| |
| +---(1:50) htb rate 3Mbit ceil 6Mbit burst 15Kb cburst 1599b
| | | Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| | | rate 0bit 0pps backlog 0b 0p requeues 0
| | |
| | +---(1:51) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
| | Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| | rate 0bit 0pps backlog 0b 0p requeues 0
| |
| +---(1:60) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
| Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| rate 0bit 0pps backlog 0b 0p requeues 0
|
+---(1:1) htb rate 6Mbit ceil 6Mbit burst 15Kb cburst 1599b
| Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| rate 0bit 0pps backlog 0b 0p requeues 0
|
+---(1:10) htb prio 0 rate 5Mbit ceil 5Mbit burst 15Kb cburst 1600b
| Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| rate 0bit 0pps backlog 0b 0p requeues 0
|
+---(1:20) htb prio 0 rate 3Mbit ceil 6Mbit burst 15Kb cburst 1599b
| Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
| rate 0bit 0pps backlog 0b 0p requeues 0
|
+---(1:30) htb prio 0 rate 1Kbit ceil 6Mbit burst 15Kb cburst 1599b
Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
rate 0bit 0pps backlog 0b 0p requeues 0
Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
---
Changes v2:
Removed "Date:" from commit message which was added by mistake.
Changes RFC -> PATCH:
#1 get rid of INIT_HLIST_NODE
#2 added sample output to commit message
#3 use "show_tree=1" instead of "show_tree++"
#4 no need update include/hlist.h (because of #1)
#5 changed a little tree output: parentheses around class id instead of qdisc name
tc/tc.c | 5 +-
tc/tc_class.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
tc/tc_common.h | 2 +
3 files changed, 169 insertions(+), 3 deletions(-)
diff --git a/tc/tc.c b/tc/tc.c
index 9b50e74..30950a6 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -34,8 +34,9 @@ int show_stats = 0;
int show_details = 0;
int show_raw = 0;
int show_pretty = 0;
-int batch_mode = 0;
+int show_tree = 0;
+int batch_mode = 0;
int resolve_hosts = 0;
int use_iec = 0;
int force = 0;
@@ -278,6 +279,8 @@ int main(int argc, char **argv)
++show_raw;
} else if (matches(argv[1], "-pretty") == 0) {
++show_pretty;
+ } else if (matches(argv[1], "-tree") == 0) {
+ show_tree = 1;
} else if (matches(argv[1], "-Version") == 0) {
printf("tc utility, iproute2-ss%s\n", SNAPSHOT);
return 0;
diff --git a/tc/tc_class.c b/tc/tc_class.c
index e56bf07..a7b3ecd 100644
--- a/tc/tc_class.c
+++ b/tc/tc_class.c
@@ -24,6 +24,22 @@
#include "utils.h"
#include "tc_util.h"
#include "tc_common.h"
+#include "hlist.h"
+
+struct cls_node {
+ struct hlist_node hlist;
+ __u32 handle;
+ __u32 parent;
+ int level;
+ struct cls_node *cls_parent;
+ struct cls_node *cls_right;
+ struct rtattr *attr;
+ int attr_len;
+ int childs_count;
+};
+
+static struct hlist_head cls_list = {};
+static struct hlist_head root_cls_list = {};
static void usage(void);
@@ -148,13 +164,149 @@ int filter_ifindex;
__u32 filter_qdisc;
__u32 filter_classid;
+static void tree_cls_add(__u32 parent, __u32 handle, struct rtattr *attr, int len)
+{
+ struct cls_node *cls = malloc(sizeof(struct cls_node));
+
+ memset(cls, 0, sizeof(*cls));
+ cls->handle = handle;
+ cls->parent = parent;
+ cls->attr = malloc(len);
+ cls->attr_len = len;
+
+ memcpy(cls->attr, attr, len);
+
+ if (parent == TC_H_ROOT)
+ hlist_add_head(&cls->hlist, &root_cls_list);
+ else
+ hlist_add_head(&cls->hlist, &cls_list);
+}
+
+static void tree_cls_indent(char *buf, struct cls_node *cls, int is_newline,
+ int add_spaces)
+{
+ char spaces[100] = {0};
+
+ while (cls && cls->cls_parent) {
+ cls->cls_parent->cls_right = cls;
+ cls = cls->cls_parent;
+ }
+ while (cls && cls->cls_right)
+ {
+ if (cls->hlist.next)
+ strcat(buf, "| ");
+ else
+ strcat(buf, " ");
+
+ cls = cls->cls_right;
+ }
+
+ if (is_newline) {
+ if (cls->hlist.next && cls->childs_count)
+ strcat(buf, "| |");
+ else if (cls->hlist.next)
+ strcat(buf, "| ");
+ else if (cls->childs_count)
+ strcat(buf, " |");
+ else if (!cls->hlist.next)
+ strcat(buf, " ");
+ }
+ if (add_spaces > 0)
+ {
+ sprintf(spaces, "%-*s", add_spaces, "");
+ strcat(buf, spaces);
+ }
+}
+
+static void tree_cls_show(FILE *fp, char *buf, struct hlist_head *root_list, int level)
+{
+ struct hlist_node *n, *tmp_cls;
+ char cls_id_str[256] = {};
+ struct rtattr * tb[TCA_MAX+1] = {};
+ struct qdisc_util *q;
+ char str[100] = {};
+
+ hlist_for_each_safe(n, tmp_cls, root_list) {
+ struct hlist_node *c, *tmp_chld;
+ struct hlist_head childs = {};
+ struct cls_node *cls = container_of(n, struct cls_node, hlist);
+
+ hlist_for_each_safe(c, tmp_chld, &cls_list) {
+ struct cls_node *child = container_of(c, struct cls_node, hlist);
+
+ if (cls->handle == child->parent) {
+ hlist_del(c);
+ hlist_add_head(c, &childs);
+ cls->childs_count++;
+ child->cls_parent = cls;
+ }
+ }
+
+ tree_cls_indent(buf, cls, 0, 0);
+
+ print_tc_classid(cls_id_str, sizeof(cls_id_str), cls->handle);
+ sprintf(str, "+---(%s)", cls_id_str);
+ strcat(buf, str);
+
+ parse_rtattr(tb, TCA_MAX, cls->attr, cls->attr_len);
+
+ if (tb[TCA_KIND] == NULL) {
+ strcat(buf, " [unknown qdisc kind] ");
+ } else {
+ const char *kind = rta_getattr_str(tb[TCA_KIND]);
+
+ sprintf(str, " %s ", kind);
+ strcat(buf, str);
+ fprintf(fp, "%s", buf);
+ buf[0] = '\0';
+
+ q = get_qdisc_kind(kind);
+ if (q && q->print_copt) {
+ q->print_copt(q, fp, tb[TCA_OPTIONS]);
+ }
+ if (q && show_stats)
+ {
+ int cls_indent = strlen(q->id) - 2 +
+ strlen(cls_id_str);
+ struct rtattr *xstats = NULL;
+
+ tree_cls_indent(buf, cls, 1, cls_indent);
+
+ if (tb[TCA_STATS] || tb[TCA_STATS2]) {
+ fprintf(fp, "\n");
+ print_tcstats_attr(fp, tb, buf, &xstats);
+ buf[0] = '\0';
+ }
+ if (cls->hlist.next || cls->childs_count)
+ {
+ strcat(buf, "\n");
+ tree_cls_indent(buf, cls, 1, 0);
+ }
+ }
+ }
+ free(cls->attr);
+ fprintf(fp, "%s\n", buf);
+ buf[0] = '\0';
+
+ tree_cls_show(fp, buf, &childs, level + 1);
+ if (!cls->hlist.next) {
+ tree_cls_indent(buf, cls, 0, 0);
+ strcat(buf, "\n");
+ }
+
+ fprintf(fp, "%s", buf);
+ buf[0] = '\0';
+ free(cls);
+ }
+}
+
int print_class(const struct sockaddr_nl *who,
struct nlmsghdr *n, void *arg)
{
FILE *fp = (FILE*)arg;
struct tcmsg *t = NLMSG_DATA(n);
int len = n->nlmsg_len;
- struct rtattr * tb[TCA_MAX+1];
+ struct rtattr * tb[TCA_MAX+1] = {};
struct qdisc_util *q;
char abuf[256];
@@ -167,13 +319,18 @@ int print_class(const struct sockaddr_nl *who,
fprintf(stderr, "Wrong len %d\n", len);
return -1;
}
+
+ if (show_tree) {
+ tree_cls_add(t->tcm_parent, t->tcm_handle, TCA_RTA(t), len);
+ return 0;
+ }
+
if (filter_qdisc && TC_H_MAJ(t->tcm_handle^filter_qdisc))
return 0;
if (filter_classid && t->tcm_handle != filter_classid)
return 0;
- memset(tb, 0, sizeof(tb));
parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len);
if (tb[TCA_KIND] == NULL) {
@@ -236,6 +393,7 @@ static int tc_class_list(int argc, char **argv)
{
struct tcmsg t;
char d[16];
+ char buf[1024] = {0};
memset(&t, 0, sizeof(t));
t.tcm_family = AF_UNSPEC;
@@ -306,6 +464,9 @@ static int tc_class_list(int argc, char **argv)
return 1;
}
+ if (show_tree)
+ tree_cls_show(stdout, &buf[0], &root_cls_list, 0);
+
return 0;
}
diff --git a/tc/tc_common.h b/tc/tc_common.h
index 4f88856..0ee009b 100644
--- a/tc/tc_common.h
+++ b/tc/tc_common.h
@@ -19,3 +19,5 @@ extern int parse_estimator(int *p_argc, char ***p_argv, struct tc_estimator *est
struct tc_sizespec;
extern int parse_size_table(int *p_argc, char ***p_argv, struct tc_sizespec *s);
extern int check_size_table_opts(struct tc_sizespec *s);
+
+extern int show_tree;
--
2.1.3
^ permalink raw reply related
* [PATCH net] net: Generalize ndo_gso_check to ndo_features_check
From: Jesse Gross @ 2014-12-22 16:03 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Tom Herbert, Joe Stringer, Eric Dumazet
GSO isn't the only offload feature with restrictions that
potentially can't be expressed with the current features mechanism.
Checksum is another although it's a general issue that could in
theory apply to anything. Even if it may be possible to
implement these restrictions in other ways, it can result in
duplicate code or inefficient per-packet behavior.
This generalizes ndo_gso_check so that drivers can remove any
features that don't make sense for a given packet, similar to
netif_skb_features(). It also converts existing driver
restrictions to the new format, completing the work that was
done to support tunnel protocols since the issues apply to
checksums as well.
CC: Tom Herbert <therbert@google.com>
CC: Joe Stringer <joestringer@nicira.com>
CC: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Fixes: 04ffcb255f22 ("net: Add ndo_gso_check")
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 8 ++++---
drivers/net/ethernet/emulex/benet/be_main.c | 8 ++++---
drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 10 +++++----
drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 8 ++++---
include/linux/netdevice.h | 20 +++++++++--------
include/net/vxlan.h | 28 ++++++++++++++++++++----
net/core/dev.c | 28 ++++++++++++++++--------
7 files changed, 75 insertions(+), 35 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 9f5e387..72eef9f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12553,9 +12553,11 @@ static int bnx2x_get_phys_port_id(struct net_device *netdev,
return 0;
}
-static bool bnx2x_gso_check(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t bnx2x_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
{
- return vxlan_gso_check(skb);
+ return vxlan_features_check(skb, features);
}
static const struct net_device_ops bnx2x_netdev_ops = {
@@ -12589,7 +12591,7 @@ static const struct net_device_ops bnx2x_netdev_ops = {
#endif
.ndo_get_phys_port_id = bnx2x_get_phys_port_id,
.ndo_set_vf_link_state = bnx2x_set_vf_link_state,
- .ndo_gso_check = bnx2x_gso_check,
+ .ndo_features_check = bnx2x_features_check,
};
static int bnx2x_set_coherency_mask(struct bnx2x *bp)
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 1960731..41a0a54 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4459,9 +4459,11 @@ done:
adapter->vxlan_port_count--;
}
-static bool be_gso_check(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t be_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
{
- return vxlan_gso_check(skb);
+ return vxlan_features_check(skb, features);
}
#endif
@@ -4492,7 +4494,7 @@ static const struct net_device_ops be_netdev_ops = {
#ifdef CONFIG_BE2NET_VXLAN
.ndo_add_vxlan_port = be_add_vxlan_port,
.ndo_del_vxlan_port = be_del_vxlan_port,
- .ndo_gso_check = be_gso_check,
+ .ndo_features_check = be_features_check,
#endif
};
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 190cbd9..d0d6dc1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2365,9 +2365,11 @@ static void mlx4_en_del_vxlan_port(struct net_device *dev,
queue_work(priv->mdev->workqueue, &priv->vxlan_del_task);
}
-static bool mlx4_en_gso_check(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t mlx4_en_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
{
- return vxlan_gso_check(skb);
+ return vxlan_features_check(skb, features);
}
#endif
@@ -2400,7 +2402,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
#ifdef CONFIG_MLX4_EN_VXLAN
.ndo_add_vxlan_port = mlx4_en_add_vxlan_port,
.ndo_del_vxlan_port = mlx4_en_del_vxlan_port,
- .ndo_gso_check = mlx4_en_gso_check,
+ .ndo_features_check = mlx4_en_features_check,
#endif
};
@@ -2434,7 +2436,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
#ifdef CONFIG_MLX4_EN_VXLAN
.ndo_add_vxlan_port = mlx4_en_add_vxlan_port,
.ndo_del_vxlan_port = mlx4_en_del_vxlan_port,
- .ndo_gso_check = mlx4_en_gso_check,
+ .ndo_features_check = mlx4_en_features_check,
#endif
};
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 1aa25b1..9929b97 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -505,9 +505,11 @@ static void qlcnic_del_vxlan_port(struct net_device *netdev,
adapter->flags |= QLCNIC_DEL_VXLAN_PORT;
}
-static bool qlcnic_gso_check(struct sk_buff *skb, struct net_device *dev)
+static netdev_features_t qlcnic_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
{
- return vxlan_gso_check(skb);
+ return vxlan_features_check(skb, features);
}
#endif
@@ -532,7 +534,7 @@ static const struct net_device_ops qlcnic_netdev_ops = {
#ifdef CONFIG_QLCNIC_VXLAN
.ndo_add_vxlan_port = qlcnic_add_vxlan_port,
.ndo_del_vxlan_port = qlcnic_del_vxlan_port,
- .ndo_gso_check = qlcnic_gso_check,
+ .ndo_features_check = qlcnic_features_check,
#endif
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = qlcnic_poll_controller,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c31f74d..679e6e9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1012,12 +1012,15 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
* Callback to use for xmit over the accelerated station. This
* is used in place of ndo_start_xmit on accelerated net
* devices.
- * bool (*ndo_gso_check) (struct sk_buff *skb,
- * struct net_device *dev);
+ * netdev_features_t (*ndo_features_check) (struct sk_buff *skb,
+ * struct net_device *dev
+ * netdev_features_t features);
* Called by core transmit path to determine if device is capable of
- * performing GSO on a packet. The device returns true if it is
- * able to GSO the packet, false otherwise. If the return value is
- * false the stack will do software GSO.
+ * performing offload operations on a given packet. This is to give
+ * the device an opportunity to implement any restrictions that cannot
+ * be otherwise expressed by feature flags. The check is called with
+ * the set of features that the stack has calculated and it returns
+ * those the driver believes to be appropriate.
*
* int (*ndo_switch_parent_id_get)(struct net_device *dev,
* struct netdev_phys_item_id *psid);
@@ -1178,8 +1181,9 @@ struct net_device_ops {
struct net_device *dev,
void *priv);
int (*ndo_get_lock_subclass)(struct net_device *dev);
- bool (*ndo_gso_check) (struct sk_buff *skb,
- struct net_device *dev);
+ netdev_features_t (*ndo_features_check) (struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features);
#ifdef CONFIG_NET_SWITCHDEV
int (*ndo_switch_parent_id_get)(struct net_device *dev,
struct netdev_phys_item_id *psid);
@@ -3611,8 +3615,6 @@ static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb,
netdev_features_t features)
{
return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
- (dev->netdev_ops->ndo_gso_check &&
- !dev->netdev_ops->ndo_gso_check(skb, dev)) ||
unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
(skb->ip_summed != CHECKSUM_UNNECESSARY)));
}
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 57cccd0..903461a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -1,6 +1,9 @@
#ifndef __NET_VXLAN_H
#define __NET_VXLAN_H 1
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/udp.h>
@@ -51,16 +54,33 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
__be16 src_port, __be16 dst_port, __be32 vni, bool xnet);
-static inline bool vxlan_gso_check(struct sk_buff *skb)
+static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
+ netdev_features_t features)
{
- if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
+ u8 l4_hdr = 0;
+
+ if (!skb->encapsulation)
+ return features;
+
+ switch (vlan_get_protocol(skb)) {
+ case htons(ETH_P_IP):
+ l4_hdr = ip_hdr(skb)->protocol;
+ break;
+ case htons(ETH_P_IPV6):
+ l4_hdr = ipv6_hdr(skb)->nexthdr;
+ break;
+ default:
+ return features;;
+ }
+
+ if ((l4_hdr == IPPROTO_UDP) &&
(skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
skb->inner_protocol != htons(ETH_P_TEB) ||
(skb_inner_mac_header(skb) - skb_transport_header(skb) !=
sizeof(struct udphdr) + sizeof(struct vxlanhdr))))
- return false;
+ return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK);
- return true;
+ return features;
}
/* IP header + UDP + VXLAN + Ethernet header */
diff --git a/net/core/dev.c b/net/core/dev.c
index f411c28..fc13f72 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2562,7 +2562,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
- const struct net_device *dev = skb->dev;
+ struct net_device *dev = skb->dev;
netdev_features_t features = dev->features;
u16 gso_segs = skb_shinfo(skb)->gso_segs;
__be16 protocol = skb->protocol;
@@ -2570,11 +2570,19 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
features &= ~NETIF_F_GSO_MASK;
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features = netdev_intersect_features(features,
+ dev->hw_enc_features);
+
if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
protocol = veh->h_vlan_encapsulated_proto;
} else if (!vlan_tx_tag_present(skb)) {
- return harmonize_features(skb, features);
+ goto finalize;
}
features = netdev_intersect_features(features,
@@ -2591,6 +2599,15 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
+finalize:
+ if (dev->netdev_ops->ndo_features_check) {
+ netdev_features_t dev_features;
+
+ dev_features = dev->netdev_ops->ndo_features_check(skb, dev,
+ features);
+ features = netdev_intersect_features(features, dev_features);
+ }
+
return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);
@@ -2661,13 +2678,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (unlikely(!skb))
goto out_null;
- /* If encapsulation offload request, verify we are testing
- * hardware encapsulation features instead of standard
- * features for the netdev
- */
- if (skb->encapsulation)
- features &= dev->hw_enc_features;
-
if (netif_needs_gso(dev, skb, features)) {
struct sk_buff *segs;
--
1.9.1
^ permalink raw reply related
* Re: Regression with commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
From: Eric Dumazet @ 2014-12-22 16:15 UTC (permalink / raw)
To: nicolas.dichtel; +Cc: netdev, Eric Dumazet
In-Reply-To: <54983612.8000600@6wind.com>
On Mon, 2014-12-22 at 16:17 +0100, Nicolas Dichtel wrote:
> One of our engineer (Huaibin Wang <huaibin.wang@6wind.com>) has reported and
> analysed this bug:
>
> This commit introduces a regression with IPv6 + IPsec transport + TCP.
>
> In TCP (net/ipv6/tcp_ipv6.c), xfrm6_policy_check() is called and thus, after
> some intermediate functions, _decode_session6() is also called.
>
> This function uses IP6CB() (u8 nexthdr = nh[IP6CB(skb)->nhoff]), which is wrong
> becauses it has been moved to the end of TCP_SKB_CB().
>
> Not sure what is the best way to fix this, any suggestion?
Thanks for the report
Presumably tcp_v6_rcv() needs to be reordered so that
xfrm6_policy_check() calls are done before the CB swap.
swap should probably be done right before bh_lock_sock_nested()
I am currently traveling and I am not sure if I can get Internet access
to post a patch soon.
^ permalink raw reply
* Re: Announce: follow #netdev01 on tweeter
From: Jesper Dangaard Brouer @ 2014-12-22 16:17 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: brouer, netdev@vger.kernel.org, Richard Guy Briggs
In-Reply-To: <54970E46.1070902@mojatatu.com>
On Sun, 21 Dec 2014 13:15:34 -0500
Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> Please help us advertise netdev01.
> For folks with twitter accounts, please follow #netdev01
> and retweet the announcements when they come in.
Link to the account:
https://twitter.com/netdev01/
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Sr. Network Kernel Developer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: virtio_net: Fix napi poll list corruption
From: Marcelo Ricardo Leitner @ 2014-12-22 16:19 UTC (permalink / raw)
To: Herbert Xu, David Vrabel
Cc: netdev, xen-devel, konrad.wilk, boris.ostrovsky, edumazet,
David S. Miller
In-Reply-To: <20141220002327.GA31975@gondor.apana.org.au>
On 19-12-2014 22:23, Herbert Xu wrote:
> David Vrabel <david.vrabel@citrix.com> wrote:
>> After d75b1ade567ffab085e8adbbdacf0092d10cd09c (net: less interrupt
>> masking in NAPI) the napi instance is removed from the per-cpu list
>> prior to calling the n->poll(), and is only requeued if all of the
>> budget was used. This inadvertently broke netfront because netfront
>> does not use NAPI correctly.
>
> A similar bug exists in virtio_net.
>
> -- >8 --
> The commit d75b1ade567ffab085e8adbbdacf0092d10cd09c (net: less
> interrupt masking in NAPI) breaks virtio_net in an insidious way.
>
> It is now required that if the entire budget is consumed when poll
> returns, the napi poll_list must remain empty. However, like some
> other drivers virtio_net tries to do a last-ditch check and if
> there is more work it will call napi_schedule and then immediately
> process some of this new work. Should the entire budget be consumed
> while processing such new work then we will violate the new caller
> contract.
>
> This patch fixes this by not touching any work when we reschedule
> in virtio_net.
>
> The worst part of this bug is that the list corruption causes other
> napi users to be moved off-list. In my case I was chasing a stall
> in IPsec (IPsec uses netif_rx) and I only belatedly realised that it
> was virtio_net which caused the stall even though the virtio_net
> poll was still functioning perfectly after IPsec stalled.
Thanks for finding/fixing this, Herbert. I was debugging this one too. In my
case, vxlan interface was getting stuck.
Marcelo
^ permalink raw reply
* Re: Regression with commit 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
From: Nicolas Dichtel @ 2014-12-22 16:54 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, Eric Dumazet
In-Reply-To: <1419264939.11185.18.camel@edumazet-glaptop2.roam.corp.google.com>
Le 22/12/2014 17:15, Eric Dumazet a écrit :
> On Mon, 2014-12-22 at 16:17 +0100, Nicolas Dichtel wrote:
>> One of our engineer (Huaibin Wang <huaibin.wang@6wind.com>) has reported and
>> analysed this bug:
>>
>> This commit introduces a regression with IPv6 + IPsec transport + TCP.
>>
>> In TCP (net/ipv6/tcp_ipv6.c), xfrm6_policy_check() is called and thus, after
>> some intermediate functions, _decode_session6() is also called.
>>
>> This function uses IP6CB() (u8 nexthdr = nh[IP6CB(skb)->nhoff]), which is wrong
>> becauses it has been moved to the end of TCP_SKB_CB().
>>
>> Not sure what is the best way to fix this, any suggestion?
>
> Thanks for the report
>
> Presumably tcp_v6_rcv() needs to be reordered so that
> xfrm6_policy_check() calls are done before the CB swap.
>
> swap should probably be done right before bh_lock_sock_nested()
>
> I am currently traveling and I am not sure if I can get Internet access
> to post a patch soon.
Ok, thank you for the tip. I will try to cook a patch.
^ permalink raw reply
* [PATCH net] tcp6: don't move IP6CB before xfrm6_policy_check()
From: Nicolas Dichtel @ 2014-12-22 17:22 UTC (permalink / raw)
To: davem, eric.dumazet; +Cc: netdev, Nicolas Dichtel
In-Reply-To: <1419264939.11185.18.camel@edumazet-glaptop2.roam.corp.google.com>
When xfrm6_policy_check() is used, _decode_session6() is called after some
intermediate functions. This function uses IP6CB(), thus TCP_SKB_CB() must be
prepared after the call of xfrm6_policy_check().
Before this patch, scenarii with IPv6 + TCP + IPsec Transport are broken.
Fixes: 971f10eca186 ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
Reported-by: Huaibin Wang <huaibin.wang@6wind.com>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
net/ipv6/tcp_ipv6.c | 45 +++++++++++++++++++++++++++++----------------
1 file changed, 29 insertions(+), 16 deletions(-)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5ff87805258e..9c0b54e87b47 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1387,6 +1387,28 @@ ipv6_pktoptions:
return 0;
}
+static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
+ const struct tcphdr *th)
+{
+ /* This is tricky: we move IP6CB at its correct location into
+ * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because
+ * _decode_session6() uses IP6CB().
+ * barrier() makes sure compiler won't play aliasing games.
+ */
+ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
+ sizeof(struct inet6_skb_parm));
+ barrier();
+
+ TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff*4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
+ TCP_SKB_CB(skb)->tcp_tw_isn = 0;
+ TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
+ TCP_SKB_CB(skb)->sacked = 0;
+}
+
static int tcp_v6_rcv(struct sk_buff *skb)
{
const struct tcphdr *th;
@@ -1418,24 +1440,9 @@ static int tcp_v6_rcv(struct sk_buff *skb)
th = tcp_hdr(skb);
hdr = ipv6_hdr(skb);
- /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
- * barrier() makes sure compiler wont play fool^Waliasing games.
- */
- memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
- sizeof(struct inet6_skb_parm));
- barrier();
-
- TCP_SKB_CB(skb)->seq = ntohl(th->seq);
- TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
- skb->len - th->doff*4);
- TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
- TCP_SKB_CB(skb)->tcp_tw_isn = 0;
- TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
- TCP_SKB_CB(skb)->sacked = 0;
sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest,
- tcp_v6_iif(skb));
+ inet6_iif(skb));
if (!sk)
goto no_tcp_socket;
@@ -1451,6 +1458,8 @@ process:
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
+ tcp_v6_fill_cb(skb, hdr, th);
+
#ifdef CONFIG_TCP_MD5SIG
if (tcp_v6_inbound_md5_hash(sk, skb))
goto discard_and_relse;
@@ -1482,6 +1491,8 @@ no_tcp_socket:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
+ tcp_v6_fill_cb(skb, hdr, th);
+
if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
csum_error:
TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
@@ -1505,6 +1516,8 @@ do_time_wait:
goto discard_it;
}
+ tcp_v6_fill_cb(skb, hdr, th);
+
if (skb->len < (th->doff<<2)) {
inet_twsk_put(inet_twsk(sk));
goto bad_packet;
--
2.1.0
^ permalink raw reply related
* [PATCH for 3.19] rtlwifi: Fix error when accessing unmapped memory in skb
From: Larry Finger @ 2014-12-22 17:37 UTC (permalink / raw)
To: kvalo; +Cc: linux-wireless, Larry Finger, netdev, Eric Biggers, Stable
Under heavy memory pressure, it is possible for the allocation of a
new skb to fail. When this happens, the kernel gets a memory access
violation. Previous versions of the drivers would drop the read request;
however, this logic was missed in the 3.18 update. This patch restores
the previous behavior.
Reported-by: Eric Biggers <ebiggers3@gmail.com>
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Stable <stable@vger.kernel.org> [3.18]
---
drivers/net/wireless/rtlwifi/pci.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/drivers/net/wireless/rtlwifi/pci.c b/drivers/net/wireless/rtlwifi/pci.c
index 846a2e6..55334ca 100644
--- a/drivers/net/wireless/rtlwifi/pci.c
+++ b/drivers/net/wireless/rtlwifi/pci.c
@@ -912,13 +912,15 @@ static void _rtl_pci_rx_interrupt(struct ieee80211_hw *hw)
}
end:
if (rtlpriv->use_new_trx_flow) {
- _rtl_pci_init_one_rxdesc(hw, (u8 *)buffer_desc,
- rxring_idx,
- rtlpci->rx_ring[rxring_idx].idx);
+ if (!_rtl_pci_init_one_rxdesc(hw, (u8 *)buffer_desc,
+ rxring_idx,
+ rtlpci->rx_ring[rxring_idx].idx))
+ return;
} else {
- _rtl_pci_init_one_rxdesc(hw, (u8 *)pdesc, rxring_idx,
- rtlpci->rx_ring[rxring_idx].idx);
-
+ if (!_rtl_pci_init_one_rxdesc(hw, (u8 *)pdesc,
+ rxring_idx,
+ rtlpci->rx_ring[rxring_idx].idx))
+ return;
if (rtlpci->rx_ring[rxring_idx].idx ==
rtlpci->rxringcount - 1)
rtlpriv->cfg->ops->set_desc(hw, (u8 *)pdesc,
--
2.1.2
^ permalink raw reply related
* [RFC PATCH 00/17] fib_trie: Reduce time spent in fib_table_lookup by 35 to 75%
From: Alexander Duyck @ 2014-12-22 17:40 UTC (permalink / raw)
To: netdev
These patches are meant to address several performance issues I have seen
in the fib_trie implementation, and fib_table_lookup specifically. With
these changes in place I have seen a reduction of up to 35 to 75% for the
total time spent in fib_table_lookup depending on the type of search being
performed.
On a VM running in my Corei7-4930K system with a trie of maximum depth of 7
this resulted in a reduction of over 370ns per packet in the total time to
process packets received from an ixgbe interface and route them to a dummy
interface. This represents a failed lookup in the local trie followed by
a successful search in the main trie.
Baseline Refactor
ixgbe->dummy routing 1.20Mpps 2.21Mpps
------------------------------------------------------------
processing time per packet 835ns 453ns
fib_table_lookup 50.1% 418ns 25.0% 113ns
check_leaf.isra.9 7.9% 66ns -- --
ixgbe_clean_rx_irq 5.3% 44ns 9.8% 44ns
ip_route_input_noref 2.9% 25ns 4.6% 21ns
pvclock_clocksource_read 2.6% 21ns 4.6% 21ns
ip_rcv 2.6% 22ns 4.0% 18ns
In the simple case of receiving a frame and dropping it before it can reach
the socket layer I saw a reduction of 40ns per packet. This represents a
trip through the local trie with the correct leaf found with no need for
any backtracing.
Baseline Refactor
ixgbe->local receive 2.65Mpps 2.96Mpps
------------------------------------------------------------
processing time per packet 377ns 337ns
fib_table_lookup 25.1% 95ns 25.8% 87ns
ixgbe_clean_rx_irq 8.7% 33ns 9.0% 30ns
check_leaf.isra.9 7.2% 27ns -- --
ip_rcv 5.7% 21ns 6.5% 22ns
These changes have resulted in several functions being inlined such as
check_leaf and fib_find_node, but due to the code simplification the
overall size of the code has been reduced.
text data bss dec hex filename
16932 376 16 17324 43ac net/ipv4/fib_trie.o - before
15259 376 8 15643 3d1b net/ipv4/fib_trie.o - after
---
Alexander Duyck (17):
fib_trie: Update usage stats to be percpu instead of global variables
fib_trie: Make leaf and tnode more uniform
fib_trie: Merge tnode_free and leaf_free into node_free
fib_trie: Merge leaf into tnode
fib_trie: Optimize fib_table_lookup to avoid wasting time on loops/variables
fib_trie: Optimize fib_find_node
fib_trie: Optimize fib_table_insert
fib_trie: Update meaning of pos to represent unchecked bits
fib_trie: Use unsigned long for anything dealing with a shift by bits
fib_trie: Push rcu_read_lock/unlock to callers
fib_trie: Move resize to after inflate/halve
fib_trie: Add functions should_inflate and should_halve
fib_trie: Push assignment of child to parent down into inflate/halve
fib_trie: Push tnode flushing down to inflate/halve
fib_trie: inflate/halve nodes in a more RCU friendly way
fib_trie: Remove checks for index >= tnode_child_length from tnode_get_child
fib_trie: Add tracking value for suffix length
include/net/ip_fib.h | 50 +
net/ipv4/fib_frontend.c | 29 -
net/ipv4/fib_rules.c | 22 -
net/ipv4/fib_trie.c | 1915 ++++++++++++++++++++++-------------------------
4 files changed, 945 insertions(+), 1071 deletions(-)
--
^ permalink raw reply
* [RFC PATCH 01/17] fib_trie: Update usage stats to be percpu instead of global variables
From: Alexander Duyck @ 2014-12-22 17:40 UTC (permalink / raw)
To: netdev
In-Reply-To: <20141222172632.1119.51469.stgit@ahduyck-vm-fedora20>
The trie usage stats were currently being shared by all threads that were
calling fib_table_lookup. As a result when multiple threads were
performing lookups simultaneously the trie would begin to cache bounce
between those threads.
In order to prevent this I have updated the usage stats to use a set of
percpu variables. By doing this we should be able to avoid the cache
bouncing and still make use of these stats.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
net/ipv4/fib_frontend.c | 2 +
net/ipv4/fib_trie.c | 68 +++++++++++++++++++++++++++++++++--------------
2 files changed, 49 insertions(+), 21 deletions(-)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 23104a3..6689020 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -67,7 +67,7 @@ static int __net_init fib4_rules_init(struct net *net)
return 0;
fail:
- kfree(local_table);
+ fib_free_table(local_table);
return -ENOMEM;
}
#else
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18bcaf2..861ef9b 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -153,7 +153,7 @@ struct trie_stat {
struct trie {
struct rt_trie_node __rcu *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- struct trie_use_stats stats;
+ struct trie_use_stats __percpu *stats;
#endif
};
@@ -631,7 +631,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.resize_node_skipped++;
+ this_cpu_inc(t->stats->resize_node_skipped);
#endif
break;
}
@@ -658,7 +658,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.resize_node_skipped++;
+ this_cpu_inc(t->stats->resize_node_skipped);
#endif
break;
}
@@ -1357,7 +1357,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
err = fib_props[fa->fa_type].error;
if (err) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_passed++;
+ this_cpu_inc(t->stats->semantic_match_passed);
#endif
return err;
}
@@ -1372,7 +1372,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
continue;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_passed++;
+ this_cpu_inc(t->stats->semantic_match_passed);
#endif
res->prefixlen = li->plen;
res->nh_sel = nhsel;
@@ -1388,7 +1388,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
}
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_miss++;
+ this_cpu_ptr(t->stats->semantic_match_miss);
#endif
}
@@ -1399,6 +1399,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
int ret;
struct rt_trie_node *n;
struct tnode *pn;
@@ -1417,7 +1420,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
goto failed;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.gets++;
+ this_cpu_inc(stats->gets);
#endif
/* Just a leaf? */
@@ -1441,7 +1444,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.null_node_hit++;
+ this_cpu_inc(stats->null_node_hit);
#endif
goto backtrace;
}
@@ -1576,7 +1579,7 @@ backtrace:
chopped_off = 0;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.backtrack++;
+ this_cpu_inc(stats->backtrack);
#endif
goto backtrace;
}
@@ -1830,6 +1833,11 @@ int fib_table_flush(struct fib_table *tb)
void fib_free_table(struct fib_table *tb)
{
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie *t = (struct trie *)tb->tb_data;
+
+ free_percpu(t->stats);
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
kfree(tb);
}
@@ -1973,7 +1981,14 @@ struct fib_table *fib_trie_table(u32 id)
tb->tb_num_default = 0;
t = (struct trie *) tb->tb_data;
- memset(t, 0, sizeof(*t));
+ RCU_INIT_POINTER(t->trie, NULL);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats = alloc_percpu(struct trie_use_stats);
+ if (!t->stats) {
+ kfree(tb);
+ tb = NULL;
+ }
+#endif
return tb;
}
@@ -2139,18 +2154,31 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
#ifdef CONFIG_IP_FIB_TRIE_STATS
static void trie_show_usage(struct seq_file *seq,
- const struct trie_use_stats *stats)
+ const struct trie_use_stats __percpu *stats)
{
+ struct trie_use_stats s = { 0 };
+ int cpu;
+
+ /* loop through all of the CPUs and gather up the stats */
+ for_each_possible_cpu(cpu) {
+ const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
+
+ s.gets += pcpu->gets;
+ s.backtrack += pcpu->backtrack;
+ s.semantic_match_passed += pcpu->semantic_match_passed;
+ s.semantic_match_miss += pcpu->semantic_match_miss;
+ s.null_node_hit += pcpu->null_node_hit;
+ s.resize_node_skipped += pcpu->resize_node_skipped;
+ }
+
seq_printf(seq, "\nCounters:\n---------\n");
- seq_printf(seq, "gets = %u\n", stats->gets);
- seq_printf(seq, "backtracks = %u\n", stats->backtrack);
+ seq_printf(seq, "gets = %u\n", s.gets);
+ seq_printf(seq, "backtracks = %u\n", s.backtrack);
seq_printf(seq, "semantic match passed = %u\n",
- stats->semantic_match_passed);
- seq_printf(seq, "semantic match miss = %u\n",
- stats->semantic_match_miss);
- seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
- seq_printf(seq, "skipped node resize = %u\n\n",
- stats->resize_node_skipped);
+ s.semantic_match_passed);
+ seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
+ seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
+ seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
}
#endif /* CONFIG_IP_FIB_TRIE_STATS */
@@ -2191,7 +2219,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
trie_collect_stats(t, &stat);
trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
- trie_show_usage(seq, &t->stats);
+ trie_show_usage(seq, t->stats);
#endif
}
}
^ permalink raw reply related
* [RFC PATCH 02/17] fib_trie: Make leaf and tnode more uniform
From: Alexander Duyck @ 2014-12-22 17:41 UTC (permalink / raw)
To: netdev
In-Reply-To: <20141222172632.1119.51469.stgit@ahduyck-vm-fedora20>
This change makes some fundamental changes to the way leaves and tnodes are
constructed. The big differences are:
1. Leaves now populate pos and bits indicating their full key size.
2. Trie nodes now mask out their lower bits to be consistent with the leaf
3. Both structures have been reordered so that rt_trie_node now consisists
of a much larger region including the pos, bits, and rcu portions of
the tnode structure.
On 32b systems this will result in the leaf being 4B larger as the pos and
bits values were added to a hole created by the key as it was only 4B in
length.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
net/ipv4/fib_trie.c | 192 ++++++++++++++++++++++-----------------------------
1 file changed, 82 insertions(+), 110 deletions(-)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 861ef9b..934bfb0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -87,24 +87,38 @@
typedef unsigned int t_key;
-#define T_TNODE 0
-#define T_LEAF 1
-#define NODE_TYPE_MASK 0x1UL
-#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
-#define IS_TNODE(n) (!(n->parent & T_LEAF))
-#define IS_LEAF(n) (n->parent & T_LEAF)
+struct tnode {
+ t_key key;
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
+ unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ struct tnode __rcu *parent;
+ union {
+ struct rcu_head rcu;
+ struct tnode *tnode_free;
+ };
+ unsigned int full_children; /* KEYLENGTH bits needed */
+ unsigned int empty_children; /* KEYLENGTH bits needed */
+ struct rt_trie_node __rcu *child[0];
+};
struct rt_trie_node {
- unsigned long parent;
t_key key;
+ unsigned char bits;
+ unsigned char pos;
+ struct tnode __rcu *parent;
+ struct rcu_head rcu;
};
struct leaf {
- unsigned long parent;
t_key key;
- struct hlist_head list;
+ unsigned char bits;
+ unsigned char pos;
+ struct tnode __rcu *parent;
struct rcu_head rcu;
+ struct hlist_head list;
};
struct leaf_info {
@@ -115,20 +129,6 @@ struct leaf_info {
struct rcu_head rcu;
};
-struct tnode {
- unsigned long parent;
- t_key key;
- unsigned char pos; /* 2log(KEYLENGTH) bits needed */
- unsigned char bits; /* 2log(KEYLENGTH) bits needed */
- unsigned int full_children; /* KEYLENGTH bits needed */
- unsigned int empty_children; /* KEYLENGTH bits needed */
- union {
- struct rcu_head rcu;
- struct tnode *tnode_free;
- };
- struct rt_trie_node __rcu *child[0];
-};
-
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
unsigned int gets;
@@ -176,38 +176,27 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
-/*
- * caller must hold RTNL
- */
-static inline struct tnode *node_parent(const struct rt_trie_node *node)
-{
- unsigned long parent;
+/* caller must hold RTNL */
+#define node_parent(n) rtnl_dereference((n)->parent)
- parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
+/* caller must hold RCU read lock or RTNL */
+#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
- return (struct tnode *)(parent & ~NODE_TYPE_MASK);
-}
-
-/*
- * caller must hold RCU read lock or RTNL
- */
-static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
+/* wrapper for rcu_assign_pointer */
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
{
- unsigned long parent;
-
- parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
- lockdep_rtnl_is_held());
-
- return (struct tnode *)(parent & ~NODE_TYPE_MASK);
+ if (node)
+ rcu_assign_pointer(node->parent, ptr);
}
-/* Same as rcu_assign_pointer
- * but that macro() assumes that value is a pointer.
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
+
+/* This provides us with the number of children in this node, in the case of a
+ * leaf this will return 0 meaning none of the children are accessible.
*/
-static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
+static inline int tnode_child_length(const struct tnode *tn)
{
- smp_wmb();
- node->parent = (unsigned long)ptr | NODE_TYPE(node);
+ return (1ul << tn->bits) & ~(1ul);
}
/*
@@ -215,7 +204,7 @@ static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
*/
static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
{
- BUG_ON(i >= 1U << tn->bits);
+ BUG_ON(i >= tnode_child_length(tn));
return rtnl_dereference(tn->child[i]);
}
@@ -225,16 +214,11 @@ static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsig
*/
static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
{
- BUG_ON(i >= 1U << tn->bits);
+ BUG_ON(i >= tnode_child_length(tn));
return rcu_dereference_rtnl(tn->child[i]);
}
-static inline int tnode_child_length(const struct tnode *tn)
-{
- return 1 << tn->bits;
-}
-
static inline t_key mask_pfx(t_key k, unsigned int l)
{
return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
@@ -336,11 +320,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
*/
-static inline void check_tnode(const struct tnode *tn)
-{
- WARN_ON(tn && tn->pos+tn->bits > 32);
-}
-
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
static const int halve_threshold_root = 15;
@@ -426,11 +405,20 @@ static void tnode_free_flush(void)
}
}
-static struct leaf *leaf_new(void)
+static struct leaf *leaf_new(t_key key)
{
struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) {
- l->parent = T_LEAF;
+ l->parent = NULL;
+ /* set key and pos to reflect full key value
+ * any trailing zeros in the key should be ignored
+ * as the nodes are searched
+ */
+ l->key = key;
+ l->pos = KEYLENGTH;
+ /* set bits to 0 indicating we are not a tnode */
+ l->bits = 0;
+
INIT_HLIST_HEAD(&l->list);
}
return l;
@@ -451,12 +439,16 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
{
size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
struct tnode *tn = tnode_alloc(sz);
+ unsigned int shift = pos + bits;
+
+ /* verify bits and pos their msb bits clear and values are valid */
+ BUG_ON(!bits || (shift > KEYLENGTH));
if (tn) {
- tn->parent = T_TNODE;
+ tn->parent = NULL;
tn->pos = pos;
tn->bits = bits;
- tn->key = key;
+ tn->key = mask_pfx(key, pos);
tn->full_children = 0;
tn->empty_children = 1<<bits;
}
@@ -473,10 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
{
- if (n == NULL || IS_LEAF(n))
- return 0;
-
- return ((struct tnode *) n)->pos == tn->pos + tn->bits;
+ return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
}
static inline void put_child(struct tnode *tn, int i,
@@ -514,8 +503,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *
else if (!wasfull && isfull)
tn->full_children++;
- if (n)
- node_set_parent(n, tn);
+ node_set_parent(n, tn);
rcu_assign_pointer(tn->child[i], n);
}
@@ -523,7 +511,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *
#define MAX_WORK 10
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
{
- int i;
+ struct rt_trie_node *n = NULL;
struct tnode *old_tn;
int inflate_threshold_use;
int halve_threshold_use;
@@ -536,12 +524,11 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
tn, inflate_threshold, halve_threshold);
/* No children */
- if (tn->empty_children == tnode_child_length(tn)) {
- tnode_free_safe(tn);
- return NULL;
- }
+ if (tn->empty_children > (tnode_child_length(tn) - 1))
+ goto no_children;
+
/* One child */
- if (tn->empty_children == tnode_child_length(tn) - 1)
+ if (tn->empty_children == (tnode_child_length(tn) - 1))
goto one_child;
/*
* Double as long as the resulting node has a number of
@@ -607,11 +594,9 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
*
*/
- check_tnode(tn);
-
/* Keep root node larger */
- if (!node_parent((struct rt_trie_node *)tn)) {
+ if (!node_parent(tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
} else {
@@ -637,8 +622,6 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
}
}
- check_tnode(tn);
-
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
return (struct rt_trie_node *) tn;
@@ -666,21 +649,16 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
/* Only one child remains */
- if (tn->empty_children == tnode_child_length(tn) - 1) {
+ if (tn->empty_children == (tnode_child_length(tn) - 1)) {
+ unsigned long i;
one_child:
- for (i = 0; i < tnode_child_length(tn); i++) {
- struct rt_trie_node *n;
-
- n = rtnl_dereference(tn->child[i]);
- if (!n)
- continue;
-
- /* compress one level */
-
- node_set_parent(n, NULL);
- tnode_free_safe(tn);
- return n;
- }
+ for (i = tnode_child_length(tn); !n && i;)
+ n = tnode_get_child(tn, --i);
+no_children:
+ /* compress one level */
+ node_set_parent(n, NULL);
+ tnode_free_safe(tn);
+ return n;
}
return (struct rt_trie_node *) tn;
}
@@ -760,8 +738,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
/* A leaf or an internal node with skipped bits */
- if (IS_LEAF(node) || ((struct tnode *) node)->pos >
- tn->pos + tn->bits - 1) {
+ if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) {
put_child(tn,
tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
node);
@@ -958,11 +935,9 @@ fib_find_node(struct trie *t, u32 key)
pos = 0;
n = rcu_dereference_rtnl(t->trie);
- while (n != NULL && NODE_TYPE(n) == T_TNODE) {
+ while (n && IS_TNODE(n)) {
tn = (struct tnode *) n;
- check_tnode(tn);
-
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
pos = tn->pos + tn->bits;
n = tnode_get_child_rcu(tn,
@@ -988,7 +963,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
key = tn->key;
- while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
+ while (tn != NULL && (tp = node_parent(tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *)resize(t, tn);
@@ -996,7 +971,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
tnode_put_child_reorg(tp, cindex,
(struct rt_trie_node *)tn, wasfull);
- tp = node_parent((struct rt_trie_node *) tn);
+ tp = node_parent(tn);
if (!tp)
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
@@ -1048,11 +1023,9 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
* If it doesn't, we need to replace it with a T_TNODE.
*/
- while (n != NULL && NODE_TYPE(n) == T_TNODE) {
+ while (n && IS_TNODE(n)) {
tn = (struct tnode *) n;
- check_tnode(tn);
-
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
pos = tn->pos + tn->bits;
@@ -1087,12 +1060,11 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
insert_leaf_info(&l->list, li);
goto done;
}
- l = leaf_new();
+ l = leaf_new(key);
if (!l)
return NULL;
- l->key = key;
li = leaf_info_new(plen);
if (!li) {
@@ -1569,7 +1541,7 @@ backtrace:
if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
- struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
+ struct tnode *parent = node_parent_rcu(pn);
if (!parent)
goto failed;
@@ -1597,7 +1569,7 @@ EXPORT_SYMBOL_GPL(fib_table_lookup);
*/
static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- struct tnode *tp = node_parent((struct rt_trie_node *) l);
+ struct tnode *tp = node_parent(l);
pr_debug("entering trie_leaf_remove(%p)\n", l);
@@ -2374,7 +2346,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
if (IS_TNODE(n)) {
struct tnode *tn = (struct tnode *) n;
- __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
+ __be32 prf = htonl(tn->key);
seq_indent(seq, iter->depth-1);
seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
^ permalink raw reply related
* [RFC PATCH 03/17] fib_trie: Merge tnode_free and leaf_free into node_free
From: Alexander Duyck @ 2014-12-22 17:41 UTC (permalink / raw)
To: netdev
In-Reply-To: <20141222172632.1119.51469.stgit@ahduyck-vm-fedora20>
Both the leaf and the tnode had an rcu_head in them, but they had them in
slightly different places. Since we now have them in the same spot and
know that any node with bits == 0 is a leaf and the rest are either vmalloc
or kmalloc tnodes depending on the value of bits it makes it easy to combine
the functions and reduce overhead.
In addition I have taken advantage of the rcu_head pointer to go ahead and
put together a simple linked list instead of using the tnode pointer as
this way we can merge either type of structure for freeing.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
net/ipv4/fib_trie.c | 90 +++++++++++++++++++++++----------------------------
1 file changed, 40 insertions(+), 50 deletions(-)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 934bfb0..bedb6c9 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,15 +95,17 @@ struct tnode {
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
struct tnode __rcu *parent;
- union {
- struct rcu_head rcu;
- struct tnode *tnode_free;
- };
+ struct rcu_head rcu;
+ /* everything above this comment must be the same as rt_trie_node */
unsigned int full_children; /* KEYLENGTH bits needed */
unsigned int empty_children; /* KEYLENGTH bits needed */
struct rt_trie_node __rcu *child[0];
};
+/* This struct represents the shared bits between tnode and leaf. If any
+ * ordering is changed here is must also be updated in tnode and leaf as
+ * well.
+ */
struct rt_trie_node {
t_key key;
unsigned char bits;
@@ -118,6 +120,7 @@ struct leaf {
unsigned char pos;
struct tnode __rcu *parent;
struct rcu_head rcu;
+ /* everything above this comment must be the same as rt_trie_node */
struct hlist_head list;
};
@@ -163,7 +166,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
-static struct tnode *tnode_free_head;
+static struct callback_head *tnode_free_head;
static size_t tnode_free_size;
/*
@@ -336,17 +339,23 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
call_rcu(&fa->rcu, __alias_free_mem);
}
-static void __leaf_free_rcu(struct rcu_head *head)
-{
- struct leaf *l = container_of(head, struct leaf, rcu);
- kmem_cache_free(trie_leaf_kmem, l);
-}
+#define TNODE_KMALLOC_MAX \
+ ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct rt_trie_node *))
-static inline void free_leaf(struct leaf *l)
+static void __node_free_rcu(struct rcu_head *head)
{
- call_rcu(&l->rcu, __leaf_free_rcu);
+ struct rt_trie_node *n = container_of(head, struct rt_trie_node, rcu);
+
+ if (IS_LEAF(n))
+ kmem_cache_free(trie_leaf_kmem, n);
+ else if (n->bits <= TNODE_KMALLOC_MAX)
+ kfree(n);
+ else
+ vfree(n);
}
+#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
+
static inline void free_leaf_info(struct leaf_info *leaf)
{
kfree_rcu(leaf, rcu);
@@ -360,43 +369,24 @@ static struct tnode *tnode_alloc(size_t size)
return vzalloc(size);
}
-static void __tnode_free_rcu(struct rcu_head *head)
-{
- struct tnode *tn = container_of(head, struct tnode, rcu);
- size_t size = sizeof(struct tnode) +
- (sizeof(struct rt_trie_node *) << tn->bits);
-
- if (size <= PAGE_SIZE)
- kfree(tn);
- else
- vfree(tn);
-}
-
-static inline void tnode_free(struct tnode *tn)
-{
- if (IS_LEAF(tn))
- free_leaf((struct leaf *) tn);
- else
- call_rcu(&tn->rcu, __tnode_free_rcu);
-}
-
static void tnode_free_safe(struct tnode *tn)
{
BUG_ON(IS_LEAF(tn));
- tn->tnode_free = tnode_free_head;
- tnode_free_head = tn;
- tnode_free_size += sizeof(struct tnode) +
- (sizeof(struct rt_trie_node *) << tn->bits);
+ tn->rcu.next = tnode_free_head;
+ tnode_free_head = &tn->rcu;
}
static void tnode_free_flush(void)
{
- struct tnode *tn;
+ struct callback_head *head;
+
+ while ((head = tnode_free_head)) {
+ struct tnode *tn = container_of(head, struct tnode, rcu);
+
+ tnode_free_head = head->next;
+ tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
- while ((tn = tnode_free_head)) {
- tnode_free_head = tn->tnode_free;
- tn->tnode_free = NULL;
- tnode_free(tn);
+ node_free(tn);
}
if (tnode_free_size >= PAGE_SIZE * sync_pages) {
@@ -437,7 +427,7 @@ static struct leaf_info *leaf_info_new(int plen)
static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
+ size_t sz = offsetof(struct tnode, child[1 << bits]);
struct tnode *tn = tnode_alloc(sz);
unsigned int shift = pos + bits;
@@ -666,15 +656,15 @@ no_children:
static void tnode_clean_free(struct tnode *tn)
{
+ struct rt_trie_node *tofree;
int i;
- struct tnode *tofree;
for (i = 0; i < tnode_child_length(tn); i++) {
- tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
+ tofree = rtnl_dereference(tn->child[i]);
if (tofree)
- tnode_free(tofree);
+ node_free(tofree);
}
- tnode_free(tn);
+ node_free(tn);
}
static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -717,7 +707,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
inode->bits - 1);
if (!right) {
- tnode_free(left);
+ node_free(left);
goto nomem;
}
@@ -1068,7 +1058,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
li = leaf_info_new(plen);
if (!li) {
- free_leaf(l);
+ node_free(l);
return NULL;
}
@@ -1100,7 +1090,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
if (!tn) {
free_leaf_info(li);
- free_leaf(l);
+ node_free(l);
return NULL;
}
@@ -1580,7 +1570,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
} else
RCU_INIT_POINTER(t->trie, NULL);
- free_leaf(l);
+ node_free(l);
}
/*
^ permalink raw reply related
* [RFC PATCH 04/17] fib_trie: Merge leaf into tnode
From: Alexander Duyck @ 2014-12-22 17:41 UTC (permalink / raw)
To: netdev
In-Reply-To: <20141222172632.1119.51469.stgit@ahduyck-vm-fedora20>
This change makes it so that leaf and tnode are the same struct. As a
result there is no need for rt_trie_node anymore since everyting can be
merged into tnode.
On 32b systems this results in the leaf being 4 bytes larger, however I
don't know if that is really an issue as this and an eariler patch that
added bits & pos have increased the size from 20 to 28. If I am not
mistaken slub/slab allocate on power of 2 sizes so 20 was likely being
rounded up to 32 anyway.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
net/ipv4/fib_trie.c | 322 ++++++++++++++++++++++-----------------------------
1 file changed, 140 insertions(+), 182 deletions(-)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index bedb6c9..1c7bf43 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -96,32 +96,16 @@ struct tnode {
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
struct tnode __rcu *parent;
struct rcu_head rcu;
- /* everything above this comment must be the same as rt_trie_node */
- unsigned int full_children; /* KEYLENGTH bits needed */
- unsigned int empty_children; /* KEYLENGTH bits needed */
- struct rt_trie_node __rcu *child[0];
-};
-
-/* This struct represents the shared bits between tnode and leaf. If any
- * ordering is changed here is must also be updated in tnode and leaf as
- * well.
- */
-struct rt_trie_node {
- t_key key;
- unsigned char bits;
- unsigned char pos;
- struct tnode __rcu *parent;
- struct rcu_head rcu;
-};
-
-struct leaf {
- t_key key;
- unsigned char bits;
- unsigned char pos;
- struct tnode __rcu *parent;
- struct rcu_head rcu;
- /* everything above this comment must be the same as rt_trie_node */
- struct hlist_head list;
+ union {
+ /* The fields in this struct are valid if bits > 0 (TNODE) */
+ struct {
+ unsigned int full_children; /* KEYLENGTH bits needed */
+ unsigned int empty_children; /* KEYLENGTH bits needed */
+ struct tnode __rcu *child[0];
+ };
+ /* This list pointer if valid if bits == 0 (LEAF) */
+ struct hlist_head list;
+ };
};
struct leaf_info {
@@ -154,15 +138,15 @@ struct trie_stat {
};
struct trie {
- struct rt_trie_node __rcu *trie;
+ struct tnode __rcu *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats;
#endif
};
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
int wasfull);
-static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
+static struct tnode *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
@@ -186,10 +170,10 @@ static struct kmem_cache *trie_leaf_kmem __read_mostly;
#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
/* wrapper for rcu_assign_pointer */
-static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
+static inline void node_set_parent(struct tnode *n, struct tnode *tp)
{
- if (node)
- rcu_assign_pointer(node->parent, ptr);
+ if (n)
+ rcu_assign_pointer(n->parent, tp);
}
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
@@ -205,7 +189,7 @@ static inline int tnode_child_length(const struct tnode *tn)
/*
* caller must hold RTNL
*/
-static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
+static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
{
BUG_ON(i >= tnode_child_length(tn));
@@ -215,7 +199,7 @@ static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsig
/*
* caller must hold RCU read lock or RTNL
*/
-static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
+static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
{
BUG_ON(i >= tnode_child_length(tn));
@@ -340,11 +324,11 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
}
#define TNODE_KMALLOC_MAX \
- ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct rt_trie_node *))
+ ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
static void __node_free_rcu(struct rcu_head *head)
{
- struct rt_trie_node *n = container_of(head, struct rt_trie_node, rcu);
+ struct tnode *n = container_of(head, struct tnode, rcu);
if (IS_LEAF(n))
kmem_cache_free(trie_leaf_kmem, n);
@@ -395,9 +379,9 @@ static void tnode_free_flush(void)
}
}
-static struct leaf *leaf_new(t_key key)
+static struct tnode *leaf_new(t_key key)
{
- struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+ struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) {
l->parent = NULL;
/* set key and pos to reflect full key value
@@ -444,7 +428,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct rt_trie_node *) << bits);
+ sizeof(struct tnode *) << bits);
return tn;
}
@@ -453,13 +437,13 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
+static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
{
return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
}
static inline void put_child(struct tnode *tn, int i,
- struct rt_trie_node *n)
+ struct tnode *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -469,10 +453,10 @@ static inline void put_child(struct tnode *tn, int i,
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
int wasfull)
{
- struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
+ struct tnode *chi = rtnl_dereference(tn->child[i]);
int isfull;
BUG_ON(i >= 1<<tn->bits);
@@ -499,10 +483,9 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *
}
#define MAX_WORK 10
-static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
+static struct tnode *resize(struct trie *t, struct tnode *tn)
{
- struct rt_trie_node *n = NULL;
- struct tnode *old_tn;
+ struct tnode *old_tn, *n = NULL;
int inflate_threshold_use;
int halve_threshold_use;
int max_work;
@@ -614,7 +597,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return (struct rt_trie_node *) tn;
+ return tn;
/*
* Halve as long as the number of empty children in this
@@ -650,13 +633,13 @@ no_children:
tnode_free_safe(tn);
return n;
}
- return (struct rt_trie_node *) tn;
+ return tn;
}
static void tnode_clean_free(struct tnode *tn)
{
- struct rt_trie_node *tofree;
+ struct tnode *tofree;
int i;
for (i = 0; i < tnode_child_length(tn); i++) {
@@ -667,10 +650,10 @@ static void tnode_clean_free(struct tnode *tn)
node_free(tn);
}
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
{
- struct tnode *oldtnode = tn;
- int olen = tnode_child_length(tn);
+ int olen = tnode_child_length(oldtnode);
+ struct tnode *tn;
int i;
pr_debug("In inflate\n");
@@ -690,11 +673,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
for (i = 0; i < olen; i++) {
struct tnode *inode;
- inode = (struct tnode *) tnode_get_child(oldtnode, i);
- if (inode &&
- IS_TNODE(inode) &&
- inode->pos == oldtnode->pos + oldtnode->bits &&
- inode->bits > 1) {
+ inode = tnode_get_child(oldtnode, i);
+ if (tnode_full(oldtnode, inode) && inode->bits > 1) {
struct tnode *left, *right;
t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
@@ -711,33 +691,29 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
goto nomem;
}
- put_child(tn, 2*i, (struct rt_trie_node *) left);
- put_child(tn, 2*i+1, (struct rt_trie_node *) right);
+ put_child(tn, 2*i, left);
+ put_child(tn, 2*i+1, right);
}
}
for (i = 0; i < olen; i++) {
- struct tnode *inode;
- struct rt_trie_node *node = tnode_get_child(oldtnode, i);
+ struct tnode *inode = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
/* An empty child */
- if (node == NULL)
+ if (inode == NULL)
continue;
/* A leaf or an internal node with skipped bits */
-
- if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) {
+ if (!tnode_full(oldtnode, inode)) {
put_child(tn,
- tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
- node);
+ tkey_extract_bits(inode->key, tn->pos, tn->bits),
+ inode);
continue;
}
/* An internal node with two children */
- inode = (struct tnode *) node;
-
if (inode->bits == 1) {
put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
@@ -769,12 +745,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
* bit to zero.
*/
- left = (struct tnode *) tnode_get_child(tn, 2*i);
+ left = tnode_get_child(tn, 2*i);
put_child(tn, 2*i, NULL);
BUG_ON(!left);
- right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+ right = tnode_get_child(tn, 2*i+1);
put_child(tn, 2*i+1, NULL);
BUG_ON(!right);
@@ -796,12 +772,11 @@ nomem:
return ERR_PTR(-ENOMEM);
}
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
{
- struct tnode *oldtnode = tn;
- struct rt_trie_node *left, *right;
+ int olen = tnode_child_length(oldtnode);
+ struct tnode *tn, *left, *right;
int i;
- int olen = tnode_child_length(tn);
pr_debug("In halve\n");
@@ -830,7 +805,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (!newn)
goto nomem;
- put_child(tn, i/2, (struct rt_trie_node *)newn);
+ put_child(tn, i/2, newn);
}
}
@@ -855,7 +830,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
}
/* Two nonempty children */
- newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+ newBinNode = tnode_get_child(tn, i/2);
put_child(tn, i/2, NULL);
put_child(newBinNode, 0, left);
put_child(newBinNode, 1, right);
@@ -871,7 +846,7 @@ nomem:
/* readside must use rcu_read_lock currently dump routines
via get_fa_head and dump */
-static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
+static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
{
struct hlist_head *head = &l->list;
struct leaf_info *li;
@@ -883,7 +858,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
return NULL;
}
-static inline struct list_head *get_fa_head(struct leaf *l, int plen)
+static inline struct list_head *get_fa_head(struct tnode *l, int plen)
{
struct leaf_info *li = find_leaf_info(l, plen);
@@ -915,32 +890,25 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
/* rcu_read_lock needs to be hold by caller from readside */
-static struct leaf *
-fib_find_node(struct trie *t, u32 key)
+static struct tnode *fib_find_node(struct trie *t, u32 key)
{
- int pos;
- struct tnode *tn;
- struct rt_trie_node *n;
-
- pos = 0;
- n = rcu_dereference_rtnl(t->trie);
+ struct tnode *n = rcu_dereference_rtnl(t->trie);
+ int pos = 0;
while (n && IS_TNODE(n)) {
- tn = (struct tnode *) n;
-
- if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
- pos = tn->pos + tn->bits;
- n = tnode_get_child_rcu(tn,
+ if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
+ pos = n->pos + n->bits;
+ n = tnode_get_child_rcu(n,
tkey_extract_bits(key,
- tn->pos,
- tn->bits));
+ n->pos,
+ n->bits));
} else
break;
}
/* Case we have found a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
- return (struct leaf *)n;
+ return n;
return NULL;
}
@@ -956,14 +924,13 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
while (tn != NULL && (tp = node_parent(tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = (struct tnode *)resize(t, tn);
+ tn = resize(t, tn);
- tnode_put_child_reorg(tp, cindex,
- (struct rt_trie_node *)tn, wasfull);
+ tnode_put_child_reorg(tp, cindex, tn, wasfull);
tp = node_parent(tn);
if (!tp)
- rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+ rcu_assign_pointer(t->trie, tn);
tnode_free_flush();
if (!tp)
@@ -973,9 +940,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
/* Handle last (top) tnode */
if (IS_TNODE(tn))
- tn = (struct tnode *)resize(t, tn);
+ tn = resize(t, tn);
- rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+ rcu_assign_pointer(t->trie, tn);
tnode_free_flush();
}
@@ -985,8 +952,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
- struct rt_trie_node *n;
- struct leaf *l;
+ struct tnode *n;
+ struct tnode *l;
int missbit;
struct list_head *fa_head = NULL;
struct leaf_info *li;
@@ -1014,17 +981,15 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
*/
while (n && IS_TNODE(n)) {
- tn = (struct tnode *) n;
-
- if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
- tp = tn;
- pos = tn->pos + tn->bits;
- n = tnode_get_child(tn,
+ if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
+ tp = n;
+ pos = n->pos + n->bits;
+ n = tnode_get_child(n,
tkey_extract_bits(key,
- tn->pos,
- tn->bits));
+ n->pos,
+ n->bits));
- BUG_ON(n && node_parent(n) != tn);
+ BUG_ON(n && node_parent(n) != tp);
} else
break;
}
@@ -1040,14 +1005,13 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
/* Case 1: n is a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- l = (struct leaf *) n;
li = leaf_info_new(plen);
if (!li)
return NULL;
fa_head = &li->falh;
- insert_leaf_info(&l->list, li);
+ insert_leaf_info(&n->list, li);
goto done;
}
l = leaf_new(key);
@@ -1068,10 +1032,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
- node_set_parent((struct rt_trie_node *)l, tp);
+ node_set_parent(l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(tp, cindex, (struct rt_trie_node *)l);
+ put_child(tp, cindex, l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1094,17 +1058,17 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
return NULL;
}
- node_set_parent((struct rt_trie_node *)tn, tp);
+ node_set_parent(tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
- put_child(tn, missbit, (struct rt_trie_node *)l);
+ put_child(tn, missbit, l);
put_child(tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(tp, cindex, (struct rt_trie_node *)tn);
+ put_child(tp, cindex, tn);
} else {
- rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+ rcu_assign_pointer(t->trie, tn);
}
tp = tn;
@@ -1134,7 +1098,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
u8 tos = cfg->fc_tos;
u32 key, mask;
int err;
- struct leaf *l;
+ struct tnode *l;
if (plen > 32)
return -EINVAL;
@@ -1292,7 +1256,7 @@ err:
}
/* should be called with rcu_read_lock */
-static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
t_key key, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
@@ -1365,7 +1329,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct trie_use_stats __percpu *stats = t->stats;
#endif
int ret;
- struct rt_trie_node *n;
+ struct tnode *n;
struct tnode *pn;
unsigned int pos, bits;
t_key key = ntohl(flp->daddr);
@@ -1387,11 +1351,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
goto found;
}
- pn = (struct tnode *) n;
+ pn = n;
chopped_off = 0;
while (pn) {
@@ -1412,13 +1376,13 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
}
if (IS_LEAF(n)) {
- ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
}
- cn = (struct tnode *)n;
+ cn = n;
/*
* It's a tnode, and we can do some extra checks here if we
@@ -1506,7 +1470,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
current_prefix_length = mp;
}
- pn = (struct tnode *)n; /* Descend */
+ pn = n; /* Descend */
chopped_off = 0;
continue;
@@ -1557,7 +1521,7 @@ EXPORT_SYMBOL_GPL(fib_table_lookup);
/*
* Remove the leaf and return parent.
*/
-static void trie_leaf_remove(struct trie *t, struct leaf *l)
+static void trie_leaf_remove(struct trie *t, struct tnode *l)
{
struct tnode *tp = node_parent(l);
@@ -1584,7 +1548,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
u8 tos = cfg->fc_tos;
struct fib_alias *fa, *fa_to_delete;
struct list_head *fa_head;
- struct leaf *l;
+ struct tnode *l;
struct leaf_info *li;
if (plen > 32)
@@ -1682,7 +1646,7 @@ static int trie_flush_list(struct list_head *head)
return found;
}
-static int trie_flush_leaf(struct leaf *l)
+static int trie_flush_leaf(struct tnode *l)
{
int found = 0;
struct hlist_head *lih = &l->list;
@@ -1704,7 +1668,7 @@ static int trie_flush_leaf(struct leaf *l)
* Scan for the next right leaf starting at node p->child[idx]
* Since we have back pointer, no recursion necessary.
*/
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
+static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
{
do {
t_key idx;
@@ -1720,47 +1684,46 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
continue;
if (IS_LEAF(c))
- return (struct leaf *) c;
+ return c;
/* Rescan start scanning in new node */
- p = (struct tnode *) c;
+ p = c;
idx = 0;
}
/* Node empty, walk back up to parent */
- c = (struct rt_trie_node *) p;
+ c = p;
} while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */
}
-static struct leaf *trie_firstleaf(struct trie *t)
+static struct tnode *trie_firstleaf(struct trie *t)
{
- struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
+ struct tnode *n = rcu_dereference_rtnl(t->trie);
if (!n)
return NULL;
if (IS_LEAF(n)) /* trie is just a leaf */
- return (struct leaf *) n;
+ return n;
return leaf_walk_rcu(n, NULL);
}
-static struct leaf *trie_nextleaf(struct leaf *l)
+static struct tnode *trie_nextleaf(struct tnode *l)
{
- struct rt_trie_node *c = (struct rt_trie_node *) l;
- struct tnode *p = node_parent_rcu(c);
+ struct tnode *p = node_parent_rcu(l);
if (!p)
return NULL; /* trie with just one leaf */
- return leaf_walk_rcu(p, c);
+ return leaf_walk_rcu(p, l);
}
-static struct leaf *trie_leafindex(struct trie *t, int index)
+static struct tnode *trie_leafindex(struct trie *t, int index)
{
- struct leaf *l = trie_firstleaf(t);
+ struct tnode *l = trie_firstleaf(t);
while (l && index-- > 0)
l = trie_nextleaf(l);
@@ -1775,7 +1738,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
int fib_table_flush(struct fib_table *tb)
{
struct trie *t = (struct trie *) tb->tb_data;
- struct leaf *l, *ll = NULL;
+ struct tnode *l, *ll = NULL;
int found = 0;
for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
@@ -1840,7 +1803,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
return skb->len;
}
-static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
struct leaf_info *li;
@@ -1876,7 +1839,7 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct leaf *l;
+ struct tnode *l;
struct trie *t = (struct trie *) tb->tb_data;
t_key key = cb->args[2];
int count = cb->args[3];
@@ -1922,7 +1885,7 @@ void __init fib_trie_init(void)
0, SLAB_PANIC, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
- max(sizeof(struct leaf),
+ max(sizeof(struct tnode),
sizeof(struct leaf_info)),
0, SLAB_PANIC, NULL);
}
@@ -1965,7 +1928,7 @@ struct fib_trie_iter {
unsigned int depth;
};
-static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
unsigned int cindex = iter->index;
@@ -1979,7 +1942,7 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
iter->tnode, iter->index, iter->depth);
rescan:
while (cindex < (1<<tn->bits)) {
- struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
+ struct tnode *n = tnode_get_child_rcu(tn, cindex);
if (n) {
if (IS_LEAF(n)) {
@@ -1987,7 +1950,7 @@ rescan:
iter->index = cindex + 1;
} else {
/* push down one level */
- iter->tnode = (struct tnode *) n;
+ iter->tnode = n;
iter->index = 0;
++iter->depth;
}
@@ -1998,7 +1961,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = node_parent_rcu((struct rt_trie_node *)tn);
+ p = node_parent_rcu(tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2010,10 +1973,10 @@ rescan:
return NULL;
}
-static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct rt_trie_node *n;
+ struct tnode *n;
if (!t)
return NULL;
@@ -2023,7 +1986,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
return NULL;
if (IS_TNODE(n)) {
- iter->tnode = (struct tnode *) n;
+ iter->tnode = n;
iter->index = 0;
iter->depth = 1;
} else {
@@ -2037,7 +2000,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct rt_trie_node *n;
+ struct tnode *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -2045,7 +2008,6 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
rcu_read_lock();
for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
- struct leaf *l = (struct leaf *)n;
struct leaf_info *li;
s->leaves++;
@@ -2053,18 +2015,17 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- hlist_for_each_entry_rcu(li, &l->list, hlist)
+ hlist_for_each_entry_rcu(li, &n->list, hlist)
++s->prefixes;
} else {
- const struct tnode *tn = (const struct tnode *) n;
int i;
s->tnodes++;
- if (tn->bits < MAX_STAT_DEPTH)
- s->nodesizes[tn->bits]++;
+ if (n->bits < MAX_STAT_DEPTH)
+ s->nodesizes[n->bits]++;
- for (i = 0; i < (1<<tn->bits); i++)
- if (!tn->child[i])
+ for (i = 0; i < tnode_child_length(n); i++)
+ if (!rcu_access_pointer(n->child[i]))
s->nullpointers++;
}
}
@@ -2088,7 +2049,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
- bytes = sizeof(struct leaf) * stat->leaves;
+ bytes = sizeof(struct tnode) * stat->leaves;
seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
bytes += sizeof(struct leaf_info) * stat->prefixes;
@@ -2109,7 +2070,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct rt_trie_node *) * pointers;
+ bytes += sizeof(struct tnode *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2163,7 +2124,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"Basic info: size of leaf:"
" %Zd bytes, size of tnode: %Zd bytes.\n",
- sizeof(struct leaf), sizeof(struct tnode));
+ sizeof(struct tnode), sizeof(struct tnode));
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2202,7 +2163,7 @@ static const struct file_operations fib_triestat_fops = {
.release = single_release_net,
};
-static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2214,7 +2175,7 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
- struct rt_trie_node *n;
+ struct tnode *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2243,7 +2204,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct rt_trie_node *n;
+ struct tnode *n;
++*pos;
/* next node in same table */
@@ -2329,29 +2290,26 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct rt_trie_node *n = v;
+ struct tnode *n = v;
if (!node_parent_rcu(n))
fib_table_print(seq, iter->tb);
if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *) n;
- __be32 prf = htonl(tn->key);
+ __be32 prf = htonl(n->key);
- seq_indent(seq, iter->depth-1);
+ seq_indent(seq, iter->depth - 1);
seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
- &prf, tn->pos, tn->bits, tn->full_children,
- tn->empty_children);
-
+ &prf, n->pos, n->bits, n->full_children,
+ n->empty_children);
} else {
- struct leaf *l = (struct leaf *) n;
struct leaf_info *li;
- __be32 val = htonl(l->key);
+ __be32 val = htonl(n->key);
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
- hlist_for_each_entry_rcu(li, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &n->list, hlist) {
struct fib_alias *fa;
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -2401,9 +2359,9 @@ struct fib_route_iter {
t_key key;
};
-static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
+static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
- struct leaf *l = NULL;
+ struct tnode *l = NULL;
struct trie *t = iter->main_trie;
/* use cache location of last found key */
@@ -2448,7 +2406,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
- struct leaf *l = v;
+ struct tnode *l = v;
++*pos;
if (v == SEQ_START_TOKEN) {
@@ -2494,7 +2452,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
- struct leaf *l = v;
+ struct tnode *l = v;
struct leaf_info *li;
if (v == SEQ_START_TOKEN) {
^ permalink raw reply related
* [RFC PATCH 05/17] fib_trie: Optimize fib_table_lookup to avoid wasting time on loops/variables
From: Alexander Duyck @ 2014-12-22 17:41 UTC (permalink / raw)
To: netdev
In-Reply-To: <20141222172632.1119.51469.stgit@ahduyck-vm-fedora20>
This patch is meant to reduce the complexity of fib_table_lookup by reducing
the number of variables to the bare minimum while still keeping the same if
not improved functionality versus the original.
Most of this change was started off by the desire to rid the function of
chopped_off and current_prefix_length as they actually added very little to
the function since they only applied when computing the cindex. I was able
to replace them mostly with just a check for the prefix match. As long as
the prefix between the key and the node being tested was the same we know
we can search the tnode fully versus just testing cindex 0.
The second portion of the change ended up being a massive reordering.
Originally the calls to check_leaf were up near the start of the loop, and
the backtracing and descending into lower levels of tnodes was later. This
didn't make much sense as the structure of the tree means the leaves are
always the last thing to be tested. As such I reordered things so that we
instead have a loop that will delve into the tree and only exit when we
have either found a leaf or we have exhausted the tree. The advantage of
rearranging things like this is that we can fully inline check_leaf since
there is now only one reference to it in the function.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
net/ipv4/fib_trie.c | 250 +++++++++++++++++++--------------------------------
1 file changed, 91 insertions(+), 159 deletions(-)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1c7bf43..fe2f60a 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -90,6 +90,9 @@ typedef unsigned int t_key;
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
+#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
+#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
+
struct tnode {
t_key key;
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
@@ -1281,7 +1284,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
continue;
fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
- if (err) {
+ if (unlikely(err < 0)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(t->stats->semantic_match_passed);
#endif
@@ -1303,7 +1306,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
res->prefixlen = li->plen;
res->nh_sel = nhsel;
res->type = fa->fa_type;
- res->scope = fa->fa_info->fib_scope;
+ res->scope = fi->fib_scope;
res->fi = fi;
res->table = tb;
res->fa_head = &li->falh;
@@ -1321,27 +1324,29 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
return 1;
}
+static inline t_key prefix_mismatch(t_key key, struct tnode *n)
+{
+ t_key prefix = n->key;
+
+ return (key ^ prefix) & (prefix | -prefix);
+}
+
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
- struct trie *t = (struct trie *) tb->tb_data;
+ struct trie *t = (struct trie *)tb->tb_data;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats = t->stats;
#endif
- int ret;
- struct tnode *n;
- struct tnode *pn;
- unsigned int pos, bits;
- t_key key = ntohl(flp->daddr);
- unsigned int chopped_off;
- t_key cindex = 0;
- unsigned int current_prefix_length = KEYLENGTH;
- struct tnode *cn;
- t_key pref_mismatch;
+ const t_key key = ntohl(flp->daddr);
+ struct tnode *n, *pn;
+ t_key cindex;
+ int ret = 1;
rcu_read_lock();
n = rcu_dereference(t->trie);
+
if (!n)
goto failed;
@@ -1349,170 +1354,97 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
this_cpu_inc(stats->gets);
#endif
- /* Just a leaf? */
- if (IS_LEAF(n)) {
- ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
- goto found;
- }
-
pn = n;
- chopped_off = 0;
-
- while (pn) {
- pos = pn->pos;
- bits = pn->bits;
-
- if (!chopped_off)
- cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
- pos, bits);
-
- n = tnode_get_child_rcu(pn, cindex);
-
- if (n == NULL) {
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->null_node_hit);
-#endif
- goto backtrace;
- }
-
- if (IS_LEAF(n)) {
- ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
- if (ret > 0)
- goto backtrace;
- goto found;
- }
-
- cn = n;
-
- /*
- * It's a tnode, and we can do some extra checks here if we
- * like, to avoid descending into a dead-end branch.
- * This tnode is in the parent's child array at index
- * key[p_pos..p_pos+p_bits] but potentially with some bits
- * chopped off, so in reality the index may be just a
- * subprefix, padded with zero at the end.
- * We can also take a look at any skipped bits in this
- * tnode - everything up to p_pos is supposed to be ok,
- * and the non-chopped bits of the index (se previous
- * paragraph) are also guaranteed ok, but the rest is
- * considered unknown.
- *
- * The skipped bits are key[pos+bits..cn->pos].
- */
-
- /* If current_prefix_length < pos+bits, we are already doing
- * actual prefix matching, which means everything from
- * pos+(bits-chopped_off) onward must be zero along some
- * branch of this subtree - otherwise there is *no* valid
- * prefix present. Here we can only check the skipped
- * bits. Remember, since we have already indexed into the
- * parent's child array, we know that the bits we chopped of
- * *are* zero.
+ cindex = 0;
+
+ /* Step 1: Travel to the longest prefix match in the trie */
+ for (;;) {
+ unsigned long index = get_index(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the "bits" in the prefix. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if !(index >> bits)
+ * we know the value is child index
+ * else
+ * we have a mismatch in skip bits and failed
*/
+ if (index >> n->bits)
+ break;
- /* NOTA BENE: Checking only skipped bits
- for the new node here */
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n))
+ goto found;
- if (current_prefix_length < pos+bits) {
- if (tkey_extract_bits(cn->key, current_prefix_length,
- cn->pos - current_prefix_length)
- || !(cn->child[0]))
- goto backtrace;
- }
+ pn = n;
+ cindex = index;
- /*
- * If chopped_off=0, the index is fully validated and we
- * only need to look at the skipped bits for this, the new,
- * tnode. What we actually want to do is to find out if
- * these skipped bits match our key perfectly, or if we will
- * have to count on finding a matching prefix further down,
- * because if we do, we would like to have some way of
- * verifying the existence of such a prefix at this point.
- */
+ n = rcu_dereference_rtnl(n->child[index]);
+ if (unlikely(!n))
+ goto backtrace;
+ }
- /* The only thing we can do at this point is to verify that
- * any such matching prefix can indeed be a prefix to our
- * key, and if the bits in the node we are inspecting that
- * do not match our key are not ZERO, this cannot be true.
- * Thus, find out where there is a mismatch (before cn->pos)
- * and verify that all the mismatching bits are zero in the
- * new tnode's key.
- */
+ /* Step 2: Sort out leaves and begin backtracing for longest prefix */
+ for (;;) {
+ /* record the pointer where our next node pointer is stored */
+ struct tnode __rcu **cptr = n->child;
- /*
- * Note: We aren't very concerned about the piece of
- * the key that precede pn->pos+pn->bits, since these
- * have already been checked. The bits after cn->pos
- * aren't checked since these are by definition
- * "unknown" at this point. Thus, what we want to see
- * is if we are about to enter the "prefix matching"
- * state, and in that case verify that the skipped
- * bits that will prevail throughout this subtree are
- * zero, as they have to be if we are to find a
- * matching prefix.
+ /* This test verifies that none of the bits that differ
+ * between the key and the prefix exist in the region of
+ * the lsb and higher in the prefix.
*/
+ if (unlikely(prefix_mismatch(key, n)))
+ goto backtrace;
- pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
+ /* exit out and process leaf */
+ if (unlikely(IS_LEAF(n)))
+ break;
- /*
- * In short: If skipped bits in this node do not match
- * the search key, enter the "prefix matching"
- * state.directly.
+ /* Don't bother recording parent info. Since we are in
+ * prefix match mode we will have to come back to wherever
+ * we started this traversal anyway
*/
- if (pref_mismatch) {
- /* fls(x) = __fls(x) + 1 */
- int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
-
- if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
- goto backtrace;
-
- if (current_prefix_length >= cn->pos)
- current_prefix_length = mp;
- }
-
- pn = n; /* Descend */
- chopped_off = 0;
- continue;
+ while ((n = rcu_dereference_rtnl(*cptr)) == NULL) {
backtrace:
- chopped_off++;
-
- /* As zero don't change the child key (cindex) */
- while ((chopped_off <= pn->bits)
- && !(cindex & (1<<(chopped_off-1))))
- chopped_off++;
-
- /* Decrease current_... with bits chopped off */
- if (current_prefix_length > pn->pos + pn->bits - chopped_off)
- current_prefix_length = pn->pos + pn->bits
- - chopped_off;
-
- /*
- * Either we do the actual chop off according or if we have
- * chopped off all bits in this tnode walk up to our parent.
- */
-
- if (chopped_off <= pn->bits) {
- cindex &= ~(1 << (chopped_off-1));
- } else {
- struct tnode *parent = node_parent_rcu(pn);
- if (!parent)
- goto failed;
-
- /* Get Child's index */
- cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
- pn = parent;
- chopped_off = 0;
-
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->backtrack);
+ if (!n)
+ this_cpu_inc(stats->null_node_hit);
#endif
- goto backtrace;
+ /* If we are at cindex 0 there are no more bits for
+ * us to strip at this level so we must ascend back
+ * up one level to see if there are any more bits to
+ * be stripped there.
+ */
+ while (!cindex) {
+ t_key pkey = pn->key;
+
+ pn = node_parent_rcu(pn);
+ if (unlikely(!pn))
+ goto failed;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->backtrack);
+#endif
+ /* Get Child's index */
+ cindex = get_index(pkey, pn);
+ }
+
+ /* strip the least significant bit from the cindex */
+ cindex &= cindex - 1;
+
+ /* grab pointer for next child node */
+ cptr = &pn->child[cindex];
}
}
-failed:
- ret = 1;
+
found:
+ /* Step 3: Process the leaf, if that fails fall back to backtracing */
+ ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
+ if (unlikely(ret > 0))
+ goto backtrace;
+failed:
rcu_read_unlock();
return ret;
}
^ permalink raw reply related
* [RFC PATCH 06/17] fib_trie: Optimize fib_find_node
From: Alexander Duyck @ 2014-12-22 17:41 UTC (permalink / raw)
To: netdev
In-Reply-To: <20141222172632.1119.51469.stgit@ahduyck-vm-fedora20>
This patch makes use of the same features I made use of for
fib_table_lookup to streamline fib_find_node. The resultant code should be
smaller and run faster than the original.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
net/ipv4/fib_trie.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index fe2f60a..c182143 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -892,28 +892,34 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
}
/* rcu_read_lock needs to be hold by caller from readside */
-
static struct tnode *fib_find_node(struct trie *t, u32 key)
{
struct tnode *n = rcu_dereference_rtnl(t->trie);
- int pos = 0;
- while (n && IS_TNODE(n)) {
- if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
- pos = n->pos + n->bits;
- n = tnode_get_child_rcu(n,
- tkey_extract_bits(key,
- n->pos,
- n->bits));
- } else
+ while (n) {
+ unsigned long index = get_index(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the bits in the cindex. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if !(index >> bits)
+ * we know the value is cindex
+ * else
+ * we have a mismatch in skip bits and failed
+ */
+ if (index >> n->bits)
+ return NULL;
+
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n))
break;
- }
- /* Case we have found a leaf. Compare prefixes */
- if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
- return n;
+ n = rcu_dereference_rtnl(n->child[index]);
+ }
- return NULL;
+ return n;
}
static void trie_rebalance(struct trie *t, struct tnode *tn)
^ permalink raw reply related
* [RFC PATCH 07/17] fib_trie: Optimize fib_table_insert
From: Alexander Duyck @ 2014-12-22 17:41 UTC (permalink / raw)
To: netdev
In-Reply-To: <20141222172632.1119.51469.stgit@ahduyck-vm-fedora20>
This patch updates the fib_table_insert function to take advantage of the
changes made to improve the performance of fib_table_lookup. As a result
the code should be smaller and run faster then the original.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
net/ipv4/fib_trie.c | 196 ++++++++++++++++++---------------------------------
1 file changed, 71 insertions(+), 125 deletions(-)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index c182143..969ecf3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -222,31 +222,6 @@ static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int
return 0;
}
-static inline int tkey_equals(t_key a, t_key b)
-{
- return a == b;
-}
-
-static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
-{
- if (bits == 0 || offset >= KEYLENGTH)
- return 1;
- bits = bits > KEYLENGTH ? KEYLENGTH : bits;
- return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
-}
-
-static inline int tkey_mismatch(t_key a, int offset, t_key b)
-{
- t_key diff = a ^ b;
- int i = offset;
-
- if (!diff)
- return 0;
- while ((diff << i) >> (KEYLENGTH-1) == 0)
- i++;
- return i;
-}
-
/*
To understand this stuff, an understanding of keys and all their bits is
necessary. Every node in the trie has a key associated with it, but not
@@ -485,6 +460,15 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
rcu_assign_pointer(tn->child[i], n);
}
+static void put_child_root(struct tnode *tp, struct trie *t,
+ t_key key, struct tnode *n)
+{
+ if (tp)
+ put_child(tp, get_index(key, tp), n);
+ else
+ rcu_assign_pointer(t->trie, n);
+}
+
#define MAX_WORK 10
static struct tnode *resize(struct trie *t, struct tnode *tn)
{
@@ -959,138 +943,100 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
- int pos, newpos;
- struct tnode *tp = NULL, *tn = NULL;
- struct tnode *n;
- struct tnode *l;
- int missbit;
struct list_head *fa_head = NULL;
+ struct tnode *l, *n, *tp = NULL;
struct leaf_info *li;
- t_key cindex;
- pos = 0;
+ li = leaf_info_new(plen);
+ if (!li)
+ return NULL;
+ fa_head = &li->falh;
+
n = rtnl_dereference(t->trie);
/* If we point to NULL, stop. Either the tree is empty and we should
* just put a new leaf in if, or we have reached an empty child slot,
* and we should just put our new leaf in that.
- * If we point to a T_TNODE, check if it matches our key. Note that
- * a T_TNODE might be skipping any number of bits - its 'pos' need
- * not be the parent's 'pos'+'bits'!
- *
- * If it does match the current key, get pos/bits from it, extract
- * the index from our key, push the T_TNODE and walk the tree.
- *
- * If it doesn't, we have to replace it with a new T_TNODE.
*
- * If we point to a T_LEAF, it might or might not have the same key
- * as we do. If it does, just change the value, update the T_LEAF's
- * value, and return it.
- * If it doesn't, we need to replace it with a T_TNODE.
+ * If we hit a node with a key that does't match then we should stop
+ * and create a new tnode to replace that node and insert ourselves
+ * and the other node into the new tnode.
*/
+ while (n) {
+ unsigned long index = get_index(key, n);
- while (n && IS_TNODE(n)) {
- if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
- tp = n;
- pos = n->pos + n->bits;
- n = tnode_get_child(n,
- tkey_extract_bits(key,
- n->pos,
- n->bits));
-
- BUG_ON(n && node_parent(n) != tp);
- } else
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the "bits" in the prefix. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if !(index >> bits)
+ * we know the value is child index
+ * else
+ * we have a mismatch in skip bits and failed
+ */
+ if (index >> n->bits)
break;
- }
-
- /*
- * n ----> NULL, LEAF or TNODE
- *
- * tp is n's (parent) ----> NULL or TNODE
- */
-
- BUG_ON(tp && IS_LEAF(tp));
-
- /* Case 1: n is a leaf. Compare prefixes */
-
- if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- li = leaf_info_new(plen);
- if (!li)
- return NULL;
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n)) {
+ /* Case 1: n is a leaf, and prefixes match*/
+ insert_leaf_info(&n->list, li);
+ return fa_head;
+ }
- fa_head = &li->falh;
- insert_leaf_info(&n->list, li);
- goto done;
+ tp = n;
+ n = rcu_dereference_rtnl(n->child[index]);
}
- l = leaf_new(key);
-
- if (!l)
- return NULL;
-
- li = leaf_info_new(plen);
- if (!li) {
- node_free(l);
+ l = leaf_new(key);
+ if (!l) {
+ free_leaf_info(li);
return NULL;
}
- fa_head = &li->falh;
insert_leaf_info(&l->list, li);
- if (t->trie && n == NULL) {
- /* Case 2: n is NULL, and will just insert a new leaf */
-
- node_set_parent(l, tp);
-
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(tp, cindex, l);
- } else {
- /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
- /*
- * Add a new tnode here
- * first tnode need some special handling
- */
+ /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
+ *
+ * Add a new tnode here
+ * first tnode need some special handling
+ * leaves us in position for handling as case 3
+ */
+ if (n) {
+ struct tnode *tn;
+ int newpos;
- if (n) {
- pos = tp ? tp->pos+tp->bits : 0;
- newpos = tkey_mismatch(key, pos, n->key);
- tn = tnode_new(n->key, newpos, 1);
- } else {
- newpos = 0;
- tn = tnode_new(key, newpos, 1); /* First tnode */
- }
+ newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
+ tn = tnode_new(key, newpos, 1);
if (!tn) {
free_leaf_info(li);
node_free(l);
return NULL;
}
- node_set_parent(tn, tp);
-
- missbit = tkey_extract_bits(key, newpos, 1);
- put_child(tn, missbit, l);
- put_child(tn, 1-missbit, n);
+ /* initialize routes out of node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child(tn, get_index(key, tn) ^ 1, n);
- if (tp) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(tp, cindex, tn);
- } else {
- rcu_assign_pointer(t->trie, tn);
- }
+ /* start adding routes into the node */
+ put_child_root(tp, t, key, tn);
+ node_set_parent(n, tn);
+ /* parent now has a NULL spot where the leaf can go */
tp = tn;
}
- if (tp && tp->pos + tp->bits > 32)
- pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
- tp, tp->pos, tp->bits, key, plen);
-
- /* Rebalance the trie */
+ /* Case 3: n is NULL, and will just insert a new leaf */
+ if (tp) {
+ NODE_INIT_PARENT(l, tp);
+ put_child(tp, get_index(key, tp), l);
+ trie_rebalance(t, tp);
+ } else {
+ rcu_assign_pointer(t->trie, l);
+ }
- trie_rebalance(t, tp);
-done:
return fa_head;
}
@@ -1466,11 +1412,11 @@ static void trie_leaf_remove(struct trie *t, struct tnode *l)
pr_debug("entering trie_leaf_remove(%p)\n", l);
if (tp) {
- t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
- put_child(tp, cindex, NULL);
+ put_child(tp, get_index(l->key, tp), NULL);
trie_rebalance(t, tp);
- } else
+ } else {
RCU_INIT_POINTER(t->trie, NULL);
+ }
node_free(l);
}
^ permalink raw reply related
* [RFC PATCH 08/17] fib_trie: Update meaning of pos to represent unchecked bits
From: Alexander Duyck @ 2014-12-22 17:41 UTC (permalink / raw)
To: netdev
In-Reply-To: <20141222172632.1119.51469.stgit@ahduyck-vm-fedora20>
This change moves the pos value to the other side of the "bits" field. By
doing this it actually simplifies a significant amount of code in the trie.
For example when halving a tree we know that the bit lost exists at
oldnode->pos, and if we inflate the tree the new bit being add is at
tn->pos. Previously to find those bits you would have to subtract pos and
bits from the keylength or start with a value of (1 << 31) and then shift
that.
There are a number of spots throughout the code that benefit from this. In
the case of the hot-path searches the main advantage is that we can drop 2
or more operations from the search path as we no longer need to compute the
value for the index to be shifted by and can instead just use the raw pos
value.
In addition the tkey_extract_bits is now defunct and can be replaced by
get_index since the two operations were doing the same thing, but now
get_index does it much more quickly as it is only an xor and shift versus a
pair of shifts and a subtraction.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
---
net/ipv4/fib_trie.c | 194 +++++++++++++++++++++------------------------------
1 file changed, 81 insertions(+), 113 deletions(-)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 969ecf3..d27aa27 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -90,8 +90,7 @@ typedef unsigned int t_key;
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
-#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
-#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
+#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
struct tnode {
t_key key;
@@ -209,81 +208,64 @@ static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned
return rcu_dereference_rtnl(tn->child[i]);
}
-static inline t_key mask_pfx(t_key k, unsigned int l)
-{
- return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
-}
-
-static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
-{
- if (offset < KEYLENGTH)
- return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
- else
- return 0;
-}
-
-/*
- To understand this stuff, an understanding of keys and all their bits is
- necessary. Every node in the trie has a key associated with it, but not
- all of the bits in that key are significant.
-
- Consider a node 'n' and its parent 'tp'.
-
- If n is a leaf, every bit in its key is significant. Its presence is
- necessitated by path compression, since during a tree traversal (when
- searching for a leaf - unless we are doing an insertion) we will completely
- ignore all skipped bits we encounter. Thus we need to verify, at the end of
- a potentially successful search, that we have indeed been walking the
- correct key path.
-
- Note that we can never "miss" the correct key in the tree if present by
- following the wrong path. Path compression ensures that segments of the key
- that are the same for all keys with a given prefix are skipped, but the
- skipped part *is* identical for each node in the subtrie below the skipped
- bit! trie_insert() in this implementation takes care of that - note the
- call to tkey_sub_equals() in trie_insert().
-
- if n is an internal node - a 'tnode' here, the various parts of its key
- have many different meanings.
-
- Example:
- _________________________________________________________________
- | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
- -----------------------------------------------------------------
- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-
- _________________________________________________________________
- | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
- -----------------------------------------------------------------
- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-
- tp->pos = 7
- tp->bits = 3
- n->pos = 15
- n->bits = 4
-
- First, let's just ignore the bits that come before the parent tp, that is
- the bits from 0 to (tp->pos-1). They are *known* but at this point we do
- not use them for anything.
-
- The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
- index into the parent's child array. That is, they will be used to find
- 'n' among tp's children.
-
- The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
- for the node n.
-
- All the bits we have seen so far are significant to the node n. The rest
- of the bits are really not needed or indeed known in n->key.
-
- The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
- n's child array, and will of course be different for each child.
-
-
- The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
- at this point.
-
-*/
+/* To understand this stuff, an understanding of keys and all their bits is
+ * necessary. Every node in the trie has a key associated with it, but not
+ * all of the bits in that key are significant.
+ *
+ * Consider a node 'n' and its parent 'tp'.
+ *
+ * If n is a leaf, every bit in its key is significant. Its presence is
+ * necessitated by path compression, since during a tree traversal (when
+ * searching for a leaf - unless we are doing an insertion) we will completely
+ * ignore all skipped bits we encounter. Thus we need to verify, at the end of
+ * a potentially successful search, that we have indeed been walking the
+ * correct key path.
+ *
+ * Note that we can never "miss" the correct key in the tree if present by
+ * following the wrong path. Path compression ensures that segments of the key
+ * that are the same for all keys with a given prefix are skipped, but the
+ * skipped part *is* identical for each node in the subtrie below the skipped
+ * bit! trie_insert() in this implementation takes care of that.
+ *
+ * if n is an internal node - a 'tnode' here, the various parts of its key
+ * have many different meanings.
+ *
+ * Example:
+ * _________________________________________________________________
+ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+ * -----------------------------------------------------------------
+ * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
+ *
+ * _________________________________________________________________
+ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+ * -----------------------------------------------------------------
+ * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ *
+ * tp->pos = 22
+ * tp->bits = 3
+ * n->pos = 13
+ * n->bits = 4
+ *
+ * First, let's just ignore the bits that come before the parent tp, that is
+ * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
+ * point we do not use them for anything.
+ *
+ * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+ * index into the parent's child array. That is, they will be used to find
+ * 'n' among tp's children.
+ *
+ * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
+ * for the node n.
+ *
+ * All the bits we have seen so far are significant to the node n. The rest
+ * of the bits are really not needed or indeed known in n->key.
+ *
+ * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+ * n's child array, and will of course be different for each child.
+ *
+ * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
+ * at this point.
+ */
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
@@ -367,7 +349,7 @@ static struct tnode *leaf_new(t_key key)
* as the nodes are searched
*/
l->key = key;
- l->pos = KEYLENGTH;
+ l->pos = 0;
/* set bits to 0 indicating we are not a tnode */
l->bits = 0;
@@ -400,7 +382,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
tn->parent = NULL;
tn->pos = pos;
tn->bits = bits;
- tn->key = mask_pfx(key, pos);
+ tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
tn->full_children = 0;
tn->empty_children = 1<<bits;
}
@@ -410,14 +392,12 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
return tn;
}
-/*
- * Check whether a tnode 'n' is "full", i.e. it is an internal node
+/* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-
static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
{
- return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
+ return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
static inline void put_child(struct tnode *tn, int i,
@@ -641,11 +621,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
{
int olen = tnode_child_length(oldtnode);
struct tnode *tn;
+ t_key m;
int i;
pr_debug("In inflate\n");
- tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
+ tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
if (!tn)
return ERR_PTR(-ENOMEM);
@@ -656,21 +637,18 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
* fails. In case of failure we return the oldnode and inflate
* of tnode is ignored.
*/
+ for (i = 0, m = 1u << tn->pos; i < olen; i++) {
+ struct tnode *inode = tnode_get_child(oldtnode, i);
- for (i = 0; i < olen; i++) {
- struct tnode *inode;
-
- inode = tnode_get_child(oldtnode, i);
- if (tnode_full(oldtnode, inode) && inode->bits > 1) {
+ if (tnode_full(oldtnode, inode) && (inode->bits > 1)) {
struct tnode *left, *right;
- t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
- left = tnode_new(inode->key&(~m), inode->pos + 1,
+ left = tnode_new(inode->key & ~m, inode->pos,
inode->bits - 1);
if (!left)
goto nomem;
- right = tnode_new(inode->key|m, inode->pos + 1,
+ right = tnode_new(inode->key | m, inode->pos,
inode->bits - 1);
if (!right) {
@@ -694,9 +672,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
/* A leaf or an internal node with skipped bits */
if (!tnode_full(oldtnode, inode)) {
- put_child(tn,
- tkey_extract_bits(inode->key, tn->pos, tn->bits),
- inode);
+ put_child(tn, get_index(inode->key, tn), inode);
continue;
}
@@ -767,7 +743,7 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
pr_debug("In halve\n");
- tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
+ tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
if (!tn)
return ERR_PTR(-ENOMEM);
@@ -787,7 +763,7 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
if (left && right) {
struct tnode *newn;
- newn = tnode_new(left->key, tn->pos + tn->bits, 1);
+ newn = tnode_new(left->key, oldtnode->pos, 1);
if (!newn)
goto nomem;
@@ -915,7 +891,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
key = tn->key;
while (tn != NULL && (tp = node_parent(tn)) != NULL) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ cindex = get_index(key, tp);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = resize(t, tn);
@@ -1005,11 +981,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
*/
if (n) {
struct tnode *tn;
- int newpos;
-
- newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
- tn = tnode_new(key, newpos, 1);
+ tn = tnode_new(key, __fls(key ^ n->key), 1);
if (!tn) {
free_leaf_info(li);
node_free(l);
@@ -1555,12 +1528,7 @@ static int trie_flush_leaf(struct tnode *l)
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
{
do {
- t_key idx;
-
- if (c)
- idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
- else
- idx = 0;
+ t_key idx = c ? idx = get_index(c->key, p) + 1 : 0;
while (idx < 1u << p->bits) {
c = tnode_get_child_rcu(p, idx++);
@@ -1847,7 +1815,7 @@ rescan:
/* Current node exhausted, pop back up */
p = node_parent_rcu(tn);
if (p) {
- cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
+ cindex = get_index(tn->key, p) + 1;
tn = p;
--iter->depth;
goto rescan;
@@ -2182,10 +2150,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
if (IS_TNODE(n)) {
__be32 prf = htonl(n->key);
- seq_indent(seq, iter->depth - 1);
- seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
- &prf, n->pos, n->bits, n->full_children,
- n->empty_children);
+ seq_indent(seq, iter->depth-1);
+ seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
+ &prf, KEYLENGTH - n->pos - n->bits, n->bits,
+ n->full_children, n->empty_children);
} else {
struct leaf_info *li;
__be32 val = htonl(n->key);
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox