* [PATCH net-next v3 1/2] net/ipv6: Remove expired routes with a separated list of routes.
2023-07-18 18:33 [PATCH net-next v3 0/2] Remove expired routes with a separated list of routes Kui-Feng Lee
@ 2023-07-18 18:33 ` Kui-Feng Lee
2023-07-18 18:33 ` [PATCH net-next v3 2/2] selftests: fib_tests: Add a test case for IPv6 garbage collection Kui-Feng Lee
2023-07-20 9:38 ` [PATCH net-next v3 0/2] Remove expired routes with a separated list of routes Paolo Abeni
2 siblings, 0 replies; 4+ messages in thread
From: Kui-Feng Lee @ 2023-07-18 18:33 UTC (permalink / raw)
To: dsahern, davem, edumazet, kuba, netdev, pabeni, martin.lau,
kernel-team, yhs
Cc: Kui-Feng Lee
FIB6 GC walks trees of fib6_tables to remove expired routes. Walking a tree
can be expensive if the number of routes in a table is big, even if most of
them are permanent. Checking routes in a separated list of routes having
expiration will avoid this potential issue.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
include/net/ip6_fib.h | 65 ++++++++++++++++++++++++++++++++++---------
net/ipv6/ip6_fib.c | 53 ++++++++++++++++++++++++++++++++---
net/ipv6/route.c | 6 ++--
3 files changed, 104 insertions(+), 20 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 05e6f756feaf..e6f4d986fb63 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -179,6 +179,9 @@ struct fib6_info {
refcount_t fib6_ref;
unsigned long expires;
+
+ struct hlist_node gc_link;
+
struct dst_metrics *fib6_metrics;
#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1]
@@ -247,19 +250,6 @@ static inline bool fib6_requires_src(const struct fib6_info *rt)
return rt->fib6_src.plen > 0;
}
-static inline void fib6_clean_expires(struct fib6_info *f6i)
-{
- f6i->fib6_flags &= ~RTF_EXPIRES;
- f6i->expires = 0;
-}
-
-static inline void fib6_set_expires(struct fib6_info *f6i,
- unsigned long expires)
-{
- f6i->expires = expires;
- f6i->fib6_flags |= RTF_EXPIRES;
-}
-
static inline bool fib6_check_expired(const struct fib6_info *f6i)
{
if (f6i->fib6_flags & RTF_EXPIRES)
@@ -267,6 +257,11 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i)
return false;
}
+static inline bool fib6_has_expires(const struct fib6_info *f6i)
+{
+ return f6i->fib6_flags & RTF_EXPIRES;
+}
+
/* Function to safely get fn->fn_sernum for passed in rt
* and store result in passed in cookie.
* Return true if we can get cookie safely
@@ -388,6 +383,7 @@ struct fib6_table {
struct inet_peer_base tb6_peers;
unsigned int flags;
unsigned int fib_seq;
+ struct hlist_head tb6_gc_hlist; /* GC candidates */
#define RT6_TABLE_HAS_DFLT_ROUTER BIT(0)
};
@@ -504,6 +500,49 @@ void fib6_gc_cleanup(void);
int fib6_init(void);
+/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be
+ * NULL.
+ */
+static inline void fib6_set_expires_locked(struct fib6_info *f6i, unsigned long expires)
+{
+ struct fib6_table *tb6;
+
+ tb6 = f6i->fib6_table;
+ f6i->expires = expires;
+ if (tb6 && !fib6_has_expires(f6i))
+ hlist_add_head(&f6i->gc_link, &tb6->tb6_gc_hlist);
+ f6i->fib6_flags |= RTF_EXPIRES;
+}
+
+/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be
+ * NULL. If fib6_table is NULL, the fib6_info will no be inserted into the
+ * list of GC candidates until it is inserted into a table.
+ */
+static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires)
+{
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_set_expires_locked(f6i, expires);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
+}
+
+static inline void fib6_clean_expires_locked(struct fib6_info *f6i)
+{
+ struct fib6_table *tb6;
+
+ tb6 = f6i->fib6_table;
+ if (tb6 && fib6_has_expires(f6i))
+ hlist_del_init(&f6i->gc_link);
+ f6i->fib6_flags &= ~RTF_EXPIRES;
+ f6i->expires = 0;
+}
+
+static inline void fib6_clean_expires(struct fib6_info *f6i)
+{
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_clean_expires_locked(f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
+}
+
struct ipv6_route_iter {
struct seq_net_private p;
struct fib6_walker w;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index bac768d36cc1..a4422d513d4d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -160,6 +160,8 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
INIT_LIST_HEAD(&f6i->fib6_siblings);
refcount_set(&f6i->fib6_ref, 1);
+ INIT_HLIST_NODE(&f6i->gc_link);
+
return f6i;
}
@@ -246,6 +248,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
net->ipv6.fib6_null_entry);
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&table->tb6_peers);
+ INIT_HLIST_HEAD(&table->tb6_gc_hlist);
}
return table;
@@ -1057,6 +1060,11 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
lockdep_is_held(&table->tb6_lock));
}
}
+
+ if (fib6_has_expires(rt)) {
+ hlist_del_init(&rt->gc_link);
+ rt->fib6_flags &= ~RTF_EXPIRES;
+ }
}
/*
@@ -1118,9 +1126,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
if (!(iter->fib6_flags & RTF_EXPIRES))
return -EEXIST;
if (!(rt->fib6_flags & RTF_EXPIRES))
- fib6_clean_expires(iter);
+ fib6_clean_expires_locked(iter);
else
- fib6_set_expires(iter, rt->expires);
+ fib6_set_expires_locked(iter, rt->expires);
if (rt->fib6_pmtu)
fib6_metric_set(iter, RTAX_MTU,
@@ -1480,6 +1488,9 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
list_add(&rt->nh_list, &rt->nh->f6i_list);
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
fib6_start_gc(info->nl_net, rt);
+
+ if (fib6_has_expires(rt))
+ hlist_add_head(&rt->gc_link, &table->tb6_gc_hlist);
}
out:
@@ -2295,7 +2306,7 @@ static int fib6_age(struct fib6_info *rt, void *arg)
* Routes are expired even if they are in use.
*/
- if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
+ if (fib6_has_expires(rt) && rt->expires) {
if (time_after(now, rt->expires)) {
RT6_TRACE("expiring %p\n", rt);
return -1;
@@ -2312,6 +2323,40 @@ static int fib6_age(struct fib6_info *rt, void *arg)
return 0;
}
+static void fib6_gc_table(struct net *net,
+ struct fib6_table *tb6,
+ void *arg)
+{
+ struct fib6_info *rt;
+ struct hlist_node *n;
+ struct nl_info info = {
+ .nl_net = net,
+ .skip_notify = false,
+ };
+
+ hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
+ if (fib6_age(rt, arg) == -1)
+ fib6_del(rt, &info);
+}
+
+static void fib6_gc_all(struct net *net, void *arg)
+{
+ struct fib6_table *table;
+ struct hlist_head *head;
+ unsigned int h;
+
+ rcu_read_lock();
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+ spin_lock_bh(&table->tb6_lock);
+ fib6_gc_table(net, table, arg);
+ spin_unlock_bh(&table->tb6_lock);
+ }
+ }
+ rcu_read_unlock();
+}
+
void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
struct fib6_gc_args gc_args;
@@ -2327,7 +2372,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
net->ipv6.sysctl.ip6_rt_gc_interval;
gc_args.more = 0;
- fib6_clean_all(net, fib6_age, &gc_args);
+ fib6_gc_all(net, &gc_args);
now = jiffies;
net->ipv6.ip6_rt_last_gc = now;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 64e873f5895f..a69083563689 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3760,10 +3760,10 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
rt->dst_nocount = true;
if (cfg->fc_flags & RTF_EXPIRES)
- fib6_set_expires(rt, jiffies +
- clock_t_to_jiffies(cfg->fc_expires));
+ fib6_set_expires_locked(rt, jiffies +
+ clock_t_to_jiffies(cfg->fc_expires));
else
- fib6_clean_expires(rt);
+ fib6_clean_expires_locked(rt);
if (cfg->fc_protocol == RTPROT_UNSPEC)
cfg->fc_protocol = RTPROT_BOOT;
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH net-next v3 2/2] selftests: fib_tests: Add a test case for IPv6 garbage collection
2023-07-18 18:33 [PATCH net-next v3 0/2] Remove expired routes with a separated list of routes Kui-Feng Lee
2023-07-18 18:33 ` [PATCH net-next v3 1/2] net/ipv6: " Kui-Feng Lee
@ 2023-07-18 18:33 ` Kui-Feng Lee
2023-07-20 9:38 ` [PATCH net-next v3 0/2] Remove expired routes with a separated list of routes Paolo Abeni
2 siblings, 0 replies; 4+ messages in thread
From: Kui-Feng Lee @ 2023-07-18 18:33 UTC (permalink / raw)
To: dsahern, davem, edumazet, kuba, netdev, pabeni, martin.lau,
kernel-team, yhs
Cc: Kui-Feng Lee
Add 10 IPv6 routes with expiration time. Wait for a few seconds
to make sure they are removed correctly.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
---
tools/testing/selftests/net/fib_tests.sh | 46 +++++++++++++++++++++++-
1 file changed, 45 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 35d89dfa6f11..87c871cae8c3 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -9,7 +9,7 @@ ret=0
ksft_skip=4
# all tests in this script. Can be overridden with -t option
-TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh"
+TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test"
VERBOSE=0
PAUSE_ON_FAIL=no
@@ -747,6 +747,49 @@ fib_notify_test()
cleanup &> /dev/null
}
+fib6_gc_test()
+{
+ setup
+
+ echo
+ echo "Fib6 garbage collection test"
+ set -e
+
+ # Check expiration of routes every 3 seconds (GC)
+ $NS_EXEC sysctl -wq net.ipv6.route.gc_interval=3
+
+ $IP link add dummy_10 type dummy
+ $IP link set dev dummy_10 up
+ $IP -6 address add 2001:10::1/64 dev dummy_10
+
+ for i in 0 1 2 3 4 5 6 7 8 9; do
+ # Expire route after 2 seconds
+ $IP -6 route add 2001:20::1$i \
+ via 2001:10::2 dev dummy_10 expires 2
+ done
+ N_EXP=$($IP -6 route list |grep expires|wc -l)
+ if [ $N_EXP -ne 10 ]; then
+ echo "FAIL: expected 10 routes with expires, got $N_EXP"
+ ret=1
+ else
+ sleep 4
+ N_EXP_s20=$($IP -6 route list |grep expires|wc -l)
+
+ if [ $N_EXP_s20 -ne 0 ]; then
+ echo "FAIL: expected 0 routes with expires, got $N_EXP_s20"
+ ret=1
+ else
+ ret=0
+ fi
+ fi
+
+ set +e
+
+ log_test $ret 0 "ipv6 route garbage collection"
+
+ cleanup &> /dev/null
+}
+
fib_suppress_test()
{
echo
@@ -2217,6 +2260,7 @@ do
ipv4_mangle) ipv4_mangle_test;;
ipv6_mangle) ipv6_mangle_test;;
ipv4_bcast_neigh) ipv4_bcast_neigh_test;;
+ fib6_gc_test|ipv6_gc) fib6_gc_test;;
help) echo "Test names: $TESTS"; exit 0;;
esac
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread* Re: [PATCH net-next v3 0/2] Remove expired routes with a separated list of routes.
2023-07-18 18:33 [PATCH net-next v3 0/2] Remove expired routes with a separated list of routes Kui-Feng Lee
2023-07-18 18:33 ` [PATCH net-next v3 1/2] net/ipv6: " Kui-Feng Lee
2023-07-18 18:33 ` [PATCH net-next v3 2/2] selftests: fib_tests: Add a test case for IPv6 garbage collection Kui-Feng Lee
@ 2023-07-20 9:38 ` Paolo Abeni
2 siblings, 0 replies; 4+ messages in thread
From: Paolo Abeni @ 2023-07-20 9:38 UTC (permalink / raw)
To: Kui-Feng Lee, dsahern, davem, edumazet, kuba, netdev, martin.lau,
kernel-team, yhs
Cc: Kui-Feng Lee
On Tue, 2023-07-18 at 11:33 -0700, Kui-Feng Lee wrote:
> FIB6 GC walks trees of fib6_tables to remove expired routes. Walking a tree
> can be expensive if the number of routes in a table is big, even if most of
> them are permanent. Checking routes in a separated list of routes having
> expiration will avoid this potential issue.
>
> Background
> ==========
>
> The size of a Linux IPv6 routing table can become a big problem if not
> managed appropriately. Now, Linux has a garbage collector to remove
> expired routes periodically. However, this may lead to a situation in the routing path is blocked for a long period due to an
> excessive number of routes.
>
> For example, years ago, there is a commit c7bb4b89033b ("ipv6: tcp: drop
> silly ICMPv6 packet too big messages") about "ICMPv6 Packet too big
> messages". The root cause is that malicious ICMPv6 packets were sent back
> for every small packet sent to them. These packets add routes with an
> expiration time that prompts the GC to periodically check all routes in the
> tables, including permanent ones.
>
> Why Route Expires
> =================
>
> Users can add IPv6 routes with an expiration time manually. However,
> the Neighbor Discovery protocol may also generate routes that can
> expire. For example, Router Advertisement (RA) messages may create a
> default route with an expiration time. [RFC 4861] For IPv4, it is not
> possible to set an expiration time for a route, and there is no RA, so
> there is no need to worry about such issues.
>
> Create Routes with Expires
> ==========================
>
> You can create routes with expires with the command.
>
> For example,
>
> ip -6 route add 2001:b000:591::3 via fe80::5054:ff:fe12:3457 \
> dev enp0s3 expires 30
>
> The route that has been generated will be deleted automatically in 30
> seconds.
>
> GC of FIB6
> ==========
>
> The function called fib6_run_gc() is responsible for performing
> garbage collection (GC) for the Linux IPv6 stack. It checks for the
> expiration of every route by traversing the trees of routing
> tables. The time taken to traverse a routing table increases with its
> size. Holding the routing table lock during traversal is particularly
> undesirable. Therefore, it is preferable to keep the lock for the
> shortest possible duration.
>
> Solution
> ========
>
> The cause of the issue is keeping the routing table locked during the
> traversal of large trees. To solve this problem, we can create a separate
> list of routes that have expiration. This will prevent GC from checking
> permanent routes.
>
> Result
> ======
>
> We conducted a test to measure the execution times of fib6_gc_timer_cb()
> and observed that it enhances the GC of FIB6. During the test, we added
> permanent routes with the following numbers: 1000, 3000, 6000, and
> 9000. Additionally, we added a route with an expiration time.
>
> Here are the average execution times for the kernel without the patch.
> - 120020 ns with 1000 permanent routes
> - 308920 ns with 3000 ...
> - 581470 ns with 6000 ...
> - 855310 ns with 9000 ...
>
> The kernel with the patch consistently takes around 14000 ns to execute,
> regardless of the number of permanent routes that are installed.
>
> Major changes from v2:
>
> - Remove unnecessary and incorrect sysctl restoring in the test case.
>
> Major changes from v1:
>
> - Moved gc_link to avoid creating a hole in fib6_info.
>
> - Moved fib6_set_expires*() and fib6_clean_expires*() to the header
> file and inlined. And removed duplicated lines.
>
> - Added a test case.
>
> ---
> v1: https://lore.kernel.org/all/20230710203609.520720-1-kuifeng@meta.com/
> v2: https://lore.kernel.org/all/20230718180321.294721-1-kuifeng@meta.com/
Too bad I did not notice v3 before starting reviewing v2.
When posting a new version you must wait the 24h quarantine period,
see:
https://elixir.bootlin.com/linux/v6.4/source/Documentation/process/maintainer-netdev.rst#L15
I assume this does not cope with the feedback on previous version ;)
/P
^ permalink raw reply [flat|nested] 4+ messages in thread