* [RFC PATCH net-next v1 2/3] net: dst_cache: add input_dst_cache API
2024-01-25 13:08 [RFC PATCH net-next v1 0/3] net: route: improve route hinting Leone Fernando
2024-01-25 13:12 ` [RFC PATCH net-next v1 1/3] net: route: expire rt if the dst it holds is expired Leone Fernando
@ 2024-01-25 13:14 ` Leone Fernando
2024-01-25 13:15 ` [RFC PATCH net-next v1 3/3] net: route: replace route hints with input_dst_cache Leone Fernando
2 siblings, 0 replies; 4+ messages in thread
From: Leone Fernando @ 2024-01-25 13:14 UTC (permalink / raw)
To: dennis, tj, cl, davem, edumazet, kuba, pabeni, dsahern, linux-mm,
linux-kernel, netdev
The input_dst_cache allows fast lookup of frequently encountered dsts.
In order to provide stable results, I implemented a simple linear
hashtable with each bucket containing a constant amount of
entries (DST_CACHE_INPUT_BUCKET_SIZE).
Similarly to how the route hint is used, I defined the hashtable key to
contain the daddr and the tos of the IP header.
Lookup is performed in a straightforward manner: start at the bucket head
corresponding the hashed key and search the following
DST_CACHE_INPUT_BUCKET_SIZE entries of the array for a matching key.
When inserting a new dst to the cache, if all the bucket entries are
full, the oldest one is deleted to make room for the new dst.
Signed-off-by: Leone Fernando <leone4fernando@gmail.com>
---
include/linux/percpu.h | 4 ++
include/net/dst_cache.h | 56 ++++++++++++++++
net/core/dst_cache.c | 145 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 205 insertions(+)
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8c677f185901..562d846b81fe 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -141,6 +141,10 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
#define alloc_percpu_gfp(type, gfp) \
(typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \
__alignof__(type), gfp)
+#define alloc_percpu_array_gfp(type, size, gfp) \
+ ((typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type[size]), \
+ __alignof__(type[size]), \
+ gfp))
#define alloc_percpu(type) \
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))
diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
index df6622a5fe98..560e7aec9347 100644
--- a/include/net/dst_cache.h
+++ b/include/net/dst_cache.h
@@ -8,11 +8,38 @@
#include <net/ip6_fib.h>
#endif
+#define DST_CACHE_INPUT_SHIFT (9)
+#define DST_CACHE_INPUT_SIZE (1 << DST_CACHE_INPUT_SHIFT)
+#define DST_CACHE_INPUT_BUCKET_SIZE (4)
+#define DST_CACHE_INPUT_HASH_MASK (~(DST_CACHE_INPUT_BUCKET_SIZE - 1))
+#define INVALID_DST_CACHE_INPUT_KEY (~(u64)(0))
+
struct dst_cache {
struct dst_cache_pcpu __percpu *cache;
unsigned long reset_ts;
};
+extern unsigned int dst_cache_net_id __read_mostly;
+
+/**
+ * idst_for_each_in_bucket - iterate over a dst cache bucket
+ * @pos: the type * to use as a loop cursor
+ * @head: the head of the cpu dst cache.
+ * @hash: the hash of the bucket
+ */
+#define idst_for_each_in_bucket(pos, head, hash) \
+ for (pos = &head[hash]; \
+ pos < &head[hash + DST_CACHE_INPUT_BUCKET_SIZE]; \
+ pos++)
+
+/**
+ * idst_for_each_in_cache - iterate over the dst cache
+ * @pos: the type * to use as a loop cursor
+ * @head: the head of the cpu dst cache.
+ */
+#define idst_for_each_in_cache(pos, head) \
+ for (pos = head; pos < head + DST_CACHE_INPUT_SIZE; pos++)
+
/**
* dst_cache_get - perform cache lookup
* @dst_cache: the cache
@@ -106,4 +133,33 @@ int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp);
*/
void dst_cache_destroy(struct dst_cache *dst_cache);
+/**
+ * dst_cache_input_get_noref - perform lookup in the input cache,
+ * return a noref dst
+ * @dst_cache: the input cache
+ * @skb: the packet according to which the dst entry will be searched
+ * local BH must be disabled.
+ */
+struct dst_entry *dst_cache_input_get_noref(struct dst_cache *dst_cache,
+ struct sk_buff *skb);
+
+/**
+ * dst_cache_input_add - add the dst of the given skb to the input cache.
+ *
+ * in case the cache bucket is full, the oldest entry will be deleted
+ * and replaced with the new one.
+ * @dst_cache: the input cache
+ * @skb: The packet according to which the dst entry will be searched
+ *
+ * local BH must be disabled.
+ */
+void dst_cache_input_add(struct dst_cache *dst_cache,
+ const struct sk_buff *skb);
+
+/**
+ * dst_cache_input_init - initialize the input cache,
+ * allocating the required storage
+ */
+int __init dst_cache_input_init(void);
+
#endif
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 0ccfd5fa5cb9..a635c0e52400 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -13,6 +13,7 @@
#include <net/ip6_fib.h>
#endif
#include <uapi/linux/in.h>
+#include <net/netns/generic.h>
struct dst_cache_pcpu {
unsigned long refresh_ts;
@@ -21,9 +22,12 @@ struct dst_cache_pcpu {
union {
struct in_addr in_saddr;
struct in6_addr in6_saddr;
+ u64 key;
};
};
+unsigned int dst_cache_net_id __read_mostly;
+
static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
struct dst_entry *dst, u32 cookie)
{
@@ -181,3 +185,144 @@ void dst_cache_reset_now(struct dst_cache *dst_cache)
}
}
EXPORT_SYMBOL_GPL(dst_cache_reset_now);
+
+static void dst_cache_input_set(struct dst_cache_pcpu *idst,
+ struct dst_entry *dst, u64 key)
+{
+ dst_cache_per_cpu_dst_set(idst, dst, 0);
+ idst->key = key;
+ idst->refresh_ts = jiffies;
+}
+
+static struct dst_entry *__dst_cache_input_get_noref(struct dst_cache_pcpu *idst)
+{
+ struct dst_entry *dst = idst->dst;
+
+ if (unlikely(dst->obsolete && !dst->ops->check(dst, idst->cookie))) {
+ dst_cache_input_set(idst, NULL, INVALID_DST_CACHE_INPUT_KEY);
+ goto fail;
+ }
+
+ idst->refresh_ts = jiffies;
+ return dst;
+
+fail:
+ return NULL;
+}
+
+static inline u64 create_dst_cache_key_ip4(const struct sk_buff *skb)
+{
+ struct iphdr *iphdr = ip_hdr(skb);
+
+ return (((u64)iphdr->daddr) << 8) | iphdr->tos;
+}
+
+static inline u32 hash_dst_cache_key(u64 key)
+{
+ return hash_64(key, DST_CACHE_INPUT_SHIFT) & DST_CACHE_INPUT_HASH_MASK;
+}
+
+struct dst_entry *dst_cache_input_get_noref(struct dst_cache *dst_cache,
+ struct sk_buff *skb)
+{
+ struct dst_entry *out_dst = NULL;
+ struct dst_cache_pcpu *pcpu_cache;
+ struct dst_cache_pcpu *idst;
+ u32 hash;
+ u64 key;
+
+ pcpu_cache = this_cpu_ptr(dst_cache->cache);
+ key = create_dst_cache_key_ip4(skb);
+ hash = hash_dst_cache_key(key);
+ idst_for_each_in_bucket(idst, pcpu_cache, hash) {
+ if (key == idst->key) {
+ out_dst = __dst_cache_input_get_noref(idst);
+ goto out;
+ }
+ }
+out:
+ return out_dst;
+}
+
+static void dst_cache_input_reset_now(struct dst_cache *dst_cache)
+{
+ struct dst_cache_pcpu *caches;
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+ int i;
+
+ for_each_possible_cpu(i) {
+ caches = per_cpu_ptr(dst_cache->cache, i);
+ idst_for_each_in_cache(idst, caches) {
+ idst->key = INVALID_DST_CACHE_INPUT_KEY;
+ dst = idst->dst;
+ if (dst)
+ dst_release(dst);
+ }
+ }
+}
+
+static int __net_init dst_cache_input_net_init(struct net *net)
+{
+ struct dst_cache *dst_cache = net_generic(net, dst_cache_net_id);
+
+ dst_cache->cache = alloc_percpu_array_gfp(struct dst_cache_pcpu,
+ DST_CACHE_INPUT_SIZE,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!dst_cache->cache)
+ return -ENOMEM;
+
+ dst_cache_input_reset_now(dst_cache);
+ return 0;
+}
+
+static void __net_exit dst_cache_input_net_exit(struct net *net)
+{
+ struct dst_cache *dst_cache = net_generic(net, dst_cache_net_id);
+
+ dst_cache_input_reset_now(dst_cache);
+ free_percpu(dst_cache->cache);
+ dst_cache->cache = NULL;
+}
+
+static inline bool idst_empty(struct dst_cache_pcpu *idst)
+{
+ return idst->key == INVALID_DST_CACHE_INPUT_KEY;
+}
+
+void dst_cache_input_add(struct dst_cache *dst_cache, const struct sk_buff *skb)
+{
+ struct dst_cache_pcpu *entry = NULL;
+ struct dst_cache_pcpu *pcpu_cache;
+ struct dst_cache_pcpu *idst;
+ u32 hash;
+ u64 key;
+
+ pcpu_cache = this_cpu_ptr(dst_cache->cache);
+ key = create_dst_cache_key_ip4(skb);
+ hash = hash_dst_cache_key(key);
+ idst_for_each_in_bucket(idst, pcpu_cache, hash) {
+ if (idst_empty(idst)) {
+ entry = idst;
+ goto add_to_cache;
+ }
+ if (!entry || time_before(idst->refresh_ts, entry->refresh_ts))
+ entry = idst;
+ }
+
+add_to_cache:
+ dst_cache_input_set(entry, skb_dst(skb), key);
+}
+
+static struct pernet_operations dst_cache_input_ops __net_initdata = {
+ .init = dst_cache_input_net_init,
+ .exit = dst_cache_input_net_exit,
+ .id = &dst_cache_net_id,
+ .size = sizeof(struct dst_cache),
+};
+
+int __init dst_cache_input_init(void)
+{
+ return register_pernet_subsys(&dst_cache_input_ops);
+}
+subsys_initcall(dst_cache_input_init);
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread* [RFC PATCH net-next v1 3/3] net: route: replace route hints with input_dst_cache
2024-01-25 13:08 [RFC PATCH net-next v1 0/3] net: route: improve route hinting Leone Fernando
2024-01-25 13:12 ` [RFC PATCH net-next v1 1/3] net: route: expire rt if the dst it holds is expired Leone Fernando
2024-01-25 13:14 ` [RFC PATCH net-next v1 2/3] net: dst_cache: add input_dst_cache API Leone Fernando
@ 2024-01-25 13:15 ` Leone Fernando
2 siblings, 0 replies; 4+ messages in thread
From: Leone Fernando @ 2024-01-25 13:15 UTC (permalink / raw)
To: dennis, tj, cl, davem, edumazet, kuba, pabeni, dsahern, linux-mm,
linux-kernel, netdev
Replace route hints with cached dsts - ip_rcv_finish_core will first try
to use the cache and only then fall back to the demux or perform a full
lookup.
Only add newly found dsts to the cache after all the checks have passed
successfully to avoid adding a dropped packet's dst to the cache.
Multicast dsts are not added to the dst_cache as it will require additional
checks and multicast packets are rarer and a slower path anyway.
A check was added to ip_route_use_dst_cache that prevents forwarding
packets received by devices for which forwarding is disabled.
Signed-off-by: Leone Fernando <leone4fernando@gmail.com>
---
include/net/route.h | 6 ++---
net/ipv4/ip_input.c | 58 ++++++++++++++++++++++++---------------------
net/ipv4/route.c | 36 +++++++++++++++++++++-------
3 files changed, 61 insertions(+), 39 deletions(-)
diff --git a/include/net/route.h b/include/net/route.h
index 980ab474eabd..a5a2f55947d6 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -189,9 +189,9 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
struct in_device *in_dev, u32 *itag);
int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
u8 tos, struct net_device *devin);
-int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src,
- u8 tos, struct net_device *devin,
- const struct sk_buff *hint);
+int ip_route_use_dst_cache(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ struct dst_entry *dst);
static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
u8 tos, struct net_device *devin)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 5e9c8156656a..35c8b122d62f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -305,30 +305,44 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
return true;
}
-static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
- const struct sk_buff *hint)
+static bool ip_can_add_dst_cache(struct sk_buff *skb, __u16 rt_type)
{
- return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
- ip_hdr(hint)->tos == iph->tos;
+ return skb_valid_dst(skb) &&
+ rt_type != RTN_BROADCAST &&
+ rt_type != RTN_MULTICAST &&
+ !(IPCB(skb)->flags & IPSKB_MULTIPATH);
+}
+
+static bool ip_can_use_dst_cache(const struct net *net, struct sk_buff *skb)
+{
+ return !skb_dst(skb) && !fib4_has_custom_rules(net);
}
int tcp_v4_early_demux(struct sk_buff *skb);
int udp_v4_early_demux(struct sk_buff *skb);
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
- struct sk_buff *skb, struct net_device *dev,
- const struct sk_buff *hint)
+ struct sk_buff *skb, struct net_device *dev)
{
+ struct dst_cache *dst_cache = net_generic(net, dst_cache_net_id);
const struct iphdr *iph = ip_hdr(skb);
+ struct dst_entry *dst;
int err, drop_reason;
struct rtable *rt;
+ bool do_cache;
drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
- if (ip_can_use_hint(skb, iph, hint)) {
- err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
- dev, hint);
- if (unlikely(err))
- goto drop_error;
+ do_cache = ip_can_use_dst_cache(net, skb);
+ if (do_cache) {
+ dst = dst_cache_input_get_noref(dst_cache, skb);
+ if (dst) {
+ err = ip_route_use_dst_cache(skb, iph->daddr,
+ iph->saddr, iph->tos,
+ dev, dst);
+ if (unlikely(err))
+ goto drop_error;
+ do_cache = false;
+ }
}
if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
@@ -418,6 +432,9 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
}
}
+ if (do_cache && ip_can_add_dst_cache(skb, rt->rt_type))
+ dst_cache_input_add(dst_cache, skb);
+
return NET_RX_SUCCESS;
drop:
@@ -444,7 +461,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
+ ret = ip_rcv_finish_core(net, sk, skb, dev);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -581,21 +598,11 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
}
-static struct sk_buff *ip_extract_route_hint(const struct net *net,
- struct sk_buff *skb, int rt_type)
-{
- if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
- IPCB(skb)->flags & IPSKB_MULTIPATH)
- return NULL;
-
- return skb;
-}
-
static void ip_list_rcv_finish(struct net *net, struct sock *sk,
struct list_head *head)
{
- struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
+ struct sk_buff *skb, *next;
struct list_head sublist;
INIT_LIST_HEAD(&sublist);
@@ -610,14 +617,11 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
if (curr_dst != dst) {
- hint = ip_extract_route_hint(net, skb,
- ((struct rtable *)dst)->rt_type);
-
/* dispatch old sublist */
if (!list_empty(&sublist))
ip_sublist_rcv_finish(&sublist);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7c5e68117ee2..3f1977f9b25c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2157,14 +2157,14 @@ static int ip_mkroute_input(struct sk_buff *skb,
/* Implements all the saddr-related checks as ip_route_input_slow(),
* assuming daddr is valid and the destination is not a local broadcast one.
- * Uses the provided hint instead of performing a route lookup.
+ * Uses the provided dst from dst_cache instead of performing a route lookup.
*/
-int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev,
- const struct sk_buff *hint)
+int ip_route_use_dst_cache(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ struct dst_entry *dst)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct rtable *rt = skb_rtable(hint);
+ struct rtable *rt = (struct rtable *)dst;
struct net *net = dev_net(dev);
int err = -EINVAL;
u32 tag = 0;
@@ -2178,21 +2178,39 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
goto martian_source;
- if (rt->rt_type != RTN_LOCAL)
- goto skip_validate_source;
+ if (ipv4_is_loopback(daddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_destination;
+ if (rt->rt_type != RTN_LOCAL) {
+ if (!IN_DEV_FORWARD(in_dev)) {
+ err = -EHOSTUNREACH;
+ goto out_err;
+ }
+ goto skip_validate_source;
+ }
tos &= IPTOS_RT_MASK;
err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
if (err < 0)
goto martian_source;
skip_validate_source:
- skb_dst_copy(skb, hint);
+ skb_dst_set_noref(skb, dst);
return 0;
martian_source:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+out_err:
return err;
+
+martian_destination:
+ RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+ &daddr, &saddr, dev->name);
+#endif
+ err = -EINVAL;
+ goto out_err;
}
/* get device for dst_alloc with local routes */
@@ -2213,7 +2231,7 @@ static struct net_device *ip_rt_get_dev(struct net *net,
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
* Changes in the enforced policies must be applied also to
- * ip_route_use_hint().
+ * ip_route_use_dst_cache().
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread