* [PATCH net-next 01/12] net: add dst_get_noref and refdst_ptr helpers
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 02/12] ipvs: avoid routing by TOS for real server Julian Anastasov
` (11 subsequent siblings)
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
Needed to hide refdst details.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
include/net/dst.h | 14 +++++++++++++-
1 files changed, 13 insertions(+), 1 deletions(-)
diff --git a/include/net/dst.h b/include/net/dst.h
index 853cda1..967f42a 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -264,10 +264,22 @@ static inline struct dst_entry *dst_clone(struct dst_entry *dst)
extern void dst_release(struct dst_entry *dst);
+/* Get noref version of dst */
+static inline struct dst_entry *dst_get_noref(struct dst_entry *dst)
+{
+ return (struct dst_entry *) ((unsigned long) dst | SKB_DST_NOREF);
+}
+
+/* Return dst pointer from refdst */
+static inline struct dst_entry *refdst_ptr(unsigned long refdst)
+{
+ return (struct dst_entry *)(refdst & SKB_DST_PTRMASK);
+}
+
static inline void refdst_drop(unsigned long refdst)
{
if (!(refdst & SKB_DST_NOREF))
- dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
+ dst_release(refdst_ptr(refdst));
}
/**
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCH net-next 02/12] ipvs: avoid routing by TOS for real server
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 01/12] net: add dst_get_noref and refdst_ptr helpers Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 03/12] ipvs: prefer NETDEV_DOWN event to free cached dsts Julian Anastasov
` (10 subsequent siblings)
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
Avoid replacing the cached route for real server
on every packet with different TOS. I doubt that routing
by TOS for real server is used at all, so we should be
better with such optimization.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
include/net/ip_vs.h | 1 -
net/netfilter/ipvs/ip_vs_xmit.c | 58 +++++++++++++++++----------------------
2 files changed, 25 insertions(+), 34 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 68c69d5..459c328 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -753,7 +753,6 @@ struct ip_vs_dest {
/* for destination cache */
spinlock_t dst_lock; /* lock of dst_cache */
struct dst_entry *dst_cache; /* destination cache entry */
- u32 dst_rtos; /* RT_TOS(tos) for dst */
u32 dst_cookie;
union nf_inet_addr dst_saddr;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index ee6b7a9..4b0bd15 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -57,27 +57,24 @@ enum {
* Destination cache to speed up outgoing route lookup
*/
static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
- u32 dst_cookie)
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct dst_entry *dst, u32 dst_cookie)
{
struct dst_entry *old_dst;
old_dst = dest->dst_cache;
dest->dst_cache = dst;
- dest->dst_rtos = rtos;
dest->dst_cookie = dst_cookie;
dst_release(old_dst);
}
static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
+__ip_vs_dst_check(struct ip_vs_dest *dest)
{
struct dst_entry *dst = dest->dst_cache;
if (!dst)
return NULL;
- if ((dst->obsolete || rtos != dest->dst_rtos) &&
- dst->ops->check(dst, dest->dst_cookie) == NULL) {
+ if (dst->obsolete && dst->ops->check(dst, dest->dst_cookie) == NULL) {
dest->dst_cache = NULL;
dst_release(dst);
return NULL;
@@ -104,7 +101,7 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
/* Get route to daddr, update *saddr, optionally bind route to saddr */
static struct rtable *do_output_route4(struct net *net, __be32 daddr,
- u32 rtos, int rt_mode, __be32 *saddr)
+ int rt_mode, __be32 *saddr)
{
struct flowi4 fl4;
struct rtable *rt;
@@ -113,7 +110,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
- fl4.flowi4_tos = rtos;
fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
FLOWI_FLAG_KNOWN_NH : 0;
@@ -124,7 +120,7 @@ retry:
if (PTR_ERR(rt) == -EINVAL && *saddr &&
rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
*saddr = 0;
- flowi4_update_output(&fl4, 0, rtos, daddr, 0);
+ flowi4_update_output(&fl4, 0, 0, daddr, 0);
goto retry;
}
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
@@ -132,7 +128,7 @@ retry:
} else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
ip_rt_put(rt);
*saddr = fl4.saddr;
- flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr);
+ flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
loop++;
goto retry;
}
@@ -143,7 +139,7 @@ retry:
/* Get route to destination or remote server */
static struct rtable *
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
- __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr)
+ __be32 daddr, int rt_mode, __be32 *ret_saddr)
{
struct net *net = dev_net(skb_dst(skb)->dev);
struct rtable *rt; /* Route to the other host */
@@ -152,19 +148,18 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
if (dest) {
spin_lock(&dest->dst_lock);
- if (!(rt = (struct rtable *)
- __ip_vs_dst_check(dest, rtos))) {
- rt = do_output_route4(net, dest->addr.ip, rtos,
- rt_mode, &dest->dst_saddr.ip);
+ rt = (struct rtable *) __ip_vs_dst_check(dest);
+ if (!rt) {
+ rt = do_output_route4(net, dest->addr.ip, rt_mode,
+ &dest->dst_saddr.ip);
if (!rt) {
spin_unlock(&dest->dst_lock);
return NULL;
}
- __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
- IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
- "rtos=%X\n",
+ __ip_vs_dst_set(dest, dst_clone(&rt->dst), 0);
+ IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
&dest->addr.ip, &dest->dst_saddr.ip,
- atomic_read(&rt->dst.__refcnt), rtos);
+ atomic_read(&rt->dst.__refcnt));
}
daddr = dest->addr.ip;
if (ret_saddr)
@@ -177,7 +172,7 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
* for performance reasons because we do not remember saddr
*/
rt_mode &= ~IP_VS_RT_MODE_CONNECT;
- rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr);
+ rt = do_output_route4(net, daddr, rt_mode, &saddr);
if (!rt)
return NULL;
if (ret_saddr)
@@ -307,7 +302,7 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
if (dest) {
spin_lock(&dest->dst_lock);
- rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
+ rt = (struct rt6_info *)__ip_vs_dst_check(dest);
if (!rt) {
u32 cookie;
@@ -320,7 +315,7 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
}
rt = (struct rt6_info *) dst;
cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+ __ip_vs_dst_set(dest, dst_clone(&rt->dst), cookie);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
&dest->addr.in6, &dest->dst_saddr.in6,
atomic_read(&rt->dst.__refcnt));
@@ -449,8 +444,9 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos),
- IP_VS_RT_MODE_NON_LOCAL, NULL)))
+ rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
+ NULL);
+ if (!rt)
goto tx_error_icmp;
/* MTU checking */
@@ -581,10 +577,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos),
IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR, NULL)))
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL)))
goto tx_error_icmp;
local = rt->rt_flags & RTCF_LOCAL;
/*
@@ -832,10 +827,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_CONNECT,
- &saddr)))
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT, &saddr)))
goto tx_error_icmp;
if (rt->rt_flags & RTCF_LOCAL) {
ip_rt_put(rt);
@@ -1067,7 +1061,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos),
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_KNOWN_NH, NULL)))
@@ -1223,7 +1216,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(ip_hdr(skb)->tos),
rt_mode, NULL)))
goto tx_error_icmp;
local = rt->rt_flags & RTCF_LOCAL;
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCH net-next 03/12] ipvs: prefer NETDEV_DOWN event to free cached dsts
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 01/12] net: add dst_get_noref and refdst_ptr helpers Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 02/12] ipvs: avoid routing by TOS for real server Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 9:56 ` Hans Schillstrom
2013-03-06 8:42 ` [PATCH net-next 04/12] ipvs: convert the IP_VS_XMIT macros to functions Julian Anastasov
` (9 subsequent siblings)
12 siblings, 1 reply; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
The real server becomes unreachable on down event,
no need to wait device unregistration. Should help in
releasing dsts early before dst->dev is replaced with lo.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/ipvs/ip_vs_ctl.c | 8 +++-----
1 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c68198b..76fc8f2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1512,10 +1512,8 @@ __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
spin_unlock_bh(&dest->dst_lock);
}
-/*
- * Netdev event receiver
- * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
- * a device that is "unregister" it must be released.
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
*/
static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
void *ptr)
@@ -1527,7 +1525,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
struct ip_vs_dest *dest;
unsigned int idx;
- if (event != NETDEV_UNREGISTER || !ipvs)
+ if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
EnterFunction(2);
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 03/12] ipvs: prefer NETDEV_DOWN event to free cached dsts
2013-03-06 8:42 ` [PATCH net-next 03/12] ipvs: prefer NETDEV_DOWN event to free cached dsts Julian Anastasov
@ 2013-03-06 9:56 ` Hans Schillstrom
2013-03-06 21:21 ` Julian Anastasov
0 siblings, 1 reply; 26+ messages in thread
From: Hans Schillstrom @ 2013-03-06 9:56 UTC (permalink / raw)
To: Julian Anastasov; +Cc: Simon Horman, lvs-devel, netdev
[-- Attachment #1: Type: text/plain, Size: 1719 bytes --]
Hi Julian
On Wed, 2013-03-06 at 10:42 +0200, Julian Anastasov wrote:
> The real server becomes unreachable on down event,
> no need to wait device unregistration. Should help in
> releasing dsts early before dst->dev is replaced with lo.
Have you test this in a network namespace ?
i.e. kill the namespace with heave traffic through it
From what I remember this was a tricky area...
I have some test cases for this, should I run them ?
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> ---
> net/netfilter/ipvs/ip_vs_ctl.c | 8 +++-----
> 1 files changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index c68198b..76fc8f2 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -1512,10 +1512,8 @@ __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
> spin_unlock_bh(&dest->dst_lock);
>
> }
> -/*
> - * Netdev event receiver
> - * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
> - * a device that is "unregister" it must be released.
> +/* Netdev event receiver
> + * Currently only NETDEV_DOWN is handled to release refs to cached dsts
> */
> static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
> void *ptr)
> @@ -1527,7 +1525,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
> struct ip_vs_dest *dest;
> unsigned int idx;
>
> - if (event != NETDEV_UNREGISTER || !ipvs)
> + if (event != NETDEV_DOWN || !ipvs)
> return NOTIFY_DONE;
> IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
> EnterFunction(2);
Regards
Hans
[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6177 bytes --]
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 03/12] ipvs: prefer NETDEV_DOWN event to free cached dsts
2013-03-06 9:56 ` Hans Schillstrom
@ 2013-03-06 21:21 ` Julian Anastasov
2013-03-07 7:43 ` Hans Schillstrom
0 siblings, 1 reply; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 21:21 UTC (permalink / raw)
To: Hans Schillstrom; +Cc: Simon Horman, lvs-devel, netdev
Hello,
On Wed, 6 Mar 2013, Hans Schillstrom wrote:
> Hi Julian
>
> On Wed, 2013-03-06 at 10:42 +0200, Julian Anastasov wrote:
> > The real server becomes unreachable on down event,
> > no need to wait device unregistration. Should help in
> > releasing dsts early before dst->dev is replaced with lo.
>
> Have you test this in a network namespace ?
> i.e. kill the namespace with heave traffic through it
This should not be a problem. Even without ns exit
situation, we can have a case where device goes down,
the output routes will start to fail and we will
not cache route anymore. It can happen while the
input device floods us with requests.
> From what I remember this was a tricky area...
>
> I have some test cases for this, should I run them ?
If it is easy...
> > Signed-off-by: Julian Anastasov <ja@ssi.bg>
> > ---
> > net/netfilter/ipvs/ip_vs_ctl.c | 8 +++-----
> > 1 files changed, 3 insertions(+), 5 deletions(-)
> >
> > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> > index c68198b..76fc8f2 100644
> > --- a/net/netfilter/ipvs/ip_vs_ctl.c
> > +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> > @@ -1512,10 +1512,8 @@ __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
> > spin_unlock_bh(&dest->dst_lock);
> >
> > }
> > -/*
> > - * Netdev event receiver
> > - * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
> > - * a device that is "unregister" it must be released.
> > +/* Netdev event receiver
> > + * Currently only NETDEV_DOWN is handled to release refs to cached dsts
> > */
> > static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
> > void *ptr)
> > @@ -1527,7 +1525,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
> > struct ip_vs_dest *dest;
> > unsigned int idx;
> >
> > - if (event != NETDEV_UNREGISTER || !ipvs)
> > + if (event != NETDEV_DOWN || !ipvs)
> > return NOTIFY_DONE;
> > IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
> > EnterFunction(2);
>
>
> Regards
> Hans
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 03/12] ipvs: prefer NETDEV_DOWN event to free cached dsts
2013-03-06 21:21 ` Julian Anastasov
@ 2013-03-07 7:43 ` Hans Schillstrom
0 siblings, 0 replies; 26+ messages in thread
From: Hans Schillstrom @ 2013-03-07 7:43 UTC (permalink / raw)
To: Julian Anastasov; +Cc: Simon Horman, lvs-devel, netdev
[-- Attachment #1: Type: text/plain, Size: 2612 bytes --]
Hello
On Wed, 2013-03-06 at 23:21 +0200, Julian Anastasov wrote:
> Hello,
>
> On Wed, 6 Mar 2013, Hans Schillstrom wrote:
>
> > Hi Julian
> >
> > On Wed, 2013-03-06 at 10:42 +0200, Julian Anastasov wrote:
> > > The real server becomes unreachable on down event,
> > > no need to wait device unregistration. Should help in
> > > releasing dsts early before dst->dev is replaced with lo.
> >
> > Have you test this in a network namespace ?
> > i.e. kill the namespace with heave traffic through it
>
> This should not be a problem. Even without ns exit
> situation, we can have a case where device goes down,
> the output routes will start to fail and we will
> not cache route anymore. It can happen while the
> input device floods us with requests.
I've gone through the unregister code a couple of times now
and in theory it should work.
The test suit have also been running without problem
> > From what I remember this was a tricky area...
> >
> > I have some test cases for this, should I run them ?
>
> If it is easy...
>
> > > Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
> > > ---
> > > net/netfilter/ipvs/ip_vs_ctl.c | 8 +++-----
> > > 1 files changed, 3 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> > > index c68198b..76fc8f2 100644
> > > --- a/net/netfilter/ipvs/ip_vs_ctl.c
> > > +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> > > @@ -1512,10 +1512,8 @@ __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
> > > spin_unlock_bh(&dest->dst_lock);
> > >
> > > }
> > > -/*
> > > - * Netdev event receiver
> > > - * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
> > > - * a device that is "unregister" it must be released.
> > > +/* Netdev event receiver
> > > + * Currently only NETDEV_DOWN is handled to release refs to cached dsts
> > > */
> > > static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
> > > void *ptr)
> > > @@ -1527,7 +1525,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
> > > struct ip_vs_dest *dest;
> > > unsigned int idx;
> > >
> > > - if (event != NETDEV_UNREGISTER || !ipvs)
> > > + if (event != NETDEV_DOWN || !ipvs)
> > > return NOTIFY_DONE;
> > > IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
> > > EnterFunction(2);
> >
> >
> > Regards
> > Hans
>
> Regards
>
> --
> Julian Anastasov <ja@ssi.bg>
Regargs
Hans
[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6177 bytes --]
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH net-next 04/12] ipvs: convert the IP_VS_XMIT macros to functions
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (2 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 03/12] ipvs: prefer NETDEV_DOWN event to free cached dsts Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 05/12] ipvs: rename functions related to dst_cache reset Julian Anastasov
` (8 subsequent siblings)
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
It was a bad idea to hide return statements in macros.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/ipvs/ip_vs_xmit.c | 134 +++++++++++++++++++++------------------
1 files changed, 72 insertions(+), 62 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4b0bd15..7cd7c61 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -376,45 +376,59 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
dest->dst_saddr.ip = 0;
}
-#define IP_VS_XMIT_TUNNEL(skb, cp) \
-({ \
- int __ret = NF_ACCEPT; \
- \
- (skb)->ipvs_property = 1; \
- if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
- __ret = ip_vs_confirm_conntrack(skb); \
- if (__ret == NF_ACCEPT) { \
- nf_reset(skb); \
- skb_forward_csum(skb); \
- } \
- __ret; \
-})
-
-#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- else \
- ip_vs_update_conntrack(skb, cp, 1); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
-
-#define IP_VS_XMIT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
+{
+ int ret = NF_ACCEPT;
+
+ skb->ipvs_property = 1;
+ if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+ ret = ip_vs_confirm_conntrack(skb);
+ if (ret == NF_ACCEPT) {
+ nf_reset(skb);
+ skb_forward_csum(skb);
+ }
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 1);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
/*
@@ -425,7 +439,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
/* we do not touch skb and do not need pskb ptr */
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
@@ -476,7 +490,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -537,7 +551,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -562,7 +576,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rtable *rt; /* Route to the other host */
int mtu;
struct iphdr *iph = ip_hdr(skb);
- int local;
+ int local, rc;
EnterFunction(10);
@@ -655,10 +669,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
tx_error_icmp:
dst_link_failure(skb);
@@ -678,7 +692,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct rt6_info *rt; /* Route to the other host */
int mtu;
- int local;
+ int local, rc;
EnterFunction(10);
@@ -771,10 +785,10 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
tx_error_icmp:
dst_link_failure(skb);
@@ -833,7 +847,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error_icmp;
if (rt->rt_flags & RTCF_LOCAL) {
ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
tdev = rt->dst.dev;
@@ -905,7 +919,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip_local_out(skb);
else if (ret == NF_DROP)
@@ -948,7 +962,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error_icmp;
if (__ip_vs_is_local_route6(rt)) {
dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
tdev = rt->dst.dev;
@@ -1023,7 +1037,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip6_local_out(skb);
else if (ret == NF_DROP)
@@ -1067,7 +1081,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error_icmp;
if (rt->rt_flags & RTCF_LOCAL) {
ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
/* MTU checking */
@@ -1097,7 +1111,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -1126,7 +1140,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error_icmp;
if (__ip_vs_is_local_route6(rt)) {
dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
/* MTU checking */
@@ -1162,7 +1176,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -1283,9 +1297,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
goto out;
tx_error_icmp:
@@ -1404,9 +1416,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
goto out;
tx_error_icmp:
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCH net-next 05/12] ipvs: rename functions related to dst_cache reset
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (3 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 04/12] ipvs: convert the IP_VS_XMIT macros to functions Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 06/12] ipvs: optimize dst usage for real server Julian Anastasov
` (7 subsequent siblings)
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
Move and give better names to two functions:
- ip_vs_dst_reset to __ip_vs_dst_cache_reset
- __ip_vs_dev_reset to ip_vs_forget_dev
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
include/net/ip_vs.h | 1 -
net/netfilter/ipvs/ip_vs_ctl.c | 34 ++++++++++++++++++++++------------
net/netfilter/ipvs/ip_vs_xmit.c | 14 --------------
3 files changed, 22 insertions(+), 27 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 459c328..c05c59c 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1415,7 +1415,6 @@ extern int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
extern int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, int offset,
unsigned int hooknum, struct ip_vs_iphdr *iph);
-extern void ip_vs_dst_reset(struct ip_vs_dest *dest);
#ifdef CONFIG_IP_VS_IPV6
extern int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 76fc8f2..7b774af 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -639,6 +639,17 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
return dest;
}
+/* Release dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+ struct dst_entry *old_dst;
+
+ old_dst = dest->dst_cache;
+ dest->dst_cache = NULL;
+ dst_release(old_dst);
+ dest->dst_saddr.ip = 0;
+}
+
/*
* Lookup dest by {svc,addr,port} in the destination trash.
* The destination trash is used to hold the destinations that are removed
@@ -688,7 +699,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port));
list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
__ip_vs_unbind_svc(dest);
free_percpu(dest->stats.cpustats);
kfree(dest);
@@ -715,7 +726,7 @@ static void ip_vs_trash_cleanup(struct net *net)
list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
__ip_vs_unbind_svc(dest);
free_percpu(dest->stats.cpustats);
kfree(dest);
@@ -809,7 +820,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
dest->l_threshold = udest->l_threshold;
spin_lock_bh(&dest->dst_lock);
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
if (add)
@@ -1035,7 +1046,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
dest->vfwmark,
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port));
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
/* simply decrease svc->refcnt here, let the caller check
and release the service if nobody refers to it.
Only user context can release destination and service,
@@ -1494,11 +1505,10 @@ void ip_vs_service_net_cleanup(struct net *net)
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
-/*
- * Release dst hold by dst_cache
- */
+
+/* Put all references for device (dst_cache) */
static inline void
-__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
{
spin_lock_bh(&dest->dst_lock);
if (dest->dst_cache && dest->dst_cache->dev == dev) {
@@ -1507,7 +1517,7 @@ __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
}
spin_unlock_bh(&dest->dst_lock);
@@ -1535,7 +1545,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
}
}
@@ -1544,7 +1554,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
}
@@ -1552,7 +1562,7 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
}
list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 7cd7c61..6448a2e 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -362,20 +362,6 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
#endif
-/*
- * Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
-{
- struct dst_entry *old_dst;
-
- old_dst = dest->dst_cache;
- dest->dst_cache = NULL;
- dst_release(old_dst);
- dest->dst_saddr.ip = 0;
-}
-
/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
struct ip_vs_conn *cp)
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCH net-next 06/12] ipvs: optimize dst usage for real server
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (4 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 05/12] ipvs: rename functions related to dst_cache reset Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 20:18 ` David Miller
2013-03-06 8:42 ` [PATCH net-next 07/12] ipvs: convert app locks Julian Anastasov
` (6 subsequent siblings)
12 siblings, 1 reply; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
Currently when forwarding requests to real servers
we use dst_lock and atomic operations when cloning the
dst_cache value. As the dst_cache value does not change
most of the time it is better to use RCU and to lock
dst_lock only when we need to replace the obsoleted dst.
For this to work we keep dst_cache in new structure protected
by RCU. For packets to remote real servers we will use noref
version of dst_cache, it will be valid while we are in RCU
read-side critical section because now dst_release for replaced
dsts will be invoked after the grace period. NAT-ed packets
via loopback that are not sent but are passed to local stack
with NF_ACCEPT need a dst clone (skb_dst_force).
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
include/net/ip_vs.h | 12 +-
net/netfilter/ipvs/ip_vs_core.c | 11 +-
net/netfilter/ipvs/ip_vs_ctl.c | 24 ++-
net/netfilter/ipvs/ip_vs_xmit.c | 366 ++++++++++++++++++++++++++-------------
4 files changed, 275 insertions(+), 138 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index c05c59c..f8cc8f4 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -724,6 +724,13 @@ struct ip_vs_service {
struct ip_vs_pe *pe;
};
+/* Information for cached dst */
+struct ip_vs_dest_dst {
+ struct dst_entry *dst_cache; /* destination cache entry */
+ u32 dst_cookie;
+ union nf_inet_addr dst_saddr;
+ struct rcu_head rcu_head;
+};
/*
* The real server destination forwarding entry
@@ -752,9 +759,7 @@ struct ip_vs_dest {
/* for destination cache */
spinlock_t dst_lock; /* lock of dst_cache */
- struct dst_entry *dst_cache; /* destination cache entry */
- u32 dst_cookie;
- union nf_inet_addr dst_saddr;
+ struct ip_vs_dest_dst __rcu *dest_dst; /* cached dst info */
/* for virtual service */
struct ip_vs_service *svc; /* service it belongs to */
@@ -1415,6 +1420,7 @@ extern int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
extern int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, int offset,
unsigned int hooknum, struct ip_vs_iphdr *iph);
+extern void ip_vs_dest_dst_rcu_free(struct rcu_head *head);
#ifdef CONFIG_IP_VS_IPV6
extern int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 47edf5a..7e03f42 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1403,10 +1403,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
goto ignore_ipip;
/* Prefer the resulting PMTU */
if (dest) {
- spin_lock(&dest->dst_lock);
- if (dest->dst_cache)
- mtu = dst_mtu(dest->dst_cache);
- spin_unlock(&dest->dst_lock);
+ struct ip_vs_dest_dst *dest_dst;
+
+ rcu_read_lock();
+ dest_dst = rcu_dereference(dest->dest_dst);
+ if (dest_dst)
+ mtu = dst_mtu(dest_dst->dst_cache);
+ rcu_read_unlock();
}
if (mtu > 68 + sizeof(struct iphdr))
mtu -= sizeof(struct iphdr);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7b774af..844fb9b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -639,15 +639,25 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
return dest;
}
-/* Release dst_cache for dest in user context */
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_dst *dest_dst = container_of(head,
+ struct ip_vs_dest_dst,
+ rcu_head);
+
+ dst_release(dest_dst->dst_cache);
+ kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
{
- struct dst_entry *old_dst;
+ struct ip_vs_dest_dst *old = rcu_dereference_raw(dest->dest_dst);
- old_dst = dest->dst_cache;
- dest->dst_cache = NULL;
- dst_release(old_dst);
- dest->dst_saddr.ip = 0;
+ if (old) {
+ RCU_INIT_POINTER(dest->dest_dst, NULL);
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+ }
}
/*
@@ -1511,7 +1521,7 @@ static inline void
ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
{
spin_lock_bh(&dest->dst_lock);
- if (dest->dst_cache && dest->dst_cache->dev == dev) {
+ if (dest->dest_dst && dest->dest_dst->dst_cache->dev == dev) {
IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
dev->name,
IP_VS_DBG_ADDR(dest->af, &dest->addr),
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6448a2e..439a67f 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -17,6 +17,8 @@
* - not all connections have destination server, for example,
* connections in backup server when fwmark is used
* - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
* LOCAL_OUT rules:
* - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
* - skb->pkt_type is not set yet
@@ -53,34 +55,51 @@ enum {
IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
};
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+ return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+ kfree(dest_dst);
+}
+
/*
* Destination cache to speed up outgoing route lookup
*/
static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, struct dst_entry *dst, u32 dst_cookie)
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+ struct dst_entry *dst, u32 dst_cookie)
{
- struct dst_entry *old_dst;
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst,
+ lockdep_is_held(&dest->dst_lock));
- old_dst = dest->dst_cache;
- dest->dst_cache = dst;
- dest->dst_cookie = dst_cookie;
- dst_release(old_dst);
+ if (dest_dst) {
+ dest_dst->dst_cache = dst;
+ dest_dst->dst_cookie = dst_cookie;
+ }
+ rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+ if (old)
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
}
-static inline struct dst_entry *
+static inline struct ip_vs_dest_dst *
__ip_vs_dst_check(struct ip_vs_dest *dest)
{
- struct dst_entry *dst = dest->dst_cache;
+ struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+ struct dst_entry *dst;
- if (!dst)
+ if (!dest_dst)
return NULL;
- if (dst->obsolete && dst->ops->check(dst, dest->dst_cookie) == NULL) {
- dest->dst_cache = NULL;
- dst_release(dst);
+ dst = dest_dst->dst_cache;
+ if (dst->obsolete &&
+ dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
return NULL;
- }
- dst_hold(dst);
- return dst;
+ return dest_dst;
}
static inline bool
@@ -136,35 +155,48 @@ retry:
return rt;
}
-/* Get route to destination or remote server */
-static struct rtable *
+/* Get route (refdst) to destination or remote server */
+static unsigned long
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
__be32 daddr, int rt_mode, __be32 *ret_saddr)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
struct rtable *rt; /* Route to the other host */
struct rtable *ort; /* Original route */
+ unsigned long refdst;
int local;
if (dest) {
- spin_lock(&dest->dst_lock);
- rt = (struct rtable *) __ip_vs_dst_check(dest);
- if (!rt) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rtable *) dest_dst->dst_cache;
+ else {
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock(&dest->dst_lock);
+ return 0;
+ }
rt = do_output_route4(net, dest->addr.ip, rt_mode,
- &dest->dst_saddr.ip);
+ &dest_dst->dst_saddr.ip);
if (!rt) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
spin_unlock(&dest->dst_lock);
- return NULL;
+ ip_vs_dest_dst_free(dest_dst);
+ return 0;
}
- __ip_vs_dst_set(dest, dst_clone(&rt->dst), 0);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ spin_unlock(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
- &dest->addr.ip, &dest->dst_saddr.ip,
+ &dest->addr.ip, &dest_dst->dst_saddr.ip,
atomic_read(&rt->dst.__refcnt));
}
+ refdst = (unsigned long) dst_get_noref(&rt->dst);
daddr = dest->addr.ip;
if (ret_saddr)
- *ret_saddr = dest->dst_saddr.ip;
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.ip;
} else {
__be32 saddr = htonl(INADDR_ANY);
@@ -174,7 +206,8 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
rt_mode &= ~IP_VS_RT_MODE_CONNECT;
rt = do_output_route4(net, daddr, rt_mode, &saddr);
if (!rt)
- return NULL;
+ return 0;
+ refdst = (unsigned long) &rt->dst;
if (ret_saddr)
*ret_saddr = saddr;
}
@@ -185,26 +218,26 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
(rt->rt_flags & RTCF_LOCAL) ?
"local":"non-local", &daddr);
- ip_rt_put(rt);
- return NULL;
+ refdst_drop(refdst);
+ return 0;
}
if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
!((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) {
IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
"requires NAT method, dest: %pI4\n",
&ip_hdr(skb)->daddr, &daddr);
- ip_rt_put(rt);
- return NULL;
+ refdst_drop(refdst);
+ return 0;
}
if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
"to non-local address, dest: %pI4\n",
&ip_hdr(skb)->saddr, &daddr);
- ip_rt_put(rt);
- return NULL;
+ refdst_drop(refdst);
+ return 0;
}
- return rt;
+ return refdst;
}
/* Reroute packet to local IPv4 stack after DNAT */
@@ -287,47 +320,61 @@ out_err:
}
/*
- * Get route to destination or remote server
+ * Get route (refdst) to destination or remote server
*/
-static struct rt6_info *
+static unsigned long
__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
struct in6_addr *daddr, struct in6_addr *ret_saddr,
int do_xfrm, int rt_mode)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
struct rt6_info *rt; /* Route to the other host */
struct rt6_info *ort; /* Original route */
+ unsigned long refdst;
struct dst_entry *dst;
int local;
if (dest) {
- spin_lock(&dest->dst_lock);
- rt = (struct rt6_info *)__ip_vs_dst_check(dest);
- if (!rt) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rt6_info *) dest_dst->dst_cache;
+ else {
u32 cookie;
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock(&dest->dst_lock);
+ return 0;
+ }
dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
- &dest->dst_saddr.in6,
+ &dest_dst->dst_saddr.in6,
do_xfrm);
if (!dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
spin_unlock(&dest->dst_lock);
- return NULL;
+ ip_vs_dest_dst_free(dest_dst);
+ return 0;
}
rt = (struct rt6_info *) dst;
cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- __ip_vs_dst_set(dest, dst_clone(&rt->dst), cookie);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ spin_unlock(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
- &dest->addr.in6, &dest->dst_saddr.in6,
+ &dest->addr.in6, &dest_dst->dst_saddr.in6,
atomic_read(&rt->dst.__refcnt));
}
+ refdst = (unsigned long) dst_get_noref(&rt->dst);
if (ret_saddr)
- *ret_saddr = dest->dst_saddr.in6;
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.in6;
} else {
dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
if (!dst)
- return NULL;
+ return 0;
rt = (struct rt6_info *) dst;
+ refdst = (unsigned long) dst;
}
local = __ip_vs_is_local_route6(rt);
@@ -335,8 +382,8 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
local ? "local":"non-local", daddr);
- dst_release(&rt->dst);
- return NULL;
+ refdst_drop(refdst);
+ return 0;
}
if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
!((ort = (struct rt6_info *) skb_dst(skb)) &&
@@ -344,8 +391,8 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
IP_VS_DBG_RL("Redirect from non-local address %pI6c to local "
"requires NAT method, dest: %pI6c\n",
&ipv6_hdr(skb)->daddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+ refdst_drop(refdst);
+ return 0;
}
if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
@@ -353,11 +400,11 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
IP_VS_DBG_RL("Stopping traffic from loopback address %pI6c "
"to non-local address, dest: %pI6c\n",
&ipv6_hdr(skb)->saddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+ refdst_drop(refdst);
+ return 0;
}
- return rt;
+ return refdst;
}
#endif
@@ -438,22 +485,25 @@ int
ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
+ struct dst_entry *dst;
+ unsigned long refdst;
struct iphdr *iph = ip_hdr(skb);
int mtu;
EnterFunction(10);
- rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
- NULL);
- if (!rt)
+ rcu_read_lock();
+ refdst = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
+ IP_VS_RT_MODE_NON_LOCAL, NULL);
+ if (!refdst)
goto tx_error_icmp;
+ dst = refdst_ptr(refdst);
/* MTU checking */
- mtu = dst_mtu(&rt->dst);
+ mtu = dst_mtu(dst);
if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
!skb_is_gso(skb)) {
- ip_rt_put(rt);
+ refdst_drop(refdst);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
@@ -464,19 +514,21 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
* after ip_defrag. Is copy-on-write needed?
*/
if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
+ refdst_drop(refdst);
+ rcu_read_unlock();
return NF_STOLEN;
}
ip_send_check(ip_hdr(skb));
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
@@ -484,6 +536,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
@@ -494,18 +547,21 @@ int
ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
{
- struct rt6_info *rt; /* Route to the other host */
+ struct dst_entry *dst;
+ unsigned long refdst;
int mtu;
EnterFunction(10);
- rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr.in6, NULL, 0,
- IP_VS_RT_MODE_NON_LOCAL);
- if (!rt)
+ rcu_read_lock();
+ refdst = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr.in6, NULL, 0,
+ IP_VS_RT_MODE_NON_LOCAL);
+ if (!refdst)
goto tx_error_icmp;
+ dst = refdst_ptr(refdst);
/* MTU checking */
- mtu = dst_mtu(&rt->dst);
+ mtu = dst_mtu(dst);
if (__mtu_check_toobig_v6(skb, mtu)) {
if (!skb->dev) {
struct net *net = dev_net(skb_dst(skb)->dev);
@@ -515,7 +571,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* only send ICMP too big on first fragment */
if (!iph->fragoffs)
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
+ refdst_drop(refdst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
}
@@ -526,18 +582,20 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
+ refdst_drop(refdst);
+ rcu_read_unlock();
return NF_STOLEN;
}
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
@@ -545,6 +603,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
@@ -560,12 +619,14 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rtable *rt; /* Route to the other host */
+ unsigned long refdst;
int mtu;
struct iphdr *iph = ip_hdr(skb);
int local, rc;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
@@ -576,11 +637,13 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR, NULL)))
+ refdst = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL);
+ if (!refdst)
goto tx_error_icmp;
+ rt = (struct rtable *) refdst_ptr(refdst);
local = rt->rt_flags & RTCF_LOCAL;
/*
* Avoid duplicate tuple in reply direction for NAT traffic
@@ -634,9 +697,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (!local) {
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
} else {
- ip_rt_put(rt);
+ refdst_drop(refdst);
/*
* Some IPv4 replies get local address from routes,
* not from iph, so while we DNAT after routing
@@ -656,6 +719,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->local_df = 1;
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
return rc;
@@ -663,11 +727,12 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
tx_error_put:
- ip_rt_put(rt);
+ refdst_drop(refdst);
goto tx_error;
}
@@ -677,11 +742,13 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
{
struct rt6_info *rt; /* Route to the other host */
+ unsigned long refdst;
int mtu;
int local, rc;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !iph->fragoffs)) {
__be16 _pt, *p;
@@ -692,11 +759,13 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR))))
+ refdst = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, (IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR));
+ if (!refdst)
goto tx_error_icmp;
+ rt = (struct rt6_info *) refdst_ptr(refdst);
local = __ip_vs_is_local_route6(rt);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
@@ -756,10 +825,12 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (!local || !skb->dev) {
/* drop the old route when skb is not shared */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
+ if (local)
+ skb_dst_force(skb);
} else {
/* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
+ refdst_drop(refdst);
}
IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
@@ -772,6 +843,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->local_df = 1;
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
return rc;
@@ -779,11 +851,12 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
LeaveFunction(10);
kfree_skb(skb);
return NF_STOLEN;
tx_error_put:
- dst_release(&rt->dst);
+ refdst_drop(refdst);
goto tx_error;
}
#endif
@@ -814,6 +887,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
struct rtable *rt; /* Route to the other host */
+ unsigned long refdst;
__be32 saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = ip_hdr(skb);
@@ -826,13 +900,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_CONNECT, &saddr)))
+ rcu_read_lock();
+ refdst = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT, &saddr);
+ if (!refdst)
goto tx_error_icmp;
+ rt = (struct rtable *) refdst_ptr(refdst);
if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
+ refdst_drop(refdst);
+ rcu_read_unlock();
return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
@@ -865,7 +943,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
- ip_rt_put(rt);
+ refdst_drop(refdst);
+ rcu_read_unlock();
kfree_skb(skb);
IP_VS_ERR_RL("%s(): no memory\n", __func__);
return NF_STOLEN;
@@ -886,7 +965,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
/*
* Push down and install the IPIP header.
@@ -910,6 +989,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
@@ -918,11 +998,12 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
tx_error_put:
- ip_rt_put(rt);
+ refdst_drop(refdst);
goto tx_error;
}
@@ -932,6 +1013,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
+ unsigned long refdst;
struct in6_addr saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
struct ipv6hdr *old_iph = ipv6_hdr(skb);
@@ -942,12 +1024,16 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
- &saddr, 1, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL))))
+ rcu_read_lock();
+ refdst = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+ &saddr, 1, (IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL));
+ if (!refdst)
goto tx_error_icmp;
+ rt = (struct rt6_info *) refdst_ptr(refdst);
if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
+ refdst_drop(refdst);
+ rcu_read_unlock();
return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
@@ -986,7 +1072,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
- dst_release(&rt->dst);
+ refdst_drop(refdst);
+ rcu_read_unlock();
kfree_skb(skb);
IP_VS_ERR_RL("%s(): no memory\n", __func__);
return NF_STOLEN;
@@ -1004,7 +1091,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
/*
* Push down and install the IPIP header.
@@ -1028,6 +1115,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ip6_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
@@ -1036,11 +1124,12 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
tx_error_put:
- dst_release(&rt->dst);
+ refdst_drop(refdst);
goto tx_error;
}
#endif
@@ -1055,18 +1144,23 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rtable *rt; /* Route to the other host */
+ unsigned long refdst;
struct iphdr *iph = ip_hdr(skb);
int mtu;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_KNOWN_NH, NULL)))
+ rcu_read_lock();
+ refdst = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH, NULL);
+ if (!refdst)
goto tx_error_icmp;
+ rt = (struct rtable *) refdst_ptr(refdst);
if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
+ refdst_drop(refdst);
+ rcu_read_unlock();
return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
@@ -1075,7 +1169,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
!skb_is_gso(skb)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
+ refdst_drop(refdst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
}
@@ -1085,19 +1179,21 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
* after ip_defrag. Is copy-on-write needed?
*/
if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
+ refdst_drop(refdst);
+ rcu_read_unlock();
return NF_STOLEN;
}
ip_send_check(ip_hdr(skb));
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
@@ -1105,6 +1201,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
@@ -1116,16 +1213,21 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
{
struct rt6_info *rt; /* Route to the other host */
+ unsigned long refdst;
int mtu;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL))))
+ rcu_read_lock();
+ refdst = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, (IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL));
+ if (!refdst)
goto tx_error_icmp;
+ rt = (struct rt6_info *) refdst_ptr(refdst);
if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
+ refdst_drop(refdst);
+ rcu_read_unlock();
return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
@@ -1140,7 +1242,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* only send ICMP too big on first fragment */
if (!iph->fragoffs)
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
+ refdst_drop(refdst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
}
@@ -1151,18 +1253,20 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
+ refdst_drop(refdst);
+ rcu_read_unlock();
return NF_STOLEN;
}
/* drop old route */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
@@ -1170,6 +1274,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
@@ -1187,6 +1292,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_iphdr *iph)
{
struct rtable *rt; /* Route to the other host */
+ unsigned long refdst;
int mtu;
int rc;
int local;
@@ -1215,9 +1321,12 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
rt_mode = (hooknum != NF_INET_FORWARD) ?
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- rt_mode, NULL)))
+ rcu_read_lock();
+ refdst = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode,
+ NULL);
+ if (!refdst)
goto tx_error_icmp;
+ rt = (struct rtable *) refdst_ptr(refdst);
local = rt->rt_flags & RTCF_LOCAL;
/*
@@ -1268,9 +1377,9 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (!local) {
/* drop the old route when skb is not shared */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
} else {
- ip_rt_put(rt);
+ refdst_drop(refdst);
/*
* Some IPv4 replies get local address from routes,
* not from iph, so while we DNAT after routing
@@ -1284,18 +1393,20 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->local_df = 1;
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
goto out;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
dev_kfree_skb(skb);
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
tx_error_put:
- ip_rt_put(rt);
+ refdst_drop(refdst);
goto tx_error;
}
@@ -1306,6 +1417,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_iphdr *iph)
{
struct rt6_info *rt; /* Route to the other host */
+ unsigned long refdst;
int mtu;
int rc;
int local;
@@ -1334,10 +1446,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rt_mode = (hooknum != NF_INET_FORWARD) ?
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, rt_mode)))
+ rcu_read_lock();
+ refdst = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, rt_mode);
+ if (!refdst)
goto tx_error_icmp;
-
+ rt = (struct rt6_info *) refdst_ptr(refdst);
local = __ip_vs_is_local_route6(rt);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
@@ -1393,28 +1507,32 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (!local || !skb->dev) {
/* drop the old route when skb is not shared */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ skb_dst_set(skb, (struct dst_entry *) refdst);
+ if (local)
+ skb_dst_force(skb);
} else {
/* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
+ refdst_drop(refdst);
}
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
goto out;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
+ rcu_read_unlock();
dev_kfree_skb(skb);
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
tx_error_put:
- dst_release(&rt->dst);
+ refdst_drop(refdst);
goto tx_error;
}
#endif
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 06/12] ipvs: optimize dst usage for real server
2013-03-06 8:42 ` [PATCH net-next 06/12] ipvs: optimize dst usage for real server Julian Anastasov
@ 2013-03-06 20:18 ` David Miller
2013-03-06 21:58 ` Julian Anastasov
0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2013-03-06 20:18 UTC (permalink / raw)
To: ja; +Cc: horms, lvs-devel, netdev
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 6 Mar 2013 10:42:16 +0200
> Currently when forwarding requests to real servers
> we use dst_lock and atomic operations when cloning the
> dst_cache value. As the dst_cache value does not change
> most of the time it is better to use RCU and to lock
> dst_lock only when we need to replace the obsoleted dst.
> For this to work we keep dst_cache in new structure protected
> by RCU. For packets to remote real servers we will use noref
> version of dst_cache, it will be valid while we are in RCU
> read-side critical section because now dst_release for replaced
> dsts will be invoked after the grace period. NAT-ed packets
> via loopback that are not sent but are passed to local stack
> with NF_ACCEPT need a dst clone (skb_dst_force).
>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
It think it's much cleaner to have the callers of route lookups do
skb_set_dst_noref().
Then you don't need any new interfaces, and you therefore don't need
to expose bits of the noref implementation and semantics at all.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 06/12] ipvs: optimize dst usage for real server
2013-03-06 20:18 ` David Miller
@ 2013-03-06 21:58 ` Julian Anastasov
2013-03-06 22:06 ` David Miller
0 siblings, 1 reply; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 21:58 UTC (permalink / raw)
To: David Miller; +Cc: horms, lvs-devel, netdev
Hello,
On Wed, 6 Mar 2013, David Miller wrote:
> From: Julian Anastasov <ja@ssi.bg>
> Date: Wed, 6 Mar 2013 10:42:16 +0200
>
> > Currently when forwarding requests to real servers
> > we use dst_lock and atomic operations when cloning the
> > dst_cache value. As the dst_cache value does not change
> > most of the time it is better to use RCU and to lock
> > dst_lock only when we need to replace the obsoleted dst.
> > For this to work we keep dst_cache in new structure protected
> > by RCU. For packets to remote real servers we will use noref
> > version of dst_cache, it will be valid while we are in RCU
> > read-side critical section because now dst_release for replaced
> > dsts will be invoked after the grace period. NAT-ed packets
> > via loopback that are not sent but are passed to local stack
> > with NF_ACCEPT need a dst clone (skb_dst_force).
> >
> > Signed-off-by: Julian Anastasov <ja@ssi.bg>
>
> It think it's much cleaner to have the callers of route lookups do
> skb_set_dst_noref().
This was my first thought but commit 27b75c95f1
(net: avoid RCU for NOCACHE dst) uses dst_hold for
DST_NOCACHE dsts. IPVS wants to cache DST_NOCACHE entries
(due to known-NH reasons) and later to use them for
noref dsts...
Also, in the IPVS case we can not set skb_dst
at the time when route is looked up because due to
MTU and other checks we can fail, functions like
icmp_send() will need the original skb_dst in
such case.
There are two choices: __ip_vs_get_out_rt
to return refdst as implemented in this patchset or
to return dst with additional 'bool *noref' argument,
so that caller can decide between skb_dst_set or
skb_set_dst_noref. Then may be we will need just
a new skb_dst_set_noref_{always,force} func that will
contain the old skb_set_dst_noref code, i.e. without
dst_hold? Not sure which variant sounds better.
> Then you don't need any new interfaces, and you therefore don't need
> to expose bits of the noref implementation and semantics at all.
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 06/12] ipvs: optimize dst usage for real server
2013-03-06 21:58 ` Julian Anastasov
@ 2013-03-06 22:06 ` David Miller
2013-03-07 0:14 ` Julian Anastasov
0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2013-03-06 22:06 UTC (permalink / raw)
To: ja; +Cc: horms, lvs-devel, netdev
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 6 Mar 2013 23:58:09 +0200 (EET)
> There are two choices: __ip_vs_get_out_rt
> to return refdst as implemented in this patchset or
> to return dst with additional 'bool *noref' argument,
> so that caller can decide between skb_dst_set or
> skb_set_dst_noref. Then may be we will need just
> a new skb_dst_set_noref_{always,force} func that will
> contain the old skb_set_dst_noref code, i.e. without
> dst_hold? Not sure which variant sounds better.
Both variants are starting to sound equally ugly :-)
Let me ask about this situation in another way.
IP input route lookup clients handle this transparently,
even for routes with next-hop exceptions and nocache
routes, by passing the SKB into the lookup function and
there it will sort out whether noref is actually possible.
Is there a reason that IPVS's route lookup architecture
can't work this way too?
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 06/12] ipvs: optimize dst usage for real server
2013-03-06 22:06 ` David Miller
@ 2013-03-07 0:14 ` Julian Anastasov
0 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-07 0:14 UTC (permalink / raw)
To: David Miller; +Cc: horms, lvs-devel, netdev
Hello,
On Wed, 6 Mar 2013, David Miller wrote:
> From: Julian Anastasov <ja@ssi.bg>
> Date: Wed, 6 Mar 2013 23:58:09 +0200 (EET)
>
> > There are two choices: __ip_vs_get_out_rt
> > to return refdst as implemented in this patchset or
> > to return dst with additional 'bool *noref' argument,
> > so that caller can decide between skb_dst_set or
> > skb_set_dst_noref. Then may be we will need just
> > a new skb_dst_set_noref_{always,force} func that will
> > contain the old skb_set_dst_noref code, i.e. without
> > dst_hold? Not sure which variant sounds better.
>
> Both variants are starting to sound equally ugly :-)
>
> Let me ask about this situation in another way.
>
> IP input route lookup clients handle this transparently,
> even for routes with next-hop exceptions and nocache
> routes, by passing the SKB into the lookup function and
> there it will sort out whether noref is actually possible.
>
> Is there a reason that IPVS's route lookup architecture
> can't work this way too?
May be it is possible, eg. by adding more
arguments to __ip_vs_get_out_rt and moving all dst
checks and icmp_send there. But there are some dst
checks specific to the forwarding method, so I'm not
sure in the end result yet.
For IPVS, noref is always possible because the
dst_cache var holds reference, even for the NOCACHE case
which is more likely to occur. So, some new variant of
skb_dst_set_noref that supports NOCACHE without dst_hold
is still needed.
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH net-next 07/12] ipvs: convert app locks
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (5 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 06/12] ipvs: optimize dst usage for real server Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 08/12] ipvs: remove rs_lock by using RCU Julian Anastasov
` (5 subsequent siblings)
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
We use locks like tcp_app_lock, udp_app_lock,
sctp_app_lock to protect access to the protocol hash tables
from readers in packet context while the application
instances (inc) are [un]registered under global mutex.
As the hash tables are mostly read when conns are
created and bound to app, use RCU for readers and reclaim
app instance after grace period.
Simplify ip_vs_app_inc_get because we use usecnt
only for statistics and rely on module refcounting.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
include/net/ip_vs.h | 4 +---
net/netfilter/ipvs/ip_vs_app.c | 27 +++++++++++++++++++--------
net/netfilter/ipvs/ip_vs_ftp.c | 2 ++
net/netfilter/ipvs/ip_vs_proto_sctp.c | 18 ++++++------------
net/netfilter/ipvs/ip_vs_proto_tcp.c | 18 ++++++------------
net/netfilter/ipvs/ip_vs_proto_udp.c | 19 ++++++-------------
6 files changed, 40 insertions(+), 48 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index f8cc8f4..f0038a8 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -823,6 +823,7 @@ struct ip_vs_app {
struct ip_vs_app *app; /* its real application */
__be16 port; /* port number in net order */
atomic_t usecnt; /* usage counter */
+ struct rcu_head rcu_head;
/*
* output hook: Process packet in inout direction, diff set for TCP.
@@ -908,7 +909,6 @@ struct netns_ipvs {
#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
struct list_head tcp_apps[TCP_APP_TAB_SIZE];
- spinlock_t tcp_app_lock;
#endif
/* ip_vs_proto_udp */
#ifdef CONFIG_IP_VS_PROTO_UDP
@@ -916,7 +916,6 @@ struct netns_ipvs {
#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
struct list_head udp_apps[UDP_APP_TAB_SIZE];
- spinlock_t udp_app_lock;
#endif
/* ip_vs_proto_sctp */
#ifdef CONFIG_IP_VS_PROTO_SCTP
@@ -925,7 +924,6 @@ struct netns_ipvs {
#define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1)
/* Hash table for SCTP application incarnations */
struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
- spinlock_t sctp_app_lock;
#endif
/* ip_vs_conn */
atomic_t conn_count; /* connection counter */
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 0b779d7..a956030 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -58,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
module_put(app->module);
}
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+ ip_vs_app_inc_destroy(inc);
+}
/*
* Allocate/initialize app incarnation and register it in proto apps.
@@ -106,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
return 0;
out:
- kfree(inc->timeout_table);
- kfree(inc);
+ ip_vs_app_inc_destroy(inc);
return ret;
}
@@ -131,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
list_del(&inc->a_list);
- kfree(inc->timeout_table);
- kfree(inc);
+ call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
}
@@ -144,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
{
int result;
- atomic_inc(&inc->usecnt);
- if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
- atomic_dec(&inc->usecnt);
+ result = ip_vs_app_get(inc->app);
+ if (result)
+ atomic_inc(&inc->usecnt);
return result;
}
@@ -156,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
*/
void ip_vs_app_inc_put(struct ip_vs_app *inc)
{
- ip_vs_app_put(inc->app);
atomic_dec(&inc->usecnt);
+ ip_vs_app_put(inc->app);
}
@@ -218,6 +228,7 @@ out_unlock:
/*
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
+ * Caller should use synchronize_rcu() or rcu_barrier()
*/
void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 4f53a5f..7f90825 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -480,6 +480,7 @@ static int __init ip_vs_ftp_init(void)
int rv;
rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns on error */
return rv;
}
@@ -489,6 +490,7 @@ static int __init ip_vs_ftp_init(void)
static void __exit ip_vs_ftp_exit(void)
{
unregister_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns */
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index ae8ec6f..9302448 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -1014,30 +1014,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
hash = sctp_app_hashkey(port);
- spin_lock_bh(&ipvs->sctp_app_lock);
list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->sctp_app_lock);
return ret;
}
static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
- spin_lock_bh(&ipvs->sctp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->sctp_app_lock);
+ list_del_rcu(&inc->p_list);
}
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
@@ -1053,12 +1048,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = sctp_app_hashkey(cp->vport);
- spin_lock(&ipvs->sctp_app_lock);
- list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->sctp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -1074,7 +1069,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->sctp_app_lock);
+ rcu_read_unlock();
out:
return result;
}
@@ -1088,7 +1083,6 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->sctp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
sizeof(sctp_timeouts));
if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 9af653a..0bbc3fe 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -580,18 +580,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
hash = tcp_app_hashkey(port);
- spin_lock_bh(&ipvs->tcp_app_lock);
list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->tcp_app_lock);
return ret;
}
@@ -599,13 +597,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
static void
tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
- spin_lock_bh(&ipvs->tcp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->tcp_app_lock);
+ list_del_rcu(&inc->p_list);
}
@@ -624,12 +619,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
- spin_lock(&ipvs->tcp_app_lock);
- list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->tcp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -646,7 +641,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->tcp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -676,7 +671,6 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->tcp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
sizeof(tcp_timeouts));
if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 503a842..1a03e2d 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -359,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc)
hash = udp_app_hashkey(port);
-
- spin_lock_bh(&ipvs->udp_app_lock);
list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->udp_app_lock);
return ret;
}
@@ -380,12 +377,9 @@ static void
udp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
- struct netns_ipvs *ipvs = net_ipvs(net);
- spin_lock_bh(&ipvs->udp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->udp_app_lock);
+ list_del_rcu(&inc->p_list);
}
@@ -403,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
- spin_lock(&ipvs->udp_app_lock);
- list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->udp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -425,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->udp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -467,7 +461,6 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->udp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
sizeof(udp_timeouts));
if (!pd->timeout_table)
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCH net-next 08/12] ipvs: remove rs_lock by using RCU
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (6 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 07/12] ipvs: convert app locks Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 09/12] ipvs: convert locks used in persistence engines Julian Anastasov
` (4 subsequent siblings)
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
rs_lock was used to protect rs_table (hash table)
from updaters (under global mutex) and readers (packet handlers).
We can remove rs_lock by using RCU lock for readers. Reclaiming
dest only with kfree_rcu is enough because the readers access
only fields from the ip_vs_dest structure.
Use hlist for rs_table.
As we are now using hlist_del_rcu, introduce in_rs_table
flag as replacement for the list_empty checks which do not
work with RCU. It is needed because only NAT dests are in
the rs_table.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
include/net/ip_vs.h | 14 ++++---
net/netfilter/ipvs/ip_vs_core.c | 5 +--
net/netfilter/ipvs/ip_vs_ctl.c | 74 ++++++++++++++-------------------------
3 files changed, 36 insertions(+), 57 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index f0038a8..b2657c1 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -738,7 +738,7 @@ struct ip_vs_dest_dst {
*/
struct ip_vs_dest {
struct list_head n_list; /* for the dests in the service */
- struct list_head d_list; /* for table with all the dests */
+ struct hlist_node d_list; /* for table with all the dests */
u16 af; /* address family */
__be16 port; /* port number of the server */
@@ -767,6 +767,9 @@ struct ip_vs_dest {
__be16 vport; /* virtual port number */
union nf_inet_addr vaddr; /* virtual IP address */
__u32 vfwmark; /* firewall mark of service */
+
+ struct rcu_head rcu_head;
+ unsigned int in_rs_table:1; /* we are in rs_table */
};
@@ -897,7 +900,7 @@ struct netns_ipvs {
#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
- struct list_head rs_table[IP_VS_RTAB_SIZE];
+ struct hlist_head rs_table[IP_VS_RTAB_SIZE];
/* ip_vs_app */
struct list_head app_list;
/* ip_vs_proto */
@@ -933,7 +936,6 @@ struct netns_ipvs {
int num_services; /* no of virtual services */
- rwlock_t rs_lock; /* real services table */
/* Trash for destinations */
struct list_head dest_trash;
/* Service counters */
@@ -1364,9 +1366,9 @@ static inline void ip_vs_service_put(struct ip_vs_service *svc)
atomic_dec(&svc->usecnt);
}
-extern struct ip_vs_dest *
-ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
- const union nf_inet_addr *daddr, __be16 dport);
+extern bool
+ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport);
extern int ip_vs_use_count_inc(void);
extern void ip_vs_use_count_dec(void);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 7e03f42..2ea2862 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1164,9 +1164,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(net, af, iph.protocol,
- &iph.saddr,
- pptr[0])) {
+ if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ pptr[0])) {
/*
* Notify the real server: there is no
* existing entry if it is not RST
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 844fb9b..f4b53c4 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -506,17 +506,13 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
& IP_VS_RTAB_MASK;
}
-/*
- * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
- * should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned int hash;
- if (!list_empty(&dest->d_list)) {
- return 0;
- }
+ if (dest->in_rs_table)
+ return;
/*
* Hash by proto,addr,port,
@@ -524,34 +520,25 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
- list_add(&dest->d_list, &ipvs->rs_table[hash]);
-
- return 1;
+ hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+ dest->in_rs_table = 1;
}
-/*
- * UNhashes ip_vs_dest from rs_table.
- * should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
* Remove it from the rs_table table.
*/
- if (!list_empty(&dest->d_list)) {
- list_del_init(&dest->d_list);
+ if (dest->in_rs_table) {
+ hlist_del_rcu(&dest->d_list);
+ dest->in_rs_table = 0;
}
-
- return 1;
}
-/*
- * Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
- const union nf_inet_addr *daddr,
- __be16 dport)
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport)
{
struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash;
@@ -563,21 +550,21 @@ ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
*/
hash = ip_vs_rs_hashkey(af, daddr, dport);
- read_lock(&ipvs->rs_lock);
- list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
if ((dest->af == af)
&& ip_vs_addr_equal(af, &dest->addr, daddr)
&& (dest->port == dport)
&& ((dest->protocol == protocol) ||
dest->vfwmark)) {
/* HIT */
- read_unlock(&ipvs->rs_lock);
- return dest;
+ rcu_read_unlock();
+ return true;
}
}
- read_unlock(&ipvs->rs_lock);
+ rcu_read_unlock();
- return NULL;
+ return false;
}
/*
@@ -610,9 +597,6 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* the backup synchronization daemon. It finds the
* destination to be bound to the received connection
* on the backup.
- *
- * ip_vs_lookup_real_service() looked promissing, but
- * seems not working as expected.
*/
struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
const union nf_inet_addr *daddr,
@@ -712,7 +696,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__ip_vs_dst_cache_reset(dest);
__ip_vs_unbind_svc(dest);
free_percpu(dest->stats.cpustats);
- kfree(dest);
+ kfree_rcu(dest, rcu_head);
}
}
@@ -739,7 +723,7 @@ static void ip_vs_trash_cleanup(struct net *net)
__ip_vs_dst_cache_reset(dest);
__ip_vs_unbind_svc(dest);
free_percpu(dest->stats.cpustats);
- kfree(dest);
+ kfree_rcu(dest, rcu_head);
}
}
@@ -804,9 +788,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
* Put the real service in rs_table if not present.
* For now only for NAT!
*/
- write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_hash(ipvs, dest);
- write_unlock_bh(&ipvs->rs_lock);
}
atomic_set(&dest->conn_flags, conn_flags);
@@ -902,7 +884,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atomic_set(&dest->persistconns, 0);
atomic_set(&dest->refcnt, 1);
- INIT_LIST_HEAD(&dest->d_list);
+ INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
@@ -1042,9 +1024,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
/*
* Remove it from the d-linked list with the real services.
*/
- write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_unhash(dest);
- write_unlock_bh(&ipvs->rs_lock);
/*
* Decrease the refcnt of the dest, and free the dest
@@ -1064,7 +1044,7 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
time, so the operation here is OK */
atomic_dec(&dest->svc->refcnt);
free_percpu(dest->stats.cpustats);
- kfree(dest);
+ kfree_rcu(dest, rcu_head);
} else {
IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
"dest->refcnt=%d\n",
@@ -3801,11 +3781,9 @@ int __net_init ip_vs_control_net_init(struct net *net)
int idx;
struct netns_ipvs *ipvs = net_ipvs(net);
- rwlock_init(&ipvs->rs_lock);
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCH net-next 09/12] ipvs: convert locks used in persistence engines
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (7 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 08/12] ipvs: remove rs_lock by using RCU Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 10/12] ipvs: convert connection locking Julian Anastasov
` (3 subsequent siblings)
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
Allow the readers to use RCU lock and for
PE module registrations use global mutex instead of
spinlock. All PE modules need to use rcu_barrier
in their module exit handler.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/ipvs/ip_vs_pe.c | 43 +++++++++++-------------------------
net/netfilter/ipvs/ip_vs_pe_sip.c | 1 +
2 files changed, 14 insertions(+), 30 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 5cf859c..5d9774c 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -13,8 +13,8 @@
/* IPVS pe list */
static LIST_HEAD(ip_vs_pe);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_pe_lock);
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
/* Bind a service with a pe */
void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
@@ -36,9 +36,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
pe_name);
- spin_lock_bh(&ip_vs_pe_lock);
-
- list_for_each_entry(pe, &ip_vs_pe, n_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
/* Test and get the modules atomically */
if (pe->module &&
!try_module_get(pe->module)) {
@@ -47,14 +46,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
}
if (strcmp(pe_name, pe->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_pe_lock);
+ rcu_read_unlock();
return pe;
}
if (pe->module)
module_put(pe->module);
}
+ rcu_read_unlock();
- spin_unlock_bh(&ip_vs_pe_lock);
return NULL;
}
@@ -83,22 +82,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_pe_lock);
-
- if (!list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- ip_vs_use_count_dec();
- pr_err("%s(): [%s] pe already linked\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Make sure that the pe with this name doesn't exist
* in the pe list.
*/
list_for_each_entry(tmp, &ip_vs_pe, n_list) {
if (strcmp(tmp->name, pe->name) == 0) {
- spin_unlock_bh(&ip_vs_pe_lock);
+ mutex_unlock(&ip_vs_pe_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] pe already existed "
"in the system\n", __func__, pe->name);
@@ -106,8 +96,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
}
}
/* Add it into the d-linked pe list */
- list_add(&pe->n_list, &ip_vs_pe);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_add_rcu(&pe->n_list, &ip_vs_pe);
+ mutex_unlock(&ip_vs_pe_mutex);
pr_info("[%s] pe registered.\n", pe->name);
@@ -118,17 +108,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe);
/* Unregister a pe from the pe list */
int unregister_ip_vs_pe(struct ip_vs_pe *pe)
{
- spin_lock_bh(&ip_vs_pe_lock);
- if (list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- pr_err("%s(): [%s] pe is not in the list. failed\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Remove it from the d-linked pe list */
- list_del(&pe->n_list);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_del_rcu(&pe->n_list);
+ mutex_unlock(&ip_vs_pe_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 12475ef..0b3ac73 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -172,6 +172,7 @@ static int __init ip_vs_sip_init(void)
static void __exit ip_vs_sip_cleanup(void)
{
unregister_ip_vs_pe(&ip_vs_sip_pe);
+ rcu_barrier();
}
module_init(ip_vs_sip_init);
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCH net-next 10/12] ipvs: convert connection locking
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (8 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 09/12] ipvs: convert locks used in persistence engines Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 8:42 ` [PATCH net-next 11/12] ipvs: reorder keys in connection structure Julian Anastasov
` (2 subsequent siblings)
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
Convert __ip_vs_conntbl_lock_array as follows:
- readers that do not modify conn lists will use RCU lock
- updaters that modify lists will use spinlock_t
Now for conn lookups we will use RCU read-side
critical section. Without using __ip_vs_conn_get such
places have access to connection fields and can
dereference some pointers like pe and pe_data plus
the ability to update timer expiration. If full access
is required we contend for reference.
We add barrier in __ip_vs_conn_put, so that
other CPUs see the refcnt operation after other writes.
With the introduction of ip_vs_conn_unlink()
we try to reorganize ip_vs_conn_expire(), so that
unhashing of connections that should stay more time is
avoided, even if it is for very short time.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
include/net/ip_vs.h | 12 ++
net/netfilter/ipvs/ip_vs_conn.c | 230 +++++++++++++++++++++------------------
2 files changed, 134 insertions(+), 108 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index b2657c1..9059360 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -620,6 +620,8 @@ struct ip_vs_conn {
const struct ip_vs_pe *pe;
char *pe_data;
__u8 pe_data_len;
+
+ struct rcu_head rcu_head;
};
/*
@@ -1173,9 +1175,19 @@ struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
int inverse);
+/* Get reference to gain full access to conn.
+ * By default, RCU read-side critical sections have access only to
+ * conn fields and its PE data, see ip_vs_conn_rcu_free() for reference.
+ */
+static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp)
+{
+ return atomic_inc_not_zero(&cp->refcnt);
+}
+
/* put back the conn without restarting its timer */
static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
{
+ smp_mb__before_atomic_dec();
atomic_dec(&cp->refcnt);
}
extern void ip_vs_conn_put(struct ip_vs_conn *cp);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 704e514..b0cd2be 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -79,51 +79,21 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
struct ip_vs_aligned_lock
{
- rwlock_t l;
+ spinlock_t l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
/* lock array for conn table */
static struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
-static inline void ct_read_lock(unsigned int key)
-{
- read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock(unsigned int key)
-{
- read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
static inline void ct_write_lock(unsigned int key)
{
- write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_unlock(unsigned int key)
{
- write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned int key)
-{
- read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned int key)
-{
- read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock_bh(unsigned int key)
-{
- write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock_bh(unsigned int key)
-{
- write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
@@ -201,9 +171,9 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
spin_lock(&cp->lock);
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
+ hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
ret = 1;
} else {
pr_err("%s(): request for already hashed, called from %pF\n",
@@ -220,7 +190,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/*
* UNhashes ip_vs_conn from ip_vs_conn_tab.
- * returns bool success.
+ * returns bool success. Caller should hold conn reference.
*/
static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
@@ -234,7 +204,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
- hlist_del(&cp->c_list);
+ hlist_del_rcu(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ret = 1;
@@ -247,6 +217,36 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
return ret;
}
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ bool ret;
+
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ ret = false;
+ /* Decrease refcnt and unlink conn only if we are last user */
+ if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ ret = true;
+ }
+ } else
+ ret = atomic_read(&cp->refcnt) ? false : true;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock(hash);
+
+ return ret;
+}
+
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
@@ -262,9 +262,9 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->af == p->af &&
p->cport == cp->cport && p->vport == cp->vport &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
@@ -272,14 +272,15 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
p->protocol == cp->protocol &&
ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
return cp;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
return NULL;
}
@@ -346,14 +347,16 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (!ip_vs_conn_net_eq(cp, p->net))
continue;
if (p->pe_data && p->pe->ct_match) {
- if (p->pe == cp->pe && p->pe->ct_match(p, cp))
- goto out;
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
continue;
}
@@ -365,15 +368,15 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
p->af, p->vaddr, &cp->vaddr) &&
p->cport == cp->cport && p->vport == cp->vport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
- p->protocol == cp->protocol)
- goto out;
+ p->protocol == cp->protocol) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
}
cp = NULL;
out:
- if (cp)
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -398,23 +401,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
*/
hash = ip_vs_conn_hashkey_param(p, true);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->af == p->af &&
p->vport == cp->cport && p->cport == cp->dport &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
p->protocol == cp->protocol &&
ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
ret = cp;
break;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -757,41 +761,36 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
* Simply decrease the refcnt of the template,
* don't restart its timer.
*/
- atomic_dec(&ct->refcnt);
+ __ip_vs_conn_put(ct);
return 0;
}
return 1;
}
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+ rcu_head);
+
+ ip_vs_pe_put(cp->pe);
+ kfree(cp->pe_data);
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+}
+
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
struct net *net = ip_vs_conn_net(cp);
struct netns_ipvs *ipvs = net_ipvs(net);
- cp->timeout = 60*HZ;
-
- /*
- * hey, I'm using it
- */
- atomic_inc(&cp->refcnt);
-
/*
* do I control anybody?
*/
if (atomic_read(&cp->n_control))
goto expire_later;
- /*
- * unhash it if it is hashed in the conn table
- */
- if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
- goto expire_later;
-
- /*
- * refcnt==1 implies I'm the only one referrer
- */
- if (likely(atomic_read(&cp->refcnt) == 1)) {
+ /* Unlink conn if not referenced anymore */
+ if (likely(ip_vs_conn_unlink(cp))) {
/* delete the timer if it is activated by other users */
del_timer(&cp->timer);
@@ -810,38 +809,41 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_conn_drop_conntrack(cp);
}
- ip_vs_pe_put(cp->pe);
- kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count);
-
- kmem_cache_free(ip_vs_conn_cachep, cp);
return;
}
- /* hash it back to the table */
- ip_vs_conn_hash(cp);
-
expire_later:
- IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
- atomic_read(&cp->refcnt)-1,
+ IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+ atomic_read(&cp->refcnt),
atomic_read(&cp->n_control));
+ atomic_inc(&cp->refcnt);
+ cp->timeout = 60*HZ;
+
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
ip_vs_conn_put(cp);
}
-
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
- if (del_timer(&cp->timer))
- mod_timer(&cp->timer, jiffies);
+ /* Using mod_timer_pending will ensure the timer is not
+ * modified after the final del_timer in ip_vs_conn_expire.
+ */
+ if (timer_pending(&cp->timer) &&
+ time_after(cp->timer.expires, jiffies))
+ mod_timer_pending(&cp->timer, jiffies);
}
@@ -952,14 +954,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
struct ip_vs_iter_state *iter = seq->private;
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- ct_read_lock_bh(idx);
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ /* __ip_vs_conn_get() is not needed by
+ * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+ */
if (pos-- == 0) {
iter->l = &ip_vs_conn_tab[idx];
return cp;
}
}
- ct_read_unlock_bh(idx);
+ rcu_read_unlock();
}
return NULL;
@@ -977,6 +982,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_conn *cp = v;
struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
struct hlist_head *l = iter->l;
int idx;
@@ -985,19 +991,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return ip_vs_conn_array(seq, 0);
/* more on same hash chain? */
- if (cp->c_list.next)
- return hlist_entry(cp->c_list.next, struct ip_vs_conn, c_list);
+ e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_conn, c_list);
+ rcu_read_unlock();
idx = l - ip_vs_conn_tab;
- ct_read_unlock_bh(idx);
-
while (++idx < ip_vs_conn_tab_size) {
- ct_read_lock_bh(idx);
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
iter->l = &ip_vs_conn_tab[idx];
return cp;
}
- ct_read_unlock_bh(idx);
+ rcu_read_unlock();
}
iter->l = NULL;
return NULL;
@@ -1009,7 +1015,7 @@ static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
struct hlist_head *l = iter->l;
if (l)
- ct_read_unlock_bh(l - ip_vs_conn_tab);
+ rcu_read_unlock();
}
static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
@@ -1188,7 +1194,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
void ip_vs_random_dropentry(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
/*
* Randomly scan 1/32 of the whole table every second
@@ -1199,9 +1205,9 @@ void ip_vs_random_dropentry(struct net *net)
/*
* Lock is actually needed in this loop.
*/
- ct_write_lock_bh(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
@@ -1228,12 +1234,15 @@ void ip_vs_random_dropentry(struct net *net)
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(hash);
+ rcu_read_unlock();
}
}
@@ -1244,7 +1253,7 @@ void ip_vs_random_dropentry(struct net *net)
static void ip_vs_conn_flush(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
struct netns_ipvs *ipvs = net_ipvs(net);
flush_again:
@@ -1252,19 +1261,22 @@ flush_again:
/*
* Lock is actually needed in this loop.
*/
- ct_write_lock_bh(idx);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (!ip_vs_conn_net_eq(cp, net))
continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(idx);
+ rcu_read_unlock();
}
/* the counter may be not NULL, because maybe some conn entries
@@ -1331,7 +1343,7 @@ int __init ip_vs_conn_init(void)
INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
- rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
/* calculate the random value for connection hash */
@@ -1342,6 +1354,8 @@ int __init ip_vs_conn_init(void)
void ip_vs_conn_cleanup(void)
{
+ /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+ rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
vfree(ip_vs_conn_tab);
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCH net-next 11/12] ipvs: reorder keys in connection structure
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (9 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 10/12] ipvs: convert connection locking Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-06 9:43 ` Hans Schillstrom
2013-03-06 8:42 ` [PATCH net-next 12/12] ipvs: avoid kmem_cache_zalloc in ip_vs_conn_new Julian Anastasov
2013-03-07 10:09 ` [PATCH net-next 00/12] IPVS optimizations Jesper Dangaard Brouer
12 siblings, 1 reply; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
__ip_vs_conn_in_get and ip_vs_conn_out_get are
hot places. Optimize them, so that ports are matched first.
By moving net and fwmark below, on 32-bit arch we can fit
caddr in 32-byte cache line and all addresses in 64-byte
cache line.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
include/net/ip_vs.h | 12 ++++++------
net/netfilter/ipvs/ip_vs_conn.c | 8 ++++----
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 9059360..2bc30e6 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -566,20 +566,19 @@ struct ip_vs_conn_param {
*/
struct ip_vs_conn {
struct hlist_node c_list; /* hashed list heads */
-#ifdef CONFIG_NET_NS
- struct net *net; /* Name space */
-#endif
/* Protocol, addresses and port numbers */
- u16 af; /* address family */
__be16 cport;
- __be16 vport;
__be16 dport;
- __u32 fwmark; /* Fire wall mark from skb */
+ __be16 vport;
+ u16 af; /* address family */
union nf_inet_addr caddr; /* client address */
union nf_inet_addr vaddr; /* virtual address */
union nf_inet_addr daddr; /* destination address */
volatile __u32 flags; /* status flags */
__u16 protocol; /* Which protocol (TCP/UDP) */
+#ifdef CONFIG_NET_NS
+ struct net *net; /* Name space */
+#endif
/* counter and timer */
atomic_t refcnt; /* reference count */
@@ -593,6 +592,7 @@ struct ip_vs_conn {
* state transition triggerd
* synchronization
*/
+ __u32 fwmark; /* Fire wall mark from skb */
unsigned long sync_endtime; /* jiffies + sent_retries */
/* Control members */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index b0cd2be..a4d8ec5 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -265,8 +265,8 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
rcu_read_lock();
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
- p->cport == cp->cport && p->vport == cp->vport &&
+ if (p->cport == cp->cport && p->vport == cp->vport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
@@ -404,8 +404,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
rcu_read_lock();
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
- p->vport == cp->cport && p->cport == cp->dport &&
+ if (p->vport == cp->cport && p->cport == cp->dport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
p->protocol == cp->protocol &&
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 11/12] ipvs: reorder keys in connection structure
2013-03-06 8:42 ` [PATCH net-next 11/12] ipvs: reorder keys in connection structure Julian Anastasov
@ 2013-03-06 9:43 ` Hans Schillstrom
2013-03-06 21:01 ` Julian Anastasov
0 siblings, 1 reply; 26+ messages in thread
From: Hans Schillstrom @ 2013-03-06 9:43 UTC (permalink / raw)
To: Julian Anastasov; +Cc: Simon Horman, lvs-devel, netdev
[-- Attachment #1: Type: text/plain, Size: 3887 bytes --]
Hi Julian
Great job you have done !
I'll test it immediate...
On Wed, 2013-03-06 at 10:42 +0200, Julian Anastasov wrote:
> __ip_vs_conn_in_get and ip_vs_conn_out_get are
> hot places. Optimize them, so that ports are matched first.
> By moving net and fwmark below, on 32-bit arch we can fit
> caddr in 32-byte cache line and all addresses in 64-byte
> cache line.
Earlier I made some rearrangements like the one you have made.
My conclusion at that time was that the best gain was to have
fwmark and net within the first 64 bytes, and move daddr to the next
cache line.
I uesd UDP at ~7Gbit/sec and 256k source address into a x86_64 machine,
and a 50/50 mix of fwmarks and port in that tests.
I guess that you have made similar test, and even take
ip_vs_conn_out_get() into your calculations ?
Regards
Hans
>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> ---
> include/net/ip_vs.h | 12 ++++++------
> net/netfilter/ipvs/ip_vs_conn.c | 8 ++++----
> 2 files changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index 9059360..2bc30e6 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -566,20 +566,19 @@ struct ip_vs_conn_param {
> */
> struct ip_vs_conn {
> struct hlist_node c_list; /* hashed list heads */
> -#ifdef CONFIG_NET_NS
> - struct net *net; /* Name space */
> -#endif
> /* Protocol, addresses and port numbers */
> - u16 af; /* address family */
> __be16 cport;
> - __be16 vport;
> __be16 dport;
> - __u32 fwmark; /* Fire wall mark from skb */
> + __be16 vport;
> + u16 af; /* address family */
> union nf_inet_addr caddr; /* client address */
> union nf_inet_addr vaddr; /* virtual address */
> union nf_inet_addr daddr; /* destination address */
> volatile __u32 flags; /* status flags */
> __u16 protocol; /* Which protocol (TCP/UDP) */
> +#ifdef CONFIG_NET_NS
> + struct net *net; /* Name space */
> +#endif
>
> /* counter and timer */
> atomic_t refcnt; /* reference count */
> @@ -593,6 +592,7 @@ struct ip_vs_conn {
> * state transition triggerd
> * synchronization
> */
> + __u32 fwmark; /* Fire wall mark from skb */
> unsigned long sync_endtime; /* jiffies + sent_retries */
>
> /* Control members */
> diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
> index b0cd2be..a4d8ec5 100644
> --- a/net/netfilter/ipvs/ip_vs_conn.c
> +++ b/net/netfilter/ipvs/ip_vs_conn.c
> @@ -265,8 +265,8 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
> rcu_read_lock();
>
> hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> - if (cp->af == p->af &&
> - p->cport == cp->cport && p->vport == cp->vport &&
> + if (p->cport == cp->cport && p->vport == cp->vport &&
> + cp->af == p->af &&
> ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
> ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
> ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
> @@ -404,8 +404,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
> rcu_read_lock();
>
> hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> - if (cp->af == p->af &&
> - p->vport == cp->cport && p->cport == cp->dport &&
> + if (p->vport == cp->cport && p->cport == cp->dport &&
> + cp->af == p->af &&
> ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
> ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
> p->protocol == cp->protocol &&
[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6177 bytes --]
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 11/12] ipvs: reorder keys in connection structure
2013-03-06 9:43 ` Hans Schillstrom
@ 2013-03-06 21:01 ` Julian Anastasov
2013-03-07 7:49 ` Hans Schillstrom
0 siblings, 1 reply; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 21:01 UTC (permalink / raw)
To: Hans Schillstrom; +Cc: Simon Horman, lvs-devel, netdev
Hello,
On Wed, 6 Mar 2013, Hans Schillstrom wrote:
> Hi Julian
> Great job you have done !
> I'll test it immediate...
Thanks, it would be good to catch the problems
in early phase...
> On Wed, 2013-03-06 at 10:42 +0200, Julian Anastasov wrote:
> > __ip_vs_conn_in_get and ip_vs_conn_out_get are
> > hot places. Optimize them, so that ports are matched first.
> > By moving net and fwmark below, on 32-bit arch we can fit
> > caddr in 32-byte cache line and all addresses in 64-byte
> > cache line.
>
> Earlier I made some rearrangements like the one you have made.
> My conclusion at that time was that the best gain was to have
> fwmark and net within the first 64 bytes, and move daddr to the next
> cache line.
But fwmark is used only for lookups in backup
server. The net field is checked first only in
ip_vs_ct_in_get (on scheduling), it can be optimized too.
Modern CPUs have 64-byte cache line and may be the
places of these fields do not play much because checking
the two ports is enough to differentiate most of the
connections. The addresses play when ports do not
differ, i.e. mostly for persistent connections. So,
on 64-byte cache line it would be more difficult to
see any difference.
> I uesd UDP at ~7Gbit/sec and 256k source address into a x86_64 machine,
> and a 50/50 mix of fwmarks and port in that tests.
>
> I guess that you have made similar test, and even take
> ip_vs_conn_out_get() into your calculations ?
No, I have only virtual boxes for tests...
> Regards
> Hans
>
> >
> > Signed-off-by: Julian Anastasov <ja@ssi.bg>
> > ---
> > include/net/ip_vs.h | 12 ++++++------
> > net/netfilter/ipvs/ip_vs_conn.c | 8 ++++----
> > 2 files changed, 10 insertions(+), 10 deletions(-)
> >
> > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> > index 9059360..2bc30e6 100644
> > --- a/include/net/ip_vs.h
> > +++ b/include/net/ip_vs.h
> > @@ -566,20 +566,19 @@ struct ip_vs_conn_param {
> > */
> > struct ip_vs_conn {
> > struct hlist_node c_list; /* hashed list heads */
> > -#ifdef CONFIG_NET_NS
> > - struct net *net; /* Name space */
> > -#endif
> > /* Protocol, addresses and port numbers */
> > - u16 af; /* address family */
> > __be16 cport;
> > - __be16 vport;
> > __be16 dport;
> > - __u32 fwmark; /* Fire wall mark from skb */
> > + __be16 vport;
> > + u16 af; /* address family */
> > union nf_inet_addr caddr; /* client address */
> > union nf_inet_addr vaddr; /* virtual address */
> > union nf_inet_addr daddr; /* destination address */
> > volatile __u32 flags; /* status flags */
> > __u16 protocol; /* Which protocol (TCP/UDP) */
> > +#ifdef CONFIG_NET_NS
> > + struct net *net; /* Name space */
> > +#endif
> >
> > /* counter and timer */
> > atomic_t refcnt; /* reference count */
> > @@ -593,6 +592,7 @@ struct ip_vs_conn {
> > * state transition triggerd
> > * synchronization
> > */
> > + __u32 fwmark; /* Fire wall mark from skb */
> > unsigned long sync_endtime; /* jiffies + sent_retries */
> >
> > /* Control members */
> > diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
> > index b0cd2be..a4d8ec5 100644
> > --- a/net/netfilter/ipvs/ip_vs_conn.c
> > +++ b/net/netfilter/ipvs/ip_vs_conn.c
> > @@ -265,8 +265,8 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
> > rcu_read_lock();
> >
> > hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> > - if (cp->af == p->af &&
> > - p->cport == cp->cport && p->vport == cp->vport &&
> > + if (p->cport == cp->cport && p->vport == cp->vport &&
> > + cp->af == p->af &&
> > ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
> > ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
> > ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
> > @@ -404,8 +404,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
> > rcu_read_lock();
> >
> > hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> > - if (cp->af == p->af &&
> > - p->vport == cp->cport && p->cport == cp->dport &&
> > + if (p->vport == cp->cport && p->cport == cp->dport &&
> > + cp->af == p->af &&
> > ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
> > ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
> > p->protocol == cp->protocol &&
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 11/12] ipvs: reorder keys in connection structure
2013-03-06 21:01 ` Julian Anastasov
@ 2013-03-07 7:49 ` Hans Schillstrom
2013-03-07 23:23 ` Julian Anastasov
0 siblings, 1 reply; 26+ messages in thread
From: Hans Schillstrom @ 2013-03-07 7:49 UTC (permalink / raw)
To: Julian Anastasov; +Cc: Simon Horman, lvs-devel, netdev
[-- Attachment #1: Type: text/plain, Size: 5410 bytes --]
Hi Julian
On Wed, 2013-03-06 at 23:01 +0200, Julian Anastasov wrote:
> Hello,
>
> On Wed, 6 Mar 2013, Hans Schillstrom wrote:
>
> > Hi Julian
> > Great job you have done !
> > I'll test it immediate...
>
> Thanks, it would be good to catch the problems
> in early phase...
>
> > On Wed, 2013-03-06 at 10:42 +0200, Julian Anastasov wrote:
> > > __ip_vs_conn_in_get and ip_vs_conn_out_get are
> > > hot places. Optimize them, so that ports are matched first.
> > > By moving net and fwmark below, on 32-bit arch we can fit
> > > caddr in 32-byte cache line and all addresses in 64-byte
> > > cache line.
> >
> > Earlier I made some rearrangements like the one you have made.
> > My conclusion at that time was that the best gain was to have
> > fwmark and net within the first 64 bytes, and move daddr to the next
> > cache line.
>
> But fwmark is used only for lookups in backup
> server. The net field is checked first only in
> ip_vs_ct_in_get (on scheduling), it can be optimized too.
> Modern CPUs have 64-byte cache line and may be the
> places of these fields do not play much because checking
> the two ports is enough to differentiate most of the
> connections. The addresses play when ports do not
> differ, i.e. mostly for persistent connections. So,
> on 64-byte cache line it would be more difficult to
> see any difference.
I made some tests on weaker machine (i7-3930K) with moderate background
load, there is absolute no measurable difference with daddr in first
cache line or in second line.
So based on that I prefer your solution since it keeps data together.
> > I uesd UDP at ~7Gbit/sec and 256k source address into a x86_64 machine,
> > and a 50/50 mix of fwmarks and port in that tests.
> >
> > I guess that you have made similar test, and even take
> > ip_vs_conn_out_get() into your calculations ?
>
> No, I have only virtual boxes for tests...
>
> > Regards
> > Hans
> >
> > >
> > > Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
> > > ---
> > > include/net/ip_vs.h | 12 ++++++------
> > > net/netfilter/ipvs/ip_vs_conn.c | 8 ++++----
> > > 2 files changed, 10 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> > > index 9059360..2bc30e6 100644
> > > --- a/include/net/ip_vs.h
> > > +++ b/include/net/ip_vs.h
> > > @@ -566,20 +566,19 @@ struct ip_vs_conn_param {
> > > */
> > > struct ip_vs_conn {
> > > struct hlist_node c_list; /* hashed list heads */
> > > -#ifdef CONFIG_NET_NS
> > > - struct net *net; /* Name space */
> > > -#endif
> > > /* Protocol, addresses and port numbers */
> > > - u16 af; /* address family */
> > > __be16 cport;
> > > - __be16 vport;
> > > __be16 dport;
> > > - __u32 fwmark; /* Fire wall mark from skb */
> > > + __be16 vport;
> > > + u16 af; /* address family */
> > > union nf_inet_addr caddr; /* client address */
> > > union nf_inet_addr vaddr; /* virtual address */
> > > union nf_inet_addr daddr; /* destination address */
> > > volatile __u32 flags; /* status flags */
> > > __u16 protocol; /* Which protocol (TCP/UDP) */
> > > +#ifdef CONFIG_NET_NS
> > > + struct net *net; /* Name space */
> > > +#endif
> > >
> > > /* counter and timer */
> > > atomic_t refcnt; /* reference count */
> > > @@ -593,6 +592,7 @@ struct ip_vs_conn {
> > > * state transition triggerd
> > > * synchronization
> > > */
> > > + __u32 fwmark; /* Fire wall mark from skb */
> > > unsigned long sync_endtime; /* jiffies + sent_retries */
> > >
> > > /* Control members */
> > > diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
> > > index b0cd2be..a4d8ec5 100644
> > > --- a/net/netfilter/ipvs/ip_vs_conn.c
> > > +++ b/net/netfilter/ipvs/ip_vs_conn.c
> > > @@ -265,8 +265,8 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
> > > rcu_read_lock();
> > >
> > > hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> > > - if (cp->af == p->af &&
> > > - p->cport == cp->cport && p->vport == cp->vport &&
> > > + if (p->cport == cp->cport && p->vport == cp->vport &&
> > > + cp->af == p->af &&
> > > ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
> > > ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
> > > ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
> > > @@ -404,8 +404,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
> > > rcu_read_lock();
> > >
> > > hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> > > - if (cp->af == p->af &&
> > > - p->vport == cp->cport && p->cport == cp->dport &&
> > > + if (p->vport == cp->cport && p->cport == cp->dport &&
> > > + cp->af == p->af &&
> > > ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
> > > ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
> > > p->protocol == cp->protocol &&
>
> Regards
>
> --
> Julian Anastasov <ja@ssi.bg>
[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6177 bytes --]
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 11/12] ipvs: reorder keys in connection structure
2013-03-07 7:49 ` Hans Schillstrom
@ 2013-03-07 23:23 ` Julian Anastasov
0 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-07 23:23 UTC (permalink / raw)
To: Hans Schillstrom; +Cc: Simon Horman, lvs-devel, netdev
Hello,
On Thu, 7 Mar 2013, Hans Schillstrom wrote:
> I made some tests on weaker machine (i7-3930K) with moderate background
> load, there is absolute no measurable difference with daddr in first
> cache line or in second line.
> So based on that I prefer your solution since it keeps data together.
Thanks for the review and the tests. But I'll
slightly extend this patch in v2 with more optimizations...
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH net-next 12/12] ipvs: avoid kmem_cache_zalloc in ip_vs_conn_new
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (10 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 11/12] ipvs: reorder keys in connection structure Julian Anastasov
@ 2013-03-06 8:42 ` Julian Anastasov
2013-03-07 10:09 ` [PATCH net-next 00/12] IPVS optimizations Jesper Dangaard Brouer
12 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-06 8:42 UTC (permalink / raw)
To: Simon Horman; +Cc: lvs-devel, netdev
We have many fields to set and few to reset,
use kmem_cache_alloc instead to save some cycles.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/ipvs/ip_vs_conn.c | 16 +++++++++++++++-
1 files changed, 15 insertions(+), 1 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index a4d8ec5..1a0c7f3 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -860,7 +860,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
p->protocol);
- cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
IP_VS_ERR_RL("%s(): no memory\n", __func__);
return NULL;
@@ -886,6 +886,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
+ } else {
+ cp->pe = NULL;
+ cp->pe_data = NULL;
+ cp->pe_data_len = 0;
}
spin_lock_init(&cp->lock);
@@ -896,18 +900,28 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
*/
atomic_set(&cp->refcnt, 1);
+ cp->control = NULL;
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
+ cp->packet_xmit = NULL;
+ cp->app = NULL;
+ cp->app_data = NULL;
+ /* reset struct ip_vs_seq */
+ cp->in_seq.delta = 0;
+ cp->out_seq.delta = 0;
+
atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind the connection with a destination server */
+ cp->dest = NULL;
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
cp->state = 0;
+ cp->old_state = 0;
cp->timeout = 3*HZ;
cp->sync_endtime = jiffies & ~3UL;
--
1.7.3.4
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 00/12] IPVS optimizations
2013-03-06 8:42 [PATCH net-next 00/12] IPVS optimizations Julian Anastasov
` (11 preceding siblings ...)
2013-03-06 8:42 ` [PATCH net-next 12/12] ipvs: avoid kmem_cache_zalloc in ip_vs_conn_new Julian Anastasov
@ 2013-03-07 10:09 ` Jesper Dangaard Brouer
2013-03-07 23:46 ` Julian Anastasov
12 siblings, 1 reply; 26+ messages in thread
From: Jesper Dangaard Brouer @ 2013-03-07 10:09 UTC (permalink / raw)
To: Julian Anastasov; +Cc: Simon Horman, lvs-devel, netdev
On Wed, 2013-03-06 at 10:42 +0200, Julian Anastasov wrote:
> This is a first patchset for IPVS optimizations.
> Another patchset will address the locking in schedulers
> and moving the global _bh disabling from LOCAL_OUT to all
> locks. It is in TODO list.
Do you have any performance measurements?
Or is this code primarily cleanup optimizations, and the next patchset
will address the performance part?
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Sr. Network Kernel Developer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH net-next 00/12] IPVS optimizations
2013-03-07 10:09 ` [PATCH net-next 00/12] IPVS optimizations Jesper Dangaard Brouer
@ 2013-03-07 23:46 ` Julian Anastasov
0 siblings, 0 replies; 26+ messages in thread
From: Julian Anastasov @ 2013-03-07 23:46 UTC (permalink / raw)
To: Jesper Dangaard Brouer; +Cc: Simon Horman, lvs-devel, netdev
Hello,
On Thu, 7 Mar 2013, Jesper Dangaard Brouer wrote:
> On Wed, 2013-03-06 at 10:42 +0200, Julian Anastasov wrote:
> > This is a first patchset for IPVS optimizations.
> > Another patchset will address the locking in schedulers
> > and moving the global _bh disabling from LOCAL_OUT to all
> > locks. It is in TODO list.
>
> Do you have any performance measurements?
Unfortunately, I don't have suitable
hardware to do real tests, only virtual setup.
> Or is this code primarily cleanup optimizations, and the next patchset
> will address the performance part?
This patchset converts all global locks to RCU,
except the __ip_vs_svc_lock usage. The states in
some schedulers probably will need some protection
with lock as before now, others like WLC and LC will
run (after patchset2) just under RCU lock for the
svc->destinations list. But I'll need some time to
finalize this 2nd patchset because there are other
patches in the queue...
So, the answer is that packets that create
connections (via schedulers) hit this __ip_vs_svc_lock
with read_lock and some write lock in scheduler.
This happens only when connection is created. Next packets
should not reach any global locks in IPVS. The cached
dst->refcnt is not changed anymore for traffic to
real server.
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 26+ messages in thread