* [PATCH] ipvs: Fix race conditions in lblcr scheduler
@ 2008-08-17 22:53 Sven Wegener
2008-08-18 23:32 ` Simon Horman
0 siblings, 1 reply; 6+ messages in thread
From: Sven Wegener @ 2008-08-17 22:53 UTC (permalink / raw)
To: netdev, lvs-devel; +Cc: wensong, horms, ja
We can't access the cache entry outside of our critical read-locked region,
because someone may free that entry. Also getting an entry under read lock,
then locking for write and trying to delete that entry looks fishy, but should
be no problem here, because we're only comparing a pointer. Also there is no
need for our own rwlock, there is already one in the service structure for use
in the schedulers.
Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
---
net/ipv4/ipvs/ip_vs_lblcr.c | 229 +++++++++++++++++++++----------------------
1 files changed, 114 insertions(+), 115 deletions(-)
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index f1c8450..96bfdc2 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
return NULL;
}
- e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
+ e = kmalloc(sizeof(*e), GFP_ATOMIC);
if (e == NULL) {
IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
return NULL;
@@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
e->dest = dest;
/* link it to the list */
- write_lock(&set->lock);
e->next = set->list;
set->list = e;
atomic_inc(&set->size);
- write_unlock(&set->lock);
set->lastmod = jiffies;
return e;
@@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
{
struct ip_vs_dest_list *e, **ep;
- write_lock(&set->lock);
for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
if (e->dest == dest) {
/* HIT */
@@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
}
ep = &e->next;
}
- write_unlock(&set->lock);
}
static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
@@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
if (set == NULL)
return NULL;
- read_lock(&set->lock);
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
least = e->dest;
@@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
goto nextstage;
}
}
- read_unlock(&set->lock);
return NULL;
/* find the destination with the weighted least load */
@@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
loh = doh;
}
}
- read_unlock(&set->lock);
IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
@@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
if (set == NULL)
return NULL;
- read_lock(&set->lock);
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
most = e->dest;
@@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
goto nextstage;
}
}
- read_unlock(&set->lock);
return NULL;
/* find the destination with the weighted most load */
@@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
moh = doh;
}
}
- read_unlock(&set->lock);
IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
@@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
* IPVS lblcr hash table
*/
struct ip_vs_lblcr_table {
- rwlock_t lock; /* lock for this table */
struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
@@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
static struct ctl_table_header * sysctl_header;
-/*
- * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server.
- */
-static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
-{
- struct ip_vs_lblcr_entry *en;
-
- en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
- if (en == NULL) {
- IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
- return NULL;
- }
-
- INIT_LIST_HEAD(&en->list);
- en->addr = daddr;
-
- /* initilize its dest set */
- atomic_set(&(en->set.size), 0);
- en->set.list = NULL;
- rwlock_init(&en->set.lock);
-
- return en;
-}
-
-
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
list_del(&en->list);
@@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
* Hash an entry in the ip_vs_lblcr_table.
* returns bool success.
*/
-static int
+static void
ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
{
- unsigned hash;
-
- if (!list_empty(&en->list)) {
- IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
+ unsigned hash = ip_vs_lblcr_hashkey(en->addr);
- /*
- * Hash by destination IP address
- */
- hash = ip_vs_lblcr_hashkey(en->addr);
-
- write_lock(&tbl->lock);
list_add(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
- write_unlock(&tbl->lock);
-
- return 1;
}
/*
- * Get ip_vs_lblcr_entry associated with supplied parameters.
+ * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
+ * read lock.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
{
- unsigned hash;
+ unsigned hash = ip_vs_lblcr_hashkey(addr);
struct ip_vs_lblcr_entry *en;
- hash = ip_vs_lblcr_hashkey(addr);
+ list_for_each_entry(en, &tbl->bucket[hash], list)
+ if (en->addr == addr)
+ return en;
+
+ return NULL;
+}
- read_lock(&tbl->lock);
- list_for_each_entry(en, &tbl->bucket[hash], list) {
- if (en->addr == addr) {
- /* HIT */
- read_unlock(&tbl->lock);
- return en;
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblcr_entry *en;
+
+ en = ip_vs_lblcr_get(tbl, daddr);
+ if (!en) {
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en) {
+ IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+ return NULL;
}
+
+ en->addr = daddr;
+ en->lastuse = jiffies;
+
+ /* initilize its dest set */
+ atomic_set(&(en->set.size), 0);
+ en->set.list = NULL;
+ rwlock_init(&en->set.lock);
+
+ ip_vs_lblcr_hash(tbl, en);
}
- read_unlock(&tbl->lock);
+ write_lock(&en->set.lock);
+ ip_vs_dest_set_insert(&en->set, dest);
+ write_unlock(&en->set.lock);
- return NULL;
+ return en;
}
@@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
int i;
struct ip_vs_lblcr_entry *en, *nxt;
+ /* No locking required, only called during cleanup. */
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- write_lock(&tbl->lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
ip_vs_lblcr_free(en);
- atomic_dec(&tbl->entries);
}
- write_unlock(&tbl->lock);
}
}
-static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int i, j;
struct ip_vs_lblcr_entry *en, *nxt;
@@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&tbl->lock);
+ write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
now))
@@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&tbl->lock);
+ write_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
*/
static void ip_vs_lblcr_check_expire(unsigned long data)
{
- struct ip_vs_lblcr_table *tbl;
+ struct ip_vs_service *svc = (struct ip_vs_service *) data;
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int goal;
int i, j;
struct ip_vs_lblcr_entry *en, *nxt;
- tbl = (struct ip_vs_lblcr_table *)data;
-
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
- ip_vs_lblcr_full_check(tbl);
+ ip_vs_lblcr_full_check(svc);
tbl->counter = 1;
goto out;
}
@@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&tbl->lock);
+ write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
continue;
@@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&tbl->lock);
+ write_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
@@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
/*
* Allocate the ip_vs_lblcr_table for this service
*/
- tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
+ tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
- "current service\n",
- sizeof(struct ip_vs_lblcr_table));
+ "current service\n", sizeof(*tbl));
/*
* Initialize the hash buckets
@@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
INIT_LIST_HEAD(&tbl->bucket[i]);
}
- rwlock_init(&tbl->lock);
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
@@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
* Hook periodic timer for garbage collection
*/
setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
- (unsigned long)tbl);
- tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
- add_timer(&tbl->periodic_timer);
+ (unsigned long)svc);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
return 0;
}
@@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
ip_vs_lblcr_flush(tbl);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree(tbl);
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
- sizeof(struct ip_vs_lblcr_table));
+ sizeof(*tbl));
return 0;
}
@@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
static struct ip_vs_dest *
ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
- struct ip_vs_dest *dest;
- struct ip_vs_lblcr_table *tbl;
- struct ip_vs_lblcr_entry *en;
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
struct iphdr *iph = ip_hdr(skb);
+ struct ip_vs_dest *dest = NULL;
+ struct ip_vs_lblcr_entry *en;
IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
- tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
+ /* First look in our cache */
+ read_lock(&svc->sched_lock);
en = ip_vs_lblcr_get(tbl, iph->daddr);
- if (en == NULL) {
- dest = __ip_vs_lblcr_schedule(svc, iph);
- if (dest == NULL) {
- IP_VS_DBG(1, "no destination available\n");
- return NULL;
- }
- en = ip_vs_lblcr_new(iph->daddr);
- if (en == NULL) {
- return NULL;
- }
- ip_vs_dest_set_insert(&en->set, dest);
- ip_vs_lblcr_hash(tbl, en);
- } else {
+ if (en) {
+ /* We only hold a read lock, but this is atomic */
+ en->lastuse = jiffies;
+
+ /* Get the least loaded destination */
+ read_lock(&en->set.lock);
dest = ip_vs_dest_set_min(&en->set);
- if (!dest || is_overloaded(dest, svc)) {
- dest = __ip_vs_lblcr_schedule(svc, iph);
- if (dest == NULL) {
- IP_VS_DBG(1, "no destination available\n");
- return NULL;
- }
- ip_vs_dest_set_insert(&en->set, dest);
- }
+ read_unlock(&en->set.lock);
+
+ /* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
- jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
+ time_after(jiffies, en->set.lastmod +
+ sysctl_ip_vs_lblcr_expiration)) {
struct ip_vs_dest *m;
+
+ write_lock(&en->set.lock);
m = ip_vs_dest_set_max(&en->set);
if (m)
ip_vs_dest_set_erase(&en->set, m);
+ write_unlock(&en->set.lock);
+ }
+
+ /* If the destination is not overloaded, use it */
+ if (dest && !is_overloaded(dest, svc)) {
+ read_unlock(&svc->sched_lock);
+ goto out;
+ }
+
+ /* The cache entry is invalid, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc, iph);
+ if (!dest) {
+ IP_VS_DBG(1, "no destination available\n");
+ read_unlock(&svc->sched_lock);
+ return NULL;
}
+
+ /* Update our cache entry */
+ write_lock(&en->set.lock);
+ ip_vs_dest_set_insert(&en->set, dest);
+ write_unlock(&en->set.lock);
+ }
+ read_unlock(&svc->sched_lock);
+
+ if (dest)
+ goto out;
+
+ /* No cache entry, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc, iph);
+ if (!dest) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
}
- en->lastuse = jiffies;
+ /* If we fail to create a cache entry, we'll just use the valid dest */
+ write_lock(&svc->sched_lock);
+ ip_vs_lblcr_new(tbl, iph->daddr, dest);
+ write_unlock(&svc->sched_lock);
+
+out:
IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
- NIPQUAD(en->addr),
+ NIPQUAD(iph->addr),
NIPQUAD(dest->addr),
ntohs(dest->port));
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] ipvs: Fix race conditions in lblcr scheduler
2008-08-17 22:53 [PATCH] ipvs: Fix race conditions in lblcr scheduler Sven Wegener
@ 2008-08-18 23:32 ` Simon Horman
2008-08-18 23:42 ` Sven Wegener
0 siblings, 1 reply; 6+ messages in thread
From: Simon Horman @ 2008-08-18 23:32 UTC (permalink / raw)
To: Sven Wegener; +Cc: netdev, lvs-devel, wensong, ja
On Mon, Aug 18, 2008 at 12:53:05AM +0200, Sven Wegener wrote:
> We can't access the cache entry outside of our critical read-locked region,
> because someone may free that entry. Also getting an entry under read lock,
> then locking for write and trying to delete that entry looks fishy, but should
> be no problem here, because we're only comparing a pointer. Also there is no
> need for our own rwlock, there is already one in the service structure for use
> in the schedulers.
Hi Sven,
this looks good to me. Just a few minor comments inline.
> Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
> ---
> net/ipv4/ipvs/ip_vs_lblcr.c | 229 +++++++++++++++++++++----------------------
> 1 files changed, 114 insertions(+), 115 deletions(-)
>
> diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
> index f1c8450..96bfdc2 100644
> --- a/net/ipv4/ipvs/ip_vs_lblcr.c
> +++ b/net/ipv4/ipvs/ip_vs_lblcr.c
> @@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> return NULL;
> }
>
> - e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
> + e = kmalloc(sizeof(*e), GFP_ATOMIC);
I think that I prefer using struct ip_vs_dest_list rather than *e.
Ditto for *tbl below.
> if (e == NULL) {
> IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
> return NULL;
> @@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> e->dest = dest;
>
> /* link it to the list */
> - write_lock(&set->lock);
> e->next = set->list;
> set->list = e;
> atomic_inc(&set->size);
> - write_unlock(&set->lock);
>
> set->lastmod = jiffies;
> return e;
> @@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> {
> struct ip_vs_dest_list *e, **ep;
>
> - write_lock(&set->lock);
> for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
> if (e->dest == dest) {
> /* HIT */
> @@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> }
> ep = &e->next;
> }
> - write_unlock(&set->lock);
> }
>
> static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
> @@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
> if (set == NULL)
> return NULL;
>
> - read_lock(&set->lock);
> /* select the first destination server, whose weight > 0 */
> for (e=set->list; e!=NULL; e=e->next) {
> least = e->dest;
> @@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
> goto nextstage;
> }
> }
> - read_unlock(&set->lock);
> return NULL;
>
> /* find the destination with the weighted least load */
> @@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
> loh = doh;
> }
> }
> - read_unlock(&set->lock);
>
> IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
> "activeconns %d refcnt %d weight %d overhead %d\n",
> @@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
> if (set == NULL)
> return NULL;
>
> - read_lock(&set->lock);
> /* select the first destination server, whose weight > 0 */
> for (e=set->list; e!=NULL; e=e->next) {
> most = e->dest;
> @@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
> goto nextstage;
> }
> }
> - read_unlock(&set->lock);
> return NULL;
>
> /* find the destination with the weighted most load */
> @@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
> moh = doh;
> }
> }
> - read_unlock(&set->lock);
>
> IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
> "activeconns %d refcnt %d weight %d overhead %d\n",
> @@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
> * IPVS lblcr hash table
> */
> struct ip_vs_lblcr_table {
> - rwlock_t lock; /* lock for this table */
> struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
> atomic_t entries; /* number of entries */
> int max_size; /* maximum size of entries */
> @@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
>
> static struct ctl_table_header * sysctl_header;
>
> -/*
> - * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
> - * IP address to a server.
> - */
> -static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
> -{
> - struct ip_vs_lblcr_entry *en;
> -
> - en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
> - if (en == NULL) {
> - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
> - return NULL;
> - }
> -
> - INIT_LIST_HEAD(&en->list);
> - en->addr = daddr;
> -
> - /* initilize its dest set */
> - atomic_set(&(en->set.size), 0);
> - en->set.list = NULL;
> - rwlock_init(&en->set.lock);
> -
> - return en;
> -}
> -
> -
> static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
> {
> list_del(&en->list);
> @@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
> * Hash an entry in the ip_vs_lblcr_table.
> * returns bool success.
> */
> -static int
> +static void
> ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
> {
> - unsigned hash;
> -
> - if (!list_empty(&en->list)) {
> - IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
> - "called from %p\n", __builtin_return_address(0));
> - return 0;
> - }
> + unsigned hash = ip_vs_lblcr_hashkey(en->addr);
>
> - /*
> - * Hash by destination IP address
> - */
> - hash = ip_vs_lblcr_hashkey(en->addr);
> -
> - write_lock(&tbl->lock);
> list_add(&en->list, &tbl->bucket[hash]);
> atomic_inc(&tbl->entries);
> - write_unlock(&tbl->lock);
> -
> - return 1;
> }
>
>
> /*
> - * Get ip_vs_lblcr_entry associated with supplied parameters.
> + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
> + * read lock.
> */
> static inline struct ip_vs_lblcr_entry *
> ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
> {
> - unsigned hash;
> + unsigned hash = ip_vs_lblcr_hashkey(addr);
> struct ip_vs_lblcr_entry *en;
>
> - hash = ip_vs_lblcr_hashkey(addr);
> + list_for_each_entry(en, &tbl->bucket[hash], list)
> + if (en->addr == addr)
> + return en;
> +
> + return NULL;
> +}
>
> - read_lock(&tbl->lock);
>
> - list_for_each_entry(en, &tbl->bucket[hash], list) {
> - if (en->addr == addr) {
> - /* HIT */
> - read_unlock(&tbl->lock);
> - return en;
> +/*
> + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
> + * IP address to a server. Called under write lock.
> + */
> +static inline struct ip_vs_lblcr_entry *
> +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
> + struct ip_vs_dest *dest)
> +{
> + struct ip_vs_lblcr_entry *en;
> +
> + en = ip_vs_lblcr_get(tbl, daddr);
> + if (!en) {
> + en = kmalloc(sizeof(*en), GFP_ATOMIC);
> + if (!en) {
> + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
> + return NULL;
> }
> +
> + en->addr = daddr;
> + en->lastuse = jiffies;
> +
> + /* initilize its dest set */
> + atomic_set(&(en->set.size), 0);
> + en->set.list = NULL;
> + rwlock_init(&en->set.lock);
> +
> + ip_vs_lblcr_hash(tbl, en);
> }
>
> - read_unlock(&tbl->lock);
> + write_lock(&en->set.lock);
> + ip_vs_dest_set_insert(&en->set, dest);
> + write_unlock(&en->set.lock);
>
> - return NULL;
> + return en;
> }
>
>
> @@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
> int i;
> struct ip_vs_lblcr_entry *en, *nxt;
>
> + /* No locking required, only called during cleanup. */
> for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> - write_lock(&tbl->lock);
> list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
> ip_vs_lblcr_free(en);
> - atomic_dec(&tbl->entries);
> }
> - write_unlock(&tbl->lock);
> }
> }
>
>
> -static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
> {
> + struct ip_vs_lblcr_table *tbl = svc->sched_data;
> unsigned long now = jiffies;
> int i, j;
> struct ip_vs_lblcr_entry *en, *nxt;
> @@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
>
> - write_lock(&tbl->lock);
> + write_lock(&svc->sched_lock);
> list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
> if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
> now))
> @@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> ip_vs_lblcr_free(en);
> atomic_dec(&tbl->entries);
> }
> - write_unlock(&tbl->lock);
> + write_unlock(&svc->sched_lock);
> }
> tbl->rover = j;
> }
> @@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> */
> static void ip_vs_lblcr_check_expire(unsigned long data)
> {
> - struct ip_vs_lblcr_table *tbl;
> + struct ip_vs_service *svc = (struct ip_vs_service *) data;
> + struct ip_vs_lblcr_table *tbl = svc->sched_data;
> unsigned long now = jiffies;
> int goal;
> int i, j;
> struct ip_vs_lblcr_entry *en, *nxt;
>
> - tbl = (struct ip_vs_lblcr_table *)data;
> -
> if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
> /* do full expiration check */
> - ip_vs_lblcr_full_check(tbl);
> + ip_vs_lblcr_full_check(svc);
> tbl->counter = 1;
> goto out;
> }
> @@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
> for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
>
> - write_lock(&tbl->lock);
> + write_lock(&svc->sched_lock);
> list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
> if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
> continue;
> @@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
> atomic_dec(&tbl->entries);
> goal--;
> }
> - write_unlock(&tbl->lock);
> + write_unlock(&svc->sched_lock);
> if (goal <= 0)
> break;
> }
> @@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
> /*
> * Allocate the ip_vs_lblcr_table for this service
> */
> - tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
> + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
> if (tbl == NULL) {
> IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
> return -ENOMEM;
> }
> svc->sched_data = tbl;
> IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
> - "current service\n",
> - sizeof(struct ip_vs_lblcr_table));
> + "current service\n", sizeof(*tbl));
>
> /*
> * Initialize the hash buckets
> @@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
> for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> INIT_LIST_HEAD(&tbl->bucket[i]);
> }
> - rwlock_init(&tbl->lock);
> tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
> tbl->rover = 0;
> tbl->counter = 1;
> @@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
> * Hook periodic timer for garbage collection
> */
> setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
> - (unsigned long)tbl);
> - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
> - add_timer(&tbl->periodic_timer);
> + (unsigned long)svc);
> + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
>
> return 0;
> }
> @@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
> ip_vs_lblcr_flush(tbl);
>
> /* release the table itself */
> - kfree(svc->sched_data);
> + kfree(tbl);
> IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
> - sizeof(struct ip_vs_lblcr_table));
> + sizeof(*tbl));
>
> return 0;
> }
> @@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
> static struct ip_vs_dest *
> ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
> {
> - struct ip_vs_dest *dest;
> - struct ip_vs_lblcr_table *tbl;
> - struct ip_vs_lblcr_entry *en;
> + struct ip_vs_lblcr_table *tbl = svc->sched_data;
> struct iphdr *iph = ip_hdr(skb);
> + struct ip_vs_dest *dest = NULL;
> + struct ip_vs_lblcr_entry *en;
>
> IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
>
> - tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
> + /* First look in our cache */
> + read_lock(&svc->sched_lock);
> en = ip_vs_lblcr_get(tbl, iph->daddr);
> - if (en == NULL) {
> - dest = __ip_vs_lblcr_schedule(svc, iph);
> - if (dest == NULL) {
> - IP_VS_DBG(1, "no destination available\n");
> - return NULL;
> - }
> - en = ip_vs_lblcr_new(iph->daddr);
> - if (en == NULL) {
> - return NULL;
> - }
> - ip_vs_dest_set_insert(&en->set, dest);
> - ip_vs_lblcr_hash(tbl, en);
> - } else {
> + if (en) {
> + /* We only hold a read lock, but this is atomic */
> + en->lastuse = jiffies;
> +
> + /* Get the least loaded destination */
> + read_lock(&en->set.lock);
> dest = ip_vs_dest_set_min(&en->set);
> - if (!dest || is_overloaded(dest, svc)) {
> - dest = __ip_vs_lblcr_schedule(svc, iph);
> - if (dest == NULL) {
> - IP_VS_DBG(1, "no destination available\n");
> - return NULL;
> - }
> - ip_vs_dest_set_insert(&en->set, dest);
> - }
> + read_unlock(&en->set.lock);
> +
> + /* More than one destination + enough time passed by, cleanup */
> if (atomic_read(&en->set.size) > 1 &&
> - jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
> + time_after(jiffies, en->set.lastmod +
> + sysctl_ip_vs_lblcr_expiration)) {
> struct ip_vs_dest *m;
> +
> + write_lock(&en->set.lock);
> m = ip_vs_dest_set_max(&en->set);
> if (m)
> ip_vs_dest_set_erase(&en->set, m);
> + write_unlock(&en->set.lock);
> + }
> +
> + /* If the destination is not overloaded, use it */
> + if (dest && !is_overloaded(dest, svc)) {
> + read_unlock(&svc->sched_lock);
> + goto out;
> + }
> +
> + /* The cache entry is invalid, time to schedule */
> + dest = __ip_vs_lblcr_schedule(svc, iph);
> + if (!dest) {
> + IP_VS_DBG(1, "no destination available\n");
> + read_unlock(&svc->sched_lock);
> + return NULL;
> }
> +
> + /* Update our cache entry */
> + write_lock(&en->set.lock);
> + ip_vs_dest_set_insert(&en->set, dest);
> + write_unlock(&en->set.lock);
> + }
> + read_unlock(&svc->sched_lock);
> +
> + if (dest)
> + goto out;
> +
> + /* No cache entry, time to schedule */
> + dest = __ip_vs_lblcr_schedule(svc, iph);
> + if (!dest) {
> + IP_VS_DBG(1, "no destination available\n");
> + return NULL;
> }
> - en->lastuse = jiffies;
>
> + /* If we fail to create a cache entry, we'll just use the valid dest */
> + write_lock(&svc->sched_lock);
> + ip_vs_lblcr_new(tbl, iph->daddr, dest);
> + write_unlock(&svc->sched_lock);
> +
> +out:
> IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
> "--> server %u.%u.%u.%u:%d\n",
> - NIPQUAD(en->addr),
> + NIPQUAD(iph->addr),
Minor problem, this should be iph->daddr
> NIPQUAD(dest->addr),
> ntohs(dest->port));
>
> --
> To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] ipvs: Fix race conditions in lblcr scheduler
2008-08-18 23:32 ` Simon Horman
@ 2008-08-18 23:42 ` Sven Wegener
2008-08-19 0:41 ` Simon Horman
0 siblings, 1 reply; 6+ messages in thread
From: Sven Wegener @ 2008-08-18 23:42 UTC (permalink / raw)
To: Simon Horman; +Cc: netdev, lvs-devel, wensong, ja
On Tue, 19 Aug 2008, Simon Horman wrote:
> On Mon, Aug 18, 2008 at 12:53:05AM +0200, Sven Wegener wrote:
> > We can't access the cache entry outside of our critical read-locked region,
> > because someone may free that entry. Also getting an entry under read lock,
> > then locking for write and trying to delete that entry looks fishy, but should
> > be no problem here, because we're only comparing a pointer. Also there is no
> > need for our own rwlock, there is already one in the service structure for use
> > in the schedulers.
>
> Hi Sven,
>
> this looks good to me. Just a few minor comments inline.
>
> > Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
> > ---
> > net/ipv4/ipvs/ip_vs_lblcr.c | 229 +++++++++++++++++++++----------------------
> > 1 files changed, 114 insertions(+), 115 deletions(-)
> >
> > diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
> > index f1c8450..96bfdc2 100644
> > --- a/net/ipv4/ipvs/ip_vs_lblcr.c
> > +++ b/net/ipv4/ipvs/ip_vs_lblcr.c
> > @@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> > return NULL;
> > }
> >
> > - e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
> > + e = kmalloc(sizeof(*e), GFP_ATOMIC);
>
> I think that I prefer using struct ip_vs_dest_list rather than *e.
> Ditto for *tbl below.
Actually, it's part of CodingStyle to use *e.
> > +out:
> > IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
> > "--> server %u.%u.%u.%u:%d\n",
> > - NIPQUAD(en->addr),
> > + NIPQUAD(iph->addr),
>
> Minor problem, this should be iph->daddr
Good catch, I've updated my patch locally. Let's get a consensus on the
sizeof issue and I'll repost.
Sven
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] ipvs: Fix race conditions in lblcr scheduler
2008-08-18 23:42 ` Sven Wegener
@ 2008-08-19 0:41 ` Simon Horman
2008-08-19 6:16 ` Sven Wegener
0 siblings, 1 reply; 6+ messages in thread
From: Simon Horman @ 2008-08-19 0:41 UTC (permalink / raw)
To: Sven Wegener; +Cc: netdev, lvs-devel, wensong, ja
On Tue, Aug 19, 2008 at 01:42:41AM +0200, Sven Wegener wrote:
> On Tue, 19 Aug 2008, Simon Horman wrote:
>
> > On Mon, Aug 18, 2008 at 12:53:05AM +0200, Sven Wegener wrote:
> > > We can't access the cache entry outside of our critical read-locked region,
> > > because someone may free that entry. Also getting an entry under read lock,
> > > then locking for write and trying to delete that entry looks fishy, but should
> > > be no problem here, because we're only comparing a pointer. Also there is no
> > > need for our own rwlock, there is already one in the service structure for use
> > > in the schedulers.
> >
> > Hi Sven,
> >
> > this looks good to me. Just a few minor comments inline.
> >
> > > Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
> > > ---
> > > net/ipv4/ipvs/ip_vs_lblcr.c | 229 +++++++++++++++++++++----------------------
> > > 1 files changed, 114 insertions(+), 115 deletions(-)
> > >
> > > diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
> > > index f1c8450..96bfdc2 100644
> > > --- a/net/ipv4/ipvs/ip_vs_lblcr.c
> > > +++ b/net/ipv4/ipvs/ip_vs_lblcr.c
> > > @@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
> > > return NULL;
> > > }
> > >
> > > - e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
> > > + e = kmalloc(sizeof(*e), GFP_ATOMIC);
> >
> > I think that I prefer using struct ip_vs_dest_list rather than *e.
> > Ditto for *tbl below.
>
> Actually, it's part of CodingStyle to use *e.
>
> > > +out:
> > > IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
> > > "--> server %u.%u.%u.%u:%d\n",
> > > - NIPQUAD(en->addr),
> > > + NIPQUAD(iph->addr),
> >
> > Minor problem, this should be iph->daddr
>
> Good catch, I've updated my patch locally. Let's get a consensus on the
> sizeof issue and I'll repost.
I'm happy with sizeof(*ptr) now. So I think that the lblc patch is ready
and this one just needs the minor daddr fix.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH] ipvs: Fix race conditions in lblcr scheduler
2008-08-19 0:41 ` Simon Horman
@ 2008-08-19 6:16 ` Sven Wegener
2008-08-19 7:41 ` Simon Horman
0 siblings, 1 reply; 6+ messages in thread
From: Sven Wegener @ 2008-08-19 6:16 UTC (permalink / raw)
To: Simon Horman; +Cc: netdev, lvs-devel, wensong, ja
We can't access the cache entry outside of our critical read-locked region,
because someone may free that entry. Also getting an entry under read lock,
then locking for write and trying to delete that entry looks fishy, but should
be no problem here, because we're only comparing a pointer. Also there is no
need for our own rwlock, there is already one in the service structure for use
in the schedulers.
Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
---
net/ipv4/ipvs/ip_vs_lblcr.c | 229 +++++++++++++++++++++----------------------
1 files changed, 114 insertions(+), 115 deletions(-)
Fix the addr->daddr in the debug output.
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index f1c8450..375a1ff 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
return NULL;
}
- e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
+ e = kmalloc(sizeof(*e), GFP_ATOMIC);
if (e == NULL) {
IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
return NULL;
@@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
e->dest = dest;
/* link it to the list */
- write_lock(&set->lock);
e->next = set->list;
set->list = e;
atomic_inc(&set->size);
- write_unlock(&set->lock);
set->lastmod = jiffies;
return e;
@@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
{
struct ip_vs_dest_list *e, **ep;
- write_lock(&set->lock);
for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
if (e->dest == dest) {
/* HIT */
@@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
}
ep = &e->next;
}
- write_unlock(&set->lock);
}
static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
@@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
if (set == NULL)
return NULL;
- read_lock(&set->lock);
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
least = e->dest;
@@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
goto nextstage;
}
}
- read_unlock(&set->lock);
return NULL;
/* find the destination with the weighted least load */
@@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
loh = doh;
}
}
- read_unlock(&set->lock);
IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
@@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
if (set == NULL)
return NULL;
- read_lock(&set->lock);
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
most = e->dest;
@@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
goto nextstage;
}
}
- read_unlock(&set->lock);
return NULL;
/* find the destination with the weighted most load */
@@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
moh = doh;
}
}
- read_unlock(&set->lock);
IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
@@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
* IPVS lblcr hash table
*/
struct ip_vs_lblcr_table {
- rwlock_t lock; /* lock for this table */
struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
@@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
static struct ctl_table_header * sysctl_header;
-/*
- * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server.
- */
-static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
-{
- struct ip_vs_lblcr_entry *en;
-
- en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
- if (en == NULL) {
- IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
- return NULL;
- }
-
- INIT_LIST_HEAD(&en->list);
- en->addr = daddr;
-
- /* initilize its dest set */
- atomic_set(&(en->set.size), 0);
- en->set.list = NULL;
- rwlock_init(&en->set.lock);
-
- return en;
-}
-
-
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
list_del(&en->list);
@@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
* Hash an entry in the ip_vs_lblcr_table.
* returns bool success.
*/
-static int
+static void
ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
{
- unsigned hash;
-
- if (!list_empty(&en->list)) {
- IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
- "called from %p\n", __builtin_return_address(0));
- return 0;
- }
+ unsigned hash = ip_vs_lblcr_hashkey(en->addr);
- /*
- * Hash by destination IP address
- */
- hash = ip_vs_lblcr_hashkey(en->addr);
-
- write_lock(&tbl->lock);
list_add(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
- write_unlock(&tbl->lock);
-
- return 1;
}
/*
- * Get ip_vs_lblcr_entry associated with supplied parameters.
+ * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
+ * read lock.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
{
- unsigned hash;
+ unsigned hash = ip_vs_lblcr_hashkey(addr);
struct ip_vs_lblcr_entry *en;
- hash = ip_vs_lblcr_hashkey(addr);
+ list_for_each_entry(en, &tbl->bucket[hash], list)
+ if (en->addr == addr)
+ return en;
+
+ return NULL;
+}
- read_lock(&tbl->lock);
- list_for_each_entry(en, &tbl->bucket[hash], list) {
- if (en->addr == addr) {
- /* HIT */
- read_unlock(&tbl->lock);
- return en;
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblcr_entry *en;
+
+ en = ip_vs_lblcr_get(tbl, daddr);
+ if (!en) {
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en) {
+ IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+ return NULL;
}
+
+ en->addr = daddr;
+ en->lastuse = jiffies;
+
+ /* initilize its dest set */
+ atomic_set(&(en->set.size), 0);
+ en->set.list = NULL;
+ rwlock_init(&en->set.lock);
+
+ ip_vs_lblcr_hash(tbl, en);
}
- read_unlock(&tbl->lock);
+ write_lock(&en->set.lock);
+ ip_vs_dest_set_insert(&en->set, dest);
+ write_unlock(&en->set.lock);
- return NULL;
+ return en;
}
@@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
int i;
struct ip_vs_lblcr_entry *en, *nxt;
+ /* No locking required, only called during cleanup. */
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- write_lock(&tbl->lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
ip_vs_lblcr_free(en);
- atomic_dec(&tbl->entries);
}
- write_unlock(&tbl->lock);
}
}
-static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int i, j;
struct ip_vs_lblcr_entry *en, *nxt;
@@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&tbl->lock);
+ write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
now))
@@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&tbl->lock);
+ write_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
*/
static void ip_vs_lblcr_check_expire(unsigned long data)
{
- struct ip_vs_lblcr_table *tbl;
+ struct ip_vs_service *svc = (struct ip_vs_service *) data;
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int goal;
int i, j;
struct ip_vs_lblcr_entry *en, *nxt;
- tbl = (struct ip_vs_lblcr_table *)data;
-
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
- ip_vs_lblcr_full_check(tbl);
+ ip_vs_lblcr_full_check(svc);
tbl->counter = 1;
goto out;
}
@@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&tbl->lock);
+ write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
continue;
@@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&tbl->lock);
+ write_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
@@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
/*
* Allocate the ip_vs_lblcr_table for this service
*/
- tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
+ tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
- "current service\n",
- sizeof(struct ip_vs_lblcr_table));
+ "current service\n", sizeof(*tbl));
/*
* Initialize the hash buckets
@@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
INIT_LIST_HEAD(&tbl->bucket[i]);
}
- rwlock_init(&tbl->lock);
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
@@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
* Hook periodic timer for garbage collection
*/
setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
- (unsigned long)tbl);
- tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
- add_timer(&tbl->periodic_timer);
+ (unsigned long)svc);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
return 0;
}
@@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
ip_vs_lblcr_flush(tbl);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree(tbl);
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
- sizeof(struct ip_vs_lblcr_table));
+ sizeof(*tbl));
return 0;
}
@@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
static struct ip_vs_dest *
ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
- struct ip_vs_dest *dest;
- struct ip_vs_lblcr_table *tbl;
- struct ip_vs_lblcr_entry *en;
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
struct iphdr *iph = ip_hdr(skb);
+ struct ip_vs_dest *dest = NULL;
+ struct ip_vs_lblcr_entry *en;
IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
- tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
+ /* First look in our cache */
+ read_lock(&svc->sched_lock);
en = ip_vs_lblcr_get(tbl, iph->daddr);
- if (en == NULL) {
- dest = __ip_vs_lblcr_schedule(svc, iph);
- if (dest == NULL) {
- IP_VS_DBG(1, "no destination available\n");
- return NULL;
- }
- en = ip_vs_lblcr_new(iph->daddr);
- if (en == NULL) {
- return NULL;
- }
- ip_vs_dest_set_insert(&en->set, dest);
- ip_vs_lblcr_hash(tbl, en);
- } else {
+ if (en) {
+ /* We only hold a read lock, but this is atomic */
+ en->lastuse = jiffies;
+
+ /* Get the least loaded destination */
+ read_lock(&en->set.lock);
dest = ip_vs_dest_set_min(&en->set);
- if (!dest || is_overloaded(dest, svc)) {
- dest = __ip_vs_lblcr_schedule(svc, iph);
- if (dest == NULL) {
- IP_VS_DBG(1, "no destination available\n");
- return NULL;
- }
- ip_vs_dest_set_insert(&en->set, dest);
- }
+ read_unlock(&en->set.lock);
+
+ /* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
- jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
+ time_after(jiffies, en->set.lastmod +
+ sysctl_ip_vs_lblcr_expiration)) {
struct ip_vs_dest *m;
+
+ write_lock(&en->set.lock);
m = ip_vs_dest_set_max(&en->set);
if (m)
ip_vs_dest_set_erase(&en->set, m);
+ write_unlock(&en->set.lock);
+ }
+
+ /* If the destination is not overloaded, use it */
+ if (dest && !is_overloaded(dest, svc)) {
+ read_unlock(&svc->sched_lock);
+ goto out;
+ }
+
+ /* The cache entry is invalid, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc, iph);
+ if (!dest) {
+ IP_VS_DBG(1, "no destination available\n");
+ read_unlock(&svc->sched_lock);
+ return NULL;
}
+
+ /* Update our cache entry */
+ write_lock(&en->set.lock);
+ ip_vs_dest_set_insert(&en->set, dest);
+ write_unlock(&en->set.lock);
+ }
+ read_unlock(&svc->sched_lock);
+
+ if (dest)
+ goto out;
+
+ /* No cache entry, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc, iph);
+ if (!dest) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
}
- en->lastuse = jiffies;
+ /* If we fail to create a cache entry, we'll just use the valid dest */
+ write_lock(&svc->sched_lock);
+ ip_vs_lblcr_new(tbl, iph->daddr, dest);
+ write_unlock(&svc->sched_lock);
+
+out:
IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
- NIPQUAD(en->addr),
+ NIPQUAD(iph->daddr),
NIPQUAD(dest->addr),
ntohs(dest->port));
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] ipvs: Fix race conditions in lblcr scheduler
2008-08-19 6:16 ` Sven Wegener
@ 2008-08-19 7:41 ` Simon Horman
0 siblings, 0 replies; 6+ messages in thread
From: Simon Horman @ 2008-08-19 7:41 UTC (permalink / raw)
To: Sven Wegener; +Cc: netdev, lvs-devel, wensong, ja
On Tue, Aug 19, 2008 at 08:16:19AM +0200, Sven Wegener wrote:
> We can't access the cache entry outside of our critical read-locked region,
> because someone may free that entry. Also getting an entry under read lock,
> then locking for write and trying to delete that entry looks fishy, but should
> be no problem here, because we're only comparing a pointer. Also there is no
> need for our own rwlock, there is already one in the service structure for use
> in the schedulers.
>
> Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Thanks, applied to lvs-2.6
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2008-08-19 7:41 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-17 22:53 [PATCH] ipvs: Fix race conditions in lblcr scheduler Sven Wegener
2008-08-18 23:32 ` Simon Horman
2008-08-18 23:42 ` Sven Wegener
2008-08-19 0:41 ` Simon Horman
2008-08-19 6:16 ` Sven Wegener
2008-08-19 7:41 ` Simon Horman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).