netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] ip: re-introduce fragments cache worker
@ 2018-07-06 10:10 Paolo Abeni
  2018-07-06 11:23 ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Abeni @ 2018-07-06 10:10 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown

Currently, the ip frag cache is fragile to overload. With
flow control disabled:

./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08
./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
28.66

Once that the overload condition is reached, the system does not
recover until it's almost completely idle:

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
13.72

This is due to the removal of the fragment cache worker, which
was responsible to free some IP fragment cache memory when the
high threshold was reached, allowing the system to cope successfully
with the next fragmented packets.

This commit re-introduces the worker, on a per netns basis. Thanks
to rhashtable walkers we can block the bh only for an entry removal.

After this commit (and before IP frag worker removal):

./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
8599.77

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
9623.12

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
Note: tweaking ipfrag sysfs does not solve completely the issue:
- raising ipfrag_high_thresh increases the number of parallels
  connections required to degrade the tput, but reached the IP
  fragment cache capacity the good-put still goes almost to 0,
  with the worker we get much more nice behaviour.
- setting ipfrag_time to 2 increases the change to recover from
  overload (the 2# test above), but with several experiments 
  in such test, I got an average of 50% the expected tput with a very
  large variance, with the worker we always see the expected/
  line rate tput.
---
 include/net/inet_frag.h  |  8 ++---
 net/ipv4/inet_fragment.c | 72 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index ed07e3786d98..1f12692d7f7d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -11,6 +11,8 @@ struct netns_frags {
 	int			timeout;
 	int			max_dist;
 	struct inet_frags	*f;
+	struct work_struct	frags_work;
+	struct rhashtable_iter	iter;
 
 	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 
@@ -101,11 +103,7 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int inet_frags_init_net(struct netns_frags *nf)
-{
-	atomic_long_set(&nf->mem, 0);
-	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
-}
+int inet_frags_init_net(struct netns_frags *nf);
 void inet_frags_exit_net(struct netns_frags *nf);
 
 void inet_frag_kill(struct inet_frag_queue *q);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..0f5b29ce96de 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -88,10 +88,76 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 	inet_frag_put(fq);
 }
 
+static void inet_frag_schedule_worker(struct netns_frags *nf)
+{
+	if (unlikely(!work_pending(&nf->frags_work)))
+		schedule_work(&nf->frags_work);
+}
+
+#define INETFRAGS_EVICT_MAX	64
+static void inet_frag_worker(struct work_struct *work)
+{
+	struct netns_frags *nf;
+	bool reschedule;
+	int evicted = 0;
+
+	nf = container_of(work, struct netns_frags, frags_work);
+
+	rhashtable_walk_start(&nf->iter);
+
+	while ((reschedule = (frag_mem_limit(nf) > nf->low_thresh))) {
+		struct inet_frag_queue *fq = rhashtable_walk_next(&nf->iter);
+
+		if (IS_ERR(fq) && PTR_ERR(fq) == -EAGAIN)
+			continue;
+		if (!fq) {
+			/* end of table, restart the walk */
+			rhashtable_walk_stop(&nf->iter);
+			rhashtable_walk_exit(&nf->iter);
+			rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+			rhashtable_walk_start(&nf->iter);
+			continue;
+		}
+		if (!refcount_inc_not_zero(&fq->refcnt))
+			continue;
+
+		spin_lock_bh(&fq->lock);
+		inet_frag_kill(fq);
+		spin_unlock_bh(&fq->lock);
+		inet_frag_put(fq);
+
+		/* limit the amount of work we can do before a reschedule,
+		 * to avoid starving others queued works
+		 */
+		if (++evicted > INETFRAGS_EVICT_MAX)
+			break;
+	}
+
+	rhashtable_walk_stop(&nf->iter);
+
+	if (reschedule)
+		inet_frag_schedule_worker(nf);
+}
+
+int inet_frags_init_net(struct netns_frags *nf)
+{
+	int ret;
+
+	atomic_long_set(&nf->mem, 0);
+	INIT_WORK(&nf->frags_work, inet_frag_worker);
+	ret = rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
+	if (ret)
+		return ret;
+	rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+	return ret;
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
 void inet_frags_exit_net(struct netns_frags *nf)
 {
 	nf->low_thresh = 0; /* prevent creation of new frags */
-
+	cancel_work_sync(&nf->frags_work);
+	rhashtable_walk_exit(&nf->iter);
 	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
@@ -157,8 +223,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 {
 	struct inet_frag_queue *q;
 
-	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
+		inet_frag_schedule_worker(nf);
 		return NULL;
+	}
 
 	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 	if (!q)
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2018-07-20 18:26 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-07-06 10:10 [RFC PATCH] ip: re-introduce fragments cache worker Paolo Abeni
2018-07-06 11:23 ` Eric Dumazet
2018-07-06 11:56   ` Paolo Abeni
2018-07-06 12:09     ` Eric Dumazet
2018-07-06 13:56       ` Paolo Abeni
2018-07-06 14:20         ` Eric Dumazet
2018-07-09  9:43           ` Paolo Abeni
2018-07-09 11:34             ` Eric Dumazet
2018-07-09 11:39               ` Eric Dumazet
2018-07-09 12:50                 ` Eric Dumazet
2018-07-20 14:48                   ` Paolo Abeni
2018-07-20 15:58                     ` Eric Dumazet
2018-07-20 17:31                       ` Paolo Abeni
2018-07-20 17:37                         ` Eric Dumazet
2018-07-06 14:37         ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).