[RFC PATCH] ip: re-introduce fragments cache worker

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] ip: re-introduce fragments cache worker
@ 2018-07-06 10:10 Paolo Abeni
  2018-07-06 11:23 ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Abeni @ 2018-07-06 10:10 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown

Currently, the ip frag cache is fragile to overload. With
flow control disabled:

./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08
./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
28.66

Once that the overload condition is reached, the system does not
recover until it's almost completely idle:

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
13.72

This is due to the removal of the fragment cache worker, which
was responsible to free some IP fragment cache memory when the
high threshold was reached, allowing the system to cope successfully
with the next fragmented packets.

This commit re-introduces the worker, on a per netns basis. Thanks
to rhashtable walkers we can block the bh only for an entry removal.

After this commit (and before IP frag worker removal):

./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
8599.77

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
9623.12

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
Note: tweaking ipfrag sysfs does not solve completely the issue:
- raising ipfrag_high_thresh increases the number of parallels
  connections required to degrade the tput, but reached the IP
  fragment cache capacity the good-put still goes almost to 0,
  with the worker we get much more nice behaviour.
- setting ipfrag_time to 2 increases the change to recover from
  overload (the 2# test above), but with several experiments 
  in such test, I got an average of 50% the expected tput with a very
  large variance, with the worker we always see the expected/
  line rate tput.
---
 include/net/inet_frag.h  |  8 ++---
 net/ipv4/inet_fragment.c | 72 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index ed07e3786d98..1f12692d7f7d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -11,6 +11,8 @@ struct netns_frags {
 	int			timeout;
 	int			max_dist;
 	struct inet_frags	*f;
+	struct work_struct	frags_work;
+	struct rhashtable_iter	iter;
 
 	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 
@@ -101,11 +103,7 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int inet_frags_init_net(struct netns_frags *nf)
-{
-	atomic_long_set(&nf->mem, 0);
-	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
-}
+int inet_frags_init_net(struct netns_frags *nf);
 void inet_frags_exit_net(struct netns_frags *nf);
 
 void inet_frag_kill(struct inet_frag_queue *q);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..0f5b29ce96de 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -88,10 +88,76 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 	inet_frag_put(fq);
 }
 
+static void inet_frag_schedule_worker(struct netns_frags *nf)
+{
+	if (unlikely(!work_pending(&nf->frags_work)))
+		schedule_work(&nf->frags_work);
+}
+
+#define INETFRAGS_EVICT_MAX	64
+static void inet_frag_worker(struct work_struct *work)
+{
+	struct netns_frags *nf;
+	bool reschedule;
+	int evicted = 0;
+
+	nf = container_of(work, struct netns_frags, frags_work);
+
+	rhashtable_walk_start(&nf->iter);
+
+	while ((reschedule = (frag_mem_limit(nf) > nf->low_thresh))) {
+		struct inet_frag_queue *fq = rhashtable_walk_next(&nf->iter);
+
+		if (IS_ERR(fq) && PTR_ERR(fq) == -EAGAIN)
+			continue;
+		if (!fq) {
+			/* end of table, restart the walk */
+			rhashtable_walk_stop(&nf->iter);
+			rhashtable_walk_exit(&nf->iter);
+			rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+			rhashtable_walk_start(&nf->iter);
+			continue;
+		}
+		if (!refcount_inc_not_zero(&fq->refcnt))
+			continue;
+
+		spin_lock_bh(&fq->lock);
+		inet_frag_kill(fq);
+		spin_unlock_bh(&fq->lock);
+		inet_frag_put(fq);
+
+		/* limit the amount of work we can do before a reschedule,
+		 * to avoid starving others queued works
+		 */
+		if (++evicted > INETFRAGS_EVICT_MAX)
+			break;
+	}
+
+	rhashtable_walk_stop(&nf->iter);
+
+	if (reschedule)
+		inet_frag_schedule_worker(nf);
+}
+
+int inet_frags_init_net(struct netns_frags *nf)
+{
+	int ret;
+
+	atomic_long_set(&nf->mem, 0);
+	INIT_WORK(&nf->frags_work, inet_frag_worker);
+	ret = rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
+	if (ret)
+		return ret;
+	rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+	return ret;
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
 void inet_frags_exit_net(struct netns_frags *nf)
 {
 	nf->low_thresh = 0; /* prevent creation of new frags */
-
+	cancel_work_sync(&nf->frags_work);
+	rhashtable_walk_exit(&nf->iter);
 	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
@@ -157,8 +223,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 {
 	struct inet_frag_queue *q;
 
-	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
+		inet_frag_schedule_worker(nf);
 		return NULL;
+	}
 
 	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 	if (!q)
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-06 10:10 [RFC PATCH] ip: re-introduce fragments cache worker Paolo Abeni
@ 2018-07-06 11:23 ` Eric Dumazet
  2018-07-06 11:56   ` Paolo Abeni
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2018-07-06 11:23 UTC (permalink / raw)
  To: Paolo Abeni, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown



On 07/06/2018 03:10 AM, Paolo Abeni wrote:
> Currently, the ip frag cache is fragile to overload. With
> flow control disabled:
> 
> ./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
> 9618.08
> ./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
> 28.66
> 
> Once that the overload condition is reached, the system does not
> recover until it's almost completely idle:
> 
> ./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
> sleep 4; I=0;
> for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
> 13.72
> 
> This is due to the removal of the fragment cache worker, which
> was responsible to free some IP fragment cache memory when the
> high threshold was reached, allowing the system to cope successfully
> with the next fragmented packets.
> 
> This commit re-introduces the worker, on a per netns basis. Thanks
> to rhashtable walkers we can block the bh only for an entry removal.
> 
> After this commit (and before IP frag worker removal):
> 
> ./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
> 9618.08
> 
> ./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
> 8599.77
> 
> ./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
> sleep 4; I=0;
> for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
> 9623.12
> 
> Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> Note: tweaking ipfrag sysfs does not solve completely the issue:
> - raising ipfrag_high_thresh increases the number of parallels
>   connections required to degrade the tput, but reached the IP
>   fragment cache capacity the good-put still goes almost to 0,
>   with the worker we get much more nice behaviour.
> - setting ipfrag_time to 2 increases the change to recover from
>   overload (the 2# test above), but with several experiments 
>   in such test, I got an average of 50% the expected tput with a very
>   large variance, with the worker we always see the expected/
>   line rate tput.
> ---

Ho hum. No please.

I do not think adding back a GC is wise, since my patches were going in the direction
of allowing us to increase limits on current hardware.

Meaning that the amount of frags to evict would be quite big under DDOS.
(One inet_frag_queue allocated for every incoming tiny frame :/ )

A GC is a _huge_ problem, burning one cpu (you would have to provision for this CPU)
compared to letting normal per frag timer doing its job.

My plan was to reduce the per frag timer under load (default is 30 seconds), since
this is exactly what your patch is indirectly doing, by aggressively pruning
frags under stress.

That would be a much simpler heuristic. [1]

BTW my own results (before patch) are :

lpaa5:/export/hda3/google/edumazet# ./super_netperf 10 -H 10.246.7.134 -t UDP_STREAM -l 60
   9602
lpaa5:/export/hda3/google/edumazet# ./super_netperf 200 -H 10.246.7.134 -t UDP_STREAM -l 60
   9557

On receiver (normal settings here) I had :

lpaa6:/export/hda3/google/edumazet# grep . /proc/sys/net/ipv4/ipfrag_*
/proc/sys/net/ipv4/ipfrag_high_thresh:104857600
/proc/sys/net/ipv4/ipfrag_low_thresh:78643200
/proc/sys/net/ipv4/ipfrag_max_dist:0
/proc/sys/net/ipv4/ipfrag_secret_interval:0
/proc/sys/net/ipv4/ipfrag_time:30

lpaa6:/export/hda3/google/edumazet# grep FRAG /proc/net/sockstat
FRAG: inuse 824 memory 53125312

[1] Something like (for IPv4 only here)

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d0931df8429a33e8d03e719b87da0747..88ed61bcda00f3357724e5c4dbcb97400b4a8b21 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -155,9 +155,15 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
                                               struct inet_frags *f,
                                               void *arg)
 {
+       long high_thresh = READ_ONCE(nf->high_thresh);
        struct inet_frag_queue *q;
+       u64 timeout;
+       long usage;
 
-       if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+       if (!high_thresh)
+               return NULL;
+       usage = frag_mem_limit(nf);
+       if (usage > high_thresh)
                return NULL;
 
        q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
@@ -171,6 +177,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
        timer_setup(&q->timer, f->frag_expire, 0);
        spin_lock_init(&q->lock);
        refcount_set(&q->refcnt, 3);
+       timeout = (u64)nf->timeout * (high_thresh - usage);
+       mod_timer(&q->timer, jiffies + div64_long(timeout, high_thresh));
 
        return q;
 }
@@ -186,8 +194,6 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
        if (!q)
                return NULL;
 
-       mod_timer(&q->timer, jiffies + nf->timeout);
-
        err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
                                     f->rhash_params);
        if (err < 0) {

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-06 11:23 ` Eric Dumazet
@ 2018-07-06 11:56   ` Paolo Abeni
  2018-07-06 12:09     ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Abeni @ 2018-07-06 11:56 UTC (permalink / raw)
  To: Eric Dumazet, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown

Hi,

On Fri, 2018-07-06 at 04:23 -0700, Eric Dumazet wrote:
> Ho hum. No please.
> 
> I do not think adding back a GC is wise, since my patches were going in the direction
> of allowing us to increase limits on current hardware.
> 
> Meaning that the amount of frags to evict would be quite big under DDOS.
> (One inet_frag_queue allocated for every incoming tiny frame :/ )
> 
> A GC is a _huge_ problem, burning one cpu (you would have to provision for this CPU)
> compared to letting normal per frag timer doing its job.
> 
> My plan was to reduce the per frag timer under load (default is 30 seconds), since
> this is exactly what your patch is indirectly doing, by aggressively pruning
> frags under stress.
> 
> That would be a much simpler heuristic. [1]
> 
> BTW my own results (before patch) are :
> 
> lpaa5:/export/hda3/google/edumazet# ./super_netperf 10 -H 10.246.7.134 -t UDP_STREAM -l 60
>    9602
> lpaa5:/export/hda3/google/edumazet# ./super_netperf 200 -H 10.246.7.134 -t UDP_STREAM -l 60
>    9557
> 
> On receiver (normal settings here) I had :
> 
> lpaa6:/export/hda3/google/edumazet# grep . /proc/sys/net/ipv4/ipfrag_*
> /proc/sys/net/ipv4/ipfrag_high_thresh:104857600
> /proc/sys/net/ipv4/ipfrag_low_thresh:78643200
> /proc/sys/net/ipv4/ipfrag_max_dist:0
> /proc/sys/net/ipv4/ipfrag_secret_interval:0
> /proc/sys/net/ipv4/ipfrag_time:30
> 
> lpaa6:/export/hda3/google/edumazet# grep FRAG /proc/net/sockstat
> FRAG: inuse 824 memory 53125312

Than you for the feedback.

With your setting, you need a bit more concurrent connections (400 ?)
to saturate the ipfrag cache. Above that number, performances will
still sink.

> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index c9e35b81d0931df8429a33e8d03e719b87da0747..88ed61bcda00f3357724e5c4dbcb97400b4a8b21 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -155,9 +155,15 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
>                                                struct inet_frags *f,
>                                                void *arg)
>  {
> +       long high_thresh = READ_ONCE(nf->high_thresh);
>         struct inet_frag_queue *q;
> +       u64 timeout;
> +       long usage;
>  
> -       if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
> +       if (!high_thresh)
> +               return NULL;
> +       usage = frag_mem_limit(nf);
> +       if (usage > high_thresh)
>                 return NULL;
>  
>         q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
> @@ -171,6 +177,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
>         timer_setup(&q->timer, f->frag_expire, 0);
>         spin_lock_init(&q->lock);
>         refcount_set(&q->refcnt, 3);
> +       timeout = (u64)nf->timeout * (high_thresh - usage);
> +       mod_timer(&q->timer, jiffies + div64_long(timeout, high_thresh));
>  
>         return q;
>  }

This looks nice, I'll try to test it in my use case and I'll report
here.

Perhaps we can use the default timeout when usage < low_thresh, to
avoid some maths in possibly common scenario?

I have doubt: under DDOS we will trigger <max numfrags> timeout per
jiffy, can that keep a CPU busy, too?

Cheers,

Paolo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-06 11:56   ` Paolo Abeni
@ 2018-07-06 12:09     ` Eric Dumazet
  2018-07-06 13:56       ` Paolo Abeni
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2018-07-06 12:09 UTC (permalink / raw)
  To: Paolo Abeni, Eric Dumazet, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown



On 07/06/2018 04:56 AM, Paolo Abeni wrote:
> Hi,
> 
> On Fri, 2018-07-06 at 04:23 -0700, Eric Dumazet wrote:
>> Ho hum. No please.
>>
>> I do not think adding back a GC is wise, since my patches were going in the direction
>> of allowing us to increase limits on current hardware.
>>
>> Meaning that the amount of frags to evict would be quite big under DDOS.
>> (One inet_frag_queue allocated for every incoming tiny frame :/ )
>>
>> A GC is a _huge_ problem, burning one cpu (you would have to provision for this CPU)
>> compared to letting normal per frag timer doing its job.
>>
>> My plan was to reduce the per frag timer under load (default is 30 seconds), since
>> this is exactly what your patch is indirectly doing, by aggressively pruning
>> frags under stress.
>>
>> That would be a much simpler heuristic. [1]
>>
>> BTW my own results (before patch) are :
>>
>> lpaa5:/export/hda3/google/edumazet# ./super_netperf 10 -H 10.246.7.134 -t UDP_STREAM -l 60
>>    9602
>> lpaa5:/export/hda3/google/edumazet# ./super_netperf 200 -H 10.246.7.134 -t UDP_STREAM -l 60
>>    9557
>>
>> On receiver (normal settings here) I had :
>>
>> lpaa6:/export/hda3/google/edumazet# grep . /proc/sys/net/ipv4/ipfrag_*
>> /proc/sys/net/ipv4/ipfrag_high_thresh:104857600
>> /proc/sys/net/ipv4/ipfrag_low_thresh:78643200
>> /proc/sys/net/ipv4/ipfrag_max_dist:0
>> /proc/sys/net/ipv4/ipfrag_secret_interval:0
>> /proc/sys/net/ipv4/ipfrag_time:30
>>
>> lpaa6:/export/hda3/google/edumazet# grep FRAG /proc/net/sockstat
>> FRAG: inuse 824 memory 53125312
> 
> Than you for the feedback.
> 
> With your setting, you need a bit more concurrent connections (400 ?)
> to saturate the ipfrag cache. Above that number, performances will
> still sink.

Maybe, but IP defrag can not be 'perfect'.

For this particular use case I could still bump high_thresh to 6 GB and all would be good :)

> This looks nice, I'll try to test it in my use case and I'll report
> here.
> 
> Perhaps we can use the default timeout when usage < low_thresh, to
> avoid some maths in possibly common scenario?

On current 64bit hardware, a divide here is not a big cost (compared to the rest
of frag setup)
and I would rather starting having smaller timeouts sooner than later ;)

(low_thresh is typically set to 75% of high_thresh)

> 
> I have doubt: under DDOS we will trigger <max numfrags> timeout per
> jiffy, can that keep a CPU busy, too?

Yes, the cpu(s) handling the RX queue(s), which are already provisioned for networking stuff ;)

Even without any frag being received, these cpu can be 100% busy.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-06 12:09     ` Eric Dumazet
@ 2018-07-06 13:56       ` Paolo Abeni
  2018-07-06 14:20         ` Eric Dumazet
  2018-07-06 14:37         ` Eric Dumazet
  0 siblings, 2 replies; 15+ messages in thread
From: Paolo Abeni @ 2018-07-06 13:56 UTC (permalink / raw)
  To: Eric Dumazet, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown

On Fri, 2018-07-06 at 05:09 -0700, Eric Dumazet wrote:
> On 07/06/2018 04:56 AM, Paolo Abeni wrote:
> > With your setting, you need a bit more concurrent connections (400 ?)
> > to saturate the ipfrag cache. Above that number, performances will
> > still sink.
> 
> Maybe, but IP defrag can not be 'perfect'.
> 
> For this particular use case I could still bump high_thresh to 6 GB and all would be good :)

Understood.

I'd like to be sure I stated the problem I see clearly. With the
current code the "goodput" goes to almost 0 as soon as the ipfrag cache
load goes above it's capacity. Before the worker removal, after
reaching high_thresh, the "goodput" degratated slowly and even with a
load more than an order of magnitude higher, the performances were
still quite good. I think we can't ask customers to add more memory for
a kernel upgrade; even changing the default sysfs configuration is
somewhat troubling.

> > This looks nice, I'll try to test it in my use case and I'll report
> > here.

I tried the patch, but the result are not encouraging:

./super_netperf.sh 200 -H 192.168.101.2 -t UDP_STREAM -l 60
34.94

# on the receiver side:
echo 2 >  /proc/sys/net/ipv4/ipfrag_time

# on the sender side:
./super_netperf.sh 200 -H 192.168.101.2 -t UDP_STREAM -l 60
85.8

# still on receiver side, while the test is running:
nstat>/dev/null ;sleep 1; nstat |grep IpReasm
IpReasmTimeout                  2128               0.0
IpReasmReqds                    754770             0.0
IpReasmOKs                      135                0.0
IpReasmFails                    752811             0.0

grep FRAG /proc/net/sockstat
FRAG: inuse 124 memory 5286144

The patch has some effect, as I basically saw no timeout without it,
but still does not look aggressive enough. Or possibly it's evicting
the fragments that are more likely to be used/completed (the most
recents one).

> > I have doubt: under DDOS we will trigger <max numfrags> timeout per
> > jiffy, can that keep a CPU busy, too?
> 
> Yes, the cpu(s) handling the RX queue(s), which are already provisioned for networking stuff ;)
> 
> Even without any frag being received, these cpu can be 100% busy.

With:

schedule_work_on(smp_processor_id(), #... )

We can be sure to run exclusively on the cpu handling the RX queue even with the worker.

Cheers,

Paolo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-06 13:56       ` Paolo Abeni
@ 2018-07-06 14:20         ` Eric Dumazet
  2018-07-09  9:43           ` Paolo Abeni
  2018-07-06 14:37         ` Eric Dumazet
  1 sibling, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2018-07-06 14:20 UTC (permalink / raw)
  To: Paolo Abeni, Eric Dumazet, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown



On 07/06/2018 06:56 AM, Paolo Abeni wrote:
> On Fri, 2018-07-06 at 05:09 -0700, Eric Dumazet wrote:
>> On 07/06/2018 04:56 AM, Paolo Abeni wrote:
>>> With your setting, you need a bit more concurrent connections (400 ?)
>>> to saturate the ipfrag cache. Above that number, performances will
>>> still sink.
>>
>> Maybe, but IP defrag can not be 'perfect'.
>>
>> For this particular use case I could still bump high_thresh to 6 GB and all would be good :)
> 
> Understood.
> 
> I'd like to be sure I stated the problem I see clearly. With the
> current code the "goodput" goes to almost 0 as soon as the ipfrag cache
> load goes above it's capacity. Before the worker removal, after
> reaching high_thresh, the "goodput" degratated slowly and even with a
> load more than an order of magnitude higher, the performances were
> still quite good. I think we can't ask customers to add more memory for
> a kernel upgrade; even changing the default sysfs configuration is
> somewhat troubling.
> 
>>> This looks nice, I'll try to test it in my use case and I'll report
>>> here.
> 
> I tried the patch, but the result are not encouraging:
> 
> ./super_netperf.sh 200 -H 192.168.101.2 -t UDP_STREAM -l 60
> 34.94
> 
> # on the receiver side:
> echo 2 >  /proc/sys/net/ipv4/ipfrag_time
> 
> # on the sender side:
> ./super_netperf.sh 200 -H 192.168.101.2 -t UDP_STREAM -l 60
> 85.8
> 
> # still on receiver side, while the test is running:
> nstat>/dev/null ;sleep 1; nstat |grep IpReasm
> IpReasmTimeout                  2128               0.0
> IpReasmReqds                    754770             0.0
> IpReasmOKs                      135                0.0
> IpReasmFails                    752811             0.0
> 
> grep FRAG /proc/net/sockstat
> FRAG: inuse 124 memory 5286144
> 
> The patch has some effect, as I basically saw no timeout without it,
> but still does not look aggressive enough. Or possibly it's evicting
> the fragments that are more likely to be used/completed (the most
> recents one).

Hey, that was simply an idea (not even compiled), not the final patch.

I will test/polish it later, I am coming back from vacations and have a backlog.

Here are my results : (Note that I have _not_ changed /proc/sys/net/ipv4/ipfrag_time )

lpaa6:~# grep . /proc/sys/net/ipv4/ipfrag_* ; grep FRAG /proc/net/sockstat
/proc/sys/net/ipv4/ipfrag_high_thresh:104857600
/proc/sys/net/ipv4/ipfrag_low_thresh:78643200
/proc/sys/net/ipv4/ipfrag_max_dist:0
/proc/sys/net/ipv4/ipfrag_secret_interval:0
/proc/sys/net/ipv4/ipfrag_time:30
FRAG: inuse 1379 memory 105006776

lpaa5:/export/hda3/google/edumazet# ./super_netperf 400 -H 10.246.7.134 -t UDP_STREAM -l 60
netperf: send_omni: send_data failed: No route to host
netperf: send_omni: send_data failed: No route to host
   9063


I would say that it looks pretty good to me.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-06 14:20         ` Eric Dumazet
@ 2018-07-09  9:43           ` Paolo Abeni
  2018-07-09 11:34             ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Abeni @ 2018-07-09  9:43 UTC (permalink / raw)
  To: Eric Dumazet, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown

On Fri, 2018-07-06 at 07:20 -0700, Eric Dumazet wrote:
> I will test/polish it later, I am coming back from vacations and have a backlog.
> 
> Here are my results : (Note that I have _not_ changed /proc/sys/net/ipv4/ipfrag_time )
> 
> lpaa6:~# grep . /proc/sys/net/ipv4/ipfrag_* ; grep FRAG /proc/net/sockstat
> /proc/sys/net/ipv4/ipfrag_high_thresh:104857600
> /proc/sys/net/ipv4/ipfrag_low_thresh:78643200
> /proc/sys/net/ipv4/ipfrag_max_dist:0
> /proc/sys/net/ipv4/ipfrag_secret_interval:0
> /proc/sys/net/ipv4/ipfrag_time:30
> FRAG: inuse 1379 memory 105006776
> 
> lpaa5:/export/hda3/google/edumazet# ./super_netperf 400 -H 10.246.7.134 -t UDP_STREAM -l 60
> netperf: send_omni: send_data failed: No route to host
> netperf: send_omni: send_data failed: No route to host
>    9063
> 
> 
> I would say that it looks pretty good to me.

Is that with an unmodifed kernel?

I would be happy if I could replicate such results. With the same
configuration I see:

[netdev9 ~]# grep . /proc/sys/net/ipv4/ipfrag_*; nstat>/dev/null; sleep 1; nstat|grep IpR; grep FRAG /proc/net/sockstat
/proc/sys/net/ipv4/ipfrag_high_thresh:104857600
/proc/sys/net/ipv4/ipfrag_low_thresh:3145728
/proc/sys/net/ipv4/ipfrag_max_dist:64
/proc/sys/net/ipv4/ipfrag_secret_interval:0
/proc/sys/net/ipv4/ipfrag_time:30
IpReasmReqds                    827385             0.0
IpReasmFails                    827385             0.0
FRAG: inuse 1038 memory 105326208

[netdev8 ~]# ./super_netperf.sh 400 -H 192.168.101.2 -t UDP_STREAM -l 60 
213.6

Note: this setup is intentionally lossy (on the sender side), to stress
the frag cache:

[netdev8 ~]#  tc -s qdisc show dev em1
qdisc mq 8001: root 
 Sent 73950097203 bytes 49639120 pkt (dropped 2052241, overlimits 0 requeues 41) 
 backlog 0b 0p requeues 41 
# ...

drops here are due to ldelay being higher than fq_codel's target (I use
fq_codel default values). Can you please share your sender's TC conf
and number of tx queues? 

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-09  9:43           ` Paolo Abeni
@ 2018-07-09 11:34             ` Eric Dumazet
  2018-07-09 11:39               ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2018-07-09 11:34 UTC (permalink / raw)
  To: Paolo Abeni, Eric Dumazet, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown



On 07/09/2018 02:43 AM, Paolo Abeni wrote:
> On Fri, 2018-07-06 at 07:20 -0700, Eric Dumazet wrote:
>> I will test/polish it later, I am coming back from vacations and have a backlog.
>>
>> Here are my results : (Note that I have _not_ changed /proc/sys/net/ipv4/ipfrag_time )
>>
>> lpaa6:~# grep . /proc/sys/net/ipv4/ipfrag_* ; grep FRAG /proc/net/sockstat
>> /proc/sys/net/ipv4/ipfrag_high_thresh:104857600
>> /proc/sys/net/ipv4/ipfrag_low_thresh:78643200
>> /proc/sys/net/ipv4/ipfrag_max_dist:0
>> /proc/sys/net/ipv4/ipfrag_secret_interval:0
>> /proc/sys/net/ipv4/ipfrag_time:30
>> FRAG: inuse 1379 memory 105006776
>>
>> lpaa5:/export/hda3/google/edumazet# ./super_netperf 400 -H 10.246.7.134 -t UDP_STREAM -l 60
>> netperf: send_omni: send_data failed: No route to host
>> netperf: send_omni: send_data failed: No route to host
>>    9063
>>
>>
>> I would say that it looks pretty good to me.
> 
> Is that with an unmodifed kernel?
> 
> I would be happy if I could replicate such results. With the same
> configuration I see:
> 
> [netdev9 ~]# grep . /proc/sys/net/ipv4/ipfrag_*; nstat>/dev/null; sleep 1; nstat|grep IpR; grep FRAG /proc/net/sockstat
> /proc/sys/net/ipv4/ipfrag_high_thresh:104857600
> /proc/sys/net/ipv4/ipfrag_low_thresh:3145728
> /proc/sys/net/ipv4/ipfrag_max_dist:64
> /proc/sys/net/ipv4/ipfrag_secret_interval:0
> /proc/sys/net/ipv4/ipfrag_time:30
> IpReasmReqds                    827385             0.0
> IpReasmFails                    827385             0.0
> FRAG: inuse 1038 memory 105326208
> 
> [netdev8 ~]# ./super_netperf.sh 400 -H 192.168.101.2 -t UDP_STREAM -l 60 
> 213.6
> 
> Note: this setup is intentionally lossy (on the sender side), to stress
> the frag cache:
> 
> [netdev8 ~]#  tc -s qdisc show dev em1
> qdisc mq 8001: root 
>  Sent 73950097203 bytes 49639120 pkt (dropped 2052241, overlimits 0 requeues 41) 
>  backlog 0b 0p requeues 41 
> # ...
> 
> drops here are due to ldelay being higher than fq_codel's target (I use
> fq_codel default values). Can you please share your sender's TC conf
> and number of tx queues? 

You seem to self inflict losses on the sender, and that is terrible for the
(convoluted) stress test you want to run.

I use mq + fq : no losses on the sender.

Do not send patches to solve a problem that does not exist on the field.

If some customers are using netperf and UDP_STREAM with frags, just tell them to
use TCP instead :)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-09 11:34             ` Eric Dumazet
@ 2018-07-09 11:39               ` Eric Dumazet
  2018-07-09 12:50                 ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2018-07-09 11:39 UTC (permalink / raw)
  To: Paolo Abeni, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown



On 07/09/2018 04:34 AM, Eric Dumazet wrote:
> and number of tx queues? 
> 
> You seem to self inflict losses on the sender, and that is terrible for the
> (convoluted) stress test you want to run.
> 
> I use mq + fq : no losses on the sender.
> 
> Do not send patches to solve a problem that does not exist on the field.
> 
> If some customers are using netperf and UDP_STREAM with frags, just tell them to
> use TCP instead :)
> 

Alternatively, you could try to patch fq_codel to drop all frags of one UDP datagram
instead of few of them.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-09 11:39               ` Eric Dumazet
@ 2018-07-09 12:50                 ` Eric Dumazet
  2018-07-20 14:48                   ` Paolo Abeni
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2018-07-09 12:50 UTC (permalink / raw)
  To: Paolo Abeni, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown



On 07/09/2018 04:39 AM, Eric Dumazet wrote:

> Alternatively, you could try to patch fq_codel to drop all frags of one UDP datagram
> instead of few of them.

A first step would be to make sure fq_codel_hash() (using skb_get_hash(skb)) selects
the same bucket for all frags of a datagram :/

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-09 12:50                 ` Eric Dumazet
@ 2018-07-20 14:48                   ` Paolo Abeni
  2018-07-20 15:58                     ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Abeni @ 2018-07-20 14:48 UTC (permalink / raw)
  To: Eric Dumazet, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown

Hi,

On Mon, 2018-07-09 at 05:50 -0700, Eric Dumazet wrote:
> On 07/09/2018 04:39 AM, Eric Dumazet wrote:
> 
> > Alternatively, you could try to patch fq_codel to drop all frags of one UDP datagram
> > instead of few of them.
> 
> A first step would be to make sure fq_codel_hash() (using skb_get_hash(skb)) selects
> the same bucket for all frags of a datagram :/

I gave the above a shot and I have some non upstream ready but somewhat
working code. Anyway it has some issues I'm unable to solve: 
* it's very invasive for fq_codel, because I need to parse each packet
looking for the fragment id
* the parsing overhead can't be easily avoided for non fragments

I tried also something hopefully along the same lines of your other
suggestion (drop eariler the fragment queues when above low threshold):
when allocating a new frag queue and the ipfrag mem is above the low
th, another frag queue is selected in a pseudorandom way and dropped.

This latter patch is much smaller, cope quite well with fragment drops,
and the goodput degradates gracefully when the ipfrag cache is
overloaded. 

I'm wodering if you could consider this second option, too.

Thank you,

Paolo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-20 14:48                   ` Paolo Abeni
@ 2018-07-20 15:58                     ` Eric Dumazet
  2018-07-20 17:31                       ` Paolo Abeni
  0 siblings, 1 reply; 15+ messages in thread
From: Eric Dumazet @ 2018-07-20 15:58 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: Eric Dumazet, netdev, David Miller, Florian Westphal, neilb

On Fri, Jul 20, 2018 at 7:48 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Hi,
>
> On Mon, 2018-07-09 at 05:50 -0700, Eric Dumazet wrote:
> > On 07/09/2018 04:39 AM, Eric Dumazet wrote:
> >
> > > Alternatively, you could try to patch fq_codel to drop all frags of one UDP datagram
> > > instead of few of them.
> >
> > A first step would be to make sure fq_codel_hash() (using skb_get_hash(skb)) selects
> > the same bucket for all frags of a datagram :/
>
> I gave the above a shot and I have some non upstream ready but somewhat
> working code. Anyway it has some issues I'm unable to solve:
> * it's very invasive for fq_codel, because I need to parse each packet
> looking for the fragment id
> * the parsing overhead can't be easily avoided for non fragments

Have you tried using ip_defrag(net, skb,  IP_DEFRAG_QDISC) from fq_codel ?
(adding a new value in ip_defrag_users enum)

if (skb->protocol == htons(ETH_P_IP) {
      if (ip_is_fragment(ip_hdr(skb))) {
           if ((ip_defrag(net, skb, IP_DEFRAG_QDISC))
                return 0;
...



>
> I tried also something hopefully along the same lines of your other
> suggestion (drop eariler the fragment queues when above low threshold):
> when allocating a new frag queue and the ipfrag mem is above the low
> th, another frag queue is selected in a pseudorandom way and dropped.

The problem with any strategy like that, is that forthcoming fragments
for this frag queue
will create another frag queue, that will never have a chance to complete.

Some workloads might benefit, others might not.

>
> This latter patch is much smaller, cope quite well with fragment drops,
> and the goodput degradates gracefully when the ipfrag cache is
> overloaded.
>
> I'm wodering if you could consider this second option, too.
>
> Thank you,
>
> Paolo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-20 15:58                     ` Eric Dumazet
@ 2018-07-20 17:31                       ` Paolo Abeni
  2018-07-20 17:37                         ` Eric Dumazet
  0 siblings, 1 reply; 15+ messages in thread
From: Paolo Abeni @ 2018-07-20 17:31 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Eric Dumazet, netdev, David Miller, Florian Westphal, neilb

On Fri, 2018-07-20 at 08:58 -0700, Eric Dumazet wrote:
> On Fri, Jul 20, 2018 at 7:48 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > 
> > Hi,
> > 
> > On Mon, 2018-07-09 at 05:50 -0700, Eric Dumazet wrote:
> > > On 07/09/2018 04:39 AM, Eric Dumazet wrote:
> > > 
> > > > Alternatively, you could try to patch fq_codel to drop all frags of one UDP datagram
> > > > instead of few of them.
> > > 
> > > A first step would be to make sure fq_codel_hash() (using skb_get_hash(skb)) selects
> > > the same bucket for all frags of a datagram :/
> > 
> > I gave the above a shot and I have some non upstream ready but somewhat
> > working code. Anyway it has some issues I'm unable to solve:
> > * it's very invasive for fq_codel, because I need to parse each packet
> > looking for the fragment id
> > * the parsing overhead can't be easily avoided for non fragments
> 
> Have you tried using ip_defrag(net, skb,  IP_DEFRAG_QDISC) from fq_codel ?
> (adding a new value in ip_defrag_users enum)
> 
> if (skb->protocol == htons(ETH_P_IP) {
>       if (ip_is_fragment(ip_hdr(skb))) {
>            if ((ip_defrag(net, skb, IP_DEFRAG_QDISC))
>                 return 0;
> ...

Thank you for the feedback. I must admit this quite in the opposite
direction of what I have attempted so far. I'll try that.
Thanks.
Still for ipv6 it will require a litte more work inside fq_codel.

> > I tried also something hopefully along the same lines of your other
> > suggestion (drop eariler the fragment queues when above low threshold):
> > when allocating a new frag queue and the ipfrag mem is above the low
> > th, another frag queue is selected in a pseudorandom way and dropped.
> 
> The problem with any strategy like that, is that forthcoming fragments
> for this frag queue
> will create another frag queue, that will never have a chance to complete.
> 
> Some workloads might benefit, others might not.

Yes, of course: is an heuristic, but is cheap code-wise, and it can be
disabled setting low th >= high th, so that the kernel will behave
exactly as it does now, and the kind of workloads we could cope with
will increase without adding new knobs.

Cheers,

Paolo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-20 17:31                       ` Paolo Abeni
@ 2018-07-20 17:37                         ` Eric Dumazet
  0 siblings, 0 replies; 15+ messages in thread
From: Eric Dumazet @ 2018-07-20 17:37 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: Eric Dumazet, netdev, David Miller, Florian Westphal, neilb

On Fri, Jul 20, 2018 at 10:32 AM Paolo Abeni <pabeni@redhat.com> wrote:

> Thank you for the feedback. I must admit this quite in the opposite
> direction of what I have attempted so far. I'll try that.
> Thanks.
> Still for ipv6 it will require a litte more work inside fq_codel.

IPv6 packets would use nf_ct_frag6_gather(), I gave you hint about IPv4 only ;)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] ip: re-introduce fragments cache worker
  2018-07-06 13:56       ` Paolo Abeni
  2018-07-06 14:20         ` Eric Dumazet
@ 2018-07-06 14:37         ` Eric Dumazet
  1 sibling, 0 replies; 15+ messages in thread
From: Eric Dumazet @ 2018-07-06 14:37 UTC (permalink / raw)
  To: Paolo Abeni, Eric Dumazet, netdev
  Cc: David S. Miller, Eric Dumazet, Florian Westphal, NeilBrown



On 07/06/2018 06:56 AM, Paolo Abeni wrote:
> With:
> 
> schedule_work_on(smp_processor_id(), #... )
> 
> We can be sure to run exclusively on the cpu handling the RX queue even with the worker.
> 

Make sure to test your patch with 16 RX queues (16 cpus) feeding the defrag unit.

In this (common) mode, then having one cpu trying to purge frags wont be enough.

Really the GC can not cope with DDOS, we tried that in the past (IPv4 route cache)
and failed miserably.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2018-07-20 18:26 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-07-06 10:10 [RFC PATCH] ip: re-introduce fragments cache worker Paolo Abeni
2018-07-06 11:23 ` Eric Dumazet
2018-07-06 11:56   ` Paolo Abeni
2018-07-06 12:09     ` Eric Dumazet
2018-07-06 13:56       ` Paolo Abeni
2018-07-06 14:20         ` Eric Dumazet
2018-07-09  9:43           ` Paolo Abeni
2018-07-09 11:34             ` Eric Dumazet
2018-07-09 11:39               ` Eric Dumazet
2018-07-09 12:50                 ` Eric Dumazet
2018-07-20 14:48                   ` Paolo Abeni
2018-07-20 15:58                     ` Eric Dumazet
2018-07-20 17:31                       ` Paolo Abeni
2018-07-20 17:37                         ` Eric Dumazet
2018-07-06 14:37         ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).