* ip_evictor can loop
@ 2004-08-19 2:20 Prasanna Meda
2004-08-19 2:25 ` David S. Miller
0 siblings, 1 reply; 2+ messages in thread
From: Prasanna Meda @ 2004-08-19 2:20 UTC (permalink / raw)
To: netdev; +Cc: davem, linux-kernel
Code has changed in 2.6 here, but still
ip_frag_mem is dynamic; While getting and
releasing locks, something can be inserted
in the lru list, while one item can be deleted
in overlapping manner.
If the traffic is like there is more traffic
coming in than can be freed, then ip_evictor
code may loop as long as ip_frag_mem will be
more than low threshold.
There should be some way of getting out of
this loop.
If I am not missing something, fixes are
1. Calculating the goal ahead, and accounting
the memory freed as return values or arguments
from ipq_destroy etc. would be one fix.
2. After calling ip_evictor, ip_defrag() code should
make second check on high threshold and should
not queue the packet, if we are under
attack or pressure.
-----
The code is given below
ip_fragment.c:
/* Memory limiting on fragments. Evictor trashes the oldest
* fragment queue until we are back under the low threshold.
*/
static void ip_evictor(void)
{
struct ipq *qp;
struct list_head *tmp;
for(;;) {
if (atomic_read(&ip_frag_mem) <=
sysctl_ipfrag_low_thresh)
return;
read_lock(&ipfrag_lock);
if (list_empty(&ipq_lru_list)) {
read_unlock(&ipfrag_lock);
return;
}
tmp = ipq_lru_list.next;
qp = list_entry(tmp, struct ipq, lru_list);
atomic_inc(&qp->refcnt);
read_unlock(&ipfrag_lock);
spin_lock(&qp->lock);
if (!(qp->last_in&COMPLETE))
ipq_kill(qp); /* -----> gets
write lock ipfrag_lock */
spin_unlock(&qp->lock);
ipq_put(qp);
IP_INC_STATS_BH(ReasmFails);
}
}
------
Fix from Dave:
I think #1 is the ideal fix.
Do you understand that packet processing is dead on
the cpu executing this code? That means the worst
possible case is that all cpus in the system enter this
loop, and they will absolutely make forward progress and
eventually bring the value back down under the low
threshold.
So you'd need a multi-processor system, one cpu sits in
the ip_evitor() loop and another gets exactly one fragment
each time the first cpu brings the limit below the low
threshold. That is the only way to loop because otherwise
other cpus will work to lower the IP fragment memory usage
below the threshold.
I really think systems will get out of this lock-step state
very quickly even under enormous packet load.
Nevertheless I will add an implementation of #1 in the
tree. Something like this:
===== net/ipv4/ip_fragment.c 1.8 vs edited =====
--- 1.8/net/ipv4/ip_fragment.c 2003-05-28 00:49:28 -07:00
+++ edited/net/ipv4/ip_fragment.c 2004-08-18 14:23:59 -07:00
@@ -168,14 +168,18 @@
atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */
/* Memory Tracking Functions. */
-static __inline__ void frag_kfree_skb(struct sk_buff *skb)
+static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
{
+ if (work)
+ *work -= skb->truesize;
atomic_sub(skb->truesize, &ip_frag_mem);
kfree_skb(skb);
}
-static __inline__ void frag_free_queue(struct ipq *qp)
+static __inline__ void frag_free_queue(struct ipq *qp, int *work)
{
+ if (work)
+ *work -= sizeof(struct ipq);
atomic_sub(sizeof(struct ipq), &ip_frag_mem);
kfree(qp);
}
@@ -194,7 +198,7 @@
/* Destruction primitives. */
/* Complete destruction of ipq. */
-static void ip_frag_destroy(struct ipq *qp)
+static void ip_frag_destroy(struct ipq *qp, int *work)
{
struct sk_buff *fp;
@@ -206,18 +210,18 @@
while (fp) {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(fp);
+ frag_kfree_skb(fp, work);
fp = xp;
}
/* Finally, release the queue descriptor itself. */
- frag_free_queue(qp);
+ frag_free_queue(qp, work);
}
-static __inline__ void ipq_put(struct ipq *ipq)
+static __inline__ void ipq_put(struct ipq *ipq, int *work)
{
if (atomic_dec_and_test(&ipq->refcnt))
- ip_frag_destroy(ipq);
+ ip_frag_destroy(ipq, work);
}
/* Kill ipq entry. It is not destroyed immediately,
@@ -242,10 +246,13 @@
{
struct ipq *qp;
struct list_head *tmp;
+ int work;
- for(;;) {
- if (atomic_read(&ip_frag_mem) <=
sysctl_ipfrag_low_thresh)
- return;
+ work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
+ if (work <= 0)
+ return;
+
+ while (work > 0) {
read_lock(&ipfrag_lock);
if (list_empty(&ipq_lru_list)) {
read_unlock(&ipfrag_lock);
@@ -261,7 +268,7 @@
ipq_kill(qp);
spin_unlock(&qp->lock);
- ipq_put(qp);
+ ipq_put(qp, &work);
IP_INC_STATS_BH(IpReasmFails);
}
}
@@ -293,7 +300,7 @@
}
out:
spin_unlock(&qp->lock);
- ipq_put(qp);
+ ipq_put(qp, NULL);
}
/* Creation primitives. */
@@ -316,7 +323,7 @@
atomic_inc(&qp->refcnt);
write_unlock(&ipfrag_lock);
qp_in->last_in |= COMPLETE;
- ipq_put(qp_in);
+ ipq_put(qp_in, NULL);
return qp;
}
}
@@ -505,7 +512,7 @@
qp->fragments = next;
qp->meat -= free_it->len;
- frag_kfree_skb(free_it);
+ frag_kfree_skb(free_it, NULL);
}
}
@@ -656,7 +663,7 @@
ret = ip_frag_reasm(qp, dev);
spin_unlock(&qp->lock);
- ipq_put(qp);
+ ipq_put(qp, NULL);
return ret;
}
-----
Thanks,
Prasanna.
^ permalink raw reply [flat|nested] 2+ messages in thread* Re: ip_evictor can loop
2004-08-19 2:20 ip_evictor can loop Prasanna Meda
@ 2004-08-19 2:25 ` David S. Miller
0 siblings, 0 replies; 2+ messages in thread
From: David S. Miller @ 2004-08-19 2:25 UTC (permalink / raw)
To: Prasanna Meda; +Cc: netdev, linux-kernel
BTW, I forgot to mention to Parsanna when I discussed this
with him in private that ipv6 has an identical problem since
the ipv6 code is derived from the ipv4 stuff.
Fix for that one follows.
# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
# 2004/08/18 14:34:14-07:00 davem@nuts.davemloft.net
# [IPV6]: ip6_evictor() has same problem as ip_evictor().
#
# Signed-off-by: David S. Miller <davem@redhat.com>
#
# net/ipv6/reassembly.c
# 2004/08/18 14:33:54-07:00 davem@nuts.davemloft.net +22 -15
# [IPV6]: ip6_evictor() has same problem as ip_evictor().
#
diff -Nru a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
--- a/net/ipv6/reassembly.c 2004-08-18 19:13:13 -07:00
+++ b/net/ipv6/reassembly.c 2004-08-18 19:13:13 -07:00
@@ -195,14 +195,18 @@
atomic_t ip6_frag_mem = ATOMIC_INIT(0);
/* Memory Tracking Functions. */
-static inline void frag_kfree_skb(struct sk_buff *skb)
+static inline void frag_kfree_skb(struct sk_buff *skb, int *work)
{
+ if (work)
+ *work -= skb->truesize;
atomic_sub(skb->truesize, &ip6_frag_mem);
kfree_skb(skb);
}
-static inline void frag_free_queue(struct frag_queue *fq)
+static inline void frag_free_queue(struct frag_queue *fq, int *work)
{
+ if (work)
+ *work -= sizeof(struct frag_queue);
atomic_sub(sizeof(struct frag_queue), &ip6_frag_mem);
kfree(fq);
}
@@ -220,7 +224,7 @@
/* Destruction primitives. */
/* Complete destruction of fq. */
-static void ip6_frag_destroy(struct frag_queue *fq)
+static void ip6_frag_destroy(struct frag_queue *fq, int *work)
{
struct sk_buff *fp;
@@ -232,17 +236,17 @@
while (fp) {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(fp);
+ frag_kfree_skb(fp, work);
fp = xp;
}
- frag_free_queue(fq);
+ frag_free_queue(fq, work);
}
-static __inline__ void fq_put(struct frag_queue *fq)
+static __inline__ void fq_put(struct frag_queue *fq, int *work)
{
if (atomic_dec_and_test(&fq->refcnt))
- ip6_frag_destroy(fq);
+ ip6_frag_destroy(fq, work);
}
/* Kill fq entry. It is not destroyed immediately,
@@ -264,10 +268,13 @@
{
struct frag_queue *fq;
struct list_head *tmp;
+ int work;
- for(;;) {
- if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh)
- return;
+ work = atomic_read(&ip6_frag_mem) - sysctl_ip6frag_low_thresh;
+ if (work <= 0)
+ return;
+
+ while(work > 0) {
read_lock(&ip6_frag_lock);
if (list_empty(&ip6_frag_lru_list)) {
read_unlock(&ip6_frag_lock);
@@ -283,7 +290,7 @@
fq_kill(fq);
spin_unlock(&fq->lock);
- fq_put(fq);
+ fq_put(fq, &work);
IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
}
}
@@ -320,7 +327,7 @@
}
out:
spin_unlock(&fq->lock);
- fq_put(fq);
+ fq_put(fq, NULL);
}
/* Creation primitives. */
@@ -340,7 +347,7 @@
atomic_inc(&fq->refcnt);
write_unlock(&ip6_frag_lock);
fq_in->last_in |= COMPLETE;
- fq_put(fq_in);
+ fq_put(fq_in, NULL);
return fq;
}
}
@@ -539,7 +546,7 @@
fq->fragments = next;
fq->meat -= free_it->len;
- frag_kfree_skb(free_it);
+ frag_kfree_skb(free_it, NULL);
}
}
@@ -734,7 +741,7 @@
ret = ip6_frag_reasm(fq, skbp, nhoffp, dev);
spin_unlock(&fq->lock);
- fq_put(fq);
+ fq_put(fq, NULL);
return ret;
}
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2004-08-19 2:25 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-08-19 2:20 ip_evictor can loop Prasanna Meda
2004-08-19 2:25 ` David S. Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).