Re: fixing sk_stream_rfree()

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: "David S. Miller" <davem@davemloft.net>
To: netdev@vger.kernel.org
Cc: herbert@gondor.apana.org.au
Subject: Re: fixing sk_stream_rfree()
Date: Sat, 15 Apr 2006 02:03:08 -0700 (PDT)	[thread overview]
Message-ID: <20060415.020308.108561770.davem@davemloft.net> (raw)
In-Reply-To: <20060415.012327.68977713.davem@davemloft.net>

From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 15 Apr 2006 01:23:27 -0700 (PDT)

> I came up with one more possible approach, it goes something like
> this:
> 
> 1) Get rid of the skb->destructor callback for TCP receive
>    data.
> 
> 2) Adjust sk_rmem_alloc and sk_forward_alloc explicitly when we add
>    packets to sk_receive_queue/out_of_order_queue, and when we unlink
>    them from sk_receive_queue and __kfree_skb() them up.

It turns out DCCP doesn't use the sk_forward_alloc memory
scheduling at least not for receive.

So, as food for thought, below is a comile-tested-only
implementation of the above idea.  The basic transformations
are:

1) sk_eat_skb --> sk_stream_eat_skb which does the sk_forward_alloc
   et al. accounting by hand which would have been done by the
   skb->destructor sk_stream_rfree()

2) __skb_queue_purge --> sk_stream_eat_queue

3) sk_stream_set_owner_r --> sk_stream_charge_r which does
   the accounting for a new receive skb but does not hook up
   the destructor

4) unlink and free SKB not received to userspace yet -->
   sk_stream_eat_skb()

... and then I noticed that ip_rcv() unshares the SKB, so
skb->users should always be 1 and therefore it is impossible
for sk_stream_rfree() to be called from a non socket locked
context.

And all of this was a wild goose chase, oh well :-)
So the BUG_ON(!sk_forward_alloc) problem is elsewhere. :-/

diff --git a/arch/x86_64/kernel/functionlist b/arch/x86_64/kernel/functionlist
index 2bcebdc..da3379a 100644
--- a/arch/x86_64/kernel/functionlist
+++ b/arch/x86_64/kernel/functionlist
@@ -751,7 +751,6 @@
 *(.text.strcmp)
 *(.text.steal_locks)
 *(.text.sock_create)
-*(.text.sk_stream_rfree)
 *(.text.sk_stream_mem_schedule)
 *(.text.skip_atoi)
 *(.text.sk_alloc)
diff --git a/include/net/sock.h b/include/net/sock.h
index af2b054..54d5a2f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -442,14 +442,56 @@ static inline int sk_stream_memory_free(
 	return sk->sk_wmem_queued < sk->sk_sndbuf;
 }
 
-extern void sk_stream_rfree(struct sk_buff *skb);
-
-static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
+/* Charge SKB as receive queue memory to SK.  The socket must be locked
+ * when we get here.
+ */
+static inline void sk_stream_charge_r(struct sk_buff *skb, struct sock *sk)
 {
-	skb->sk = sk;
-	skb->destructor = sk_stream_rfree;
 	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
 	sk->sk_forward_alloc -= skb->truesize;
+}
+
+/* Release SKB as receive queue memory from SK.  The socket must be
+ * locked when we get here.
+ */
+static inline void sk_stream_release_r(struct sk_buff *skb, struct sock *sk)
+{
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+	sk->sk_forward_alloc += skb->truesize;
+}
+
+/**
+ * sk_stream_eat_skb - Release a skb if it is no longer needed
+ * @sk: socket to eat this skb from
+ * @skb: socket buffer to eat
+ * @queue: skb queue to unlink from
+ *
+ * This routine must be called with interrupts disabled or with the socket
+ * locked so that the sk_buff queue operation is ok.
+ */
+static inline void sk_stream_eat_skb(struct sk_buff *skb, struct sock *sk, struct sk_buff_head *queue)
+{
+	__skb_unlink(skb, queue);
+	sk_stream_release_r(skb, sk);
+	__kfree_skb(skb);
+}
+
+/*
+ * sk_stream_eat_queue - Release an entire queue of skbs, when no longer needed
+ * @sk: socket to eat these skbs from
+ * @queue: queue of skbs to eat
+ *
+ * This routine must be called with interrupts disabled or with the socket
+ * locked so that the sk_buff queue operation is ok.
+ */
+static inline void sk_stream_eat_queue(struct sock *sk, struct sk_buff_head *queue)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(queue)) != NULL) {
+		sk_stream_release_r(skb, sk);
+		__kfree_skb(skb);
+	}
 }
 
 static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb)
diff --git a/net/core/stream.c b/net/core/stream.c
index 35e2525..a034f2d 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -172,16 +172,6 @@ do_interrupted:
 
 EXPORT_SYMBOL(sk_stream_wait_memory);
 
-void sk_stream_rfree(struct sk_buff *skb)
-{
-	struct sock *sk = skb->sk;
-
-	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
-	sk->sk_forward_alloc += skb->truesize;
-}
-
-EXPORT_SYMBOL(sk_stream_rfree);
-
 int sk_stream_error(struct sock *sk, int flags, int err)
 {
 	if (err == -EPIPE)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87f68e7..c45a9c2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_
 				break;
 		}
 		if (skb->h.th->fin) {
-			sk_eat_skb(sk, skb);
+			sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
 			++seq;
 			break;
 		}
-		sk_eat_skb(sk, skb);
+		sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
 		if (!desc->count)
 			break;
 	}
@@ -1355,15 +1355,17 @@ skip_copy:
 
 		if (skb->h.th->fin)
 			goto found_fin_ok;
-		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+		if (!(flags & MSG_PEEK)) {
+			sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
+		}
 		continue;
 
 	found_fin_ok:
 		/* Process the FIN. */
 		++*seq;
-		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+		if (!(flags & MSG_PEEK)) {
+			sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
+		}
 		break;
 	} while (len > 0);
 
@@ -1489,6 +1491,7 @@ void tcp_close(struct sock *sk, long tim
 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
 			  skb->h.th->fin;
 		data_was_unread += len;
+		sk_stream_release_r(skb, sk);
 		__kfree_skb(skb);
 	}
 
@@ -1650,9 +1653,9 @@ int tcp_disconnect(struct sock *sk, int 
 		sk->sk_err = ECONNRESET;
 
 	tcp_clear_xmit_timers(sk);
-	__skb_queue_purge(&sk->sk_receive_queue);
+	sk_stream_eat_queue(sk, &sk->sk_receive_queue);
 	sk_stream_writequeue_purge(sk);
-	__skb_queue_purge(&tp->out_of_order_queue);
+	sk_stream_eat_queue(sk, &tp->out_of_order_queue);
 
 	inet->dport = 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9f0cca4..5a30648 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2875,7 +2875,7 @@ static void tcp_fin(struct sk_buff *skb,
 	/* It _is_ possible, that we have something out-of-order _after_ FIN.
 	 * Probably, we should reset in this case. For now drop them.
 	 */
-	__skb_queue_purge(&tp->out_of_order_queue);
+	sk_stream_eat_queue(sk, &tp->out_of_order_queue);
 	if (tp->rx_opt.sack_ok)
 		tcp_sack_reset(&tp->rx_opt);
 	sk_stream_mem_reclaim(sk);
@@ -3093,8 +3093,7 @@ static void tcp_ofo_queue(struct sock *s
 
 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 			SOCK_DEBUG(sk, "ofo packet was already received \n");
-			__skb_unlink(skb, &tp->out_of_order_queue);
-			__kfree_skb(skb);
+			sk_stream_eat_skb(skb, sk, &tp->out_of_order_queue);
 			continue;
 		}
 		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
@@ -3166,7 +3165,7 @@ queue_and_out:
 				    !sk_stream_rmem_schedule(sk, skb))
 					goto drop;
 			}
-			sk_stream_set_owner_r(skb, sk);
+			sk_stream_charge_r(skb, sk);
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
 		}
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3248,7 +3247,7 @@ drop:
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
-	sk_stream_set_owner_r(skb, sk);
+	sk_stream_charge_r(skb, sk);
 
 	if (!skb_peek(&tp->out_of_order_queue)) {
 		/* Initial out of order segment, build 1 SACK. */
@@ -3290,6 +3289,7 @@ drop:
 		    before(seq, TCP_SKB_CB(skb1)->end_seq)) {
 			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
 				/* All the bits are present. Drop. */
+				sk_stream_release_r(skb, sk);
 				__kfree_skb(skb);
 				tcp_dsack_set(tp, seq, end_seq);
 				goto add_sack;
@@ -3311,9 +3311,8 @@ drop:
 			       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
 			       break;
 		       }
-		       __skb_unlink(skb1, &tp->out_of_order_queue);
 		       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
-		       __kfree_skb(skb1);
+		       sk_stream_eat_skb(skb1, sk, &tp->out_of_order_queue);
 		}
 
 add_sack:
@@ -3340,8 +3339,7 @@ tcp_collapse(struct sock *sk, struct sk_
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 			struct sk_buff *next = skb->next;
-			__skb_unlink(skb, list);
-			__kfree_skb(skb);
+			sk_stream_eat_skb(skb, sk, list);
 			NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
 			skb = next;
 			continue;
@@ -3387,7 +3385,7 @@ tcp_collapse(struct sock *sk, struct sk_
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
 		__skb_insert(nskb, skb->prev, skb, list);
-		sk_stream_set_owner_r(nskb, sk);
+		sk_stream_charge_r(nskb, sk);
 
 		/* Copy data, releasing collapsed skbs. */
 		while (copy > 0) {
@@ -3405,8 +3403,7 @@ tcp_collapse(struct sock *sk, struct sk_
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 				struct sk_buff *next = skb->next;
-				__skb_unlink(skb, list);
-				__kfree_skb(skb);
+				sk_stream_eat_skb(skb, sk, list);
 				NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
 				skb = next;
 				if (skb == tail || skb->h.th->syn || skb->h.th->fin)
@@ -3494,7 +3491,7 @@ static int tcp_prune_queue(struct sock *
 	/* First, purge the out_of_order queue. */
 	if (!skb_queue_empty(&tp->out_of_order_queue)) {
 		NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
-		__skb_queue_purge(&tp->out_of_order_queue);
+		sk_stream_eat_queue(sk, &tp->out_of_order_queue);
 
 		/* Reset SACK state.  A conforming SACK implementation will
 		 * do the same at a timeout based retransmit.  When a connection
@@ -3703,10 +3700,8 @@ static void tcp_check_urg(struct sock * 
 	    tp->copied_seq != tp->rcv_nxt) {
 		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 		tp->copied_seq++;
-		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
-			__skb_unlink(skb, &sk->sk_receive_queue);
-			__kfree_skb(skb);
-		}
+		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))
+			sk_stream_eat_skb(skb, sk, &sk->sk_receive_queue);
 	}
 
 	tp->urg_data   = TCP_URG_NOTYET;
@@ -3950,7 +3945,7 @@ int tcp_rcv_established(struct sock *sk,
 				/* Bulk data transfer: receiver */
 				__skb_pull(skb,tcp_header_len);
 				__skb_queue_tail(&sk->sk_receive_queue, skb);
-				sk_stream_set_owner_r(skb, sk);
+				sk_stream_charge_r(skb, sk);
 				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 			}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 672950e..e518d97 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1294,7 +1294,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
   	sk_stream_writequeue_purge(sk);
 
 	/* Cleans up our, hopefully empty, out_of_order_queue. */
-  	__skb_queue_purge(&tp->out_of_order_queue);
+	sk_stream_eat_queue(sk, &tp->out_of_order_queue);
 
 	/* Clean prequeue, it must be empty really */
 	__skb_queue_purge(&tp->ucopy.prequeue);

next prev parent reply	other threads:[~2006-04-15  9:03 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-04-15  3:59 fixing sk_stream_rfree() David S. Miller
2006-04-15  8:23 ` David S. Miller
2006-04-15  9:03   ` David S. Miller [this message]
2006-04-16 11:18 ` Herbert Xu
2006-04-17  5:32   ` David S. Miller
2006-04-17  6:17     ` Herbert Xu
2006-04-17 20:27       ` Jesse Brandeburg
2006-04-17 20:29       ` David S. Miller
2006-04-17 21:09         ` Jesse Brandeburg
2006-04-17 21:40           ` Herbert Xu
2006-04-17 21:56             ` Jesse Brandeburg
2006-04-18 11:49 ` Ingo Oeser

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:2bcebdc dfblob:da3379a dfblob:af2b054 dfblob:54d5a2f
dfblob:35e2525 dfblob:a034f2d dfblob:87f68e7 dfblob:c45a9c2
dfblob:9f0cca4 dfblob:5a30648 dfblob:672950e dfblob:e518d97 )
 OR (
bs:"Re: fixing sk_stream_rfree()" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060415.020308.108561770.davem@davemloft.net \
    --to=davem@davemloft.net \
    --cc=herbert@gondor.apana.org.au \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).