netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Willem de Bruijn <willemb@google.com>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, Willem de Bruijn <willemb@google.com>
Subject: [PATCH net-next 5/7] packet: rollover only to socket with headroom
Date: Wed,  6 May 2015 14:27:15 -0400	[thread overview]
Message-ID: <1430936837-22655-6-git-send-email-willemb@google.com> (raw)
In-Reply-To: <1430936837-22655-1-git-send-email-willemb@google.com>

From: Willem de Bruijn <willemb@google.com>

Only migrate flows to sockets that have sufficient headroom, where
sufficient is defined as having at least 25% empty space.

The kernel has three different buffer types: a regular socket, a ring
with frames (TPACKET_V[12]) or a ring with blocks (TPACKET_V3). The
latter two do not expose a read pointer to the kernel, so headroom is
not computed easily. All three needs a different implementation to
estimate free space.

Tested:
  Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input.

  bench_rollover has as many sockets as there are NIC receive queues
  in the system. Each socket is owned by a process that is pinned to
  one of the receive cpus. RFS is disabled. RPS is enabled with an
  identity mapping (cpu x -> cpu x), to count drops with softnettop.

      lpbb5:/export/hda3/willemb# ./bench_rollover -l 1000 -s -r
      Press [Enter] to exit

      cpu         rx       rx.k     drop.k   rollover     r.huge   r.failed
        0    1563828    1563828          0    8674715          0          0
        1    1212590    1212590          0          0          0          0
        2    1236807    1236808          0          0          0          0
        3    1246906    1246906          0          0          0          0
        4    1236603    1236603          0          0          0          0
        5    1238808    1238808          0          0          0          0
        6    1250217    1250217          0          0          0          0
        7    1252906    1252906          0          0          0          0

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/packet/af_packet.c | 68 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 14 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index fdb5261..d0c4c95 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1234,31 +1234,71 @@ static void packet_free_pending(struct packet_sock *po)
 	free_percpu(po->tx_ring.pending_refcnt);
 }
 
-static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+#define ROOM_POW_OFF	2
+#define ROOM_NONE	0x0
+#define ROOM_LOW	0x1
+#define ROOM_NORMAL	0x2
+
+static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+{
+	int idx, len;
+
+	len = po->rx_ring.frame_max + 1;
+	idx = po->rx_ring.head;
+	if (pow_off)
+		idx += len >> pow_off;
+	if (idx >= len)
+		idx -= len;
+	return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+{
+	int idx, len;
+
+	len = po->rx_ring.prb_bdqc.knum_blocks;
+	idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+	if (pow_off)
+		idx += len >> pow_off;
+	if (idx >= len)
+		idx -= len;
+	return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
 {
 	struct sock *sk = &po->sk;
+	int ret = ROOM_NONE;
 	bool has_room;
 
 	if (po->prot_hook.func != tpacket_rcv) {
-		has_room = (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
-			   <= sk->sk_rcvbuf;
+		int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+					  - skb->truesize;
+		if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+			ret = ROOM_NORMAL;
+		else if (avail > 0)
+			ret = ROOM_LOW;
 	} else {
 		spin_lock(&sk->sk_receive_queue.lock);
-		if (po->tp_version == TPACKET_V3)
-			has_room = prb_lookup_block(po, &po->rx_ring,
-						    po->rx_ring.prb_bdqc.kactive_blk_num,
-						    TP_STATUS_KERNEL);
-		else
-			has_room = packet_lookup_frame(po, &po->rx_ring,
-						       po->rx_ring.head,
-						       TP_STATUS_KERNEL);
+		if (po->tp_version == TPACKET_V3) {
+			if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
+				ret = ROOM_NORMAL;
+			else if (__tpacket_v3_has_room(po, 0))
+				ret = ROOM_LOW;
+		} else {
+			if (__tpacket_has_room(po, ROOM_POW_OFF))
+				ret = ROOM_NORMAL;
+			else if (__tpacket_has_room(po, 0))
+				ret = ROOM_LOW;
+		}
 		spin_unlock(&sk->sk_receive_queue.lock);
 	}
 
+	has_room = ret == ROOM_NORMAL;
 	if (po->pressure == has_room)
 		xchg(&po->pressure, !has_room);
 
-	return has_room;
+	return ret;
 }
 
 static void packet_sock_destruct(struct sock *sk)
@@ -1329,14 +1369,14 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
 	unsigned int i, j;
 
 	po = pkt_sk(f->arr[idx]);
-	if (try_self && packet_rcv_has_room(po, skb))
+	if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE)
 		return idx;
 
 	i = j = min_t(int, po->rollover->sock, num - 1);
 	do {
 		po_next = pkt_sk(f->arr[i]);
 		if (po_next != po && !po_next->pressure &&
-		    packet_rcv_has_room(po_next, skb)) {
+		    packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
 			if (i != j)
 				po->rollover->sock = i;
 			return i;
-- 
2.2.0.rc0.207.ga3a616c

  parent reply	other threads:[~2015-05-06 18:27 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-05-06 18:27 [PATCH net-next 0/7] packet: refine rollover Willem de Bruijn
2015-05-06 18:27 ` [PATCH net-next 1/7] packet: rollover prepare: move code out of callsites Willem de Bruijn
2015-05-06 18:27 ` [PATCH net-next 2/7] packet: rollover prepare: per-socket state Willem de Bruijn
2015-05-06 18:27 ` [PATCH net-next 3/7] packet: rollover prepare: single return in packet_rcv_has_room Willem de Bruijn
2015-05-07 13:49   ` David Laight
2015-05-07 16:05     ` Willem de Bruijn
2015-05-06 18:27 ` [PATCH net-next 4/7] packet: rollover lock contention avoidance Willem de Bruijn
2015-05-06 19:44   ` Eric Dumazet
2015-05-06 21:05     ` Willem de Bruijn
2015-05-06 18:27 ` Willem de Bruijn [this message]
2015-05-06 18:27 ` [PATCH net-next 6/7] packet: rollover huge flows before small flows Willem de Bruijn
2015-05-06 19:34   ` Eric Dumazet
2015-05-06 20:06     ` Willem de Bruijn
2015-05-06 20:16       ` Eric Dumazet
2015-05-06 20:19         ` Willem de Bruijn
2015-05-06 18:27 ` [PATCH net-next 7/7] packet: rollover statistics Willem de Bruijn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1430936837-22655-6-git-send-email-willemb@google.com \
    --to=willemb@google.com \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).