Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/4] [RFC] Add c/r support for connected INET sockets
From: Dan Smith @ 2009-10-20 21:06 UTC (permalink / raw)
  To: containers; +Cc: netdev, Oren Laadan, John Dykstra
In-Reply-To: <1256072803-3518-1-git-send-email-danms@us.ibm.com>

This patch adds basic support for C/R of open INET sockets.  I think that
all the important bits of the TCP and ICSK socket structures is saved,
but I think there is still some additional IPv6 stuff that needs to be
handled.

With this patch applied, the following script can be used to demonstrate
the functionality:

  https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html

It shows that this enables migration of a sendmail process with open
connections from one machine to another without dropping.

We still need comments from the netdev people about what sort of sanity
checking we need to do on the values in the ckpt_hdr_socket_inet
structure on restart.

Note that this still doesn't address lingering sockets yet.

Changes in v2:
 - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
   structure instead of saving them separately
 - Fix 'sock' naming in sock_cptrst()
 - Don't take the queue lock before skb_queue_tail() since it is
   done for us
 - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
   flag is specified on sys_restart()
 - Pull the implementation of the list of listening sockets back into
   this patch
 - Fix dangling printk
 - Add some comments around the parent/child restore logic

Cc: netdev@vger.kernel.org
Cc: Oren Laadan <orenl@librato.com>
Cc: John Dykstra <jdykstra72@gmail.com>
Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 checkpoint/sys.c                 |    4 +
 include/linux/checkpoint.h       |    5 +-
 include/linux/checkpoint_hdr.h   |   97 ++++++++++++++
 include/linux/checkpoint_types.h |    2 +
 net/checkpoint.c                 |   23 ++--
 net/ipv4/checkpoint.c            |  263 +++++++++++++++++++++++++++++++++++++-
 6 files changed, 379 insertions(+), 15 deletions(-)

diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 260a1ee..df00973 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -221,6 +221,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 
 	kfree(ctx->pids_arr);
 
+	sock_listening_list_free(&ctx->listen_sockets);
+
 	kfree(ctx);
 }
 
@@ -249,6 +251,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	spin_lock_init(&ctx->lock);
 #endif
 
+	INIT_LIST_HEAD(&ctx->listen_sockets);
+
 	err = -EBADF;
 	ctx->file = fget(fd);
 	if (!ctx->file)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 1da0b04..73d1677 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -19,6 +19,7 @@
 #define RESTART_TASKSELF	0x1
 #define RESTART_FROZEN		0x2
 #define RESTART_GHOST		0x4
+#define RESTART_SOCK_LISTENONLY 0x8
 
 #ifdef __KERNEL__
 #ifdef CONFIG_CHECKPOINT
@@ -48,7 +49,8 @@
 #define RESTART_USER_FLAGS  \
 	(RESTART_TASKSELF | \
 	 RESTART_FROZEN | \
-	 RESTART_GHOST)
+	 RESTART_GHOST | \
+	 RESTART_SOCK_LISTENONLY)
 
 extern int walk_task_subtree(struct task_struct *task,
 			     int (*func)(struct task_struct *, void *),
@@ -102,6 +104,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct sockaddr *rem, unsigned *rem_len);
 void sock_restore_header_info(struct sk_buff *skb,
 			      struct ckpt_hdr_socket_buffer *h);
+void sock_listening_list_free(struct list_head *head);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 3e6cab1..0c10657 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -20,6 +20,7 @@
 #include <linux/socket.h>
 #include <linux/un.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #else
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -569,6 +570,102 @@ struct ckpt_hdr_socket_unix {
 
 struct ckpt_hdr_socket_inet {
 	struct ckpt_hdr h;
+	__u32 daddr;
+	__u32 rcv_saddr;
+	__u32 saddr;
+	__u16 dport;
+	__u16 num;
+	__u16 sport;
+	__s16 uc_ttl;
+	__u16 cmsg_flags;
+
+	struct {
+		__u64 timeout;
+		__u32 ato;
+		__u32 lrcvtime;
+		__u16 last_seg_size;
+		__u16 rcv_mss;
+		__u8 pending;
+		__u8 quick;
+		__u8 pingpong;
+		__u8 blocked;
+	} icsk_ack __attribute__ ((aligned(8)));
+
+	/* FIXME: Skipped opt, tos, multicast, cork settings */
+
+	struct {
+		__u64 last_synq_overflow;
+
+		__u32 rcv_nxt;
+		__u32 copied_seq;
+		__u32 rcv_wup;
+		__u32 snd_nxt;
+		__u32 snd_una;
+		__u32 snd_sml;
+		__u32 rcv_tstamp;
+		__u32 lsndtime;
+
+		__u32 snd_wl1;
+		__u32 snd_wnd;
+		__u32 max_window;
+		__u32 mss_cache;
+		__u32 window_clamp;
+		__u32 rcv_ssthresh;
+		__u32 frto_highmark;
+
+		__u32 srtt;
+		__u32 mdev;
+		__u32 mdev_max;
+		__u32 rttvar;
+		__u32 rtt_seq;
+
+		__u32 packets_out;
+		__u32 retrans_out;
+
+		__u32 snd_up;
+		__u32 rcv_wnd;
+		__u32 write_seq;
+		__u32 pushed_seq;
+		__u32 lost_out;
+		__u32 sacked_out;
+		__u32 fackets_out;
+		__u32 tso_deferred;
+		__u32 bytes_acked;
+
+		__s32 lost_cnt_hint;
+		__u32 retransmit_high;
+
+		__u32 lost_retrans_low;
+
+		__u32 prior_ssthresh;
+		__u32 high_seq;
+
+		__u32 retrans_stamp;
+		__u32 undo_marker;
+		__s32 undo_retrans;
+		__u32 total_retrans;
+
+		__u32 urg_seq;
+		__u32 keepalive_time;
+		__u32 keepalive_intvl;
+
+		__u16 urg_data;
+		__u16 advmss;
+		__u8 frto_counter;
+		__u8 nonagle;
+
+		__u8 ecn_flags;
+		__u8 reordering;
+
+		__u8 keepalive_probes;
+	} tcp __attribute__ ((aligned(8)));
+
+	struct {
+		struct in6_addr saddr;
+		struct in6_addr rcv_saddr;
+		struct in6_addr daddr;
+	} inet6 __attribute__ ((aligned(8)));
+
 	__u32 laddr_len;
 	__u32 raddr_len;
 	struct sockaddr_in laddr;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index fa57cdc..91c141b 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -65,6 +65,8 @@ struct ckpt_ctx {
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
 
+	struct list_head listen_sockets;/* listening parent sockets */
+
 	/* [multi-process checkpoint] */
 	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
 	int nr_tasks;                   /* size of tasks array */
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 5ed2724..3e7574d 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -122,6 +122,7 @@ void sock_restore_header_info(struct sk_buff *skb,
 
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
 				struct sk_buff_head *queue,
+				uint16_t family,
 				int dst_objref)
 {
 	struct sk_buff *skb;
@@ -130,11 +131,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 		struct ckpt_hdr_socket_buffer *h;
 		int ret = 0;
 
-		/* FIXME: This could be a false positive for non-unix
-		 *        buffers, so add a type check here in the
-		 *        future
-		 */
-		if (UNIXCB(skb).fp) {
+		if ((family == AF_UNIX) && UNIXCB(skb).fp) {
 			ckpt_write_err(ctx, "TE", "af_unix: pass fd", -EBUSY);
 			return -EBUSY;
 		}
@@ -174,6 +171,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 
 static int sock_write_buffers(struct ckpt_ctx *ctx,
 			      struct sk_buff_head *queue,
+			      uint16_t family,
 			      int dst_objref)
 {
 	struct ckpt_hdr_socket_queue *h;
@@ -193,7 +191,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
 	h->skb_count = ret;
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (!ret)
-		ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
+		ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
 
  out:
 	ckpt_hdr_put(ctx, h);
@@ -215,12 +213,14 @@ int sock_deferred_write_buffers(void *data)
 		return dst_objref;
 	}
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write recv buffers: %i\n", ret);
 	if (ret < 0)
 		return ret;
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write send buffers: %i\n", ret);
 
 	return ret;
@@ -745,10 +745,9 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
 		goto err;
 
 	if ((h->sock_common.family == AF_INET) &&
-	    (h->sock.state != TCP_LISTEN)) {
-		/* Temporary hack to enable restore of TCP_LISTEN sockets
-		 * while forcing anything else to a closed state
-		 */
+	    (h->sock.state != TCP_LISTEN) &&
+	    (ctx->uflags & RESTART_SOCK_LISTENONLY)) {
+		ckpt_debug("Forcing open socket closed\n");
 		sock->sk->sk_state = TCP_CLOSE;
 		sock->state = SS_UNCONNECTED;
 	}
diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
index 9cbbf5e..5913652 100644
--- a/net/ipv4/checkpoint.c
+++ b/net/ipv4/checkpoint.c
@@ -17,6 +17,7 @@
 #include <linux/deferqueue.h>
 #include <net/tcp_states.h>
 #include <net/tcp.h>
+#include <net/ipv6.h>
 
 struct dq_sock {
 	struct ckpt_ctx *ctx;
@@ -28,6 +29,233 @@ struct dq_buffers {
 	struct sock *sk;
 };
 
+struct listen_item {
+	struct sock *sk;
+	struct list_head list;
+};
+
+void sock_listening_list_free(struct list_head *head)
+{
+	struct listen_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, head, list) {
+		list_del(&item->list);
+		kfree(item);
+	}
+}
+
+static int sock_listening_list_add(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	item->sk = sk;
+	list_add(&item->list, &ctx->listen_sockets);
+
+	return 0;
+}
+
+static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	list_for_each_entry(item, &ctx->listen_sockets, list) {
+		if (inet_sk(sk)->sport == inet_sk(item->sk)->sport)
+			return item->sk;
+	}
+
+	return NULL;
+}
+
+static int sock_hash_parent(void *data)
+{
+	struct dq_sock *dq = (struct dq_sock *)data;
+	struct sock *parent;
+
+	ckpt_debug("INET post-restart hash\n");
+
+	dq->sk->sk_prot->hash(dq->sk);
+
+	/* If there is a listening socket with the same source port,
+	 * then become a child of that socket [we are the result of an
+	 * accept()].  Otherwise hash ourselves directly in [we are
+	 * the result of a connect()]
+	 */
+
+	parent = sock_get_parent(dq->ctx, dq->sk);
+	if (parent) {
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+		local_bh_disable();
+		__inet_inherit_port(parent, dq->sk);
+		local_bh_enable();
+	} else {
+		inet_sk(dq->sk)->num = 0;
+		inet_hash_connect(&tcp_death_row, dq->sk);
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+	}
+
+	return 0;
+}
+
+static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
+{
+	struct dq_sock dq;
+
+	dq.sk = sock;
+	dq.ctx = ctx;
+
+	return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
+			      sock_hash_parent, NULL);
+}
+
+static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
+				struct tcp_sock *sk,
+				struct ckpt_hdr_socket_inet *hh,
+				int op)
+{
+	CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
+	CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
+	CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
+	CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
+	CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
+	CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
+	CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
+	CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
+
+	CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
+	CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
+	CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
+	CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
+	CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
+	CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
+	CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
+	CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
+	CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
+	CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
+
+	CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
+	CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
+	CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
+	CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
+	CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
+
+	CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
+	CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
+
+	CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
+	CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
+	CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
+	CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
+
+	CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
+
+	CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
+	CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
+	CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
+	CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
+	CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
+	CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
+	CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
+	CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
+
+	CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
+	CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
+
+	CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
+
+	CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
+	CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
+
+	CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
+	CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
+	CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
+	CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
+
+	CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
+	CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
+	CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
+
+	return 0;
+}
+
+static int sock_inet_restore_addrs(struct inet_sock *inet,
+				   struct ckpt_hdr_socket_inet *hh)
+{
+	inet->daddr = hh->raddr.sin_addr.s_addr;
+	inet->saddr = hh->laddr.sin_addr.s_addr;
+	inet->rcv_saddr = inet->saddr;
+
+	inet->dport = hh->raddr.sin_port;
+	inet->sport = hh->laddr.sin_port;
+
+	return 0;
+}
+
+static int sock_inet_cptrst(struct ckpt_ctx *ctx,
+			    struct sock *sk,
+			    struct ckpt_hdr_socket_inet *hh,
+			    int op)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ret;
+
+	if (op == CKPT_CPT) {
+		CKPT_COPY(op, hh->daddr, inet->daddr);
+		CKPT_COPY(op, hh->rcv_saddr, inet->rcv_saddr);
+		CKPT_COPY(op, hh->dport, inet->dport);
+		CKPT_COPY(op, hh->saddr, inet->saddr);
+		CKPT_COPY(op, hh->sport, inet->sport);
+	} else {
+		ret = sock_inet_restore_addrs(inet, hh);
+		if (ret)
+			return ret;
+	}
+
+	CKPT_COPY(op, hh->num, inet->num);
+	CKPT_COPY(op, hh->uc_ttl, inet->uc_ttl);
+	CKPT_COPY(op, hh->cmsg_flags, inet->cmsg_flags);
+
+	CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
+	CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
+	CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
+	CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
+	CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
+	CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
+	CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
+	CKPT_COPY(op,
+		  hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
+	CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
+
+	if (sk->sk_protocol == IPPROTO_TCP)
+		ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sk), hh, op);
+	else if (sk->sk_protocol == IPPROTO_UDP)
+		ret = 0;
+	else {
+		ckpt_write_err(ctx, "T", "unknown socket protocol %d",
+			       sk->sk_protocol);
+		ret = -EINVAL;
+	}
+
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *inet6 = inet6_sk(sk);
+		if (op == CKPT_CPT) {
+			ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
+			ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
+			ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
+		} else {
+			ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
+			ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
+			ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
+		}
+	}
+
+	return ret;
+}
+
 int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 {
 	struct ckpt_hdr_socket_inet *in;
@@ -43,6 +271,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 	if (ret)
 		goto out;
 
+	ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
+	if (ret < 0)
+		goto out;
+
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
  out:
 	ckpt_hdr_put(ctx, in);
@@ -87,9 +319,9 @@ static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
 	if (ret < 0)
 		goto out;
 
-	spin_lock(&queue->lock);
+	sock_restore_header_info(skb, h);
+
 	skb_queue_tail(queue, skb);
-	spin_unlock(&queue->lock);
  out:
 	ckpt_hdr_put(ctx, h);
 
@@ -209,8 +441,35 @@ int inet_restore(struct ckpt_ctx *ctx,
 			ckpt_debug("inet listen: %i\n", ret);
 			if (ret < 0)
 				goto out;
+
+			/* We are a listening socket, so add ourselves
+			 * to the list of parent sockets.  This will
+			 * allow our children to find us later and
+			 * link up
+			 */
+
+			ret = sock_listening_list_add(ctx, sock->sk);
+			if (ret < 0)
+				goto out;
 		}
 	} else {
+		ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
+		if (ret)
+			goto out;
+
+		if ((h->sock.state == TCP_ESTABLISHED) &&
+		    (h->sock.protocol == IPPROTO_TCP)) {
+			/* A connected socket that was spawned from an
+			 * accept() needs to be hashed with its parent
+			 * listening socket in order to receive
+			 * traffic on the original port.  Since we may
+			 * not have restarted the parent yet, we defer
+			 * this until later when we know we have all
+			 * the listening sockets accounted for.
+			 */
+			ret = sock_defer_hash(ctx, sock->sk);
+		}
+
 		if (!sock_flag(sock->sk, SOCK_DEAD))
 			ret = inet_defer_restore_buffers(ctx, sock->sk);
 	}
-- 
1.6.2.5


^ permalink raw reply related

* Re: pktgen and spin_lock_bh in xmit path
From: Ben Greear @ 2009-10-20 21:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE0770.8060708@gmail.com>

On 10/20/2009 11:54 AM, Eric Dumazet wrote:

> -	queue_map = skb_get_queue_mapping(pkt_dev->skb);
> +	queue_map = pkt_dev->cur_queue_map;
> +	/*
> +	 * tells skb_tx_hash() to use this tx queue.
> +	 * We should reset skb->mapping before each xmit() because
> +	 * xmit() might change it.
> +	 */
> +	skb_record_rx_queue(pkt_dev->skb, queue_map);
>   	txq = netdev_get_tx_queue(odev, queue_map);

I think that must be wrong.  The record_rx_queue sets it to queue_map + 1,
but the hard-start-xmit method (in ixgbe/ixgbe_main.c, at least), takes the
skb->queue_map and uses it as an index with no subtraction.

This causes watchdog timeouts because we are calling txq_trans_update in pktgen on
queue 0, for instance, but sending pkts on queue 1.  If queue 1 is ever
busy when the WD fires, link is reset.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Eric Dumazet @ 2009-10-20 21:22 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE2735.9000807@candelatech.com>

Ben Greear a écrit :
> On 10/20/2009 11:54 AM, Eric Dumazet wrote:
> 
>> -    queue_map = skb_get_queue_mapping(pkt_dev->skb);
>> +    queue_map = pkt_dev->cur_queue_map;
>> +    /*
>> +     * tells skb_tx_hash() to use this tx queue.
>> +     * We should reset skb->mapping before each xmit() because
>> +     * xmit() might change it.
>> +     */
>> +    skb_record_rx_queue(pkt_dev->skb, queue_map);
>>       txq = netdev_get_tx_queue(odev, queue_map);
> 
> I think that must be wrong.  The record_rx_queue sets it to queue_map + 1,
> but the hard-start-xmit method (in ixgbe/ixgbe_main.c, at least), takes the
> skb->queue_map and uses it as an index with no subtraction.


Yes but check net/core/dev.c I quoted in my previous mail :
We change queue_map if skb goes through dev_queue_xmit()
(as done by macvlan)

u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
{
        u32 hash;

        if (skb_rx_queue_recorded(skb)) {
                hash = skb_get_rx_queue(skb);
                while (unlikely(hash >= dev->real_num_tx_queues))
                        hash -= dev->real_num_tx_queues;
                return hash;
        }

        if (skb->sk && skb->sk->sk_hash)
                hash = skb->sk->sk_hash;
        else
                hash = skb->protocol;

        hash = jhash_1word(hash, skb_tx_hashrnd);

        return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
}
EXPORT_SYMBOL(skb_tx_hash);

static struct netdev_queue *dev_pick_tx(struct net_device *dev,
                                        struct sk_buff *skb)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        u16 queue_index = 0;

        if (ops->ndo_select_queue)
                queue_index = ops->ndo_select_queue(dev, skb);
        else if (dev->real_num_tx_queues > 1)
                queue_index = skb_tx_hash(dev, skb);

        skb_set_queue_mapping(skb, queue_index);
        return netdev_get_tx_queue(dev, queue_index);
}

So if skb->queue_mapping was X+1 before entering dev_pick_tx(), it is X when
leaving dev_pick_tx()

> 
> This causes watchdog timeouts because we are calling txq_trans_update in
> pktgen on
> queue 0, for instance, but sending pkts on queue 1.  If queue 1 is ever
> busy when the WD fires, link is reset.
> 

Problem is not pktgen IMHO. 

Problem is skb->queue_mapping has different meaning if skb is directly given to a real device -> start_xmit()

( In this case skb->queue_mapping should be between [O ... real_num_tx_queues-1])

But if it goes through dev_queue_xmit(), it should be set between [1 .. real_num_tx_queues], because
dev_pick_tx() will decrement skb->queue_mapping

In fact skb->queue_mapping only works for forwarded packets, not locally generated ones.

I am too tired to cook a fix at this moment, sorry :(

^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Ben Greear @ 2009-10-20 21:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE2A24.6080300@gmail.com>

On 10/20/2009 02:22 PM, Eric Dumazet wrote:

> Problem is skb->queue_mapping has different meaning if skb is directly given to a real device ->  start_xmit()
>
> ( In this case skb->queue_mapping should be between [O ... real_num_tx_queues-1])
>
> But if it goes through dev_queue_xmit(), it should be set between [1 .. real_num_tx_queues], because
> dev_pick_tx() will decrement skb->queue_mapping
>
> In fact skb->queue_mapping only works for forwarded packets, not locally generated ones.
>
> I am too tired to cook a fix at this moment, sorry :(

That's definitely a nasty little issue.  Using skb_set_queue_mapping
in pktgen makes it run for me, but may just be getting lucky with the
mac-vlan interfaces which will do the dev_queue_xmit (but, I don't so much
care exactly what queue is used as long as things don't crash and the
link doesn't reset).

Don't worry about a quick patch on my account.  I seem to have it working
to at least some degree (no funny crashes, no link watchdog timeouts).

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Eric Dumazet @ 2009-10-20 21:57 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE2C00.8030900@candelatech.com>

Ben Greear a écrit :
> That's definitely a nasty little issue.  Using skb_set_queue_mapping
> in pktgen makes it run for me, but may just be getting lucky with the
> mac-vlan interfaces which will do the dev_queue_xmit (but, I don't so much
> care exactly what queue is used as long as things don't crash and the
> link doesn't reset).
> 
> Don't worry about a quick patch on my account.  I seem to have it working
> to at least some degree (no funny crashes, no link watchdog timeouts).
> 

Could you try following patch ?

This makes queue_mapping invariant if set in range [0 ... real_num_tx_queues-1]

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index df7b23a..3ef2429 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2023,17 +2023,17 @@ static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_bu
 
 static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
 {
-	skb->queue_mapping = rx_queue + 1;
+	skb->queue_mapping = rx_queue;
 }
 
 static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
 {
-	return skb->queue_mapping - 1;
+	return skb->queue_mapping;
 }
 
 static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
 {
-	return (skb->queue_mapping != 0);
+	return (skb->queue_mapping != 0xffff);
 }
 
 extern u16 skb_tx_hash(const struct net_device *dev,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 80a9616..5f04f00 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -198,6 +198,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->truesize = size + sizeof(struct sk_buff);
 	atomic_set(&skb->users, 1);
+	skb->queue_mapping = 0xffff;
 	skb->head = data;
 	skb->data = data;
 	skb_reset_tail_pointer(skb);



^ permalink raw reply related

* Re: [patch 0/3] KS8851 updates for -rc5
From: Ben Dooks @ 2009-10-20 22:26 UTC (permalink / raw)
  To: Doong, Ping; +Cc: netdev, Li, Charles
In-Reply-To: <6DA9692E60FD324FBF17D6336890798104259C@MELANITE.micrel.com>

Doong, Ping wrote:
> Hi Ben,
> 
> I'm sorry. I am busy on other project, and have got the change to verify
> your new patch on our side. I will do it soon.

 From testing, these verify for me now that I've seen the
issues for myself. I belive they're ready for merging so
unless Dave (or someone else) objects, then I'd like to
see them in as soon as possible.

Thanks for the bug reports.

-- 
Ben Dooks, Software Engineer, Simtec Electronics

http://www.simtec.co.uk/

^ permalink raw reply

* [PATCH v2] fix section mismatch in fec.c
From: Steven King @ 2009-10-20 23:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev

fec_enet_init is called by both fec_probe and fec_resume, so it shouldn't 
be marked as __init.

Signed-off-by: Steven King <sfking@fdwdc.com>


diff --git a/drivers/net/fec.c b/drivers/net/fec.c
index 2923438..16a1d58 100644
--- a/drivers/net/fec.c
+++ b/drivers/net/fec.c
@@ -1654,7 +1654,7 @@ static const struct net_device_ops fec_netdev_ops = {
   *
   * index is only used in legacy code
   */
-int __init fec_enet_init(struct net_device *dev, int index)
+static int fec_enet_init(struct net_device *dev, int index)
 {
 	struct fec_enet_private *fep = netdev_priv(dev);
 	struct bufdesc *cbd_base;

-- 
Steven King -- sfking at fdwdc dot com

^ permalink raw reply related

* Re: pktgen and spin_lock_bh in xmit path
From: Ben Greear @ 2009-10-20 23:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE3253.10302@gmail.com>

On 10/20/2009 02:57 PM, Eric Dumazet wrote:
> Ben Greear a écrit :
>> That's definitely a nasty little issue.  Using skb_set_queue_mapping
>> in pktgen makes it run for me, but may just be getting lucky with the
>> mac-vlan interfaces which will do the dev_queue_xmit (but, I don't so much
>> care exactly what queue is used as long as things don't crash and the
>> link doesn't reset).
>>
>> Don't worry about a quick patch on my account.  I seem to have it working
>> to at least some degree (no funny crashes, no link watchdog timeouts).
>>
>
> Could you try following patch ?
>
> This makes queue_mapping invariant if set in range [0 ... real_num_tx_queues-1]

Yes, that runs w/out causing link resets and without crashes (just tested it for
a few minutes).

Interestingly, the pkts sent by pktgen on the mac-vlans end up in
tx-queues that match processor ID, even though I'm on .31 where mac-vlans
have only one tx-queue and pktgen is setting the queue to 0 in the skb
(per your previous patch).

Must be something somewhere doing the mapping of processor ids to txqs.

I'll try to figure this out this evening or tomorrow.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* Re: [PATCH RFC] Per route TCP options
From: David Miller @ 2009-10-21  0:36 UTC (permalink / raw)
  To: gilad; +Cc: netdev, ori
In-Reply-To: <1256052161-14156-1-git-send-email-gilad@codefidence.com>


When sending more than one patch in a patch set, you must
number your patches so that people know in what order your
pathes should be applied.

Mailing lists and other entities reorder list postings
quite severely, so it's not "obvious" what order your
postings matter just based upon arrival order.

^ permalink raw reply

* Re: [PATCH RFC] Allow to turn off TCP window scale opt per route
From: Stephen Hemminger @ 2009-10-21  1:40 UTC (permalink / raw)
  To: Gilad Ben-Yossef; +Cc: netdev, ori, Gilad Ben-Yossef
In-Reply-To: <1256052161-14156-7-git-send-email-gilad@codefidence.com>

On Tue, 20 Oct 2009 17:22:39 +0200
Gilad Ben-Yossef <gilad@codefidence.com> wrote:

> Add and use no window scale bit in the features field.
> 
> Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
> Sigend-off-by: Ori Finkelman <ori@comsleep.com>
> Sigend-off-by: Yony Amit <yony@comsleep.com>

The same effect can by just using window limit on route


^ permalink raw reply

* Re: [PATCH v2] fix section mismatch in fec.c
From: David Miller @ 2009-10-21  1:51 UTC (permalink / raw)
  To: sfking; +Cc: linux-kernel, netdev
In-Reply-To: <200910201600.10545.sfking@fdwdc.com>

From: Steven King <sfking@fdwdc.com>
Date: Tue, 20 Oct 2009 16:00:10 -0700

> fec_enet_init is called by both fec_probe and fec_resume, so it shouldn't 
> be marked as __init.
> 
> Signed-off-by: Steven King <sfking@fdwdc.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH 0/4 v4] net: Implement fast TX queue selection
From: David Miller @ 2009-10-21  1:59 UTC (permalink / raw)
  To: krkumar2; +Cc: netdev, herbert, dada1
In-Reply-To: <20091020094607.10404.81794.sendpatchset@localhost.localdomain>


I've applied this set to net-next-2.6, will push out to kernel.org
after some build tests, thanks!

^ permalink raw reply

* Re: [patch 0/3] KS8851 updates for -rc5
From: David Miller @ 2009-10-21  2:11 UTC (permalink / raw)
  To: ben; +Cc: Ping.Doong, netdev, Charles.Li
In-Reply-To: <4ADE3916.3050206@simtec.co.uk>

From: Ben Dooks <ben@simtec.co.uk>
Date: Tue, 20 Oct 2009 23:26:30 +0100

> Doong, Ping wrote:
>> Hi Ben,
>> I'm sorry. I am busy on other project, and have got the change to
>> verify
>> your new patch on our side. I will do it soon.
> 
> From testing, these verify for me now that I've seen the
> issues for myself. I belive they're ready for merging so
> unless Dave (or someone else) objects, then I'd like to
> see them in as soon as possible.

All applied to net-next-2.6, thanks.

^ permalink raw reply

* Re: [patch 0/3] KS8851 updates for -rc5
From: David Miller @ 2009-10-21  2:12 UTC (permalink / raw)
  To: ben; +Cc: Ping.Doong, netdev, Charles.Li
In-Reply-To: <20091020.191142.193713900.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Tue, 20 Oct 2009 19:11:42 -0700 (PDT)

> From: Ben Dooks <ben@simtec.co.uk>
> Date: Tue, 20 Oct 2009 23:26:30 +0100
> 
>> Doong, Ping wrote:
>>> Hi Ben,
>>> I'm sorry. I am busy on other project, and have got the change to
>>> verify
>>> your new patch on our side. I will do it soon.
>> 
>> From testing, these verify for me now that I've seen the
>> issues for myself. I belive they're ready for merging so
>> unless Dave (or someone else) objects, then I'd like to
>> see them in as soon as possible.
> 
> All applied to net-next-2.6, thanks.

Sorry, I meant 'net-2.6'

^ permalink raw reply

* Re: [PATCH RFC] Per route TCP options
From: Bill Fink @ 2009-10-21  2:13 UTC (permalink / raw)
  To: Gilad Ben-Yossef; +Cc: netdev, ori
In-Reply-To: <1256052161-14156-1-git-send-email-gilad@codefidence.com>

On Tue, 20 Oct 2009, Gilad Ben-Yossef wrote:

> Turn the global sysctls allowing disabling of TCP SACK, DSCAK,
> time stamp and window scale into per route entry feature options,
> laying the ground to future removal of the relevant global sysctls.
> 
> You really only want to disable SACK, DSACK, time stamp or window
> scale if you've got a piece of broken networking equipment somewhere 
> as a stop gap until you can bring a big enough hammer to deal with
> the broken network equipment. It doesn't make sense to "punish" the
> entire connections going through the machine to destinations not 
> related to the broken equipment.

For certain test situations, it is sometimes desirable to globally
disable TCP timestamps.  Although I have not personally wanted to
globally disable the other mentioned features, I can imagine test
scenarios where it could be useful.  Admittedly it could also be
accomplished with per route features, just not as conveniently,
especially if there are a large number of interfaces and/or routes.

						-Bill

^ permalink raw reply

* [net-next-2.6 PATCH 1/3] irq: Export irq_set_affinity() for drivers
From: Jeff Kirsher @ 2009-10-21  2:26 UTC (permalink / raw)
  To: davem; +Cc: gospo, netdev, Peter P Waskiewicz Jr, Jeff Kirsher

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

This patch allows drivers to specify an IRQ affinity mask for
their respective interrupt sources.  This is very useful on
network adapters using MSI-X, where aligning network flows
linearly to CPUs greatly improves efficiency of the network
stack.

Today, users must either hand-set affinity through /proc, or
use a script through the same interface.  This patch will allow
a driver to come completely pre-canned with an optimal
configuration.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 kernel/irq/manage.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c66..185eb26 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -137,6 +137,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
+EXPORT_SYMBOL(irq_set_affinity);
 
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*


^ permalink raw reply related

* [net-next-2.6 PATCH 2/3] ixgbe: Set MSI-X vectors to NOBALANCING and set affinity
From: Jeff Kirsher @ 2009-10-21  2:27 UTC (permalink / raw)
  To: davem; +Cc: gospo, netdev, Peter P Waskiewicz Jr, Jeff Kirsher
In-Reply-To: <20091021022626.32449.73883.stgit@localhost.localdomain>

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

This patch will set each MSI-X vector to IRQF_NOBALANCING to
prevent autobalance of the interrupts, then applies a CPU
affinity.  This will only be done when Flow Director is enabled,
which needs interrupts to be processed on the same CPUs where the
applications are running.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_main.c |   34 +++++++++++++++++++++++++++++-----
 1 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 4c8a449..d2280c3 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1565,8 +1565,10 @@ static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
 	irqreturn_t (*handler)(int, void *);
-	int i, vector, q_vectors, err;
+	int i, vector, q_vectors, cpu, err;
 	int ri=0, ti=0;
+	u32 intr_flags = 0;
+	u32 num_cpus = num_online_cpus();
 
 	/* Decrement for Other and TCP Timer vectors */
 	q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
@@ -1576,17 +1578,22 @@ static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)
 	if (err)
 		goto out;
 
+	/* If Flow Director is enabled, we want to affinitize vectors */
+	if ((adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE) ||
+	    (adapter->flags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE))
+		intr_flags = IRQF_NOBALANCING;
+
 #define SET_HANDLER(_v) ((!(_v)->rxr_count) ? &ixgbe_msix_clean_tx : \
                          (!(_v)->txr_count) ? &ixgbe_msix_clean_rx : \
                          &ixgbe_msix_clean_many)
-	for (vector = 0; vector < q_vectors; vector++) {
+	for (vector = 0, cpu = 0; vector < q_vectors; vector++) {
 		handler = SET_HANDLER(adapter->q_vector[vector]);
 
-		if(handler == &ixgbe_msix_clean_rx) {
+		if (handler == &ixgbe_msix_clean_rx) {
 			sprintf(adapter->name[vector], "%s-%s-%d",
 				netdev->name, "rx", ri++);
 		}
-		else if(handler == &ixgbe_msix_clean_tx) {
+		else if (handler == &ixgbe_msix_clean_tx) {
 			sprintf(adapter->name[vector], "%s-%s-%d",
 				netdev->name, "tx", ti++);
 		}
@@ -1595,7 +1602,8 @@ static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)
 				netdev->name, "TxRx", vector);
 
 		err = request_irq(adapter->msix_entries[vector].vector,
-		                  handler, 0, adapter->name[vector],
+		                  handler, intr_flags,
+		                  adapter->name[vector],
 		                  adapter->q_vector[vector]);
 		if (err) {
 			DPRINTK(PROBE, ERR,
@@ -1603,9 +1611,25 @@ static int ixgbe_request_msix_irqs(struct ixgbe_adapter *adapter)
 			        "Error: %d\n", err);
 			goto free_queue_irqs;
 		}
+		if (intr_flags) {
+			/*
+			 * We're not balancing the vector, so affinitize it.
+			 * Best default layout is try and assign one vector
+			 * per CPU.  If we have more vectors than online
+			 * CPUs, then try to first affinitize Rx, then lay
+			 * Tx over the same Rx CPU map.  This can always be
+			 * overridden using smp_affinity in /proc
+			 */
+
+			irq_set_affinity(adapter->msix_entries[vector].vector,
+			                 cpumask_of(cpu));
+			if (++cpu >= num_cpus)
+				cpu = 0;
+		}
 	}
 
 	sprintf(adapter->name[vector], "%s:lsc", netdev->name);
+	/* We don't care if this vector is irqbalanced or not */
 	err = request_irq(adapter->msix_entries[vector].vector,
 	                  &ixgbe_msix_lsc, 0, adapter->name[vector], netdev);
 	if (err) {


^ permalink raw reply related

* [net-next-2.6 PATCH 3/3] ixgbe: Make queue pairs on single MSI-X interrupts
From: Jeff Kirsher @ 2009-10-21  2:27 UTC (permalink / raw)
  To: davem; +Cc: gospo, netdev, Peter P Waskiewicz Jr, Jeff Kirsher
In-Reply-To: <20091021022626.32449.73883.stgit@localhost.localdomain>

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

This patch pairs similar-numbered Rx and Tx queues onto a single
MSI-X vector.  For example, Tx queue 0 and Rx queue 0's interrupt
with be ethX-RxTx-0.  This allows for more efficient cleanup, since
fewer interrupts will be firing during device operation.  It also
helps with a cleaner CPU affinity for IRQ affinity.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_main.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index d2280c3..00b6829 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -3579,10 +3579,10 @@ static int ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
 	 * It's easy to be greedy for MSI-X vectors, but it really
 	 * doesn't do us much good if we have a lot more vectors
 	 * than CPU's.  So let's be conservative and only ask for
-	 * (roughly) twice the number of vectors as there are CPU's.
+	 * (roughly) the same number of vectors as there are CPU's.
 	 */
 	v_budget = min(adapter->num_rx_queues + adapter->num_tx_queues,
-	               (int)(num_online_cpus() * 2)) + NON_Q_VECTORS;
+	               (int)num_online_cpus()) + NON_Q_VECTORS;
 
 	/*
 	 * At the same time, hardware can only support a maximum of


^ permalink raw reply related

* Re: pktgen and spin_lock_bh in xmit path
From: Ben Greear @ 2009-10-21  3:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE44FC.4030406@candelatech.com>

Ben Greear wrote:
> On 10/20/2009 02:57 PM, Eric Dumazet wrote:
>> Ben Greear a écrit :
>>> That's definitely a nasty little issue.  Using skb_set_queue_mapping
>>> in pktgen makes it run for me, but may just be getting lucky with the
>>> mac-vlan interfaces which will do the dev_queue_xmit (but, I don't 
>>> so much
>>> care exactly what queue is used as long as things don't crash and the
>>> link doesn't reset).
>>>
>>> Don't worry about a quick patch on my account.  I seem to have it 
>>> working
>>> to at least some degree (no funny crashes, no link watchdog timeouts).
>>>
>>
>> Could you try following patch ?
>>
>> This makes queue_mapping invariant if set in range [0 ... 
>> real_num_tx_queues-1]
>
> Yes, that runs w/out causing link resets and without crashes (just 
> tested it for
> a few minutes).
>
> Interestingly, the pkts sent by pktgen on the mac-vlans end up in
> tx-queues that match processor ID, even though I'm on .31 where mac-vlans
> have only one tx-queue and pktgen is setting the queue to 0 in the skb
> (per your previous patch).
Ok, this is because ixgbe implements the ndo_select_queue, which is 
called from
dev_pick_tx.

So, as far as I can tell, as long as you are using ixgbe with 82559, it 
doesn't matter what you set
for the queue-map in the skb..it's always over-written.

I don't know if this is a bug or a feature, but it explains the behavior 
with tx and rx queues
that I saw when using pktgen on mac-vlans...

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com> 
Candela Technologies Inc  http://www.candelatech.com



^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Eric Dumazet @ 2009-10-21  3:12 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE7A63.4090404@candelatech.com>

Ben Greear a écrit :
> Ben Greear wrote:
>> On 10/20/2009 02:57 PM, Eric Dumazet wrote:
>>> Ben Greear a écrit :
>>>> That's definitely a nasty little issue.  Using skb_set_queue_mapping
>>>> in pktgen makes it run for me, but may just be getting lucky with the
>>>> mac-vlan interfaces which will do the dev_queue_xmit (but, I don't
>>>> so much
>>>> care exactly what queue is used as long as things don't crash and the
>>>> link doesn't reset).
>>>>
>>>> Don't worry about a quick patch on my account.  I seem to have it
>>>> working
>>>> to at least some degree (no funny crashes, no link watchdog timeouts).
>>>>
>>>
>>> Could you try following patch ?
>>>
>>> This makes queue_mapping invariant if set in range [0 ...
>>> real_num_tx_queues-1]
>>
>> Yes, that runs w/out causing link resets and without crashes (just
>> tested it for
>> a few minutes).
>>
>> Interestingly, the pkts sent by pktgen on the mac-vlans end up in
>> tx-queues that match processor ID, even though I'm on .31 where mac-vlans
>> have only one tx-queue and pktgen is setting the queue to 0 in the skb
>> (per your previous patch).
> Ok, this is because ixgbe implements the ndo_select_queue, which is
> called from
> dev_pick_tx.
> 
> So, as far as I can tell, as long as you are using ixgbe with 82559, it
> doesn't matter what you set
> for the queue-map in the skb..it's always over-written.
> 
> I don't know if this is a bug or a feature, but it explains the behavior
> with tx and rx queues
> that I saw when using pktgen on mac-vlans...
> 

We have many bugs in this area :)

So we probably need to reset skb_set_queue_mapping(skb, queue_map);
each time skb is transmitted by pktgen.

Or else, pktgen will break if using bonding driver -> ixgbe, since bonding
uses only one txqueue (it is not yet multiqueue aware)

Thanks, I'll take care of official patches submission


^ permalink raw reply

* Re: pktgen and spin_lock_bh in xmit path
From: Eric Dumazet @ 2009-10-21  3:59 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE7C0D.5070208@gmail.com>

Eric Dumazet a écrit :
> Ben Greear a écrit :
>> Ben Greear wrote:
>>> On 10/20/2009 02:57 PM, Eric Dumazet wrote:
>>>> Ben Greear a écrit :
>>>>> That's definitely a nasty little issue.  Using skb_set_queue_mapping
>>>>> in pktgen makes it run for me, but may just be getting lucky with the
>>>>> mac-vlan interfaces which will do the dev_queue_xmit (but, I don't
>>>>> so much
>>>>> care exactly what queue is used as long as things don't crash and the
>>>>> link doesn't reset).
>>>>>
>>>>> Don't worry about a quick patch on my account.  I seem to have it
>>>>> working
>>>>> to at least some degree (no funny crashes, no link watchdog timeouts).
>>>>>
>>>> Could you try following patch ?
>>>>
>>>> This makes queue_mapping invariant if set in range [0 ...
>>>> real_num_tx_queues-1]
>>> Yes, that runs w/out causing link resets and without crashes (just
>>> tested it for
>>> a few minutes).
>>>
>>> Interestingly, the pkts sent by pktgen on the mac-vlans end up in
>>> tx-queues that match processor ID, even though I'm on .31 where mac-vlans
>>> have only one tx-queue and pktgen is setting the queue to 0 in the skb
>>> (per your previous patch).
>> Ok, this is because ixgbe implements the ndo_select_queue, which is
>> called from
>> dev_pick_tx.
>>
>> So, as far as I can tell, as long as you are using ixgbe with 82559, it
>> doesn't matter what you set
>> for the queue-map in the skb..it's always over-written.
>>
>> I don't know if this is a bug or a feature, but it explains the behavior
>> with tx and rx queues
>> that I saw when using pktgen on mac-vlans...
>>
> 
> We have many bugs in this area :)
> 
> So we probably need to reset skb_set_queue_mapping(skb, queue_map);
> each time skb is transmitted by pktgen.
> 
> Or else, pktgen will break if using bonding driver -> ixgbe, since bonding
> uses only one txqueue (it is not yet multiqueue aware)
> 

After some thoughts, I believe user is in error :)

pktgen should not use "clone XXX" pkts if macvlan is used (or any other driver
that ultimatly calls dev_queue_xmit() and queue packet), since skb queue anchor
is shared and would be overwritten.

1) Only way to use "clone XXXX" pkts is when using real device.

2) Also, using macvlan in pktgen is sub-optimal, since you can already put any
MAC addresses in pktgen pkts, you dont need to go through macvlan layer.

3) If ixgbe overwrites skb->queue_mapping to current cpu, you should setup pktgen
 queue_map_min and queue_map_max to match you cpu number, or use QUEUE_MAP_CPU pktgen flag
Or else, pktgen wont get  the appropriate txq (and lock) before calling driver start_xmit()

Unfortunatly, the (queue_map_min==queue_map_max) case needs a patch that might be not present in 2.6.31
(commit 896a7cf8d846a9e86fb823be16f4f14ffeb7f074 : pktgen: Fix multiqueue handling)


^ permalink raw reply

* [PATCH] pktgen: initialize IP ID field
From: Eric Dumazet @ 2009-10-21  4:22 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List

While playing with pktgen, I realized IP ID was not filled and a random value
was taken, possibly leaking 2 bytes of kernel memory.

We can use an increasing ID, this can help diagnostics anyway.


Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1da0e03..59396a5 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -335,6 +335,7 @@ struct pktgen_dev {
 	__u32 cur_src_mac_offset;
 	__be32 cur_saddr;
 	__be32 cur_daddr;
+	__u16 ip_id;
 	__u16 cur_udp_dst;
 	__u16 cur_udp_src;
 	__u16 cur_queue_map;
@@ -2630,6 +2631,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	iph->protocol = IPPROTO_UDP;	/* UDP */
 	iph->saddr = pkt_dev->cur_saddr;
 	iph->daddr = pkt_dev->cur_daddr;
+	iph->id = htons(pkt_dev->ip_id);
+	pkt_dev->ip_id++;
 	iph->frag_off = 0;
 	iplen = 20 + 8 + datalen;
 	iph->tot_len = htons(iplen);

^ permalink raw reply related

* Re: pktgen and spin_lock_bh in xmit path
From: Ben Greear @ 2009-10-21  5:00 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE873F.3030903@gmail.com>

Eric Dumazet wrote:
> pktgen should not use "clone XXX" pkts if macvlan is used (or any other driver
> that ultimatly calls dev_queue_xmit() and queue packet), since skb queue anchor
> is shared and would be overwritten.
>   
> After some thoughts, I believe user is in error :)
I tried to explain in my original post:  The problem arises when
when the hard-start-xmit fails with NETDEV_TX_BUSY.  Part of the
hard-start-xmit logic for virtual devices can call dev_queue_xmit, which 
can ultimately
change the queue mapping and yet may still return NETDEV_TX_BUSY.

pktgen would try to resend this skb next loop, and this is where it would
blow up.

I have a patched macvlan logic and a patched dev queue xmit logic that 
allows
me to return NETDEV_TX_BUSY when underlying device fails to transmit.

It may be that my hacked macvlan is the only virtual device that could ever
return NETDEV_TX_BUSY, and if that is the case, I don't think the bug
could ever be hit in official kernel code.  My opinion is that the 
current pktgen code makes
too many assumptions, so unless there is a performance penalty, I still
think it should be cleaned up.  But, I may be too paranoid.
> 1) Only way to use "clone XXXX" pkts is when using real device.
>   
Agreed, and I was not cloning pkts on the mac-vlan interface.
> 2) Also, using macvlan in pktgen is sub-optimal, since you can already put any
> MAC addresses in pktgen pkts, you dont need to go through macvlan layer.
>   
It's sub-optimal for massive pkt pushing, but still useful for sending 
multiple distinct flows
across a single physical wire.
> 3) If ixgbe overwrites skb->queue_mapping to current cpu, you should setup pktgen
>  queue_map_min and queue_map_max to match you cpu number, or use QUEUE_MAP_CPU pktgen flag
> Or else, pktgen wont get  the appropriate txq (and lock) before calling driver start_xmit()
>   
The hard-start-xmit path doesn't call the driver's queue-mapping logic, 
so you
only get that fun when transmitting through mac-vlans (or .1q vlans, 
etc).  There appears to be
no watchdog for virtual devices, and the dev_queue_xmit path updates the 
proper txq, so, as long as you
aren't using that +1 variant of the skb set queue map logic in pktgen, I 
think you will be fine.  The
current code is fine in this manner, but your patch broke it w/out the 
second patch to remove the +1
logic.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com> 
Candela Technologies Inc  http://www.candelatech.com



^ permalink raw reply

* [PATCH] pktgen: Dont leak kernel memory
From: Eric Dumazet @ 2009-10-21  5:01 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List
In-Reply-To: <4ADE8C85.6020809@gmail.com>

Eric Dumazet a écrit :
> While playing with pktgen, I realized IP ID was not filled and a random value
> was taken, possibly leaking 2 bytes of kernel memory.
> 
> We can use an increasing ID, this can help diagnostics anyway.
> 
> 

Here is a more complete version of the patch, since we leak lot of kernel
memory :(

[PATCH] pktgen: Dont leak kernel memory

While playing with pktgen, I realized IP ID was not filled and a random value
was taken, possibly leaking 2 bytes of kernel memory.
 
We can use an increasing ID, this can help diagnostics anyway.

Also clear packet payload, instead of leaking kernel memory.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1da0e03..5ce017b 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -335,6 +335,7 @@ struct pktgen_dev {
 	__u32 cur_src_mac_offset;
 	__be32 cur_saddr;
 	__be32 cur_daddr;
+	__u16 ip_id;
 	__u16 cur_udp_dst;
 	__u16 cur_udp_src;
 	__u16 cur_queue_map;
@@ -2630,6 +2631,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	iph->protocol = IPPROTO_UDP;	/* UDP */
 	iph->saddr = pkt_dev->cur_saddr;
 	iph->daddr = pkt_dev->cur_daddr;
+	iph->id = htons(pkt_dev->ip_id);
+	pkt_dev->ip_id++;
 	iph->frag_off = 0;
 	iplen = 20 + 8 + datalen;
 	iph->tot_len = htons(iplen);
@@ -2641,24 +2644,26 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	skb->dev = odev;
 	skb->pkt_type = PACKET_HOST;
 
-	if (pkt_dev->nfrags <= 0)
+	if (pkt_dev->nfrags <= 0) {
 		pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
-	else {
+		memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr));
+	} else {
 		int frags = pkt_dev->nfrags;
-		int i;
+		int i, len;
 
 		pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
 
 		if (frags > MAX_SKB_FRAGS)
 			frags = MAX_SKB_FRAGS;
 		if (datalen > frags * PAGE_SIZE) {
-			skb_put(skb, datalen - frags * PAGE_SIZE);
+			len = datalen - frags * PAGE_SIZE;
+			memset(skb_put(skb, len), 0, len);
 			datalen = frags * PAGE_SIZE;
 		}
 
 		i = 0;
 		while (datalen > 0) {
-			struct page *page = alloc_pages(GFP_KERNEL, 0);
+			struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
 			skb_shinfo(skb)->frags[i].page = page;
 			skb_shinfo(skb)->frags[i].page_offset = 0;
 			skb_shinfo(skb)->frags[i].size =


^ permalink raw reply related

* Re: pktgen and spin_lock_bh in xmit path
From: Eric Dumazet @ 2009-10-21  5:14 UTC (permalink / raw)
  To: Ben Greear; +Cc: NetDev, robert, David S. Miller
In-Reply-To: <4ADE9560.5050500@candelatech.com>

Ben Greear a écrit :
> Eric Dumazet wrote:
>> pktgen should not use "clone XXX" pkts if macvlan is used (or any
>> other driver
>> that ultimatly calls dev_queue_xmit() and queue packet), since skb
>> queue anchor
>> is shared and would be overwritten.
>>   After some thoughts, I believe user is in error :)
> I tried to explain in my original post:  The problem arises when
> when the hard-start-xmit fails with NETDEV_TX_BUSY.  Part of the
> hard-start-xmit logic for virtual devices can call dev_queue_xmit, which
> can ultimately
> change the queue mapping and yet may still return NETDEV_TX_BUSY.
> 
> pktgen would try to resend this skb next loop, and this is where it would
> blow up.
> 
> I have a patched macvlan logic and a patched dev queue xmit logic that
> allows
> me to return NETDEV_TX_BUSY when underlying device fails to transmit.
> 
> It may be that my hacked macvlan is the only virtual device that could ever
> return NETDEV_TX_BUSY, and if that is the case, I don't think the bug
> could ever be hit in official kernel code.  My opinion is that the
> current pktgen code makes
> too many assumptions, so unless there is a performance penalty, I still
> think it should be cleaned up.  But, I may be too paranoid.

If a virtual device changes skb->queue_map, it must consume skb,
or it breaks caller.

Alternative would be to restore queue_map to its initial value in
your hacked macvlan when it wants to return NETDEV_TX_BUSY status.


We could add a WARN_ON(skb_get_queue_mapping(pkt_dev->skb) != queue_map);
in pktgen, to catch driver errors but pktgen assumption is right IMHO

@@ -3466,6 +3471,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		/* fallthru */
 	case NETDEV_TX_LOCKED:
 	case NETDEV_TX_BUSY:
+		WARN_ON(skb_get_queue_mapping(pkt_dev->skb) != queue_map);
 		/* Retry it next time */
 		atomic_dec(&(pkt_dev->skb->users));
 		pkt_dev->last_ok = 0;

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox