Netdev List
 help / color / mirror / Atom feed
* [PATCH 1/3] Bruce's orignal tcp friend V3
From: Weiping Pan @ 2012-12-05  2:54 UTC (permalink / raw)
  To: netdev; +Cc: brutus, Weiping Pan
In-Reply-To: <cover.1354674151.git.wpan@redhat.com>

http://patchwork.ozlabs.org/patch/184523/

Rebase on top of commit 03f52a0a5542(ip6mr: Add sizeof verification to
MRT6_ASSERT and MT6_PIM).

Signed-off-by: Weiping Pan <wpan@redhat.com>
---
 Documentation/networking/ip-sysctl.txt |    8 +
 include/linux/skbuff.h                 |    2 +
 include/net/request_sock.h             |    1 +
 include/net/sock.h                     |   32 ++-
 include/net/tcp.h                      |   13 +-
 net/core/skbuff.c                      |    1 +
 net/core/sock.c                        |    1 +
 net/core/stream.c                      |   36 ++
 net/ipv4/inet_connection_sock.c        |   20 +
 net/ipv4/sysctl_net_ipv4.c             |    7 +
 net/ipv4/tcp.c                         |  604 +++++++++++++++++++++++++++-----
 net/ipv4/tcp_input.c                   |   22 +-
 net/ipv4/tcp_ipv4.c                    |    2 +
 net/ipv4/tcp_minisocks.c               |    4 +
 net/ipv4/tcp_output.c                  |   16 +-
 net/ipv6/tcp_ipv6.c                    |    1 +
 16 files changed, 679 insertions(+), 91 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 98ac0d7..152f488 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -214,6 +214,14 @@ tcp_fack - BOOLEAN
 	Enable FACK congestion avoidance and fast retransmission.
 	The value is not used, if tcp_sack is not enabled.
 
+tcp_friends - BOOLEAN
+	If set, TCP loopback socket pair stack bypass is enabled such
+	that all data sent will be directly queued to the receiver's
+	socket for receive. Note, normal connection establishment and
+	finish is used to make friends so any loopback interpose, e.g.
+	tcpdump, will see these TCP segements but no data segments.
+	Default: 1
+
 tcp_fin_timeout - INTEGER
 	Time to hold socket in state FIN-WAIT-2, if it was closed
 	by our side. Peer can be broken and never close its side,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f2af494..c890f65 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -334,6 +334,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@_skb_refdst: destination entry (with norefcount bit)
  *	@sp: the security path, used for xfrm
+ *	@friend: loopback friend socket
  *	@len: Length of actual data
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
@@ -409,6 +410,7 @@ struct sk_buff {
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
+	struct sock		*friend;
 	unsigned int		len,
 				data_len;
 	__u16			mac_len,
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index a51dbd1..c6dfa26 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -66,6 +66,7 @@ struct request_sock {
 	unsigned long			expires;
 	const struct request_sock_ops	*rsk_ops;
 	struct sock			*sk;
+	struct sock			*friend;
 	u32				secid;
 	u32				peer_secid;
 };
diff --git a/include/net/sock.h b/include/net/sock.h
index c945fba..778d8dd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -197,6 +197,7 @@ struct cg_proto;
   *	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
   *	@sk_lock:	synchronizer
   *	@sk_rcvbuf: size of receive buffer in bytes
+  *	@sk_friend: loopback friend socket
   *	@sk_wq: sock wait queue and async head
   *	@sk_rx_dst: receive input route used by early tcp demux
   *	@sk_dst_cache: destination cache
@@ -286,6 +287,14 @@ struct sock {
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
 	/*
+	 * If socket has a friend (sk_friend != NULL) then a send skb is
+	 * enqueued directly to the friend's sk_receive_queue such that:
+	 *
+	 *        sk_sndbuf -> sk_sndbuf + sk_friend->sk_rcvbuf
+	 *   sk_wmem_queued -> sk_friend->sk_rmem_alloc
+	 */
+	struct sock		*sk_friend;
+	/*
 	 * The backlog queue is special, it is always used with
 	 * the per-socket spinlock held and requires low latency
 	 * access. Therefore we special case it's implementation.
@@ -703,24 +712,40 @@ static inline bool sk_acceptq_is_full(const struct sock *sk)
 	return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
 }
 
+static inline int sk_wmem_queued_get(const struct sock *sk)
+{
+	if (sk->sk_friend)
+		return atomic_read(&sk->sk_friend->sk_rmem_alloc);
+	else
+		return sk->sk_wmem_queued;
+}
+
+static inline int sk_sndbuf_get(const struct sock *sk)
+{
+	if (sk->sk_friend)
+		return sk->sk_sndbuf + sk->sk_friend->sk_rcvbuf;
+	else
+		return sk->sk_sndbuf;
+}
+
 /*
  * Compute minimal free write space needed to queue new packets.
  */
 static inline int sk_stream_min_wspace(const struct sock *sk)
 {
-	return sk->sk_wmem_queued >> 1;
+	return sk_wmem_queued_get(sk) >> 1;
 }
 
 static inline int sk_stream_wspace(const struct sock *sk)
 {
-	return sk->sk_sndbuf - sk->sk_wmem_queued;
+	return sk_sndbuf_get(sk) - sk_wmem_queued_get(sk);
 }
 
 extern void sk_stream_write_space(struct sock *sk);
 
 static inline bool sk_stream_memory_free(const struct sock *sk)
 {
-	return sk->sk_wmem_queued < sk->sk_sndbuf;
+	return sk_wmem_queued_get(sk) < sk_sndbuf_get(sk);
 }
 
 /* OOB backlog add */
@@ -829,6 +854,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
 	})
 
 extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
+extern int sk_stream_wait_friend(struct sock *sk, long *timeo_p);
 extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
 extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
 extern int sk_stream_error(struct sock *sk, int flags, int err);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3202bde..5f82770 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -292,6 +292,7 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
+extern int sysctl_tcp_friends;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -687,6 +688,15 @@ void tcp_send_window_probe(struct sock *sk);
 #define TCPHDR_ECE 0x40
 #define TCPHDR_CWR 0x80
 
+/* If skb_get_friend() != NULL, TCP friends per packet state.
+ */
+struct friend_skb_parm {
+	bool	tail_inuse;		/* In use by skb_get_friend() send while */
+					/* on sk_receive_queue for tail put */
+};
+
+#define TCP_FRIEND_CB(tcb) (&(tcb)->header.hf)
+
 /* This is what the send packet queuing engine uses to pass
  * TCP per-packet control information to the transmission code.
  * We also store the host-order sequence numbers in here too.
@@ -699,6 +709,7 @@ struct tcp_skb_cb {
 #if IS_ENABLED(CONFIG_IPV6)
 		struct inet6_skb_parm	h6;
 #endif
+		struct friend_skb_parm	hf;
 	} header;	/* For incoming frames		*/
 	__u32		seq;		/* Starting sequence number	*/
 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
@@ -1041,7 +1052,7 @@ static inline bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (sysctl_tcp_low_latency || !tp->ucopy.task)
+	if (sysctl_tcp_low_latency || !tp->ucopy.task || sk->sk_friend)
 		return false;
 
 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 880722e2..665826a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -690,6 +690,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
+	new->friend		= old->friend;
 	memcpy(new->cb, old->cb, sizeof(old->cb));
 	new->csum		= old->csum;
 	new->local_df		= old->local_df;
diff --git a/net/core/sock.c b/net/core/sock.c
index a692ef4..a8f59a9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2225,6 +2225,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 #ifdef CONFIG_NET_DMA
 	skb_queue_head_init(&sk->sk_async_wait_queue);
 #endif
+	sk->sk_friend		=	NULL;
 
 	sk->sk_send_head	=	NULL;
 
diff --git a/net/core/stream.c b/net/core/stream.c
index f5df85d..85e5b03 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -83,6 +83,42 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 EXPORT_SYMBOL(sk_stream_wait_connect);
 
 /**
+ * sk_stream_wait_friend - Wait for a socket to make friends
+ * @sk: sock to wait on
+ * @timeo_p: for how long to wait
+ *
+ * Must be called with the socket locked.
+ */
+int sk_stream_wait_friend(struct sock *sk, long *timeo_p)
+{
+	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
+	int done;
+
+	do {
+		int err = sock_error(sk);
+		if (err)
+			return err;
+		if (!sk->sk_friend)
+			return -EBADFD;
+		if (!*timeo_p)
+			return -EAGAIN;
+		if (signal_pending(tsk))
+			return sock_intr_errno(*timeo_p);
+
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		sk->sk_write_pending++;
+		done = sk_wait_event(sk, timeo_p,
+				     !sk->sk_err &&
+				     sk->sk_friend->sk_friend);
+		finish_wait(sk_sleep(sk), &wait);
+		sk->sk_write_pending--;
+	} while (!done);
+	return 0;
+}
+EXPORT_SYMBOL(sk_stream_wait_friend);
+
+/**
  * sk_stream_closing - Return 1 if we still have things to send in our buffers.
  * @sk: socket to verify
  */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 2026542..ce4b79b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -659,6 +659,26 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 	if (newsk != NULL) {
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 
+		if (req->friend) {
+			/*
+			 * Make friends with the requestor but the ACK of
+			 * the request is already in-flight so the race is
+			 * on to make friends before the ACK is processed.
+			 * If the requestor's sk_friend value is != NULL
+			 * then the requestor has already processed the
+			 * ACK so indicate state change to wake'm up.
+			 */
+			struct sock *was;
+
+			sock_hold(req->friend);
+			newsk->sk_friend = req->friend;
+			sock_hold(newsk);
+			was = xchg(&req->friend->sk_friend, newsk);
+			/* If requester already connect()ed, maybe sleeping */
+			if (was && !sock_flag(req->friend, SOCK_DEAD))
+				sk->sk_state_change(req->friend);
+		}
+
 		newsk->sk_state = TCP_SYN_RECV;
 		newicsk->icsk_bind_hash = NULL;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d84400b..4ca53db 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -796,6 +796,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero
 	},
+	{
+		.procname	= "tcp_friends",
+		.data		= &sysctl_tcp_friends,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e6eace1..4327deb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -310,6 +310,56 @@ struct tcp_splice_state {
 };
 
 /*
+ * Validate friendp, if not a friend return 0, else if friend is also a
+ * friend return 1, else friendp points to a listen()er so wait for our
+ * friend to be ready then update friendp with pointer to the real friend
+ * and return 1, else an error has occurred so return a -errno.
+ */
+static inline int tcp_friend_validate(struct sock *sk, struct sock **friendp,
+			      long *timeo)
+{
+	struct sock *friend = *friendp;
+
+	if (!friend)
+		return 0;
+	if (unlikely(!friend->sk_friend)) {
+		/* Friendship not complete, wait? */
+		int err;
+
+		if (!timeo)
+			return -EAGAIN;
+		err = sk_stream_wait_friend(sk, timeo);
+		if (err < 0)
+			return err;
+		*friendp = sk->sk_friend;
+	}
+	return 1;
+}
+
+static inline int tcp_friend_send_lock(struct sock *friend)
+{
+	int err = 0;
+
+	spin_lock_bh(&friend->sk_lock.slock);
+	if (unlikely(friend->sk_shutdown & RCV_SHUTDOWN)) {
+		spin_unlock_bh(&friend->sk_lock.slock);
+		err = -ECONNRESET;
+	}
+
+	return err;
+}
+
+static inline void tcp_friend_recv_lock(struct sock *friend)
+{
+	spin_lock_bh(&friend->sk_lock.slock);
+}
+
+static void tcp_friend_unlock(struct sock *friend)
+{
+	spin_unlock_bh(&friend->sk_lock.slock);
+}
+
+/*
  * Pressure flag: try to collapse.
  * Technical note: it is used by multiple contexts non atomically.
  * All the __sk_mem_schedule() is of this nature: accounting
@@ -589,6 +639,76 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 }
 EXPORT_SYMBOL(tcp_ioctl);
 
+/*
+ * Friend receive_queue tail skb space? If true, set tail_inuse.
+ * Else if RCV_SHUTDOWN, return *copy = -ECONNRESET.
+ */
+static inline struct sk_buff *tcp_friend_tail(struct sock *friend, int *copy)
+{
+	struct sk_buff	*skb = NULL;
+	int		sz = 0;
+
+	if (skb_peek_tail(&friend->sk_receive_queue)) {
+		sz = tcp_friend_send_lock(friend);
+		if (!sz) {
+			skb = skb_peek_tail(&friend->sk_receive_queue);
+			if (skb && skb->friend) {
+				if (!*copy)
+					sz = skb_tailroom(skb);
+				else {
+					sz = *copy - skb->len;
+					if (sz < 0)
+						sz = 0;
+				}
+				if (sz > 0)
+					TCP_FRIEND_CB(TCP_SKB_CB(skb))->
+							tail_inuse = true;
+			}
+			tcp_friend_unlock(friend);
+		}
+	}
+
+	*copy = sz;
+	return skb;
+}
+
+static inline void tcp_friend_seq(struct sock *sk, int copy, int charge)
+{
+	struct sock	*friend = sk->sk_friend;
+	struct tcp_sock *tp = tcp_sk(friend);
+
+	if (charge) {
+		sk_mem_charge(friend, charge);
+		atomic_add(charge, &friend->sk_rmem_alloc);
+	}
+	tp->rcv_nxt += copy;
+	tp->rcv_wup += copy;
+	tcp_friend_unlock(friend);
+
+	tp = tcp_sk(sk);
+	tp->snd_nxt += copy;
+	tp->pushed_seq += copy;
+	tp->snd_una += copy;
+	tp->snd_up += copy;
+}
+
+static inline bool tcp_friend_push(struct sock *sk, struct sk_buff *skb)
+{
+	struct sock	*friend = sk->sk_friend;
+	int		wait = false;
+
+	skb_set_owner_r(skb, friend);
+	__skb_queue_tail(&friend->sk_receive_queue, skb);
+	if (!sk_rmem_schedule(friend, skb, skb->truesize))
+		wait = true;
+
+	tcp_friend_seq(sk, skb->len, 0);
+	if (skb == skb_peek(&friend->sk_receive_queue))
+		friend->sk_data_ready(friend, 0);
+
+	return wait;
+}
+
 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
@@ -605,8 +725,13 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
-	skb->csum    = 0;
 	tcb->seq     = tcb->end_seq = tp->write_seq;
+	if (sk->sk_friend) {
+		skb->friend = sk;
+		TCP_FRIEND_CB(tcb)->tail_inuse = false;
+		return;
+	}
+	skb->csum    = 0;
 	tcb->tcp_flags = TCPHDR_ACK;
 	tcb->sacked  = 0;
 	skb_header_release(skb);
@@ -626,7 +751,10 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
 			    int nonagle)
 {
-	if (tcp_send_head(sk)) {
+	if (sk->sk_friend) {
+		if (skb_peek(&sk->sk_friend->sk_receive_queue))
+			sk->sk_friend->sk_data_ready(sk->sk_friend, 0);
+	} else if (tcp_send_head(sk)) {
 		struct tcp_sock *tp = tcp_sk(sk);
 
 		if (!(flags & MSG_MORE) || forced_push(tp))
@@ -758,6 +886,21 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 }
 EXPORT_SYMBOL(tcp_splice_read);
 
+static inline struct sk_buff *tcp_friend_alloc_skb(struct sock *sk, int size)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_skb(size, sk->sk_allocation);
+	if (skb)
+		skb->avail_size = skb_tailroom(skb);
+	else {
+		sk->sk_prot->enter_memory_pressure(sk);
+		sk_stream_moderate_sndbuf(sk);
+	}
+
+	return skb;
+}
+
 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 {
 	struct sk_buff *skb;
@@ -821,12 +964,53 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 	return max(xmit_size_goal, mss_now);
 }
 
+static unsigned int tcp_friend_xmit_size_goal(struct sock *sk, int size_goal)
+{
+	u32 size = SKB_DATA_ALIGN(size_goal);
+	u32 overhead = sizeof(struct skb_shared_info) + sizeof(struct sk_buff);
+
+	/*
+	 * If alloc >= largest skb use largest order, else check
+	 * for optimal tail fill size, else use largest order.
+	 */
+	if (size >= SKB_MAX_ORDER(0, 4))
+		size = SKB_MAX_ORDER(0, 4);
+	else if (size <= (SKB_MAX_ORDER(0, 0) >> 3))
+		size = SKB_MAX_ORDER(0, 0);
+	else if (size <= (SKB_MAX_ORDER(0, 1) >> 3))
+		size = SKB_MAX_ORDER(0, 1);
+	else if (size <= (SKB_MAX_ORDER(0, 0) >> 1))
+		size = SKB_MAX_ORDER(0, 0);
+	else if (size <= (SKB_MAX_ORDER(0, 1) >> 1))
+		size = SKB_MAX_ORDER(0, 1);
+	else if (size <= (SKB_MAX_ORDER(0, 2) >> 1))
+		size = SKB_MAX_ORDER(0, 2);
+	else if (size <= (SKB_MAX_ORDER(0, 3) >> 1))
+		size = SKB_MAX_ORDER(0, 3);
+	else
+		size = SKB_MAX_ORDER(0, 4);
+
+	/* At least 2 true sized in sk_buf */
+	if (size + overhead > (sk_sndbuf_get(sk) >> 1))
+		size = (sk_sndbuf_get(sk) >> 1) - overhead;
+
+	return size;
+}
+
 static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 {
 	int mss_now;
+	int tmp;
+
+	if (sk->sk_friend) {
+		mss_now = tcp_friend_xmit_size_goal(sk, *size_goal);
+		tmp = mss_now;
+	} else {
+		mss_now = tcp_current_mss(sk);
+		tmp = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+	}
 
-	mss_now = tcp_current_mss(sk);
-	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+	*size_goal = tmp;
 
 	return mss_now;
 }
@@ -834,8 +1018,9 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 			 size_t psize, int flags)
 {
+	struct sock *friend = sk->sk_friend;
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mss_now, size_goal;
+	int mss_now, size_goal = psize;
 	int err;
 	ssize_t copied;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -850,6 +1035,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 			goto out_err;
 	}
 
+	err = tcp_friend_validate(sk, &friend, &timeo);
+	if (err < 0)
+		goto out_err;
+
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -860,25 +1049,47 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 		goto out_err;
 
 	while (psize > 0) {
-		struct sk_buff *skb = tcp_write_queue_tail(sk);
+		struct sk_buff *skb;
+		struct tcp_skb_cb *tcb;
 		struct page *page = pages[poffset / PAGE_SIZE];
 		int copy, i;
 		int offset = poffset % PAGE_SIZE;
 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
 		bool can_coalesce;
 
-		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+		if (friend) {
+			copy = size_goal;
+			skb = tcp_friend_tail(friend, &copy);
+			if (copy < 0) {
+				sk->sk_err = -copy;
+				err = -EPIPE;
+				goto out_err;
+			}
+		} else if (!tcp_send_head(sk)) {
+			skb = NULL;
+			copy = 0;
+		} else {
+			skb = tcp_write_queue_tail(sk);
+			copy = size_goal - skb->len;
+		}
+
+		if (copy <= 0) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
 
-			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+			if (friend)
+				skb = tcp_friend_alloc_skb(sk, 0);
+			else
+				skb = sk_stream_alloc_skb(sk, 0,
+							  sk->sk_allocation);
 			if (!skb)
 				goto wait_for_memory;
 
 			skb_entail(sk, skb);
 			copy = size_goal;
 		}
+		tcb = TCP_SKB_CB(skb);
 
 		if (copy > size)
 			copy = size;
@@ -886,10 +1097,14 @@ new_segment:
 		i = skb_shinfo(skb)->nr_frags;
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
-			tcp_mark_push(tp, skb);
+			if (friend) {
+				if (TCP_FRIEND_CB(tcb)->tail_inuse)
+					TCP_FRIEND_CB(tcb)->tail_inuse = false;
+			} else
+				tcp_mark_push(tp, skb);
 			goto new_segment;
 		}
-		if (!sk_wmem_schedule(sk, copy))
+		if (!friend && !sk_wmem_schedule(sk, copy))
 			goto wait_for_memory;
 
 		if (can_coalesce) {
@@ -902,19 +1117,41 @@ new_segment:
 		skb->len += copy;
 		skb->data_len += copy;
 		skb->truesize += copy;
-		sk->sk_wmem_queued += copy;
-		sk_mem_charge(sk, copy);
-		skb->ip_summed = CHECKSUM_PARTIAL;
 		tp->write_seq += copy;
-		TCP_SKB_CB(skb)->end_seq += copy;
-		skb_shinfo(skb)->gso_segs = 0;
-
-		if (!copied)
-			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
 
 		copied += copy;
 		poffset += copy;
-		if (!(psize -= copy))
+		psize -= copy;
+
+		if (friend) {
+			err = tcp_friend_send_lock(friend);
+			if (err) {
+				sk->sk_err = -err;
+				err = -EPIPE;
+				goto out_err;
+			}
+			tcb->end_seq += copy;
+			if (TCP_FRIEND_CB(tcb)->tail_inuse) {
+				TCP_FRIEND_CB(tcb)->tail_inuse = false;
+				tcp_friend_seq(sk, copy, copy);
+			} else {
+				if (tcp_friend_push(sk, skb))
+					goto wait_for_sndbuf;
+			}
+			if (!psize)
+				goto out;
+			continue;
+		}
+
+		tcb->end_seq += copy;
+		skb_shinfo(skb)->gso_segs = 0;
+		sk->sk_wmem_queued += copy;
+		sk_mem_charge(sk, copy);
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		if (copied == copy)
+			tcb->tcp_flags &= ~TCPHDR_PSH;
+
+		if (!psize)
 			goto out;
 
 		if (skb->len < size_goal || (flags & MSG_OOB))
@@ -935,7 +1172,8 @@ wait_for_memory:
 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 			goto do_error;
 
-		mss_now = tcp_send_mss(sk, &size_goal, flags);
+		if (!friend)
+			mss_now = tcp_send_mss(sk, &size_goal, flags);
 	}
 
 out:
@@ -1026,10 +1264,12 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
 	struct iovec *iov;
+	struct sock *friend = sk->sk_friend;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
+	struct tcp_skb_cb *tcb;
 	int iovlen, flags, err, copied = 0;
-	int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
+	int mss_now = 0, size_goal = size, copied_syn = 0, offset = 0;
 	bool sg;
 	long timeo;
 
@@ -1057,6 +1297,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			goto do_error;
 	}
 
+	err = tcp_friend_validate(sk, &friend, &timeo);
+	if (err < 0)
+		goto out;
+
 	if (unlikely(tp->repair)) {
 		if (tp->repair_queue == TCP_RECV_QUEUE) {
 			copied = tcp_send_rcvq(sk, msg, size);
@@ -1105,24 +1349,38 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			int copy = 0;
 			int max = size_goal;
 
-			skb = tcp_write_queue_tail(sk);
-			if (tcp_send_head(sk)) {
-				if (skb->ip_summed == CHECKSUM_NONE)
-					max = mss_now;
-				copy = max - skb->len;
+			if (friend) {
+				skb = tcp_friend_tail(friend, &copy);
+				if (copy < 0) {
+					sk->sk_err = -copy;
+					err = -EPIPE;
+					goto out_err;
+				}
+			} else {
+				skb = tcp_write_queue_tail(sk);
+				if (tcp_send_head(sk)) {
+					if (skb->ip_summed == CHECKSUM_NONE)
+						max = mss_now;
+					copy = max - skb->len;
+				}
 			}
 
 			if (copy <= 0) {
 new_segment:
-				/* Allocate new segment. If the interface is SG,
-				 * allocate skb fitting to single page.
-				 */
 				if (!sk_stream_memory_free(sk))
 					goto wait_for_sndbuf;
 
-				skb = sk_stream_alloc_skb(sk,
-							  select_size(sk, sg),
-							  sk->sk_allocation);
+				if (friend)
+					skb = tcp_friend_alloc_skb(sk, max);
+				else {
+					/* Allocate new segment. If the
+					 * interface is SG, allocate skb
+					 * fitting to single page.
+					 */
+					skb = sk_stream_alloc_skb(sk,
+							select_size(sk, sg),
+							sk->sk_allocation);
+				}
 				if (!skb)
 					goto wait_for_memory;
 
@@ -1136,6 +1394,7 @@ new_segment:
 				copy = size_goal;
 				max = size_goal;
 			}
+			tcb = TCP_SKB_CB(skb);
 
 			/* Try to append data to the end of skb. */
 			if (copy > seglen)
@@ -1153,6 +1412,8 @@ new_segment:
 				int i = skb_shinfo(skb)->nr_frags;
 				struct page_frag *pfrag = sk_page_frag(sk);
 
+				BUG_ON(friend);
+
 				if (!sk_page_frag_refill(sk, pfrag))
 					goto wait_for_memory;
 
@@ -1188,16 +1449,37 @@ new_segment:
 				pfrag->offset += copy;
 			}
 
-			if (!copied)
-				TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
-
 			tp->write_seq += copy;
-			TCP_SKB_CB(skb)->end_seq += copy;
-			skb_shinfo(skb)->gso_segs = 0;
 
 			from += copy;
 			copied += copy;
-			if ((seglen -= copy) == 0 && iovlen == 0)
+			seglen -= copy;
+
+			if (friend) {
+				err = tcp_friend_send_lock(friend);
+				if (err) {
+					sk->sk_err = -err;
+					err = -EPIPE;
+					goto out_err;
+				}
+				tcb->end_seq += copy;
+				if (TCP_FRIEND_CB(tcb)->tail_inuse) {
+					TCP_FRIEND_CB(tcb)->tail_inuse = false;
+					tcp_friend_seq(sk, copy, 0);
+				} else {
+					if (tcp_friend_push(sk, skb))
+						goto wait_for_sndbuf;
+				}
+				continue;
+			}
+
+			tcb->end_seq += copy;
+			skb_shinfo(skb)->gso_segs = 0;
+
+			if (copied == copy)
+				tcb->tcp_flags &= ~TCPHDR_PSH;
+
+			if (seglen == 0 && iovlen == 0)
 				goto out;
 
 			if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
@@ -1219,7 +1501,8 @@ wait_for_memory:
 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 				goto do_error;
 
-			mss_now = tcp_send_mss(sk, &size_goal, flags);
+			if (!friend)
+				mss_now = tcp_send_mss(sk, &size_goal, flags);
 		}
 	}
 
@@ -1230,7 +1513,12 @@ out:
 	return copied + copied_syn;
 
 do_fault:
-	if (!skb->len) {
+	if (skb->friend) {
+		if (TCP_FRIEND_CB(tcb)->tail_inuse)
+			TCP_FRIEND_CB(tcb)->tail_inuse = false;
+		else
+			__kfree_skb(skb);
+	} else if (!skb->len) {
 		tcp_unlink_write_queue(skb, sk);
 		/* It is the one place in all of TCP, except connection
 		 * reset, where we can be unlinking the send_head.
@@ -1249,6 +1537,13 @@ out_err:
 }
 EXPORT_SYMBOL(tcp_sendmsg);
 
+static inline void tcp_friend_write_space(struct sock *sk)
+{
+	/* Queued data below 1/4th of sndbuf? */
+	if ((sk_sndbuf_get(sk) >> 2) > sk_wmem_queued_get(sk))
+		sk->sk_friend->sk_write_space(sk->sk_friend);
+}
+
 /*
  *	Handle reading urgent data. BSD has very simple semantics for
  *	this, no blocking and very strange errors 8)
@@ -1327,7 +1622,12 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool time_to_ack = false;
 
-	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+	struct sk_buff *skb;
+
+	if (sk->sk_friend)
+		return;
+
+	skb = skb_peek(&sk->sk_receive_queue);
 
 	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
 	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
@@ -1431,17 +1731,27 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
 }
 #endif
 
-static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off,
+					   size_t *len)
 {
 	struct sk_buff *skb;
 	u32 offset;
+	size_t avail;
 
 	skb_queue_walk(&sk->sk_receive_queue, skb) {
-		offset = seq - TCP_SKB_CB(skb)->seq;
-		if (tcp_hdr(skb)->syn)
-			offset--;
-		if (offset < skb->len || tcp_hdr(skb)->fin) {
+		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+		offset = seq - tcb->seq;
+		if (skb->friend)
+			avail = (u32)(tcb->end_seq - seq);
+		else {
+			if (tcp_hdr(skb)->syn)
+				offset--;
+			avail = skb->len - offset;
+		}
+		if (avail > 0 || (!skb->friend && tcp_hdr(skb)->fin)) {
 			*off = offset;
+			*len = avail;
 			return skb;
 		}
 	}
@@ -1467,15 +1777,23 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 	u32 seq = tp->copied_seq;
 	u32 offset;
 	int copied = 0;
+	size_t len;
+	int err;
+	struct sock *friend = sk->sk_friend;
+	long timeo = sock_rcvtimeo(sk, false);
 
 	if (sk->sk_state == TCP_LISTEN)
 		return -ENOTCONN;
-	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
-		if (offset < skb->len) {
-			int used;
-			size_t len;
+	err = tcp_friend_validate(sk, &friend, &timeo);
+	if (err < 0)
+		return err;
+	if (friend)
+		tcp_friend_recv_lock(sk);
 
-			len = skb->len - offset;
+	while ((skb = tcp_recv_skb(sk, seq, &offset, &len)) != NULL) {
+		if (len > 0) {
+			int used;
+	again:
 			/* Stop reading if we hit a patch of urgent data */
 			if (tp->urg_data) {
 				u32 urg_offset = tp->urg_seq - seq;
@@ -1484,6 +1802,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				if (!len)
 					break;
 			}
+
+			if (friend)
+				tcp_friend_unlock(sk);
+
 			used = recv_actor(desc, skb, offset, len);
 			if (used < 0) {
 				if (!copied)
@@ -1494,33 +1816,65 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				copied += used;
 				offset += used;
 			}
-			/*
-			 * If recv_actor drops the lock (e.g. TCP splice
-			 * receive) the skb pointer might be invalid when
-			 * getting here: tcp_collapse might have deleted it
-			 * while aggregating skbs from the socket queue.
-			 */
-			skb = tcp_recv_skb(sk, seq-1, &offset);
-			if (!skb || (offset+1 != skb->len))
-				break;
+
+			if (friend)
+				tcp_friend_recv_lock(sk);
+			if (skb->friend) {
+				len = (u32)(TCP_SKB_CB(skb)->end_seq - seq);
+				if (len > 0) {
+					/*
+					 * Friend did an skb_put() while we
+					 * were away so process the same skb.
+					 */
+					if (!desc->count)
+						break;
+					tp->copied_seq = seq;
+					goto again;
+				}
+			} else {
+				/*
+				 * If recv_actor drops the lock (e.g. TCP
+				 * splice receive) the skb pointer might be
+				 * invalid when getting here: tcp_collapse
+				 * might have deleted it while aggregating
+				 * skbs from the socket queue.
+				 */
+				skb = tcp_recv_skb(sk, seq-1, &offset, &len);
+				if (!skb || (offset+1 != skb->len))
+					break;
+			}
 		}
-		if (tcp_hdr(skb)->fin) {
+		if (!skb->friend && tcp_hdr(skb)->fin) {
 			sk_eat_skb(sk, skb, false);
 			++seq;
 			break;
 		}
-		sk_eat_skb(sk, skb, false);
+		if (skb->friend) {
+			if (!TCP_FRIEND_CB(TCP_SKB_CB(skb))->tail_inuse) {
+				__skb_unlink(skb, &sk->sk_receive_queue);
+				__kfree_skb(skb);
+				tcp_friend_write_space(sk);
+			}
+			tcp_friend_unlock(sk);
+			tcp_friend_recv_lock(sk);
+		} else
+			sk_eat_skb(sk, skb, 0);
 		if (!desc->count)
 			break;
 		tp->copied_seq = seq;
 	}
 	tp->copied_seq = seq;
 
-	tcp_rcv_space_adjust(sk);
+	if (friend) {
+		tcp_friend_unlock(sk);
+		tcp_friend_write_space(sk);
+	} else {
+		tcp_rcv_space_adjust(sk);
 
-	/* Clean up data we have read: This will do ACK frames. */
-	if (copied > 0)
-		tcp_cleanup_rbuf(sk, copied);
+		/* Clean up data we have read: This will do ACK frames. */
+		if (copied > 0)
+			tcp_cleanup_rbuf(sk, copied);
+	}
 	return copied;
 }
 EXPORT_SYMBOL(tcp_read_sock);
@@ -1536,6 +1890,7 @@ EXPORT_SYMBOL(tcp_read_sock);
 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t len, int nonblock, int flags, int *addr_len)
 {
+	struct sock *friend = sk->sk_friend;
 	struct tcp_sock *tp = tcp_sk(sk);
 	int copied = 0;
 	u32 peek_seq;
@@ -1548,6 +1903,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	bool copied_early = false;
 	struct sk_buff *skb;
 	u32 urg_hole = 0;
+	bool locked = false;
 
 	lock_sock(sk);
 
@@ -1557,6 +1913,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	timeo = sock_rcvtimeo(sk, nonblock);
 
+	err = tcp_friend_validate(sk, &friend, &timeo);
+	if (err < 0)
+		goto out;
+
 	/* Urgent data needs to be handled specially. */
 	if (flags & MSG_OOB)
 		goto recv_urg;
@@ -1595,7 +1955,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
 		if ((available < target) &&
 		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
-		    !sysctl_tcp_low_latency &&
+		    !sysctl_tcp_low_latency && !friend &&
 		    net_dma_find_channel()) {
 			preempt_enable_no_resched();
 			tp->ucopy.pinned_list =
@@ -1606,7 +1966,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	}
 #endif
 
+	err = 0;
+
 	do {
+		struct tcp_skb_cb *tcb;
 		u32 offset;
 
 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
@@ -1614,37 +1977,77 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			if (copied)
 				break;
 			if (signal_pending(current)) {
-				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+				err = timeo ? sock_intr_errno(timeo) : -EAGAIN;
 				break;
 			}
 		}
 
-		/* Next get a buffer. */
+		/*
+		 * Next get a buffer. Note, for friends sendmsg() queues
+		 * data directly to our sk_receive_queue by holding our
+		 * slock and either tail queuing a new skb or adding new
+		 * data to the tail skb. In the later case tail_inuse is
+		 * set, slock dropped, copyin, skb->len updated, re-hold
+		 * slock, end_seq updated, so we can only use the bytes
+		 * from *seq to end_seq!
+		 */
+		if (friend && !locked) {
+			tcp_friend_recv_lock(sk);
+			locked = true;
+		}
 
 		skb_queue_walk(&sk->sk_receive_queue, skb) {
+			tcb = TCP_SKB_CB(skb);
+			offset = *seq - tcb->seq;
+			if (friend) {
+				if (skb->friend) {
+					used = (u32)(tcb->end_seq - *seq);
+					if (used > 0) {
+						tcp_friend_unlock(sk);
+						locked = false;
+						/* Can use it all */
+						goto found_ok_skb;
+					}
+					/* No data to copyout */
+					if (flags & MSG_PEEK)
+						continue;
+					if (!TCP_FRIEND_CB(tcb)->tail_inuse)
+						goto unlink;
+					break;
+				}
+				tcp_friend_unlock(sk);
+				locked = false;
+			}
+
 			/* Now that we have two receive queues this
 			 * shouldn't happen.
 			 */
-			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+			if (WARN(before(*seq, tcb->seq),
 				 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
-				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
-				 flags))
+				 *seq, tcb->seq, tp->rcv_nxt, flags))
 				break;
 
-			offset = *seq - TCP_SKB_CB(skb)->seq;
 			if (tcp_hdr(skb)->syn)
 				offset--;
-			if (offset < skb->len)
+			if (offset < skb->len) {
+				/* Ok so how much can we use? */
+				used = skb->len - offset;
 				goto found_ok_skb;
+			}
 			if (tcp_hdr(skb)->fin)
 				goto found_fin_ok;
 			WARN(!(flags & MSG_PEEK),
 			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
-			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
+			     *seq, tcb->seq, tp->rcv_nxt, flags);
 		}
 
 		/* Well, if we have backlog, try to process it now yet. */
 
+		if (friend && locked) {
+			tcp_friend_unlock(sk);
+			locked = false;
+		}
+
 		if (copied >= target && !sk->sk_backlog.tail)
 			break;
 
@@ -1691,7 +2094,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 		tcp_cleanup_rbuf(sk, copied);
 
-		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
+		if (!sysctl_tcp_low_latency && !friend &&
+		    tp->ucopy.task == user_recv) {
 			/* Install new reader */
 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
 				user_recv = current;
@@ -1791,8 +2195,6 @@ do_prequeue:
 		continue;
 
 	found_ok_skb:
-		/* Ok so how much can we use? */
-		used = skb->len - offset;
 		if (len < used)
 			used = len;
 
@@ -1849,7 +2251,7 @@ do_prequeue:
 				if (err) {
 					/* Exception. Bailout! */
 					if (!copied)
-						copied = -EFAULT;
+						copied = err;
 					break;
 				}
 			}
@@ -1858,6 +2260,7 @@ do_prequeue:
 		*seq += used;
 		copied += used;
 		len -= used;
+		offset += used;
 
 		tcp_rcv_space_adjust(sk);
 
@@ -1866,10 +2269,43 @@ skip_copy:
 			tp->urg_data = 0;
 			tcp_fast_path_check(sk);
 		}
-		if (used + offset < skb->len)
+
+		if (skb->friend) {
+			tcp_friend_recv_lock(sk);
+			locked = true;
+			used = (u32)(tcb->end_seq - *seq);
+			if (used) {
+				/*
+				 * Friend did an skb_put() while we were away
+				 * so if more to do process the same skb.
+				 */
+				if (len > 0) {
+					tcp_friend_unlock(sk);
+					locked = false;
+					goto found_ok_skb;
+				}
+				continue;
+			}
+			if (TCP_FRIEND_CB(tcb)->tail_inuse) {
+				/* Give sendmsg a chance */
+				tcp_friend_unlock(sk);
+				locked = false;
+				continue;
+			}
+			if (!(flags & MSG_PEEK)) {
+		unlink:
+				__skb_unlink(skb, &sk->sk_receive_queue);
+				__kfree_skb(skb);
+				tcp_friend_unlock(sk);
+				locked = false;
+				tcp_friend_write_space(sk);
+			}
 			continue;
+		}
 
-		if (tcp_hdr(skb)->fin)
+		if (offset < skb->len)
+			continue;
+		else if (tcp_hdr(skb)->fin)
 			goto found_fin_ok;
 		if (!(flags & MSG_PEEK)) {
 			sk_eat_skb(sk, skb, copied_early);
@@ -1887,6 +2323,9 @@ skip_copy:
 		break;
 	} while (len > 0);
 
+	if (friend && locked)
+		tcp_friend_unlock(sk);
+
 	if (user_recv) {
 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 			int chunk;
@@ -2065,6 +2504,9 @@ void tcp_close(struct sock *sk, long timeout)
 		goto adjudge_to_death;
 	}
 
+	if (sk->sk_friend)
+		sock_put(sk->sk_friend);
+
 	/*  We need to flush the recv. buffs.  We do this only on the
 	 *  descriptor close, not protocol-sourced closes, because the
 	 *  reader process may not have drained the data yet!
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fc67831..9640a81 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -530,6 +530,9 @@ void tcp_rcv_space_adjust(struct sock *sk)
 	int time;
 	int space;
 
+	if (sk->sk_friend)
+		return;
+
 	if (tp->rcvq_space.time == 0)
 		goto new_measure;
 
@@ -4350,8 +4353,9 @@ static int tcp_prune_queue(struct sock *sk);
 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 				 unsigned int size)
 {
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	    !sk_rmem_schedule(sk, skb, size)) {
+	if (!sk->sk_friend &&
+	    (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	    !sk_rmem_schedule(sk, skb, size))) {
 
 		if (tcp_prune_queue(sk) < 0)
 			return -1;
@@ -5722,6 +5726,16 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 *    state to ESTABLISHED..."
 		 */
 
+		if (skb->friend) {
+			/*
+			 * If friends haven't been made yet, our sk_friend
+			 * still == NULL, then update with the ACK's friend
+			 * value (the listen()er's sock addr) which is used
+			 * as a place holder.
+			 */
+			cmpxchg(&sk->sk_friend, NULL, skb->friend);
+		}
+
 		TCP_ECN_rcv_synack(tp, th);
 
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
@@ -5797,9 +5811,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		    tcp_rcv_fastopen_synack(sk, skb, &foc))
 			return -1;
 
-		if (sk->sk_write_pending ||
+		if (!skb->friend && (sk->sk_write_pending ||
 		    icsk->icsk_accept_queue.rskq_defer_accept ||
-		    icsk->icsk_ack.pingpong) {
+		    icsk->icsk_ack.pingpong)) {
 			/* Save one ACK. Data will be ready after
 			 * several ticks, if write_pending is set.
 			 *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1ed2307..f494914 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1512,6 +1512,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 #endif
 
+	req->friend = skb->friend;
+
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f35f2df..36d832a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -270,6 +270,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	bool recycle_ok = false;
 
+	if (sk->sk_friend)
+		goto out;
+
 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = tcp_remember_stamp(sk);
 
@@ -349,6 +352,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	}
 
 	tcp_update_metrics(sk);
+out:
 	tcp_done(sk);
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8ac0855..509c5e3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
+/* By default, TCP loopback bypass */
+int sysctl_tcp_friends __read_mostly = 1;
+
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
 
@@ -1025,9 +1028,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
-	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+		/* Only try to make friends if enabled */
+		if (sysctl_tcp_friends)
+			skb->friend = sk;
+
 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
-	else
+	} else
 		tcp_options_size = tcp_established_options(sk, skb, &opts,
 							   &md5);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
@@ -2725,6 +2732,11 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	}
 
 	memset(&opts, 0, sizeof(opts));
+
+	/* Only try to make friends if enabled */
+	if (sysctl_tcp_friends)
+		skb->friend = sk;
+
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
 		TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6565cf5..828d5f7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -969,6 +969,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
 #endif
 
+	req->friend = skb->friend;
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 2/3] fix panic in tcp_close()
From: Weiping Pan @ 2012-12-05  2:54 UTC (permalink / raw)
  To: netdev; +Cc: brutus, Weiping Pan
In-Reply-To: <cover.1354674151.git.wpan@redhat.com>

For tcp friends data skb, it has no tcp header,
and its transport_header is NULL, so it will panic if we deference tcp_hdr(skb)
in tcp_close().

So I add a check before we use tcp_hdr().

Signed-off-by: Weiping Pan <wpan@redhat.com>
---
 net/ipv4/tcp.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4327deb..e9d82e0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2512,8 +2512,12 @@ void tcp_close(struct sock *sk, long timeout)
 	 *  reader process may not have drained the data yet!
 	 */
 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
-		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
+		u32 len;
+		if (tcp_hdr(skb))
+			len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
 			  tcp_hdr(skb)->fin;
+		else
+			len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
 		data_was_unread += len;
 		__kfree_skb(skb);
 	}
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 3/3] delete request_sock->friend
From: Weiping Pan @ 2012-12-05  2:54 UTC (permalink / raw)
  To: netdev; +Cc: brutus, Weiping Pan
In-Reply-To: <cover.1354674151.git.wpan@redhat.com>

The sock pointed by request_sock->friend may be freed since it does not have a
lock to protect it.
I just delete request_sock->friend since I think it is useless.

For sk_buff->friend, it has the same problem, and we use
"atomic_add(skb->truesize, &sk->sk_wmem_alloc)" to guarantee that the sock can
not be freed before the skb is freed.

Then for 3-way handshake with tcp friends enabled,
SYN->friend is NULL, SYN/ACK->friend is set in tcp_make_synack(),
and ACK->friend is set in tcp_send_ack().

Signed-off-by: Weiping Pan <wpan@redhat.com>
---
 include/net/inet_connection_sock.h |    4 ++
 include/net/request_sock.h         |    1 -
 net/ipv4/inet_connection_sock.c    |   58 +++++++++++++++++++++++------------
 net/ipv4/tcp_input.c               |   10 ------
 net/ipv4/tcp_ipv4.c                |    7 +++-
 net/ipv4/tcp_minisocks.c           |    7 ++++-
 net/ipv4/tcp_output.c              |   21 ++++++++-----
 net/ipv6/tcp_ipv6.c                |    1 -
 8 files changed, 66 insertions(+), 43 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index ba1d361..883e029 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -147,6 +147,10 @@ static inline void *inet_csk_ca(const struct sock *sk)
 extern struct sock *inet_csk_clone_lock(const struct sock *sk,
 					const struct request_sock *req,
 					const gfp_t priority);
+extern struct sock *inet_csk_friend_clone_lock(const struct sock *sk,
+					const struct request_sock *req,
+					const struct sk_buff *skb,
+					const gfp_t priority);
 
 enum inet_csk_ack_state_t {
 	ICSK_ACK_SCHED	= 1,
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index c6dfa26..a51dbd1 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -66,7 +66,6 @@ struct request_sock {
 	unsigned long			expires;
 	const struct request_sock_ops	*rsk_ops;
 	struct sock			*sk;
-	struct sock			*friend;
 	u32				secid;
 	u32				peer_secid;
 };
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ce4b79b..7af92ed 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -659,26 +659,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 	if (newsk != NULL) {
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 
-		if (req->friend) {
-			/*
-			 * Make friends with the requestor but the ACK of
-			 * the request is already in-flight so the race is
-			 * on to make friends before the ACK is processed.
-			 * If the requestor's sk_friend value is != NULL
-			 * then the requestor has already processed the
-			 * ACK so indicate state change to wake'm up.
-			 */
-			struct sock *was;
-
-			sock_hold(req->friend);
-			newsk->sk_friend = req->friend;
-			sock_hold(newsk);
-			was = xchg(&req->friend->sk_friend, newsk);
-			/* If requester already connect()ed, maybe sleeping */
-			if (was && !sock_flag(req->friend, SOCK_DEAD))
-				sk->sk_state_change(req->friend);
-		}
-
 		newsk->sk_state = TCP_SYN_RECV;
 		newicsk->icsk_bind_hash = NULL;
 
@@ -700,6 +680,44 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
 
+/**
+ *	inet_csk_friend_clone_lock - clone an inet socket, and lock its clone
+ *	@sk: the socket to clone
+ *	@req: request_sock
+ *	@skb: who sends the request
+ *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_friend_clone_lock(const struct sock *sk,
+				 const struct request_sock *req,
+				 const struct sk_buff *skb,
+				 const gfp_t priority)
+{
+	struct sock *newsk = inet_csk_clone_lock(sk, req, priority);
+
+	if (newsk) {
+		struct sock *friend = skb->friend;
+		if (friend) {
+			/*
+			 * Make friends.
+			 */
+			struct sock *was;
+
+			sock_hold(friend);
+			newsk->sk_friend = friend;
+			sock_hold(newsk);
+			was = xchg(&friend->sk_friend, newsk);
+			/* If requester already connect()ed, maybe sleeping */
+			if (was && !sock_flag(friend, SOCK_DEAD))
+				sk->sk_state_change(friend);
+		}
+	}
+
+	return newsk;
+}
+EXPORT_SYMBOL_GPL(inet_csk_friend_clone_lock);
+
 /*
  * At this point, there should be no process reference to this
  * socket, and thus no user references at all.  Therefore we
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9640a81..39db09d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5726,16 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 *    state to ESTABLISHED..."
 		 */
 
-		if (skb->friend) {
-			/*
-			 * If friends haven't been made yet, our sk_friend
-			 * still == NULL, then update with the ACK's friend
-			 * value (the listen()er's sock addr) which is used
-			 * as a place holder.
-			 */
-			cmpxchg(&sk->sk_friend, NULL, skb->friend);
-		}
-
 		TCP_ECN_rcv_synack(tp, th);
 
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f494914..8d61e4c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1512,8 +1512,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 #endif
 
-	req->friend = skb->friend;
-
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
@@ -1873,6 +1871,11 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
 		goto csum_err;
 
+	if (sysctl_tcp_friends && skb->friend) {
+		skb->sk = skb->friend;
+		skb->destructor = sock_wfree;
+	}
+
 	if (sk->sk_state == TCP_LISTEN) {
 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
 		if (!nsk)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 36d832a..753126e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -383,7 +383,12 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
  */
 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
-	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+	struct sock *newsk = NULL;
+
+	if (sysctl_tcp_friends && skb->friend)
+		newsk = inet_csk_friend_clone_lock(sk, req, skb, GFP_ATOMIC);
+	else
+		newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
 
 	if (newsk != NULL) {
 		const struct inet_request_sock *ireq = inet_rsk(req);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 509c5e3..4d71549 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1028,13 +1028,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
-	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
-		/* Only try to make friends if enabled */
-		if (sysctl_tcp_friends)
-			skb->friend = sk;
-
+	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
-	} else
+	else
 		tcp_options_size = tcp_established_options(sk, skb, &opts,
 							   &md5);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
@@ -1050,7 +1046,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	skb_orphan(skb);
 	skb->sk = sk;
-	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
+
+	if (skb->friend)
+		skb->destructor = NULL;
+	else
+		skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
 			  tcp_wfree : sock_wfree;
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
@@ -2734,8 +2734,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	memset(&opts, 0, sizeof(opts));
 
 	/* Only try to make friends if enabled */
-	if (sysctl_tcp_friends)
+	if (sysctl_tcp_friends) {
 		skb->friend = sk;
+		atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+	}
 
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
@@ -3120,6 +3122,9 @@ void tcp_send_ack(struct sock *sk)
 
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+
+	if (sysctl_tcp_friends)
+		buff->friend = sk;
 	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 828d5f7..6565cf5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -969,7 +969,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
 #endif
 
-	req->friend = skb->friend;
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
-- 
1.7.4.4

^ permalink raw reply related

* new portal
From: marknmel05 @ 2012-12-05  3:18 UTC (permalink / raw)
  To: ewokandiam

Dear Colleague,  http://elysianwhitetails.com/images/stories/nadomu.html

^ permalink raw reply

* Re: [Patch net-next] netlink: add missing netlink message types to selinux perm table
From: Cong Wang @ 2012-12-05  3:23 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller
In-Reply-To: <1354593208-21939-1-git-send-email-amwang@redhat.com>

On Tue, 2012-12-04 at 11:53 +0800, Cong Wang wrote:
> RTM_NEWNETCONF and RTM_GETNETCONF are missing in this table.

This patch has conflicts with "v3 bridge: export multicast database via
netlink", please drop this for now, I will resend it later.

Thanks.

^ permalink raw reply

* Re: [Suggestion] net/atm : for sprintf, need check the total write length whether larger than a page.
From: Chas Williams (CONTRACTOR) @ 2012-12-05  3:57 UTC (permalink / raw)
  To: Chen Gang; +Cc: David Miller, netdev
In-Reply-To: <50BEA2CB.9000800@asianux.com>

In message <50BEA2CB.9000800@asianux.com>,Chen Gang writes:
>> -	for (i = 0; i < (ESI_LEN - 1); i++)
>> -		pos += sprintf(pos, "%02x:", adev->esi[i]);
>> -	pos += sprintf(pos, "%02x\n", adev->esi[i]);
>>  
>> -	return pos - buf;
>> +	return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi);
>>  }
>>  
>
>  "%p" seems print a pointer, not contents of pointer (is it correct ?)
>  will it change the original display format to outside ?

%pM means format this pointer as a mac address.  it didnt exist when the
atm stack was originally written but can be used now to save a bit of
messy code.

>> -		pos += sprintf(pos, "\n");
>> +		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
...
>  need we judge whether count >= PAGE_SIZE ?

count will eventually make PAGE_SIZE - count reach 0 at which point,
scnprintf() won't be able to write into the buffer.

^ permalink raw reply

* [PATCH net-next 1/2] net: neighbour: prohibit negative value for unres_qlen_bytes parameter
From: Shan Wei @ 2012-12-05  4:49 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, NetDev; +Cc: Shan Wei

From: Shan Wei <davidshan@tencent.com>

unres_qlen_bytes and unres_qlen are int type.
But multiple relation(unres_qlen_bytes = unres_qlen * SKB_TRUESIZE(ETH_FRAME_LEN))
will cause type overflow when seting unres_qlen. e.g.

$ echo 1027506 > /proc/sys/net/ipv4/neigh/eth1/unres_qlen
$ cat /proc/sys/net/ipv4/neigh/eth1/unres_qlen
1182657265
$ cat /proc/sys/net/ipv4/neigh/eth1/unres_qlen_bytes 
-2147479756

The gutted value is not that we setting。
But user/administrator don't know this is caused by int type overflow.

what's more, it is meaningless and even dangerous that unres_qlen_bytes is set
with negative number. Because, for unresolved neighbour address, kernel will cache packets
without limit in __neigh_event_send()(e.g. (u32)-1 = 2GB).


Signed-off-by: Shan Wei <davidshan@tencent.com>
---
 net/core/neighbour.c |   17 ++++++++++++-----
 1 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f1c0c2e..36fc692 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -62,6 +62,9 @@ static void __neigh_notify(struct neighbour *n, int type, int flags);
 static void neigh_update_notify(struct neighbour *neigh);
 static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
 
+static int zero;
+static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
+
 static struct neigh_table *neigh_tables;
 #ifdef CONFIG_PROC_FS
 static const struct file_operations neigh_stat_seq_fops;
@@ -1787,8 +1790,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 	    nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) ||
 	    /* approximative value for deprecated QUEUE_LEN (in packets) */
 	    nla_put_u32(skb, NDTPA_QUEUE_LEN,
-			DIV_ROUND_UP(parms->queue_len_bytes,
-				     SKB_TRUESIZE(ETH_FRAME_LEN))) ||
+			parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
 	    nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) ||
 	    nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) ||
 	    nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) ||
@@ -2777,9 +2779,13 @@ static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
 	int size, ret;
 	ctl_table tmp = *ctl;
 
+	tmp.extra1 = &zero;
+	tmp.extra2 = &unres_qlen_max;
 	tmp.data = &size;
-	size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN));
-	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+	size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
 	if (write && !ret)
 		*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
 	return ret;
@@ -2865,7 +2871,8 @@ static struct neigh_sysctl_table {
 			.procname	= "unres_qlen_bytes",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.proc_handler	= proc_dointvec,
+			.extra1		= &zero,
+			.proc_handler   = proc_dointvec_minmax,
 		},
 		[NEIGH_VAR_PROXY_QLEN] = {
 			.procname	= "proxy_qlen",
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 2/2] net: doc: add default value for neighbour parameters
From: Shan Wei @ 2012-12-05  4:50 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, NetDev, Shan Wei

From: Shan Wei <davidshan@tencent.com>

Signed-off-by: Shan Wei <davidshan@tencent.com>
---
 Documentation/networking/ip-sysctl.txt |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index c6d5fee..0462a71 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -30,16 +30,24 @@ neigh/default/gc_thresh3 - INTEGER
 	Maximum number of neighbor entries allowed.  Increase this
 	when using large numbers of interfaces and when communicating
 	with large numbers of directly-connected peers.
+	Default: 1024
 
 neigh/default/unres_qlen_bytes - INTEGER
 	The maximum number of bytes which may be used by packets
 	queued for each	unresolved address by other network layers.
 	(added in linux 3.3)
+	Seting negative value is meaningless and will retrun error.
+	Default: 65536 Bytes(64KB)
 
 neigh/default/unres_qlen - INTEGER
 	The maximum number of packets which may be queued for each
 	unresolved address by other network layers.
 	(deprecated in linux 3.3) : use unres_qlen_bytes instead.
+	Prior to linux 3.3, the default value is 3 which may cause
+	secluded packet loss. The current default value is calculated
+	according to default value of unres_qlen_bytes and true size of
+	packet.
+	Default: 31
 
 mtu_expires - INTEGER
 	Time, in seconds, that cached PMTU information is kept.
-- 
1.7.1

^ permalink raw reply related

* Re: [Suggestion] net/atm : for sprintf, need check the total write length whether larger than a page.
From: Chen Gang @ 2012-12-05  4:56 UTC (permalink / raw)
  To: Chas Williams (CONTRACTOR); +Cc: David Miller, netdev
In-Reply-To: <201212050357.qB53vHvT022706@thirdoffive.cmf.nrl.navy.mil>

于 2012年12月05日 11:57, Chas Williams (CONTRACTOR) 写道:
> In message <50BEA2CB.9000800@asianux.com>,Chen Gang writes:
>>> -	for (i = 0; i < (ESI_LEN - 1); i++)
>>> -		pos += sprintf(pos, "%02x:", adev->esi[i]);
>>> -	pos += sprintf(pos, "%02x\n", adev->esi[i]);
>>>  
>>> -	return pos - buf;
>>> +	return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi);
>>>  }
>>>  
>>
>>  "%p" seems print a pointer, not contents of pointer (is it correct ?)
>>  will it change the original display format to outside ?
> 
> %pM means format this pointer as a mac address.  it didnt exist when the
> atm stack was originally written but can be used now to save a bit of
> messy code.
> 

  it is my fault. thank you

  :-)


>>> -		pos += sprintf(pos, "\n");
>>> +		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
> ..
>>  need we judge whether count >= PAGE_SIZE ?
> 
> count will eventually make PAGE_SIZE - count reach 0 at which point,
> scnprintf() won't be able to write into the buffer.

  I also think so.

  I think, maybe it will be better to break the loop when we already
know that "count >= PAGE_SIZE" (it can save waste looping, although it
seems unlikly happen, for example, using unlikly(...) ).

By the way:
  will it be better that always let "\n" at the end ?
  (if count == PAGE_SIZE in a loop, we can not let "\n" at the end).



  I think what I said above are minor, if you think, for this patch, do
not need consider them, it is ok (at least for me, it is true).

  :-)

> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* Re: [Suggestion] net/atm : for sprintf, need check the total write length whether larger than a page.
From: Chen Gang @ 2012-12-05  5:40 UTC (permalink / raw)
  To: Chas Williams (CONTRACTOR); +Cc: David Miller, netdev
In-Reply-To: <50BED40D.9080100@asianux.com>

于 2012年12月05日 12:56, Chen Gang 写道:
>>>> >>> -		pos += sprintf(pos, "\n");
>>>> >>> +		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
>> > ..
>>> >>  need we judge whether count >= PAGE_SIZE ?
>> > 
>> > count will eventually make PAGE_SIZE - count reach 0 at which point,
>> > scnprintf() won't be able to write into the buffer.
>   I also think so.
> 
>   I think, maybe it will be better to break the loop when we already
> know that "count >= PAGE_SIZE" (it can save waste looping, although it
> seems unlikly happen, for example, using unlikly(...) ).
> 
> By the way:
>   will it be better that always let "\n" at the end ?
>   (if count == PAGE_SIZE in a loop, we can not let "\n" at the end).

   oh, sorry ! count will never >= PAGE_SIZE.

   I think let "PAGE_SIZE - 2" instead of "PAGE_SIZE" in the loop, so we
can make the room for the end of "\n".



-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* Re: [Patch net-next] xfrm: add missing xfrm message types to selinux perm table
From: Steffen Klassert @ 2012-12-05  5:41 UTC (permalink / raw)
  To: David Miller; +Cc: amwang, netdev, herbert
In-Reply-To: <20121204.130257.1178992077066719400.davem@davemloft.net>

On Tue, Dec 04, 2012 at 01:02:57PM -0500, David Miller wrote:
> From: Cong Wang <amwang@redhat.com>
> Date: Tue,  4 Dec 2012 13:39:31 +0800
> 
> > Cc: Steffen Klassert <steffen.klassert@secunet.com>
> > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > Cc: "David S. Miller" <davem@davemloft.net>
> > Signed-off-by: Cong Wang <amwang@redhat.com>
> 
> Steffen will you take this into your IPSEC tree?
> 

I would do so, but I have not seen this patch by now.
Was it really send to me? I have not even seen it on the
netdev list.

Anyway, I don't have it, please resend.

Thanks.

^ permalink raw reply

* Re: [RFC PATCH 2/2] tun: fix LSM/SELinux labeling of tun/tap devices
From: Jason Wang @ 2012-12-05  5:44 UTC (permalink / raw)
  To: Paul Moore; +Cc: netdev, linux-security-module, selinux, mst
In-Reply-To: <8577392.82G063LYx2@sifl>

On Tuesday, December 04, 2012 11:18:57 AM Paul Moore wrote:
> On Tuesday, December 04, 2012 09:24:43 PM Jason Wang wrote:
> > On Monday, December 03, 2012 11:22:29 AM Paul Moore wrote:
> > > It may be that I'm misunderstanding TUNSETQUEUE and/or TUNSETIFF.  Can
> > > you
> > > elaborate as to why they should be different?
> > 
> > If I understand correctly, before multiqueue patchset, TUNSETIFF is used
> > to:
> > 
> > 1) Create the tun/tap network device
> > 2) For persistent device, re-attach the fd to the network device / socket.
> > In this case, we call selinux_tun_dev_attch() to relabel the socket sid
> > (in
> > fact also the device's since the socket were persistent also) to the sid
> > of
> > process that calls TUNSETIFF.
> > 
> > So, after the changes of multiqueue, we need try to preserve those policy.
> > The interesting part is the introducing of TUNSETQUEUE, it's used to
> > attach
> > more file descriptors/sockets to a tun/tap device after at least one file
> > descriptor were attached to the tun/tap device through TUNSETIFF. So I
> > think maybe we need differ those two ioctls. This patch looks fine for
> > TUNSETQUEUE, but for TUNSETIFF, we need relabel the tunsec to the process
> > that calling TUNSETIFF for persistent device?
> 
> Okay, based on your explanation of TUNSETQUEUE, the steps below are what I
> believe we need to do ... if you disagree speak up quickly please.
> 
> A. TUNSETIFF (new, non-persistent device)
> 
> [Allocate and initialize the tun_struct LSM state based on the calling
> process, use this state to label the TUN socket.]
> 
> 1. Call security_tun_dev_create() which authorizes the action.
> 2. Call security_tun_dev_alloc_security() which allocates the tun_struct LSM
> blob and SELinux sets some internal blob state to record the label of the
> calling process.
> 3. Call security_tun_dev_attach() which sets the label of the TUN socket to
> match the label stored in the tun_struct LSM blob during A2.  No
> authorization is done at this point since the socket is new/unlabeled.
> 
> B. TUNSETIFF (existing, persistent device)
> 
> [Relabel the existing tun_struct LSM state based on the calling process, use
> this state to label the TUN socket.]
> 
> 1. Attempt to relabel/reset the tun_struct LSM blob from the currently
> stored value, set during A2, to the label of the current calling process.
> *** THIS IS NOT CURRENTLY DONE IN THE RFC PATCH ***
> 2. Call security_tun_dev_attach() which sets the label of the TUN socket to
> match the label stored in the tun_struct LSM blob during B1. No
> authorization is done at this point since the socket is new/unlabeled.
> 
> C. TUNSETQUEUE
> 
> [Use the existing tun_struct LSM state to label the new TUN socket.]
> 
> 1. Call security_tun_dev_attach() which sets the label of the TUN socket to
> match the label stored in the tun_struct LSM blob set during either A2 or
> B1. No authorization is done at this point since the socket is
> new/unlabeled.

This looks fine to me.
> > btw. Current code does allow calling TUNSETQUEUE to a persistent tun/tap
> > device with no file attached. It should be a bug and need to be fixed.
> 
> Since you wrote that code will you be submitting a patch to fix that
> problem?

Yes, I will fix it.
> > > One thing that I think we probably should change is the relabelto/from
> > > permissions in the function above (selinux_tun_dev_attach()); in the
> > > case
> > > where the socket does not yet have a label, e.g. 'sksec->sid == 0', we
> > > should probably skip the relabel permissions since we want to assign the
> > > TUN device label regardless in this case.
> > 
> > I'm not familiar with the selinux, have a quick glance of the code, looks
> > like the label has been initialized to SECINITSID_KERNEL in
> > selinux_socket_post_create().
> 
> Unless I've missed something in your changes, the multiqueue code never
> calls any socket code which ends up calling
> {security,selinux}_socket_post_create(); I believe you only call sk_alloc()
> which ends up calling
> {security,selinux}_sk_alloc() which sets SECINITSID_UNLABELED (I mistakenly
> wrote 0 instead in my earlier email which is techincally SECSID_NULL). 
> Either way, I still think the logic I originally described above is
> correct.

Yes, I was wrong. Thanks for the checking.



^ permalink raw reply

* Re: [Suggestion] net/atm : for sprintf, need check the total write length whether larger than a page.
From: Chen Gang @ 2012-12-05  5:59 UTC (permalink / raw)
  To: Chas Williams (CONTRACTOR); +Cc: David Miller, netdev
In-Reply-To: <50BEDE4E.8010408@asianux.com>

于 2012年12月05日 13:40, Chen Gang 写道:
> 于 2012年12月05日 12:56, Chen Gang 写道:
>>>>>>>> -		pos += sprintf(pos, "\n");
>>>>>>>> +		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
>>>> ..
>>>>>>  need we judge whether count >= PAGE_SIZE ?
>>>>
>>>> count will eventually make PAGE_SIZE - count reach 0 at which point,
>>>> scnprintf() won't be able to write into the buffer.
>>   I also think so.
>>
>>   I think, maybe it will be better to break the loop when we already
>> know that "count >= PAGE_SIZE" (it can save waste looping, although it
>> seems unlikly happen, for example, using unlikly(...) ).
>>
>> By the way:
>>   will it be better that always let "\n" at the end ?
>>   (if count == PAGE_SIZE in a loop, we can not let "\n" at the end).
> 
>    oh, sorry ! count will never >= PAGE_SIZE.
> 
>    I think let "PAGE_SIZE - 2" instead of "PAGE_SIZE" in the loop, so we
> can make the room for the end of "\n".
> 
> 
> 
   sorry, "PAGE_SIZE - 1" is enough, not need "PAGE_SIZE - 2".


-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* Re: [RFC PATCH 2/2] tun: fix LSM/SELinux labeling of tun/tap devices
From: Jason Wang @ 2012-12-05  6:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Paul Moore, netdev, linux-security-module, selinux
In-Reply-To: <20121204152420.GJ7499@redhat.com>

On 12/04/2012 11:24 PM, Michael S. Tsirkin wrote:
> On Tue, Dec 04, 2012 at 09:24:43PM +0800, Jason Wang wrote:
>> On Monday, December 03, 2012 11:22:29 AM Paul Moore wrote:
>>> On Monday, December 03, 2012 06:15:42 PM Jason Wang wrote:
>>>> On 11/30/2012 06:06 AM, Paul Moore wrote:
>>>>> This patch corrects some problems with LSM/SELinux that were introduced
>>>>> with the multiqueue patchset.  The problem stems from the fact that the
>>>>> multiqueue work changed the relationship between the tun device and its
>>>>> associated socket; before the socket persisted for the life of the
>>>>> device, however after the multiqueue changes the socket only persisted
>>>>> for the life of the userspace connection (fd open).  For non-persistent
>>>>> devices this is not an issue, but for persistent devices this can cause
>>>>> the tun device to lose its SELinux label.
>>>>>
>>>>> We correct this problem by adding an opaque LSM security blob to the
>>>>> tun device struct which allows us to have the LSM security state, e.g.
>>>>> SELinux labeling information, persist for the lifetime of the tun
>>>>> device.
>>> ...
>>>
>>>>> -static int selinux_tun_dev_attach(struct sock *sk)
>>>>> +static int selinux_tun_dev_attach(struct sock *sk, void *security)
>>>>>
>>>>>  {
>>>>>
>>>>> +	struct tun_security_struct *tunsec = security;
>>>>>
>>>>>  	struct sk_security_struct *sksec = sk->sk_security;
>>>>>  	u32 sid = current_sid();
>>>>>  	int err;
>>>>>
>>>>> +	/* we don't currently perform any NetLabel based labeling here ...
>>>>>
>>>>>  	err = avc_has_perm(sid, sksec->sid, SECCLASS_TUN_SOCKET,
>>>>>  	
>>>>>  			   TUN_SOCKET__RELABELFROM, NULL);
>>>>>  	
>>>>>  	if (err)
>>>>>  	
>>>>>  		return err;
>>>>>
>>>>> -	err = avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET,
>>>>> +	err = avc_has_perm(sid, tunsec->sid, SECCLASS_TUN_SOCKET,
>>>>>
>>>>>  			   TUN_SOCKET__RELABELTO, NULL);
>>>>>  	
>>>>>  	if (err)
>>>>>  	
>>>>>  		return err;
>>>>>
>>>>> -	sksec->sid = sid;
>>>>> +	sksec->sid = tunsec->sid;
>>>>> +	sksec->sclass = SECCLASS_TUN_SOCKET;
>>>> I'm not sure whether this is correct, looks like we need to differ between
>>>> TUNSETQUEUE and TUNSETIFF. When userspace call TUNSETIFF for persistent
>>>> device, looks like we need change the sid of tunsec like in the past.
>>> It may be that I'm misunderstanding TUNSETQUEUE and/or TUNSETIFF.  Can you
>>> elaborate as to why they should be different?
>> If I understand correctly, before multiqueue patchset, TUNSETIFF is used to:
>>
>> 1) Create the tun/tap network device
>> 2) For persistent device, re-attach the fd to the network device / socket. In 
>> this case, we call selinux_tun_dev_attch() to relabel the socket sid (in fact 
>> also the device's since the socket were persistent also) to the sid of process 
>> that calls TUNSETIFF.
>>
>> So, after the changes of multiqueue, we need try to preserve those policy. The 
>> interesting part is the introducing of TUNSETQUEUE, it's used to attach more 
>> file descriptors/sockets to a tun/tap device after at least one file descriptor 
>> were attached to the tun/tap device through TUNSETIFF. So I think maybe we 
>> need differ those two ioctls. This patch looks fine for TUNSETQUEUE, but for 
>> TUNSETIFF, we need relabel the tunsec to the process that calling TUNSETIFF 
>> for persistent device?
> Basically, it looks like currently once you get a tun fd,
> you can attach it to any device even if normally
> selinux would prevent you from accessing it.

Yes some checking during TUNSETQUEUE is missed.
> If we reuse selinux_tun_dev_attach, we won't need to
> change selinux policy, with a new capability we will need to change it
> to allow libvirt to do TUNSETQUEUE.
>

Also needed for qemu too since it may call TUNSETQUEUE when guest wants
to change the number of queues.
>> btw. Current code does allow calling TUNSETQUEUE to a persistent tun/tap 
>> device with no file attached. It should be a bug and need to be fixed.
> Is this a problem? You can always
> attach
> set queue
> detach
>
> and it would be hard to prevent this ...

Currently, the following steps is allowed:

1. fd1 = open("/dev/net/tun");
2. tunsetiff(fd1, "tap0");
3. tunsetpersistent("tap0");
4. close(fd1);
5. fd2 = open("/dev/net/tun");
6. tunsetqueue(fd2, "tap0);

Looks like step 6 should be forbidden since:

- no fd/sockets were attached to the device, we need use TUNSETIFF
instead to keep the API as we used do in single queue tun
- we need update the security information in tun_struct just like what
we discussed in this mail
- it may also miss checks in TUNSETIFF

>>> One thing that I think we probably should change is the relabelto/from
>>> permissions in the function above (selinux_tun_dev_attach()); in the case
>>> where the socket does not yet have a label, e.g. 'sksec->sid == 0', we
>>> should probably skip the relabel permissions since we want to assign the
>>> TUN device label regardless in this case.
>> I'm not familiar with the selinux, have a quick glance of the code, looks like 
>> the label has been initialized to SECINITSID_KERNEL in 
>> selinux_socket_post_create().
>>
>> Thanks

^ permalink raw reply

* Re: [RFC PATCH 2/2] tun: fix LSM/SELinux labeling of tun/tap devices
From: Jason Wang @ 2012-12-05  6:19 UTC (permalink / raw)
  To: Paul Moore; +Cc: Michael S. Tsirkin, netdev, linux-security-module, selinux
In-Reply-To: <338664993.8Hd16PoY2S@sifl>

On 12/05/2012 02:17 AM, Paul Moore wrote:
> On Tuesday, December 04, 2012 07:36:26 PM Michael S. Tsirkin wrote:
>> On Tue, Dec 04, 2012 at 11:18:57AM -0500, Paul Moore wrote:
>>> Okay, based on your explanation of TUNSETQUEUE, the steps below are what I
>>> believe we need to do ... if you disagree speak up quickly please.
>>>
>>> A. TUNSETIFF (new, non-persistent device)
>>>
>>> [Allocate and initialize the tun_struct LSM state based on the calling
>>> process, use this state to label the TUN socket.]
>>>
>>> 1. Call security_tun_dev_create() which authorizes the action.
>>> 2. Call security_tun_dev_alloc_security() which allocates the tun_struct
>>> LSM blob and SELinux sets some internal blob state to record the label of
>>> the calling process.
>>> 3. Call security_tun_dev_attach() which sets the label of the TUN socket
>>> to match the label stored in the tun_struct LSM blob during A2.  No
>>> authorization is done at this point since the socket is new/unlabeled.
>>>
>>> B. TUNSETIFF (existing, persistent device)
>>>
>>> [Relabel the existing tun_struct LSM state based on the calling process,
>>> use this state to label the TUN socket.]
>>>
>>> 1. Attempt to relabel/reset the tun_struct LSM blob from the currently
>>> stored value, set during A2, to the label of the current calling process.
>>> *** THIS IS NOT CURRENTLY DONE IN THE RFC PATCH ***
>>> 2. Call security_tun_dev_attach() which sets the label of the TUN socket
>>> to match the label stored in the tun_struct LSM blob during B1. No
>>> authorization is done at this point since the socket is new/unlabeled.
>>>
>>> C. TUNSETQUEUE
>>>
>>> [Use the existing tun_struct LSM state to label the new TUN socket.]
>>>
>>> 1. Call security_tun_dev_attach() which sets the label of the TUN socket
>>> to match the label stored in the tun_struct LSM blob set during either A2
>>> or B1. No authorization is done at this point since the socket is
>>> new/unlabeled.
>> Here's what bothers me. libvirt currently opens tun and passes
>> fd to qemu. What would prevent qemu from attaching fd using TUNSETQUEUE
>> to another device it does not own?
> True, assuming all the above is correct and that I'm understanding it 
> correctly (Jason?), we should probably add a new SELinux access control for 
> TUNSETQUEUE.

Yes, we need make sure qemu can call TUNSETQUEUE for the device it does
not own.
>
> The current DAC code exists in tun_not_capable().
>

^ permalink raw reply

* Re: [PATCH net-next 2/3] virtio_net: multiqueue support
From: Jason Wang @ 2012-12-05  6:33 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: krkumar2, kvm, netdev, linux-kernel, virtualization, bhutchings,
	jwhan, davem, shiyer
In-Reply-To: <20121204151108.GI7499@redhat.com>

On 12/04/2012 11:11 PM, Michael S. Tsirkin wrote:
> On Tue, Dec 04, 2012 at 10:45:33PM +0800, Jason Wang wrote:
>> On Tuesday, December 04, 2012 03:24:22 PM Michael S. Tsirkin wrote:
>>> I found some bugs, see below.
>>> Also some style nitpicking, this is not mandatory to address.
>> Thanks for the reviewing.
>>> On Tue, Dec 04, 2012 at 07:07:57PM +0800, Jason Wang wrote:
>>>
[...]
>>>> +		set = false;
>>> This will overwrite affinity if it was set by userspace.
>>> Just
>>> 	if (set)
>>> 		return;
>>> will not have this problem.
>> But we need handle the situtaiton when switch back to sq from mq mode. 
>> Otherwise we may still get the affinity hint used in mq.
>>  This kind of overwrite 
>> is unavoidable or is there some method to detect whether userspac write 
>> something new?
> If we didn't set the affinity originally we should not overwrite it.
> I think this means we need a flag that tells us that
> virtio set the affinity.

Ok.

[...]
>>>> +
>>>> +	/* Parameters for control virtqueue, if any */
>>>> +	if (vi->has_cvq) {
>>>> +		callbacks[total_vqs - 1] = NULL;
>>>> +		names[total_vqs - 1] = kasprintf(GFP_KERNEL, "control");
>>>> +	}
>>>>
>>>> +	/* Allocate/initialize parameters for send/receive virtqueues */
>>>> +	for (i = 0; i < vi->max_queue_pairs; i++) {
>>>> +		callbacks[rxq2vq(i)] = skb_recv_done;
>>>> +		callbacks[txq2vq(i)] = skb_xmit_done;
>>>> +		names[rxq2vq(i)] = kasprintf(GFP_KERNEL, "input.%d", i);
>>>> +		names[txq2vq(i)] = kasprintf(GFP_KERNEL, "output.%d", i);
>>>> +	}
>>> We would need to check kasprintf return value.
>> Looks like a better method is to make the name as a memeber of receive_queue 
>> and send_queue, and use sprintf here.
>>> Also if you allocate names from slab we'll need to free them
>>> later.
>> Then it could be freed when the send_queue and receive_queue is freed.
>>> It's probably easier to just use fixed names for now -
>>> it's not like the index is really useful.
>> Looks useful for debugging e.g. check whether the irq distribution is as 
>> expected.
> Well it doesn't really matter which one goes where, right?
> As long as interrupts are distributed well.

Yes, anyway, we decide to store the name in the send/receive queue, so I
will keep the index.
>
>>>> +
>>>> +	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
>>>> +					 (const char **)names);
>>> Please avoid casts, use a proper type for names.
>> I'm consider we need a minor change in this api, we need allocate the names 
>> dynamically which could not be a const char **.
> I don't see why. Any use that allocates on the fly as
> you did would leak memory. Any use like you suggest
> e.g. allocating as part of send/receive structure
> would be fine.

True
>>>> +	if (ret)
>>>> +		goto err_names;
>>>> +
>>>> +	if (vi->has_cvq) {
>>>> +		vi->cvq = vqs[total_vqs - 1];
>>>>
>>>>  		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
>>>>  		
>>>>  			vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
>>>>  	
>>>>  	}
>>>>
>>>> +
>>>> +	for (i = 0; i < vi->max_queue_pairs; i++) {
>>>> +		vi->rq[i].vq = vqs[rxq2vq(i)];
>>>> +		vi->sq[i].vq = vqs[txq2vq(i)];
>>>> +	}
>>>> +
>>>> +	kfree(callbacks);
>>>> +	kfree(vqs);
>>> Who frees names if there's no error?
>>>
>> The virtio core does not copy the name, so it need this and only used for 
>> debugging if I'm reading the code correctly.
> No, virtio core does not free either individual vq name or the names
> array passed in. So this leaks memory.

Yes, so when we use the names in receive/send queue, it can be freed
during queue destroying.

[...]
> @@ -1276,24 +1531,29 @@ static int virtnet_freeze(struct virtio_device
> *vdev)> 
>  static int virtnet_restore(struct virtio_device *vdev)
>  {
>  
>  	struct virtnet_info *vi = vdev->priv;
>
> -	int err;
> +	int err, i;
>
>  	err = init_vqs(vi);
>  	if (err)
>  	
>  		return err;
>  	
>  	if (netif_running(vi->dev))
>
> -		virtnet_napi_enable(&vi->rq);
> +		for (i = 0; i < vi->max_queue_pairs; i++)
> +			virtnet_napi_enable(&vi->rq[i]);
>
>  	netif_device_attach(vi->dev);
>
> -	if (!try_fill_recv(&vi->rq, GFP_KERNEL))
> -		schedule_delayed_work(&vi->refill, 0);
> +	for (i = 0; i < vi->max_queue_pairs; i++)
> +		if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
> +			schedule_delayed_work(&vi->refill, 0);
>
>  	mutex_lock(&vi->config_lock);
>  	vi->config_enable = true;
>  	mutex_unlock(&vi->config_lock);
>
> +	if (vi->has_cvq && virtio_has_feature(vi->vdev, VIRTIO_NET_F_RFS))
> +		virtnet_set_queues(vi);
> +
>>> I think it's easier to test
>>> if (curr_queue_pairs == max_queue_pairs)
>>> within virtnet_set_queues and make it
>>> a NOP if so.
>> Still need to send the command during restore since we reset the device during 
>> freezing.
>
> Then maybe check vi->has_cvq && virtio_has_feature(vi->vdev,
> VIRTIO_NET_F_RFS) in there?

Right.
>
>>>>  
[...]

^ permalink raw reply

* Re: [Patch net-next] xfrm: add missing xfrm message types to selinux perm table
From: Cong Wang @ 2012-12-05  6:56 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: David Miller, netdev, herbert
In-Reply-To: <20121205054152.GO22290@secunet.com>

On Wed, 2012-12-05 at 06:41 +0100, Steffen Klassert wrote:
> On Tue, Dec 04, 2012 at 01:02:57PM -0500, David Miller wrote:
> > From: Cong Wang <amwang@redhat.com>
> > Date: Tue,  4 Dec 2012 13:39:31 +0800
> > 
> > > Cc: Steffen Klassert <steffen.klassert@secunet.com>
> > > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > > Cc: "David S. Miller" <davem@davemloft.net>
> > > Signed-off-by: Cong Wang <amwang@redhat.com>
> > 
> > Steffen will you take this into your IPSEC tree?
> > 
> 
> I would do so, but I have not seen this patch by now.
> Was it really send to me? I have not even seen it on the
> netdev list.
> 
> Anyway, I don't have it, please resend.
> 

Weird, you are already Cc'ed in the original patch, Cc: Steffen Klassert
<steffen.klassert@secunet.com>.

^ permalink raw reply

* [PATCH v3] iproute2: add mdb command to bridge
From: Cong Wang @ 2012-12-05  7:14 UTC (permalink / raw)
  To: netdev
  Cc: Cong Wang, bridge, Herbert Xu, Jesper Dangaard Brouer,
	Thomas Graf, Stephen Hemminger, David S. Miller
In-Reply-To: <1354675804-13310-1-git-send-email-amwang@redhat.com>

V3: improve the output, display router info only for -d
    fix router parsing code

V2: sync with the kernel patch
    handle IPv6 addr
    a few cleanup

Sample output:

	# ./bridge/bridge mdb
	bridge br0:
	port eth0, group 224.8.8.9
	port eth1, group 224.8.8.8

	# ./bridge/bridge -d mdb
	bridge br0:
	port eth0, group 224.8.8.9
	port eth1, group 224.8.8.8
	router ports: eth0

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 bridge/Makefile    |    2 +-
 bridge/br_common.h |    3 +-
 bridge/bridge.c    |    1 +
 bridge/mdb.c       |  177 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/bridge/Makefile b/bridge/Makefile
index 9a6743e..67aceb4 100644
--- a/bridge/Makefile
+++ b/bridge/Makefile
@@ -1,4 +1,4 @@
-BROBJ = bridge.o fdb.o monitor.o link.o
+BROBJ = bridge.o fdb.o monitor.o link.o mdb.o
 
 include ../Config
 
diff --git a/bridge/br_common.h b/bridge/br_common.h
index 718ecb9..892fb76 100644
--- a/bridge/br_common.h
+++ b/bridge/br_common.h
@@ -5,10 +5,11 @@ extern int print_fdb(const struct sockaddr_nl *who,
 		     struct nlmsghdr *n, void *arg);
 
 extern int do_fdb(int argc, char **argv);
+extern int do_mdb(int argc, char **argv);
 extern int do_monitor(int argc, char **argv);
 
 extern int preferred_family;
 extern int show_stats;
-extern int show_detail;
+extern int show_details;
 extern int timestamp;
 extern struct rtnl_handle rth;
diff --git a/bridge/bridge.c b/bridge/bridge.c
index e2c33b0..1fcd365 100644
--- a/bridge/bridge.c
+++ b/bridge/bridge.c
@@ -43,6 +43,7 @@ static const struct cmd {
 	int (*func)(int argc, char **argv);
 } cmds[] = {
 	{ "fdb", 	do_fdb },
+	{ "mdb", 	do_mdb },
 	{ "monitor",	do_monitor },
 	{ "help",	do_help },
 	{ 0 }
diff --git a/bridge/mdb.c b/bridge/mdb.c
new file mode 100644
index 0000000..b39b535
--- /dev/null
+++ b/bridge/mdb.c
@@ -0,0 +1,177 @@
+/*
+ * Get mdb table with netlink
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <linux/if_bridge.h>
+#include <linux/if_ether.h>
+#include <linux/neighbour.h>
+#include <linux/if_bridge.h>
+#include <string.h>
+#include <arpa/inet.h>
+
+#include "libnetlink.h"
+#include "br_common.h"
+#include "rt_names.h"
+#include "utils.h"
+
+#ifndef MDBA_RTA
+#define MDBA_RTA(r) \
+	((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct br_port_msg))))
+#endif
+
+int filter_index;
+
+static void usage(void)
+{
+	fprintf(stderr, "       bridge mdb {show} [ dev DEV ]\n");
+	exit(-1);
+}
+
+static void br_print_router_ports(FILE *f, struct rtattr *attr)
+{
+	uint32_t *port_ifindex;
+	struct rtattr *i;
+	int rem;
+
+	rem = RTA_PAYLOAD(attr);
+	for (i = RTA_DATA(attr); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
+		port_ifindex = RTA_DATA(i);
+		fprintf(f, "%s ", ll_index_to_name(*port_ifindex));
+	}
+
+	fprintf(f, "\n");
+}
+
+static void print_mdb_entry(FILE *f, struct br_mdb_entry *e)
+{
+	SPRINT_BUF(abuf);
+
+	if (e->addr.proto == htons(ETH_P_IP))
+		fprintf(f, "port %s, group %s\n", ll_index_to_name(e->ifindex),
+			inet_ntop(AF_INET, &e->addr.u.ip4, abuf, sizeof(abuf)));
+	else
+		fprintf(f, "port %s, group %s\n", ll_index_to_name(e->ifindex),
+			inet_ntop(AF_INET6, &e->addr.u.ip6, abuf, sizeof(abuf)));
+}
+
+static void br_print_mdb_entry(FILE *f, struct rtattr *attr)
+{
+	struct rtattr *i;
+	int rem;
+	struct br_mdb_entry *e;
+
+	rem = RTA_PAYLOAD(attr);
+	for (i = RTA_DATA(attr); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
+		e = RTA_DATA(i);
+		print_mdb_entry(f, e);
+	}
+}
+
+int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+	FILE *fp = arg;
+	struct br_port_msg *r = NLMSG_DATA(n);
+	int len = n->nlmsg_len;
+	struct rtattr * tb[MDBA_MAX+1];
+
+	if (n->nlmsg_type != RTM_GETMDB) {
+		fprintf(stderr, "Not RTM_GETMDB: %08x %08x %08x\n",
+			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+
+		return 0;
+	}
+
+	len -= NLMSG_LENGTH(sizeof(*r));
+	if (len < 0) {
+		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+		return -1;
+	}
+
+	if (filter_index && filter_index != r->ifindex)
+		return 0;
+
+	if (!filter_index && r->ifindex)
+		fprintf(fp, "bridge %s:\n", ll_index_to_name(r->ifindex));
+
+	parse_rtattr(tb, MDBA_MAX, MDBA_RTA(r), n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
+
+	if (tb[MDBA_MDB]) {
+		struct rtattr *i;
+		int rem = RTA_PAYLOAD(tb[MDBA_MDB]);
+
+		for (i = RTA_DATA(tb[MDBA_MDB]); RTA_OK(i, rem); i = RTA_NEXT(i, rem))
+			br_print_mdb_entry(fp, i);
+	}
+
+	if (tb[MDBA_ROUTER]) {
+		if (show_details) {
+			fprintf(fp, "router ports: ");
+			br_print_router_ports(fp, tb[MDBA_ROUTER]);
+		}
+	}
+
+	return 0;
+}
+
+static int mdb_show(int argc, char **argv)
+{
+	char *filter_dev = NULL;
+
+	while (argc > 0) {
+		if (strcmp(*argv, "dev") == 0) {
+			NEXT_ARG();
+			if (filter_dev)
+				duparg("dev", *argv);
+			filter_dev = *argv;
+		}
+		argc--; argv++;
+	}
+
+	if (filter_dev) {
+		filter_index = if_nametoindex(filter_dev);
+		if (filter_index == 0) {
+			fprintf(stderr, "Cannot find device \"%s\"\n",
+				filter_dev);
+			return -1;
+		}
+	}
+
+	if (rtnl_wilddump_request(&rth, PF_BRIDGE, RTM_GETMDB) < 0) {
+		perror("Cannot send dump request");
+		exit(1);
+	}
+
+	if (rtnl_dump_filter(&rth, print_mdb, stdout) < 0) {
+		fprintf(stderr, "Dump terminated\n");
+		exit(1);
+	}
+
+	return 0;
+}
+
+int do_mdb(int argc, char **argv)
+{
+	ll_init_map(&rth);
+
+	if (argc > 0) {
+		if (matches(*argv, "show") == 0 ||
+		    matches(*argv, "lst") == 0 ||
+		    matches(*argv, "list") == 0)
+			return mdb_show(argc-1, argv+1);
+		if (matches(*argv, "help") == 0)
+			usage();
+	} else
+		return mdb_show(0, NULL);
+
+	fprintf(stderr, "Command \"%s\" is unknown, try \"bridge mdb help\".\n", *argv);
+	exit(-1);
+}

^ permalink raw reply related

* Re: [Patch net-next] xfrm: add missing xfrm message types to selinux perm table
From: Steffen Klassert @ 2012-12-05  7:39 UTC (permalink / raw)
  To: Cong Wang; +Cc: David Miller, netdev, herbert
In-Reply-To: <1354690585.28951.10.camel@cr0>

On Wed, Dec 05, 2012 at 02:56:25PM +0800, Cong Wang wrote:
> On Wed, 2012-12-05 at 06:41 +0100, Steffen Klassert wrote:
> > On Tue, Dec 04, 2012 at 01:02:57PM -0500, David Miller wrote:
> > > From: Cong Wang <amwang@redhat.com>
> > > Date: Tue,  4 Dec 2012 13:39:31 +0800
> > > 
> > > > Cc: Steffen Klassert <steffen.klassert@secunet.com>
> > > > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > > > Cc: "David S. Miller" <davem@davemloft.net>
> > > > Signed-off-by: Cong Wang <amwang@redhat.com>
> > > 
> > > Steffen will you take this into your IPSEC tree?
> > > 
> > 
> > I would do so, but I have not seen this patch by now.
> > Was it really send to me? I have not even seen it on the
> > netdev list.
> > 
> > Anyway, I don't have it, please resend.
> > 
> 
> Weird, you are already Cc'ed in the original patch, Cc: Steffen Klassert
> <steffen.klassert@secunet.com>.
> 

Even patchwork does not list this patch, I can only guess that
something went wrong with your submission.

Please resend it,

Thanks.

^ permalink raw reply

* [Patch net-next] xfrm: add missing xfrm message types to selinux perm table
From: Cong Wang @ 2012-12-05  7:42 UTC (permalink / raw)
  To: netdev; +Cc: Steffen Klassert, Herbert Xu, David S. Miller, Cong Wang

SElinux perm table is not up-to-date.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>

---
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index d309e7f..cc191bc 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -93,6 +93,13 @@ static struct nlmsg_perm nlmsg_xfrm_perms[] =
 	{ XFRM_MSG_FLUSHPOLICY,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
 	{ XFRM_MSG_NEWAE,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
 	{ XFRM_MSG_GETAE,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
+	{ XFRM_MSG_REPORT,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
+	{ XFRM_MSG_MIGRATE,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+	{ XFRM_MSG_NEWSADINFO,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+	{ XFRM_MSG_GETSADINFO,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
+	{ XFRM_MSG_NEWSPDINFO,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+	{ XFRM_MSG_GETSPDINFO,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
+	{ XFRM_MSG_MAPPING,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
 };
 
 static struct nlmsg_perm nlmsg_audit_perms[] =

^ permalink raw reply related

* [PATCH -next v2] ipw2200: return error code on error in ipw_wx_get_auth()
From: Wei Yongjun @ 2012-12-05  8:08 UTC (permalink / raw)
  To: stas.yakovlev, linville; +Cc: yongjun_wei, linux-wireless, netdev

From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

We have assinged error code to 'ret' when get auth from some
option is not supported but never used it, but we'd better return
the error code.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
---
 drivers/net/wireless/ipw2x00/ipw2200.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index 482f505..b0879ad 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -6812,7 +6812,6 @@ static int ipw_wx_get_auth(struct net_device *dev,
 	struct libipw_device *ieee = priv->ieee;
 	struct lib80211_crypt_data *crypt;
 	struct iw_param *param = &wrqu->param;
-	int ret = 0;
 
 	switch (param->flags & IW_AUTH_INDEX) {
 	case IW_AUTH_WPA_VERSION:
@@ -6822,8 +6821,7 @@ static int ipw_wx_get_auth(struct net_device *dev,
 		/*
 		 * wpa_supplicant will control these internally
 		 */
-		ret = -EOPNOTSUPP;
-		break;
+		return -EOPNOTSUPP;
 
 	case IW_AUTH_TKIP_COUNTERMEASURES:
 		crypt = priv->ieee->crypt_info.crypt[priv->ieee->crypt_info.tx_keyidx];

^ permalink raw reply related

* Re: WARNING: drivers/net/ethernet/dlink/sundance.o(.text+0x2e87): Section mismatch in reference from the function sundance_probe1() to the variable .devinit.rodata:sundance_pci_tbl
From: Denis Kirjanov @ 2012-12-05  8:12 UTC (permalink / raw)
  To: kbuild test robot; +Cc: Bill Pemberton, netdev, Greg Kroah-Hartman
In-Reply-To: <50be77ca.95imDYp6GlyHjRuw%fengguang.wu@intel.com>

I"ll fix it.

Thanks.

On 12/5/12, kbuild test robot <fengguang.wu@intel.com> wrote:
> tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
> master
> head:   193c1e478cc496844fcbef402a10976c95a634ff
> commit: 64bc40de134bb5c7826ff384016f654219ed3956 dlink: remove __dev*
> attributes
> date:   27 hours ago
> config: make ARCH=x86_64 allmodconfig
>
> All warnings:
>
> WARNING: drivers/net/ethernet/dlink/sundance.o(.text+0x2e87): Section
> mismatch in reference from the function sundance_probe1() to the variable
> .devinit.rodata:sundance_pci_tbl
> The function sundance_probe1() references
> the variable __devinitconst sundance_pci_tbl.
> This is often because sundance_probe1 lacks a __devinitconst
> annotation or the annotation of sundance_pci_tbl is wrong.
>
> ---
> 0-DAY kernel build testing backend         Open Source Technology Center
> Fengguang Wu, Yuanhan Liu                              Intel Corporation
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [Patch net-next] xfrm: add missing xfrm message types to selinux perm table
From: Steffen Klassert @ 2012-12-05  8:14 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, Herbert Xu, David S. Miller, Eric Paris,
	linux-security-module
In-Reply-To: <1354693368-19494-1-git-send-email-amwang@redhat.com>

Ccing Eric Paris and linux-security-module.

On Wed, Dec 05, 2012 at 03:42:48PM +0800, Cong Wang wrote:
> SElinux perm table is not up-to-date.
> 
> Cc: Steffen Klassert <steffen.klassert@secunet.com>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: "David S. Miller" <davem@davemloft.net>
> Signed-off-by: Cong Wang <amwang@redhat.com>
> 
> ---
> diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
> index d309e7f..cc191bc 100644
> --- a/security/selinux/nlmsgtab.c
> +++ b/security/selinux/nlmsgtab.c
> @@ -93,6 +93,13 @@ static struct nlmsg_perm nlmsg_xfrm_perms[] =
>  	{ XFRM_MSG_FLUSHPOLICY,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
>  	{ XFRM_MSG_NEWAE,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
>  	{ XFRM_MSG_GETAE,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
> +	{ XFRM_MSG_REPORT,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
> +	{ XFRM_MSG_MIGRATE,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
> +	{ XFRM_MSG_NEWSADINFO,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
> +	{ XFRM_MSG_GETSADINFO,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
> +	{ XFRM_MSG_NEWSPDINFO,	NETLINK_XFRM_SOCKET__NLMSG_WRITE },
> +	{ XFRM_MSG_GETSPDINFO,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
> +	{ XFRM_MSG_MAPPING,	NETLINK_XFRM_SOCKET__NLMSG_READ  },
>  };
>  
>  static struct nlmsg_perm nlmsg_audit_perms[] =

I'm not the maintainer of the file which this patch changes,
but I could take it into the ipsec-next tree if the selinux
people are fine with that.

^ permalink raw reply

* Re: ip_rt_min_pmtu
From: Rami Rosen @ 2012-12-05  8:46 UTC (permalink / raw)
  To: Christopher Schramm; +Cc: netdev
In-Reply-To: <50BE654D.2010602@shakaweb.org>

Hi,
Just a short note:
RFC 791 indeed set 68 for internet module MTU.

But RFC 791 also declares 576 as PMTU:
"All hosts must be prepared to accept datagrams of up to 576 octets".
and it says also:
"The number 576 is selected to allow a reasonable sized data block to
be transmitted in addition to the required header information."

It seems that there is a distinction between a module sending MTU and
hosts receiving MTU.

Regarding the historical details of why it was sent at that time  -
I don't have an idea.

Regards,
Rami Rosen
http://ramirose.wix.com/ramirosen



On Tue, Dec 4, 2012 at 11:04 PM, Christopher Schramm
<netdev@shakaweb.org> wrote:
> Hi,
>
> I'm looking into an interesting detail of the Linux IPv4 implementation I
> stumbled upon during a University course.
>
> In route.c there's a value ip_rt_min_pmtu, defined as 512 + 20 + 20, that
> tells Linux a minimum PMTU to use, even if e. g. an ICMP message tells it to
> set a smaller one.
>
> Of course, this is not a problem in real world, but not standard-compliant,
> since RFC 791 defines a minimum MTU of 68 for IPv4. So I wonder what's the
> reason for the restriction.
>
> I looked into it and found that it appeared in Linux 2.3.15 with the
> following ID in route.c:
>
> v 1.71 1999/08/20 11:05:58 davem
>
> While it was not present in Linux 2.3.14 with:
>
> v 1.69 1999/06/09 10:11:02 davem
>
> I couldn't find any related discussion or patch on the LKML around that
> dates, so I'm asking you for any hints to find out the reason for
> implementing this lower bound.
>
> What I've found on the LKML is a topic around February 15th, 2001, titled
> "MTU and 2.4.x kernel", where Alexey Kuznetsov points out that the handling
> of "DF on syn frames" is broken for MTUs smaller than 128 and "Preventing
> DoSes requires to block pmtu discovery at 576 or at least 552".
>
> Does anybody know the actual reason for the change in 2.3.15? I first
> thought it's the common misinterpretation that 576 would be the lower bound
> for MTUs in IPv4, but I wonder why it was put in place as a patch years
> after the IPv4 implementation was already done. There seems to have been
> some clear reason for it. I also wonder why it has never been removed up to
> today if it's really nothing more than a mistake.
>
> Would be great if someone could help me shed some light on this.
>
> Regards
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [net-next PATCH V3-evictor] net: frag evictor, avoid killing warm frag queues
From: Jesper Dangaard Brouer @ 2012-12-05  9:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Florian Westphal, netdev, Thomas Graf,
	Paul E. McKenney, Cong Wang, Herbert Xu
In-Reply-To: <20121204133007.20215.52566.stgit@dragon>


First of all, this patch contains a small bug (see below), which
resulted in me not testing the correct patch...

Second, this patch does NOT behave as I expected and claimed.  Thus, my
conclusions, in my previous respond might be wrong!

The previous evictor patch of letting new fragments enter, worked
amazingly well.  But I suspect, this might also be related to a
bug/problem in the evictor loop (which were being hidden by that patch).

My new *theory* is that the evictor loop, will be looping too much, if
it finds a fragment which is INET_FRAG_COMPLETE ... in that case, we
don't advance the LRU list, and thus will pickup the exact same
inet_frag_queue again in the loop... to get out of the loop we need
another CPU or packet to change the LRU list for us... I'll test that
theory... (its could also be CPUs fighting over the same LRU head
element that cause this) ... more to come...


On Tue, 2012-12-04 at 14:30 +0100, Jesper Dangaard Brouer wrote:
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 4750d2b..d8bf59b 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -178,6 +178,16 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
>  
>  		q = list_first_entry(&nf->lru_list,
>  				struct inet_frag_queue, lru_list);
> +
> +		/* When head of LRU is very new/warm, then the head is
> +		 * most likely the one with most fragments and the
> +		 * tail with least, thus drop tail
> +		 */
> +		if (!force && q->creation_ts == (u32) jiffies) {
> +			q = list_entry(&nf->lru_list.prev,

Remove the "&" in &nf->lru_list.prev

> +				struct inet_frag_queue, lru_list);
> +		}
> +
>  		atomic_inc(&q->refcnt);
>  		read_unlock(&f->lock);

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox