Netdev List
 help / color / mirror / Atom feed
* Re: suspicious RCU usage in net/ipv4/ip_tunnel.c:80
From: Eric Dumazet @ 2014-01-13  8:20 UTC (permalink / raw)
  To: Cong Wang; +Cc: Tom Herbert, netdev
In-Reply-To: <CAHA+R7Pe_pGFw5cA3v_RnGvOurAykJNVH5=DeeEVHG84faWi2Q@mail.gmail.com>

On Sun, 2014-01-12 at 22:36 -0800, Cong Wang wrote:
> > Please read rcu_dereference_protected() documentation in
> > include/linux/rcupdate.h
> 
> I did before I replied.



> 
> >
> > Also you can run sparse, with CONFIG_SPARSE_RCU_POINTER=y in
> > your .config
> >
> > make C=2 net/ipv4/ip_tunnel.o
> >
> > And then you'll know the answer to this question.
> >
> 
> Sounds like it is only to shut up a sparse warning, then its name
> is misleading, we clearly don't dereference it here.

Historical reasons, you should have been there when Paul invented the
name and lazy people like us let him do so !

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b62730baea32f86fe91a7930e4b7ee8d82778b79

You are lucky, there is plenty of documentation, maybe too much..

^ permalink raw reply

* [PATCH v4 3/3] Send cgroup_path in SCM_CGROUP
From: Jan Kaluza @ 2014-01-13  8:01 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: Jan Kaluza, rgb-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
	eparis-H+wXaHxf7aLQT0dZR+AlfA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, tj-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1389600109-30739-1-git-send-email-jkaluza-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Server-like processes in many cases need credentials and other
metadata of the peer, to decide if the calling process is allowed to
request a specific action, or the server just wants to log away this
type of information for auditing tasks.

The current practice to retrieve such process metadata is to look that
information up in procfs with the $PID received over SCM_CREDENTIALS.
This is sufficient for long-running tasks, but introduces a race which
cannot be worked around for short-living processes; the calling
process and all the information in /proc/$PID/ is gone before the
receiver of the socket message can look it up.

This introduces a new SCM type called SCM_CGROUP to allow the direct
attaching of "cgroup_path" to SCM, which is significantly more
efficient and will reliably avoid the race with the round-trip over
procfs.

Signed-off-by: Jan Kaluza <jkaluza-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 include/linux/socket.h |  1 +
 include/net/af_unix.h  |  1 +
 include/net/scm.h      | 15 +++++++++++++++
 net/core/scm.c         | 18 ++++++++++++++++++
 net/unix/af_unix.c     | 20 ++++++++++++++++++++
 5 files changed, 55 insertions(+)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5a41f35..b015ed4 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -133,6 +133,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr
 #define SCM_AUDIT	0x04		/* rw: struct uaudit		*/
 #define SCM_PROCINFO	0x05	/* rw: comm + cmdline (NULL terminated
 					   array of char *) */
+#define SCM_CGROUP	0x06		/* rw: cgroup path */
 
 struct ucred {
 	__u32	pid;
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 05c7678..c49bf35 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -32,6 +32,7 @@ struct unix_skb_parms_scm {
 	unsigned int sessionid;
 	char *procinfo;
 	int procinfo_len;
+	char *cgroup_path;
 };
 
 struct unix_skb_parms {
diff --git a/include/net/scm.h b/include/net/scm.h
index f084e19..359048d 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -41,6 +41,7 @@ struct scm_cookie {
 	struct scm_creds	creds;		/* Skb credentials	*/
 	struct scm_audit	audit;		/* Skb audit	*/
 	struct scm_procinfo	procinfo;	/* Skb procinfo */
+	char *cgroup_path;
 #ifdef CONFIG_SECURITY_NETWORK
 	u32			secid;		/* Passed security ID 	*/
 #endif
@@ -52,6 +53,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm);
 void __scm_destroy(struct scm_cookie *scm);
 struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl);
 int scm_get_current_procinfo(char **procinfo);
+int scm_get_current_cgroup_path(char **cgroup_path);
 
 #ifdef CONFIG_SECURITY_NETWORK
 static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
@@ -86,6 +88,12 @@ static inline void scm_set_procinfo(struct scm_cookie *scm,
 	scm->procinfo.len = len;
 }
 
+static inline void scm_set_cgroup_path(struct scm_cookie *scm,
+				    char *cgroup_path)
+{
+	scm->cgroup_path = cgroup_path;
+}
+
 static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
 {
 	put_pid(scm->pid);
@@ -140,6 +148,9 @@ static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct sc
 			security_release_secctx(secdata, seclen);
 		}
 	}
+
+	kfree(scm->cgroup_path);
+	scm->cgroup_path = NULL;
 }
 #else
 static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
@@ -172,6 +183,10 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 		put_cmsg(msg, SOL_SOCKET, SCM_AUDIT, sizeof(uaudits), &uaudits);
 		put_cmsg(msg, SOL_SOCKET, SCM_PROCINFO, scm->procinfo.len,
 				 scm->procinfo.procinfo);
+		if (scm->cgroup_path) {
+			put_cmsg(msg, SOL_SOCKET, SCM_CGROUP,
+				 strlen(scm->cgroup_path), scm->cgroup_path);
+		}
 	}
 
 	scm_destroy_cred(scm);
diff --git a/net/core/scm.c b/net/core/scm.c
index 4accb07..78e206a 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -404,3 +404,21 @@ out:
 	return res;
 }
 EXPORT_SYMBOL(scm_get_current_procinfo);
+
+int scm_get_current_cgroup_path(char **cgroup_path)
+{
+	int ret = 0;
+
+	*cgroup_path = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!(*cgroup_path))
+		return -ENOMEM;
+
+	ret = task_cgroup_path(current, *cgroup_path, PATH_MAX);
+	if (ret < 0) {
+		kfree(*cgroup_path);
+		*cgroup_path = NULL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(scm_get_current_cgroup_path);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 35ab97f0..b04f55e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1364,6 +1364,7 @@ static void unix_destruct_scm(struct sk_buff *skb)
 	if (UNIXCB(skb).scm) {
 		scm.procinfo.procinfo = UNIXSCM(skb).procinfo;
 		scm.procinfo.len = UNIXSCM(skb).procinfo_len;
+		scm.cgroup_path = UNIXSCM(skb).cgroup_path;
 	}
 	if (UNIXCB(skb).fp)
 		unix_detach_fds(&scm, skb);
@@ -1440,6 +1441,14 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
 			return -ENOMEM;
 	}
 
+	UNIXSCM(skb).cgroup_path = NULL;
+	if (scm->cgroup_path) {
+		UNIXSCM(skb).cgroup_path = kstrdup(scm->cgroup_path,
+						   GFP_KERNEL);
+		if (!UNIXSCM(skb).cgroup_path)
+			return -ENOMEM;
+	}
+
 	skb->destructor = unix_destruct_scm;
 	return err;
 }
@@ -1463,6 +1472,7 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
 		UNIXSCM(skb).sessionid = audit_get_sessionid(current);
 		UNIXSCM(skb).procinfo_len = scm_get_current_procinfo(
 			&UNIXSCM(skb).procinfo);
+		scm_get_current_cgroup_path(&UNIXSCM(skb).cgroup_path);
 	}
 }
 
@@ -1866,6 +1876,11 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 						 GFP_KERNEL),
 					 UNIXSCM(skb).procinfo_len);
 		}
+		if (UNIXSCM(skb).cgroup_path) {
+			scm_set_cgroup_path(siocb->scm,
+					    kstrdup(UNIXSCM(skb).cgroup_path,
+						    GFP_KERNEL));
+		}
 	}
 	unix_set_secdata(siocb->scm, skb);
 
@@ -2057,6 +2072,11 @@ again:
 						GFP_KERNEL),
 						UNIXSCM(skb).procinfo_len);
 				}
+				if (UNIXSCM(skb).cgroup_path) {
+					scm_set_cgroup_path(siocb->scm,
+							    kstrdup(UNIXSCM(skb).cgroup_path,
+							    GFP_KERNEL));
+				}
 			}
 			check_creds = 1;
 		}
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 2/3] Send comm and cmdline in SCM_PROCINFO
From: Jan Kaluza @ 2014-01-13  8:01 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: Jan Kaluza, rgb-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
	eparis-H+wXaHxf7aLQT0dZR+AlfA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, tj-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1389600109-30739-1-git-send-email-jkaluza-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Server-like processes in many cases need credentials and other
metadata of the peer, to decide if the calling process is allowed to
request a specific action, or the server just wants to log away this
type of information for auditing tasks.

The current practice to retrieve such process metadata is to look that
information up in procfs with the $PID received over SCM_CREDENTIALS.
This is sufficient for long-running tasks, but introduces a race which
cannot be worked around for short-living processes; the calling
process and all the information in /proc/$PID/ is gone before the
receiver of the socket message can look it up.

This introduces a new SCM type called SCM_PROCINFO to allow the direct
attaching of "comm" and "cmdline" to SCM, which is significantly more
efficient and will reliably avoid the race with the round-trip over
procfs.

To achieve that, new struct called unix_skb_parms_scm had to be created,
because otherwise unix_skb_parms would be too big.

scm_get_current_procinfo is inspired by ./fs/proc/base.c.

Signed-off-by: Jan Kaluza <jkaluza-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 include/linux/socket.h |  2 ++
 include/net/af_unix.h  | 11 +++++++--
 include/net/scm.h      | 24 +++++++++++++++++++
 net/core/scm.c         | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/unix/af_unix.c     | 57 +++++++++++++++++++++++++++++++++++++------
 5 files changed, 150 insertions(+), 9 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index eeac565..5a41f35 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -131,6 +131,8 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr
 #define SCM_CREDENTIALS 0x02		/* rw: struct ucred		*/
 #define SCM_SECURITY	0x03		/* rw: security label		*/
 #define SCM_AUDIT	0x04		/* rw: struct uaudit		*/
+#define SCM_PROCINFO	0x05	/* rw: comm + cmdline (NULL terminated
+					   array of char *) */
 
 struct ucred {
 	__u32	pid;
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 3b9d22a..05c7678 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -27,6 +27,13 @@ struct unix_address {
 	struct sockaddr_un name[0];
 };
 
+struct unix_skb_parms_scm {
+	kuid_t loginuid;
+	unsigned int sessionid;
+	char *procinfo;
+	int procinfo_len;
+};
+
 struct unix_skb_parms {
 	struct pid		*pid;		/* Skb credentials	*/
 	kuid_t			uid;
@@ -36,12 +43,12 @@ struct unix_skb_parms {
 	u32			secid;		/* Security ID		*/
 #endif
 	u32			consumed;
-	kuid_t			loginuid;
-	unsigned int		sessionid;
+	struct unix_skb_parms_scm *scm;
 };
 
 #define UNIXCB(skb) 	(*(struct unix_skb_parms *)&((skb)->cb))
 #define UNIXSID(skb)	(&UNIXCB((skb)).secid)
+#define UNIXSCM(skb)	(*(UNIXCB((skb)).scm))
 
 #define unix_state_lock(s)	spin_lock(&unix_sk(s)->lock)
 #define unix_state_unlock(s)	spin_unlock(&unix_sk(s)->lock)
diff --git a/include/net/scm.h b/include/net/scm.h
index 67de64f..f084e19 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -30,11 +30,17 @@ struct scm_fp_list {
 	struct file		*fp[SCM_MAX_FD];
 };
 
+struct scm_procinfo {
+	char *procinfo;
+	int len;
+};
+
 struct scm_cookie {
 	struct pid		*pid;		/* Skb credentials */
 	struct scm_fp_list	*fp;		/* Passed files		*/
 	struct scm_creds	creds;		/* Skb credentials	*/
 	struct scm_audit	audit;		/* Skb audit	*/
+	struct scm_procinfo	procinfo;	/* Skb procinfo */
 #ifdef CONFIG_SECURITY_NETWORK
 	u32			secid;		/* Passed security ID 	*/
 #endif
@@ -45,6 +51,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm);
 int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm);
 void __scm_destroy(struct scm_cookie *scm);
 struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl);
+int scm_get_current_procinfo(char **procinfo);
 
 #ifdef CONFIG_SECURITY_NETWORK
 static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
@@ -72,10 +79,20 @@ static inline void scm_set_audit(struct scm_cookie *scm,
 	scm->audit.sessionid = sessionid;
 }
 
+static inline void scm_set_procinfo(struct scm_cookie *scm,
+				    char *procinfo, int len)
+{
+	scm->procinfo.procinfo = procinfo;
+	scm->procinfo.len = len;
+}
+
 static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
 {
 	put_pid(scm->pid);
 	scm->pid  = NULL;
+	kfree(scm->procinfo.procinfo);
+	scm->procinfo.procinfo = NULL;
+	scm->procinfo.len = 0;
 }
 
 static __inline__ void scm_destroy(struct scm_cookie *scm)
@@ -88,6 +105,8 @@ static __inline__ void scm_destroy(struct scm_cookie *scm)
 static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
 			       struct scm_cookie *scm, bool forcecreds)
 {
+	char *procinfo;
+	int len;
 	memset(scm, 0, sizeof(*scm));
 	scm->creds.uid = INVALID_UID;
 	scm->creds.gid = INVALID_GID;
@@ -96,6 +115,9 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
 			     current_gid());
 		scm_set_audit(scm, audit_get_loginuid(current),
 			      audit_get_sessionid(current));
+		len = scm_get_current_procinfo(&procinfo);
+		if (len > 0)
+			scm_set_procinfo(scm, procinfo, len);
 	}
 	unix_get_peersec_dgram(sock, scm);
 	if (msg->msg_controllen <= 0)
@@ -148,6 +170,8 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 		};
 		put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
 		put_cmsg(msg, SOL_SOCKET, SCM_AUDIT, sizeof(uaudits), &uaudits);
+		put_cmsg(msg, SOL_SOCKET, SCM_PROCINFO, scm->procinfo.len,
+				 scm->procinfo.procinfo);
 	}
 
 	scm_destroy_cred(scm);
diff --git a/net/core/scm.c b/net/core/scm.c
index b442e7e..4accb07 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -339,3 +339,68 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 	return new_fpl;
 }
 EXPORT_SYMBOL(scm_fp_dup);
+
+int scm_get_current_procinfo(char **procinfo)
+{
+	int res = 0;
+	unsigned int len;
+	char *buffer = NULL;
+	struct mm_struct *mm;
+	int comm_len = strlen(current->comm);
+
+	*procinfo = NULL;
+
+	buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	mm = get_task_mm(current);
+	if (!mm)
+		goto out;
+	if (!mm->arg_end)
+		goto out_mm;    /* Shh! No looking before we're done */
+
+	len = mm->arg_end - mm->arg_start;
+
+	if (len > PAGE_SIZE)
+		len = PAGE_SIZE;
+
+	res = access_process_vm(current, mm->arg_start, buffer, len, 0);
+
+	/* If the nul at the end of args has been overwritten, then
+	 * assume application is using setproctitle(3).
+	 */
+	if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
+		len = strnlen(buffer, res);
+		if (len < res) {
+			res = len;
+		} else {
+			len = mm->env_end - mm->env_start;
+			if (len > PAGE_SIZE - res)
+				len = PAGE_SIZE - res;
+			res += access_process_vm(current, mm->env_start,
+						 buffer+res, len, 0);
+			res = strnlen(buffer, res);
+		}
+	}
+
+	/* strlen(comm) + \0 + len of cmdline */
+	len = comm_len + 1 + res;
+	*procinfo = kmalloc(len, GFP_KERNEL);
+	if (!*procinfo) {
+		res = -ENOMEM;
+		goto out_mm;
+	}
+
+	memcpy(*procinfo, current->comm, comm_len + 1); /* include \0 */
+	if (res > 0)
+		memcpy(*procinfo + comm_len + 1, buffer, res);
+	res = len;
+
+out_mm:
+	mmput(mm);
+out:
+	kfree(buffer);
+	return res;
+}
+EXPORT_SYMBOL(scm_get_current_procinfo);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index bc02a25..35ab97f0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1361,9 +1361,14 @@ static void unix_destruct_scm(struct sk_buff *skb)
 	struct scm_cookie scm;
 	memset(&scm, 0, sizeof(scm));
 	scm.pid  = UNIXCB(skb).pid;
+	if (UNIXCB(skb).scm) {
+		scm.procinfo.procinfo = UNIXSCM(skb).procinfo;
+		scm.procinfo.len = UNIXSCM(skb).procinfo_len;
+	}
 	if (UNIXCB(skb).fp)
 		unix_detach_fds(&scm, skb);
 
+	kfree(UNIXCB(skb).scm);
 	/* Alas, it calls VFS */
 	/* So fscking what? fput() had been SMP-safe since the last Summer */
 	scm_destroy(&scm);
@@ -1410,15 +1415,31 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
 {
 	int err = 0;
 
+	if (!UNIXCB(skb).scm) {
+		UNIXCB(skb).scm = kmalloc(sizeof(struct unix_skb_parms_scm),
+					  GFP_KERNEL);
+		if (!UNIXCB(skb).scm)
+			return -ENOMEM;
+	}
+
 	UNIXCB(skb).pid  = get_pid(scm->pid);
 	UNIXCB(skb).uid = scm->creds.uid;
 	UNIXCB(skb).gid = scm->creds.gid;
-	UNIXCB(skb).loginuid = scm->audit.loginuid;
-	UNIXCB(skb).sessionid = scm->audit.sessionid;
+	UNIXSCM(skb).loginuid = scm->audit.loginuid;
+	UNIXSCM(skb).sessionid = scm->audit.sessionid;
 	UNIXCB(skb).fp = NULL;
 	if (scm->fp && send_fds)
 		err = unix_attach_fds(scm, skb);
 
+	UNIXSCM(skb).procinfo = NULL;
+	if (scm->procinfo.procinfo) {
+		UNIXSCM(skb).procinfo_len = scm->procinfo.len;
+		UNIXSCM(skb).procinfo = kmemdup(scm->procinfo.procinfo,
+					scm->procinfo.len, GFP_KERNEL);
+		if (!UNIXSCM(skb).procinfo)
+			return -ENOMEM;
+	}
+
 	skb->destructor = unix_destruct_scm;
 	return err;
 }
@@ -1438,8 +1459,10 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
-		UNIXCB(skb).loginuid = audit_get_loginuid(current);
-		UNIXCB(skb).sessionid = audit_get_sessionid(current);
+		UNIXSCM(skb).loginuid = audit_get_loginuid(current);
+		UNIXSCM(skb).sessionid = audit_get_sessionid(current);
+		UNIXSCM(skb).procinfo_len = scm_get_current_procinfo(
+			&UNIXSCM(skb).procinfo);
 	}
 }
 
@@ -1833,7 +1856,17 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}
 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
-	scm_set_audit(siocb->scm, UNIXCB(skb).loginuid, UNIXCB(skb).sessionid);
+	if (UNIXCB(skb).scm) {
+		scm_set_audit(siocb->scm, UNIXSCM(skb).loginuid,
+			      UNIXSCM(skb).sessionid);
+		if (UNIXSCM(skb).procinfo) {
+			scm_set_procinfo(siocb->scm,
+					 kmemdup(UNIXSCM(skb).procinfo,
+						 UNIXSCM(skb).procinfo_len,
+						 GFP_KERNEL),
+					 UNIXSCM(skb).procinfo_len);
+		}
+	}
 	unix_set_secdata(siocb->scm, skb);
 
 	if (!(flags & MSG_PEEK)) {
@@ -2013,8 +2046,18 @@ again:
 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
 			/* Copy credentials */
 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
-			scm_set_audit(siocb->scm, UNIXCB(skb).loginuid,
-				      UNIXCB(skb).sessionid);
+			if (UNIXCB(skb).scm) {
+				scm_set_audit(siocb->scm,
+					      UNIXSCM(skb).loginuid,
+					      UNIXSCM(skb).sessionid);
+				if (UNIXSCM(skb).procinfo) {
+					scm_set_procinfo(siocb->scm,
+						kmemdup(UNIXSCM(skb).procinfo,
+						UNIXSCM(skb).procinfo_len,
+						GFP_KERNEL),
+						UNIXSCM(skb).procinfo_len);
+				}
+			}
 			check_creds = 1;
 		}
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 1/3] Send loginuid and sessionid in SCM_AUDIT
From: Jan Kaluza @ 2014-01-13  8:01 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: Jan Kaluza, rgb-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
	eparis-H+wXaHxf7aLQT0dZR+AlfA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, tj-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1389600109-30739-1-git-send-email-jkaluza-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Server-like processes in many cases need credentials and other
metadata of the peer, to decide if the calling process is allowed to
request a specific action, or the server just wants to log away this
type of information for auditing tasks.

The current practice to retrieve such process metadata is to look that
information up in procfs with the $PID received over SCM_CREDENTIALS.
This is sufficient for long-running tasks, but introduces a race which
cannot be worked around for short-living processes; the calling
process and all the information in /proc/$PID/ is gone before the
receiver of the socket message can look it up.

This introduces a new SCM type called SCM_AUDIT to allow the direct
attaching of "loginuid" and "sessionid" to SCM, which is significantly more
efficient and will reliably avoid the race with the round-trip over
procfs.

Signed-off-by: Jan Kaluza <jkaluza-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 include/linux/socket.h |  6 ++++++
 include/net/af_unix.h  |  2 ++
 include/net/scm.h      | 28 ++++++++++++++++++++++++++--
 net/unix/af_unix.c     |  7 +++++++
 4 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5d488a6..eeac565 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -130,6 +130,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr
 #define	SCM_RIGHTS	0x01		/* rw: access rights (array of int) */
 #define SCM_CREDENTIALS 0x02		/* rw: struct ucred		*/
 #define SCM_SECURITY	0x03		/* rw: security label		*/
+#define SCM_AUDIT	0x04		/* rw: struct uaudit		*/
 
 struct ucred {
 	__u32	pid;
@@ -137,6 +138,11 @@ struct ucred {
 	__u32	gid;
 };
 
+struct uaudit {
+	__u32	loginuid;
+	__u32	sessionid;
+};
+
 /* Supported address families. */
 #define AF_UNSPEC	0
 #define AF_UNIX		1	/* Unix domain sockets 		*/
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index a175ba4..3b9d22a 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -36,6 +36,8 @@ struct unix_skb_parms {
 	u32			secid;		/* Security ID		*/
 #endif
 	u32			consumed;
+	kuid_t			loginuid;
+	unsigned int		sessionid;
 };
 
 #define UNIXCB(skb) 	(*(struct unix_skb_parms *)&((skb)->cb))
diff --git a/include/net/scm.h b/include/net/scm.h
index 262532d..67de64f 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -6,6 +6,7 @@
 #include <linux/security.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
+#include <linux/audit.h>
 
 /* Well, we should have at least one descriptor open
  * to accept passed FDs 8)
@@ -18,6 +19,11 @@ struct scm_creds {
 	kgid_t	gid;
 };
 
+struct scm_audit {
+	kuid_t loginuid;
+	unsigned int sessionid;
+};
+
 struct scm_fp_list {
 	short			count;
 	short			max;
@@ -28,6 +34,7 @@ struct scm_cookie {
 	struct pid		*pid;		/* Skb credentials */
 	struct scm_fp_list	*fp;		/* Passed files		*/
 	struct scm_creds	creds;		/* Skb credentials	*/
+	struct scm_audit	audit;		/* Skb audit	*/
 #ifdef CONFIG_SECURITY_NETWORK
 	u32			secid;		/* Passed security ID 	*/
 #endif
@@ -58,6 +65,13 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm,
 	scm->creds.gid = gid;
 }
 
+static inline void scm_set_audit(struct scm_cookie *scm,
+				    kuid_t loginuid, unsigned int sessionid)
+{
+	scm->audit.loginuid = loginuid;
+	scm->audit.sessionid = sessionid;
+}
+
 static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
 {
 	put_pid(scm->pid);
@@ -77,8 +91,12 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
 	memset(scm, 0, sizeof(*scm));
 	scm->creds.uid = INVALID_UID;
 	scm->creds.gid = INVALID_GID;
-	if (forcecreds)
-		scm_set_cred(scm, task_tgid(current), current_uid(), current_gid());
+	if (forcecreds) {
+		scm_set_cred(scm, task_tgid(current), current_uid(),
+			     current_gid());
+		scm_set_audit(scm, audit_get_loginuid(current),
+			      audit_get_sessionid(current));
+	}
 	unix_get_peersec_dgram(sock, scm);
 	if (msg->msg_controllen <= 0)
 		return 0;
@@ -123,7 +141,13 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 			.uid = from_kuid_munged(current_ns, scm->creds.uid),
 			.gid = from_kgid_munged(current_ns, scm->creds.gid),
 		};
+		struct uaudit uaudits = {
+			.loginuid = from_kuid_munged(current_ns,
+						     scm->audit.loginuid),
+			.sessionid = scm->audit.sessionid,
+		};
 		put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
+		put_cmsg(msg, SOL_SOCKET, SCM_AUDIT, sizeof(uaudits), &uaudits);
 	}
 
 	scm_destroy_cred(scm);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 800ca61..bc02a25 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1413,6 +1413,8 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
 	UNIXCB(skb).pid  = get_pid(scm->pid);
 	UNIXCB(skb).uid = scm->creds.uid;
 	UNIXCB(skb).gid = scm->creds.gid;
+	UNIXCB(skb).loginuid = scm->audit.loginuid;
+	UNIXCB(skb).sessionid = scm->audit.sessionid;
 	UNIXCB(skb).fp = NULL;
 	if (scm->fp && send_fds)
 		err = unix_attach_fds(scm, skb);
@@ -1436,6 +1438,8 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
+		UNIXCB(skb).loginuid = audit_get_loginuid(current);
+		UNIXCB(skb).sessionid = audit_get_sessionid(current);
 	}
 }
 
@@ -1829,6 +1833,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}
 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
+	scm_set_audit(siocb->scm, UNIXCB(skb).loginuid, UNIXCB(skb).sessionid);
 	unix_set_secdata(siocb->scm, skb);
 
 	if (!(flags & MSG_PEEK)) {
@@ -2008,6 +2013,8 @@ again:
 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
 			/* Copy credentials */
 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
+			scm_set_audit(siocb->scm, UNIXCB(skb).loginuid,
+				      UNIXCB(skb).sessionid);
 			check_creds = 1;
 		}
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v4 0/3] Send audit/procinfo/cgroup data in socket-level control message
From: Jan Kaluza @ 2014-01-13  8:01 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: Jan Kaluza, rgb-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, LKML,
	eparis-H+wXaHxf7aLQT0dZR+AlfA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, tj-DgEjT+Ai2ygdnm+yROfE0A,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1377614400-27122-1-git-send-email-jkaluza-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Hi,

this patchset against net-next (applies also to linux-next) adds 3 new types
of "Socket"-level control message (SCM_AUDIT, SCM_PROCINFO and SCM_CGROUP).

Server-like processes in many cases need credentials and other
metadata of the peer, to decide if the calling process is allowed to
request a specific action, or the server just wants to log away this
type of information for auditing tasks.

The current practice to retrieve such process metadata is to look that
information up in procfs with the $PID received over SCM_CREDENTIALS.
This is sufficient for long-running tasks, but introduces a race which
cannot be worked around for short-living processes; the calling
process and all the information in /proc/$PID/ is gone before the
receiver of the socket message can look it up.

Changes introduced in this patchset can also increase performance
of such server-like processes, because current way of opening and
parsing /proc/$PID/* files is much more expensive than receiving these
metadata using SCM.

Changes in v4:
- Rebased to work with the latest net-next tree

Changes in v3:
- Better description of patches (Thanks to Kay Sievers)

Changes in v2:
- use PATH_MAX instead of PAGE_SIZE in SCM_CGROUP patch
- describe each patch individually

Jan Kaluza (3):
  Send loginuid and sessionid in SCM_AUDIT
  Send comm and cmdline in SCM_PROCINFO
  Send cgroup_path in SCM_CGROUP

 include/linux/socket.h |  9 ++++++
 include/net/af_unix.h  | 10 ++++++
 include/net/scm.h      | 67 ++++++++++++++++++++++++++++++++++++++--
 net/core/scm.c         | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/unix/af_unix.c     | 70 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 237 insertions(+), 2 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH net-next 0/3] bonding: cleanup bond_3ad.c
From: Veaceslav Falico @ 2014-01-13  7:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, fubar, andy
In-Reply-To: <20140112.144454.2121293412351121018.davem@davemloft.net>

On Sun, Jan 12, 2014 at 02:44:54PM -0800, David Miller wrote:
>From: David Miller <davem@davemloft.net>
>Date: Sun, 12 Jan 2014 14:36:21 -0800 (PST)
>
>> I'll apply them again, thanks for noticing.
>
>They should really be there now, let me know if there are any
>problems.

Yep, they're there already, thank you, and sorry for bugging :).

^ permalink raw reply

* [PATCH net-next 4/4] flowcache: Bring net/core/flow.c under IPsec maintain scope
From: Fan Du @ 2014-01-13  7:49 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599348-5214-1-git-send-email-fan.du@windriver.com>

As flow cache is mainly manipulated from IPsec.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 MAINTAINERS |    1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index e11d495..14ad385 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5916,6 +5916,7 @@ L:	netdev@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git
 S:	Maintained
+F:	net/core/flow.c
 F:	net/xfrm/
 F:	net/key/
 F:	net/ipv4/xfrm*
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH net-next 3/4] flowcache: Fixup flow cache part in xfrm policy
From: Fan Du @ 2014-01-13  7:49 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599348-5214-1-git-send-email-fan.du@windriver.com>

Bump flow cache genid, and flush flow cache should also be made
in per net style.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 net/xfrm/xfrm_policy.c |    7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index e205c4b..d39c90f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -661,7 +661,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		hlist_add_head(&policy->bydst, chain);
 	xfrm_pol_hold(policy);
 	net->xfrm.policy_count[dir]++;
-	atomic_inc(&flow_cache_genid);
+	atomic_inc(&net->xfrm.flow_cache_genid);
 
 	/* After previous checking, family can either be AF_INET or AF_INET6 */
 	if (policy->family == AF_INET)
@@ -2567,14 +2567,14 @@ static void __xfrm_garbage_collect(struct net *net)
 
 void xfrm_garbage_collect(struct net *net)
 {
-	flow_cache_flush();
+	flow_cache_flush(net);
 	__xfrm_garbage_collect(net);
 }
 EXPORT_SYMBOL(xfrm_garbage_collect);
 
 static void xfrm_garbage_collect_deferred(struct net *net)
 {
-	flow_cache_flush_deferred();
+	flow_cache_flush_deferred(net);
 	__xfrm_garbage_collect(net);
 }
 
@@ -2947,6 +2947,7 @@ static int __net_init xfrm_net_init(struct net *net)
 	spin_lock_init(&net->xfrm.xfrm_policy_sk_bundle_lock);
 	mutex_init(&net->xfrm.xfrm_cfg_mutex);
 
+	flow_cache_init(net);
 	return 0;
 
 out_sysctl:
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH net-next 2/4] flowcache: Make flowcache entry inserting/flushing in per-net style
From: Fan Du @ 2014-01-13  7:49 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599348-5214-1-git-send-email-fan.du@windriver.com>

Inserting a entry into flowcache, or flushing flowcache should be based
on per net scope. The reason to do so is flushing operation from fat
netns crammed with flow entries will also making the slim netns with only
a few flow cache entries go away in original implementation.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 include/net/flow.h |    5 ++-
 net/core/flow.c    |  127 +++++++++++++++++++++++-----------------------------
 2 files changed, 60 insertions(+), 72 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index d23e7fa..bee3741 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -218,9 +218,10 @@ struct flow_cache_object *flow_cache_lookup(struct net *net,
 					    const struct flowi *key, u16 family,
 					    u8 dir, flow_resolve_t resolver,
 					    void *ctx);
+int flow_cache_init(struct net *net);
 
-void flow_cache_flush(void);
-void flow_cache_flush_deferred(void);
+void flow_cache_flush(struct net *net);
+void flow_cache_flush_deferred(struct net *net);
 extern atomic_t flow_cache_genid;
 
 #endif
diff --git a/net/core/flow.c b/net/core/flow.c
index dfa602c..344a184 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -24,6 +24,7 @@
 #include <net/flow.h>
 #include <linux/atomic.h>
 #include <linux/security.h>
+#include <net/net_namespace.h>
 
 struct flow_cache_entry {
 	union {
@@ -38,37 +39,12 @@ struct flow_cache_entry {
 	struct flow_cache_object	*object;
 };
 
-struct flow_cache_percpu {
-	struct hlist_head		*hash_table;
-	int				hash_count;
-	u32				hash_rnd;
-	int				hash_rnd_recalc;
-	struct tasklet_struct		flush_tasklet;
-};
-
 struct flow_flush_info {
 	struct flow_cache		*cache;
 	atomic_t			cpuleft;
 	struct completion		completion;
 };
 
-struct flow_cache {
-	u32				hash_shift;
-	struct flow_cache_percpu __percpu *percpu;
-	struct notifier_block		hotcpu_notifier;
-	int				low_watermark;
-	int				high_watermark;
-	struct timer_list		rnd_timer;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-EXPORT_SYMBOL(flow_cache_genid);
-static struct flow_cache flow_cache_global;
-static struct kmem_cache *flow_cachep __read_mostly;
-
-static DEFINE_SPINLOCK(flow_cache_gc_lock);
-static LIST_HEAD(flow_cache_gc_list);
-
 #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
 #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
@@ -84,46 +60,50 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
-static int flow_entry_valid(struct flow_cache_entry *fle)
+static int flow_entry_valid(struct flow_cache_entry *fle,
+				struct netns_xfrm *xfrm)
 {
-	if (atomic_read(&flow_cache_genid) != fle->genid)
+	if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
 		return 0;
 	if (fle->object && !fle->object->ops->check(fle->object))
 		return 0;
 	return 1;
 }
 
-static void flow_entry_kill(struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle,
+				struct netns_xfrm *xfrm)
 {
 	if (fle->object)
 		fle->object->ops->delete(fle->object);
-	kmem_cache_free(flow_cachep, fle);
+	kmem_cache_free(xfrm->flow_cachep, fle);
 }
 
 static void flow_cache_gc_task(struct work_struct *work)
 {
 	struct list_head gc_list;
 	struct flow_cache_entry *fce, *n;
+	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+						flow_cache_gc_work);
 
 	INIT_LIST_HEAD(&gc_list);
-	spin_lock_bh(&flow_cache_gc_lock);
-	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
-	spin_unlock_bh(&flow_cache_gc_lock);
+	spin_lock_bh(&xfrm->flow_cache_gc_lock);
+	list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
+	spin_unlock_bh(&xfrm->flow_cache_gc_lock);
 
 	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
-		flow_entry_kill(fce);
+		flow_entry_kill(fce, xfrm);
 }
-static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
 
 static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
-				     int deleted, struct list_head *gc_list)
+				     int deleted, struct list_head *gc_list,
+				     struct netns_xfrm *xfrm)
 {
 	if (deleted) {
 		fcp->hash_count -= deleted;
-		spin_lock_bh(&flow_cache_gc_lock);
-		list_splice_tail(gc_list, &flow_cache_gc_list);
-		spin_unlock_bh(&flow_cache_gc_lock);
-		schedule_work(&flow_cache_gc_work);
+		spin_lock_bh(&xfrm->flow_cache_gc_lock);
+		list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
+		spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+		schedule_work(&xfrm->flow_cache_gc_work);
 	}
 }
 
@@ -135,6 +115,8 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	struct hlist_node *tmp;
 	LIST_HEAD(gc_list);
 	int i, deleted = 0;
+	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+						flow_cache_global);
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int saved = 0;
@@ -142,7 +124,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 		hlist_for_each_entry_safe(fle, tmp,
 					  &fcp->hash_table[i], u.hlist) {
 			if (saved < shrink_to &&
-			    flow_entry_valid(fle)) {
+			    flow_entry_valid(fle, xfrm)) {
 				saved++;
 			} else {
 				deleted++;
@@ -152,7 +134,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 		}
 	}
 
-	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
 }
 
 static void flow_cache_shrink(struct flow_cache *fc,
@@ -208,7 +190,7 @@ struct flow_cache_object *
 flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 		  flow_resolve_t resolver, void *ctx)
 {
-	struct flow_cache *fc = &flow_cache_global;
+	struct flow_cache *fc = &net->xfrm.flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, *tfle;
 	struct flow_cache_object *flo;
@@ -248,7 +230,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 		if (fcp->hash_count > fc->high_watermark)
 			flow_cache_shrink(fc, fcp);
 
-		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
+		fle = kmem_cache_alloc(net->xfrm.flow_cachep, GFP_ATOMIC);
 		if (fle) {
 			fle->net = net;
 			fle->family = family;
@@ -258,7 +240,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 			fcp->hash_count++;
 		}
-	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+	} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
 		flo = fle->object;
 		if (!flo)
 			goto ret_object;
@@ -279,7 +261,7 @@ nocache:
 	}
 	flo = resolver(net, key, family, dir, flo, ctx);
 	if (fle) {
-		fle->genid = atomic_read(&flow_cache_genid);
+		fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
 		if (!IS_ERR(flo))
 			fle->object = flo;
 		else
@@ -303,12 +285,14 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	struct hlist_node *tmp;
 	LIST_HEAD(gc_list);
 	int i, deleted = 0;
+	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+						flow_cache_global);
 
 	fcp = this_cpu_ptr(fc->percpu);
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		hlist_for_each_entry_safe(fle, tmp,
 					  &fcp->hash_table[i], u.hlist) {
-			if (flow_entry_valid(fle))
+			if (flow_entry_valid(fle, xfrm))
 				continue;
 
 			deleted++;
@@ -317,7 +301,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
 		}
 	}
 
-	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
 
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
@@ -351,10 +335,9 @@ static void flow_cache_flush_per_cpu(void *data)
 	tasklet_schedule(tasklet);
 }
 
-void flow_cache_flush(void)
+void flow_cache_flush(struct net *net)
 {
 	struct flow_flush_info info;
-	static DEFINE_MUTEX(flow_flush_sem);
 	cpumask_var_t mask;
 	int i, self;
 
@@ -365,8 +348,8 @@ void flow_cache_flush(void)
 
 	/* Don't want cpus going down or up during this. */
 	get_online_cpus();
-	mutex_lock(&flow_flush_sem);
-	info.cache = &flow_cache_global;
+	mutex_lock(&net->xfrm.flow_flush_sem);
+	info.cache = &net->xfrm.flow_cache_global;
 	for_each_online_cpu(i)
 		if (!flow_cache_percpu_empty(info.cache, i))
 			cpumask_set_cpu(i, mask);
@@ -386,21 +369,23 @@ void flow_cache_flush(void)
 	wait_for_completion(&info.completion);
 
 done:
-	mutex_unlock(&flow_flush_sem);
+	mutex_unlock(&net->xfrm.flow_flush_sem);
 	put_online_cpus();
 	free_cpumask_var(mask);
 }
 
 static void flow_cache_flush_task(struct work_struct *work)
 {
-	flow_cache_flush();
-}
+	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+						flow_cache_gc_work);
+	struct net *net = container_of(xfrm, struct net, xfrm);
 
-static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
+	flow_cache_flush(net);
+}
 
-void flow_cache_flush_deferred(void)
+void flow_cache_flush_deferred(struct net *net)
 {
-	schedule_work(&flow_cache_flush_work);
+	schedule_work(&net->xfrm.flow_cache_flush_work);
 }
 
 static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
@@ -425,7 +410,8 @@ static int flow_cache_cpu(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
 {
-	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+	struct flow_cache *fc = container_of(nfb, struct flow_cache,
+						hotcpu_notifier);
 	int res, cpu = (unsigned long) hcpu;
 	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 
@@ -444,9 +430,20 @@ static int flow_cache_cpu(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static int __init flow_cache_init(struct flow_cache *fc)
+int flow_cache_init(struct net *net)
 {
 	int i;
+	struct flow_cache *fc = &net->xfrm.flow_cache_global;
+
+	/* Initialize per-net flow cache global variables here */
+	net->xfrm.flow_cachep = kmem_cache_create("flow_cache",
+					sizeof(struct flow_cache_entry),
+					0, SLAB_PANIC, NULL);
+	spin_lock_init(&net->xfrm.flow_cache_gc_lock);
+	INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
+	INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
+	INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
+	mutex_init(&net->xfrm.flow_flush_sem);
 
 	fc->hash_shift = 10;
 	fc->low_watermark = 2 * flow_cache_hash_size(fc);
@@ -484,14 +481,4 @@ err:
 
 	return -ENOMEM;
 }
-
-static int __init flow_cache_init_global(void)
-{
-	flow_cachep = kmem_cache_create("flow_cache",
-					sizeof(struct flow_cache_entry),
-					0, SLAB_PANIC, NULL);
-
-	return flow_cache_init(&flow_cache_global);
-}
-
-module_init(flow_cache_init_global);
+EXPORT_SYMBOL(flow_cache_init);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH net-next 1/4] flowcache: Namespacify flowcache global parameters with xfrm
From: Fan Du @ 2014-01-13  7:49 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599348-5214-1-git-send-email-fan.du@windriver.com>

Since flowcache is tightly coupled with IPsec, so it would be
easier to put flow cache global parameters here into xfrm
namespace part.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 include/net/netns/xfrm.h |   11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 1006a26..52d0086 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -6,6 +6,7 @@
 #include <linux/workqueue.h>
 #include <linux/xfrm.h>
 #include <net/dst_ops.h>
+#include <net/flowcache.h>
 
 struct ctl_table_header;
 
@@ -61,6 +62,16 @@ struct netns_xfrm {
 	spinlock_t xfrm_policy_sk_bundle_lock;
 	rwlock_t xfrm_policy_lock;
 	struct mutex xfrm_cfg_mutex;
+
+	/* flow cache part */
+	struct flow_cache	flow_cache_global;
+	struct kmem_cache	*flow_cachep;
+	atomic_t		flow_cache_genid;
+	struct list_head	flow_cache_gc_list;
+	spinlock_t		flow_cache_gc_lock;
+	struct work_struct	flow_cache_gc_work;
+	struct work_struct	flow_cache_flush_work;
+	struct mutex		flow_flush_sem;
 };
 
 #endif
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH net-next 0/4] Make flow cache name space aware
From: Fan Du @ 2014-01-13  7:49 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev

Hi,

This patch set aims to make flow cache operating in a per net style
when inserting flow cache entry or flush flow cache. The reason to
do so is not much compelling but reasonable, which is flushing flow
cache in original implementation has global effective, the collateral
damage is netns with only a few flow cache entries has gone.

So this patch make flow cache running in a per net scope. Operation
from different netns won't interfere with each other. And the flushing
operation is worthwhile for the netns which supposed to be.

Fan Du (4):
  flowcache: Namespacify flowcache global parameters with xfrm
  flowcache: Make flowcache entry inserting/flushing in per-net style
  flowcache: fixup flow cache part in xfrm policy
  flowcache: Bring net/core/flow.c under IPsec maintain scope

 MAINTAINERS              |    1 +
 include/net/flow.h       |    5 +-
 include/net/netns/xfrm.h |   11 ++++
 net/core/flow.c          |  127 +++++++++++++++++++++-------------------------
 net/xfrm/xfrm_policy.c   |    7 +--
 5 files changed, 76 insertions(+), 75 deletions(-)

-- 
1.7.9.5

^ permalink raw reply

* [PATCHv2 net-next 5/5] xfrm: Don't prohibit AH from using ESN feature
From: Fan Du @ 2014-01-13  7:48 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599324-5174-1-git-send-email-fan.du@windriver.com>

Clear checking when user try to use ESN through netlink keymgr for AH.
As only ESP and AH support ESN feature according to RFC.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 net/xfrm/xfrm_user.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 97681a3..dbd287d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -142,7 +142,8 @@ static inline int verify_replay(struct xfrm_usersa_info *p,
 	if (!rt)
 		return 0;
 
-	if (p->id.proto != IPPROTO_ESP)
+	/* As only ESP and AH support ESN feature. */
+	if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH))
 		return -EINVAL;
 
 	if (p->replay_window != 0)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCHv2 net-next 4/5] {IPv6,xfrm} Add ESN support for AH ingress part
From: Fan Du @ 2014-01-13  7:48 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599324-5174-1-git-send-email-fan.du@windriver.com>

This patch add esn support for AH input stage by attaching upper 32bits
sequence number right after packet payload as specified by RFC 4302.

Then the ICV value will guard upper 32bits sequence number as well when
packet going in.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 net/ipv6/ah6.c |   28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 87826a5..7f5c822 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -532,6 +532,10 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 	int nexthdr;
 	int nfrags;
 	int err = -ENOMEM;
+	int seqhi_len = 0;
+	__be32 *seqhi;
+	int sglists = 0;
+	struct scatterlist *seqhisg;
 
 	if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
 		goto out;
@@ -568,14 +572,22 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	skb_push(skb, hdr_len);
 
-	work_iph = ah_alloc_tmp(ahash, nfrags, hdr_len + ahp->icv_trunc_len);
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists = 1;
+		seqhi_len = sizeof(*seqhi);
+	}
+
+	work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len +
+				ahp->icv_trunc_len + seqhi_len);
 	if (!work_iph)
 		goto out;
 
-	auth_data = ah_tmp_auth(work_iph, hdr_len);
-	icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+	auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len);
+	seqhi = auth_data + ahp->icv_trunc_len;
+	icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
 	req = ah_tmp_req(ahash, icv);
 	sg = ah_req_sg(ahash, req);
+	seqhisg = sg + nfrags;
 
 	memcpy(work_iph, ip6h, hdr_len);
 	memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
@@ -593,7 +605,15 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	ahash_request_set_crypt(req, sg, icv, skb->len);
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		sg_unmark_end(&sg[nfrags - 1]);
+		/* Attach seqhi sg right after packet payload */
+		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+		sg_init_table(seqhisg, sglists);
+		sg_set_buf(seqhisg, seqhi, seqhi_len);
+	}
+
+	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
 	ahash_request_set_callback(req, 0, ah6_input_done, skb);
 
 	AH_SKB_CB(skb)->tmp = work_iph;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCHv2 net-next 3/5] {IPv6,xfrm} Add ESN support for AH egress part
From: Fan Du @ 2014-01-13  7:48 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599324-5174-1-git-send-email-fan.du@windriver.com>

This patch add esn support for AH output stage by attaching upper 32bits
sequence number right after packet payload as specified by RFC 4302.

Then the ICV value will guard upper 32bits sequence number as well when
packet going out.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 net/ipv6/ah6.c |   24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 81e496a..87826a5 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -346,6 +346,10 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct ip_auth_hdr *ah;
 	struct ah_data *ahp;
 	struct tmp_ext *iph_ext;
+	int seqhi_len = 0;
+	__be32 *seqhi;
+	int sglists = 0;
+	struct scatterlist *seqhisg;
 
 	ahp = x->data;
 	ahash = ahp->ahash;
@@ -359,15 +363,22 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
 	if (extlen)
 		extlen += sizeof(*iph_ext);
 
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists = 1;
+		seqhi_len = sizeof(*seqhi);
+	}
 	err = -ENOMEM;
-	iph_base = ah_alloc_tmp(ahash, nfrags, IPV6HDR_BASELEN + extlen);
+	iph_base = ah_alloc_tmp(ahash, nfrags + sglists, IPV6HDR_BASELEN +
+				extlen + seqhi_len);
 	if (!iph_base)
 		goto out;
 
 	iph_ext = ah_tmp_ext(iph_base);
-	icv = ah_tmp_icv(ahash, iph_ext, extlen);
+	seqhi = (char *)iph_ext + extlen;
+	icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
 	req = ah_tmp_req(ahash, icv);
 	sg = ah_req_sg(ahash, req);
+	seqhisg = sg + nfrags;
 
 	ah = ip_auth_hdr(skb);
 	memset(ah->auth_data, 0, ahp->icv_trunc_len);
@@ -414,7 +425,14 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	ahash_request_set_crypt(req, sg, icv, skb->len);
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		sg_unmark_end(&sg[nfrags - 1]);
+		/* Attach seqhi sg right after packet payload */
+		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+		sg_init_table(seqhisg, sglists);
+		sg_set_buf(seqhisg, seqhi, seqhi_len);
+	}
+	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
 	ahash_request_set_callback(req, 0, ah6_output_done, skb);
 
 	AH_SKB_CB(skb)->tmp = iph_base;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCHv2 net-next 1/5] {IPv4,xfrm} Add ESN support for AH egress part
From: Fan Du @ 2014-01-13  7:48 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599324-5174-1-git-send-email-fan.du@windriver.com>

This patch add esn support for AH output stage by attaching upper 32bits
sequence number right after packet payload as specified by RFC 4302.

Then the ICV value will guard upper 32bits sequence number as well when
packet going out.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 net/ipv4/ah4.c |   25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 7179026..a7fac03 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -12,6 +12,7 @@
 #include <linux/scatterlist.h>
 #include <net/icmp.h>
 #include <net/protocol.h>
+#include <crypto/scatterwalk.h>
 
 struct ah_skb_cb {
 	struct xfrm_skb_cb xfrm;
@@ -155,6 +156,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct iphdr *iph, *top_iph;
 	struct ip_auth_hdr *ah;
 	struct ah_data *ahp;
+	int seqhi_len = 0;
+	__be32 *seqhi;
+	int sglists = 0;
+	struct scatterlist *seqhisg;
 
 	ahp = x->data;
 	ahash = ahp->ahash;
@@ -167,14 +172,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
 	ah = ip_auth_hdr(skb);
 	ihl = ip_hdrlen(skb);
 
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists = 1;
+		seqhi_len = sizeof(*seqhi);
+	}
 	err = -ENOMEM;
-	iph = ah_alloc_tmp(ahash, nfrags, ihl);
+	iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
 	if (!iph)
 		goto out;
-
-	icv = ah_tmp_icv(ahash, iph, ihl);
+	seqhi = (__be32 *)((char *)iph + ihl);
+	icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
 	req = ah_tmp_req(ahash, icv);
 	sg = ah_req_sg(ahash, req);
+	seqhisg = sg + nfrags;
 
 	memset(ah->auth_data, 0, ahp->icv_trunc_len);
 
@@ -213,7 +223,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	ahash_request_set_crypt(req, sg, icv, skb->len);
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		sg_unmark_end(&sg[nfrags - 1]);
+		/* Attach seqhi sg right after packet payload */
+		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+		sg_init_table(seqhisg, sglists);
+		sg_set_buf(seqhisg, seqhi, seqhi_len);
+	}
+	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
 	ahash_request_set_callback(req, 0, ah_output_done, skb);
 
 	AH_SKB_CB(skb)->tmp = iph;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCHv2 net-next 2/5] {IPv4,xfrm} Add ESN support for AH ingress part
From: Fan Du @ 2014-01-13  7:48 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev
In-Reply-To: <1389599324-5174-1-git-send-email-fan.du@windriver.com>

This patch add esn support for AH input stage by attaching upper 32bits
sequence number right after packet payload as specified by RFC 4302.

Then the ICV value will guard upper 32bits sequence number as well when
packet getting in.

Signed-off-by: Fan Du <fan.du@windriver.com>
---
 net/ipv4/ah4.c |   25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index a7fac03..984dbd6 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -312,6 +312,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 	struct ip_auth_hdr *ah;
 	struct ah_data *ahp;
 	int err = -ENOMEM;
+	int seqhi_len = 0;
+	__be32 *seqhi;
+	int sglists = 0;
+	struct scatterlist *seqhisg;
 
 	if (!pskb_may_pull(skb, sizeof(*ah)))
 		goto out;
@@ -352,14 +356,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 	iph = ip_hdr(skb);
 	ihl = ip_hdrlen(skb);
 
-	work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists = 1;
+		seqhi_len = sizeof(*seqhi);
+	}
+
+	work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+				ahp->icv_trunc_len + seqhi_len);
 	if (!work_iph)
 		goto out;
 
-	auth_data = ah_tmp_auth(work_iph, ihl);
+	seqhi = (__be32 *)((char *)work_iph + ihl);
+	auth_data = ah_tmp_auth(seqhi, seqhi_len);
 	icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
 	req = ah_tmp_req(ahash, icv);
 	sg = ah_req_sg(ahash, req);
+	seqhisg = sg + nfrags;
 
 	memcpy(work_iph, iph, ihl);
 	memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
@@ -381,7 +393,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	ahash_request_set_crypt(req, sg, icv, skb->len);
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		sg_unmark_end(&sg[nfrags - 1]);
+		/* Attach seqhi sg right after packet payload */
+		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+		sg_init_table(seqhisg, sglists);
+		sg_set_buf(seqhisg, seqhi, seqhi_len);
+	}
+	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
 	ahash_request_set_callback(req, 0, ah_input_done, skb);
 
 	AH_SKB_CB(skb)->tmp = work_iph;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCHv2 net-next 0/5] xfrm: Add ESN support for AH
From: Fan Du @ 2014-01-13  7:48 UTC (permalink / raw)
  To: steffen.klassert; +Cc: davem, netdev

Hi,

This is initial Extended Sequence Number support for AH based on IPv4.
The rationale is totally by the RFC 4302, which states:

3.3.3.2.2.  Implicit Packet Padding and ESN

   If the ESN option is elected for an SA, then the high-order 32 bits
   of the ESN must be included in the ICV computation.  For purposes of
   ICV computation, these bits are appended (implicitly) immediately
   after the end of the payload, and before any implicit packet padding.

So we attach the high-order 32bits as a scatterlist right after the packet
payload to compute ICV value. 

Test:
I add a knob in iproute2/ip/xfrm_state.c to enable esn when setting SA,
which make it possible to test with-esn and without-esn scenarios, both
cases works ok with ping using packetsize(-s) from default to 32768. 

v2:
  - Patch3/5 and Patch4/5 add IPv6 part as requested by Steffen.
  - Patch5/5 restrict ESN feature only to ESP and AH.

Fan Du (5):
  {IPv4,xfrm} Add ESN support for AH egress part
  {IPv4,xfrm} Add ESN support for AH ingress part
  {IPv6,xfrm} Add ESN support for AH egress part
  {IPv6,xfrm} Add ESN support for AH ingress part
  xfrm: Don't prohibit AH from using ESN feature

 net/ipv4/ah4.c       |   50 +++++++++++++++++++++++++++++++++++++++++-------
 net/ipv6/ah6.c       |   52 +++++++++++++++++++++++++++++++++++++++++++-------
 net/xfrm/xfrm_user.c |    3 ++-
 3 files changed, 90 insertions(+), 15 deletions(-)

-- 
1.7.9.5

^ permalink raw reply

* Re: [PATCH net-next v2 4/4] virtio-net: initial debugfs support, export mergeable rx buffer size
From: Jason Wang @ 2014-01-13  7:36 UTC (permalink / raw)
  To: Michael Dalton, Michael S. Tsirkin
  Cc: netdev, Eric Dumazet, David S. Miller, lf-virt
In-Reply-To: <CANJ5vP+Vu8=haCetZkcefPgR8pGe0iDZ03EZhJHLSgunCaHJsQ@mail.gmail.com>

On 01/13/2014 07:32 AM, Michael Dalton wrote:
> Hi Michael,
>
> On Sun, Jan 12, 2014 at 9:09 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> Can't we add struct attribute * to netdevice, and pass that in when
>> creating the kobj?
> I like that idea, I think that will work and should be better than
> the alternatives. The actual kobjs for RX queues (struct netdev_rx_queue)
> are allocated and deallocated by calls to net_rx_queue_update_kobjects,
> which resizes RX queue kobjects when the netdev RX queues are resized.
>
> Is this what you had in mind:
> (1) Add a pointer to an attribute group to struct net_device, used for
>     per-netdev rx queue attributes and initialized before the call to
>     register_netdevice().
> (2) Declare an attribute group containing the mergeable_rx_buffer_size
>     attribute in virtio-net, and initialize the per-netdevice group pointer
>     to the address of this group in virtnet_probe before register_netdevice
> (3) In net-sysfs, modify net_rx_queue_update_kobjects
>     (or rx_queue_add_kobject) to call sysfs_create_group on the
>     per-netdev attribute group (if non-NULL), adding the attributes in
>     the group to the RX queue kobject.
>
> That should allow us to have per-RX queue attributes that are
> device-specific. I'm not a sysfs expert, but it seems that rx_queue_ktype
> and rx_queue_sysfs_ops presume that all rx queue sysfs operations are
> performed on attributes of type rx_queue_attribute. That type will need
> to be moved from net-sysfs.c to a header file like netdevice.h so that
> the type can be used in virtio-net when we declare the
> mergeable_rx_buffer_size attribute.

There's a possible issue, rx queue sysfs depends on CONFIG_RPS. So we
probably need a dedicated attribute just for virtio-net.
>
> The last issue is how the rx_queue_attribute 'show' function
> implementation for mergeable_rx_buffer_size will access the appropriate
> per-receive queue EWMA data. The arguments to the show function will be
> the netdev_rx_queue and the attribute itself. We can get to the
> struct net_device from the netdev_rx_queue.  If we extended
> netdev_rx_queue to indicate the queue_index or to store a void *priv_data
> pointer, that would be sufficient to allow us to resolve this issue.
>
> Please let me know if the above sounds good or if you see a better way
> to accomplish this goal. Thanks!
>
> Best,
>
> Mike

^ permalink raw reply

* RE: Use of ENOTSUPP in drivers?
From: Shahed Shaikh @ 2014-01-13  6:45 UTC (permalink / raw)
  To: Sabrina Dubroca, netdev; +Cc: Ben Hutchings
In-Reply-To: <20140112185715.GB5405@kria>

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Sabrina Dubroca
> Sent: Monday, January 13, 2014 12:27 AM
> To: netdev
> Cc: Ben Hutchings
> Subject: Use of ENOTSUPP in drivers?
> 
> 
> Thu, 2 Jan 2014 12:01:31 +0000, Ben Hutchings wrote:
> > Never return error code ENOTSUPP; it's *not* the same thing as ENOTSUP
> > in userland and is not part of the userland ABI.  I would use EINVAL
> > here.
> 
> 
> I've found a few ethernet drivers that return -ENOTSUPP in various
> functions. In particular, some ethtool functions or ioctl's.
> Ben's message makes me think that the ethtool functions and ioctl's should
> be modified.
> 
> There are other occurences, mostly in functions related to device
> initialization. I didn't manage to track down exactly from where some of
> them are called, and I don't know if ENOTSUPP is okay in these.
> 
> I've included the complete list of occurences (based on net-next) from
> drivers/net/ethernet in patch form at the end, if that's more convenient
> than the file/function list. This is not meant to be applied.
> 
> 
> Do these (or part of them) need to be patched? Or is there something I'm
> missing?
> 
> 
> Thanks,
> Sabrina
> 
> 
> ---
> 

Hi Sabrina,

Using -EOPNOTSUPP instead of -ENOTSUPP,  ethtool is giving the expected error message.

# ethtool -w p6p1
Can not get dump level
: Operation not supported

> 
> diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
> b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
> index 03eb2ad..bb9f4ec 100644
> --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
> +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
> @@ -1641,7 +1641,7 @@ int qlcnic_83xx_loopback_test(struct net_device
> *netdev, u8 mode)
>  	if (ahw->op_mode == QLCNIC_NON_PRIV_FUNC) {
>  		netdev_warn(netdev,
>  			    "Loopback test not supported in non privileged
> mode\n");
> -		return -ENOTSUPP;
> +		return -EINVAL;

Please use -EOPNOTSUPP

>  	}
> 
>  	if (test_bit(__QLCNIC_RESETTING, &adapter->state)) { diff --git
> a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
> b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
> index 45fa6ef..727be4e 100644
> --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
> +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
> @@ -1681,7 +1681,7 @@ qlcnic_get_dump_flag(struct net_device *netdev,
> struct ethtool_dump *dump)
> 
>  	if (!fw_dump->tmpl_hdr) {
>  		netdev_err(adapter->netdev, "FW Dump not supported\n");
> -		return -ENOTSUPP;
> +		return -EINVAL;

Please use -EOPNOTSUPP

>  	}
> 
>  	if (fw_dump->clr)
> @@ -1710,7 +1710,7 @@ qlcnic_get_dump_data(struct net_device *netdev,
> struct ethtool_dump *dump,
> 
>  	if (!fw_dump->tmpl_hdr) {
>  		netdev_err(netdev, "FW Dump not supported\n");
> -		return -ENOTSUPP;
> +		return -EINVAL;

Please use -EOPNOTSUPP

>  	}
> 
>  	if (!fw_dump->clr) {
> 

Thanks,
Shahed
  

^ permalink raw reply

* Re: suspicious RCU usage in net/ipv4/ip_tunnel.c:80
From: Cong Wang @ 2014-01-13  6:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tom Herbert, netdev
In-Reply-To: <1389548697.31367.184.camel@edumazet-glaptop2.roam.corp.google.com>

On Sun, Jan 12, 2014 at 9:44 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Sat, 2014-01-11 at 11:15 -0800, Cong Wang wrote:
>> On Fri, Jan 10, 2014 at 5:21 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> >
>> > Nope, the synchronize_rcu() is not needed here.
>>
>> OK.
>>
>> >
>> > Please use following sparse ready patch, thanks :
>> >
>> > diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
>> > index d3929a69f008..6eda759b5c4b 100644
>> > --- a/net/ipv4/ip_tunnel.c
>> > +++ b/net/ipv4/ip_tunnel.c
>> > @@ -77,10 +77,11 @@ static inline void __tunnel_dst_set(struct ip_tunnel_dst *idst,
>> >                 dst = NULL;
>> >
>> >         spin_lock_bh(&idst->lock);
>> > -       old_dst = rcu_dereference(idst->dst);
>> > +       old_dst = rcu_dereference_protected(idst->dst,
>> > +                                           lockdep_is_held(&idst->lock));
>> >         rcu_assign_pointer(idst->dst, dst);
>> > -       dst_release(old_dst);
>> >         spin_unlock_bh(&idst->lock);
>> > +       dst_release(old_dst);
>>
>> Do you really need a rcu_dereference*() here? We don't dereference
>> it inside spin_lock protection.
>
> Please read rcu_dereference_protected() documentation in
> include/linux/rcupdate.h

I did before I replied.

>
> Also you can run sparse, with CONFIG_SPARSE_RCU_POINTER=y in
> your .config
>
> make C=2 net/ipv4/ip_tunnel.o
>
> And then you'll know the answer to this question.
>

Sounds like it is only to shut up a sparse warning, then its name
is misleading, we clearly don't dereference it here.

^ permalink raw reply

* Re: [PATCH net-next] bgmac: propagate error codes in bgmac_probe()
From: Rafał Miłecki @ 2014-01-13  5:58 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Network Development, David Miller, Hauke Mehrtens, Nathan Hintz,
	sd, bhutchings
In-Reply-To: <1389582355-27514-1-git-send-email-f.fainelli@gmail.com>

2014/1/13 Florian Fainelli <f.fainelli@gmail.com>:
> bgmac_mii_register() and register_netdev() both return appropriate error
> codes for the failures they would encounter, propagate this error code
> instead of overriding the value with -ENOTSUPP which is not the correct
> error code to return.
>
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Acked-by: Rafał Miłecki <zajec5@gmail.com>

-- 
Rafał

^ permalink raw reply

* Re: [PATCH net-next 3/3] packet: use percpu mmap tx frame pending refcount
From: Cong Wang @ 2014-01-13  5:51 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: David Miller, netdev
In-Reply-To: <1389543768-20234-4-git-send-email-dborkman@redhat.com>

On Sun, Jan 12, 2014 at 8:22 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> +static void packet_inc_pending(struct packet_ring_buffer *rb)
> +{
> +       this_cpu_inc(*rb->pending_refcnt);
> +}
> +
> +static void packet_dec_pending(struct packet_ring_buffer *rb)
> +{
> +       this_cpu_dec(*rb->pending_refcnt);
> +}
> +
> +static int packet_read_pending(const struct packet_ring_buffer *rb)
> +{
> +       int i, refcnt = 0;
> +
> +       /* We don't use pending refcount in rx_ring. */
> +       if (rb->pending_refcnt == NULL)
> +               return 0;
> +
> +       for_each_possible_cpu(i)
> +               refcnt += *per_cpu_ptr(rb->pending_refcnt, i);
> +
> +       return refcnt;
> +}

How is this supposed to work? Since there is no lock,
you can't read accurate refcnt. Take a look at lib/percpu_counter.c.

I guess for some reason you don't care the accuracy?
Then at least you need to comment in the code.

Thanks.

^ permalink raw reply

* Re: linux-next: manual merge of the tip tree with the net-next tree
From: Stephen Rothwell @ 2014-01-13  3:20 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Peter Zijlstra,
	David Miller, netdev
  Cc: linux-next, linux-kernel
In-Reply-To: <20140113141824.4301cb76e8e5540d42d74015@canb.auug.org.au>

[-- Attachment #1: Type: text/plain, Size: 1309 bytes --]

Hi all,

On Mon, 13 Jan 2014 14:18:24 +1100 Stephen Rothwell <sfr@canb.auug.org.au> wrote:
>
> Today's linux-next merge of the tip tree got conflicts in
> arch/arc/include/asm/Kbuild, arch/cris/include/asm/Kbuild,
> arch/hexagon/include/asm/Kbuild, arch/microblaze/include/asm/Kbuild,
> arch/parisc/include/asm/Kbuild and arch/score/include/asm/Kbuild between
> commit e3fec2f74f7f ("lib: Add missing arch generic-y entries for
> asm-generic/hash.h") from the net-next tree and commit 93ea02bb8435
> ("arch: Clean up asm/barrier.h implementations using
> asm-generic/barrier.h") from the tip tree.
> 
> I fixed it up (see below) and can carry the fix as necessary (no action
> is required).
> 
> BTW: thanks for not keeping the Kbuild files sorted :-(

I missed arch/mn10300/include/asm/Kbuild the first time round.

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

diff --cc arch/mn10300/include/asm/Kbuild
index bc42f14c9c2e,367ef399ddf7..000000000000
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@@ -1,6 -1,6 +1,7 @@@
  
++generic-y += barrier.h
  generic-y += clkdev.h
  generic-y += exec.h
++generic-y += hash.h
  generic-y += trace_clock.h
  generic-y += preempt.h
- generic-y += hash.h
 -generic-y += barrier.h

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* linux-next: manual merge of the tip tree with the net-next tree
From: Stephen Rothwell @ 2014-01-13  3:18 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Peter Zijlstra,
	David Miller, netdev
  Cc: linux-next, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 4196 bytes --]

Hi all,

Today's linux-next merge of the tip tree got conflicts in
arch/arc/include/asm/Kbuild, arch/cris/include/asm/Kbuild,
arch/hexagon/include/asm/Kbuild, arch/microblaze/include/asm/Kbuild,
arch/parisc/include/asm/Kbuild and arch/score/include/asm/Kbuild between
commit e3fec2f74f7f ("lib: Add missing arch generic-y entries for
asm-generic/hash.h") from the net-next tree and commit 93ea02bb8435
("arch: Clean up asm/barrier.h implementations using
asm-generic/barrier.h") from the tip tree.

I fixed it up (see below) and can carry the fix as necessary (no action
is required).

BTW: thanks for not keeping the Kbuild files sorted :-(

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

diff --cc arch/arc/include/asm/Kbuild
index 93e6ca919620,e07c786011af..000000000000
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@@ -1,4 -1,4 +1,5 @@@
  generic-y += auxvec.h
++generic-y += barrier.h
  generic-y += bugs.h
  generic-y += bitsperlong.h
  generic-y += clkdev.h
@@@ -11,6 -11,6 +12,7 @@@ generic-y += fcntl.
  generic-y += fb.h
  generic-y += ftrace.h
  generic-y += hardirq.h
++generic-y += hash.h
  generic-y += hw_irq.h
  generic-y += ioctl.h
  generic-y += ioctls.h
@@@ -47,4 -47,4 +49,3 @@@ generic-y += user.
  generic-y += vga.h
  generic-y += xor.h
  generic-y += preempt.h
- generic-y += hash.h
 -generic-y += barrier.h
diff --cc arch/cris/include/asm/Kbuild
index c5963b3e4624,35ec2e5ca832..000000000000
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@@ -3,8 -3,8 +3,10 @@@ header-y += arch-v10
  header-y += arch-v32/
  
  
++generic-y += barrier.h
  generic-y += clkdev.h
  generic-y += exec.h
++generic-y += hash.h
  generic-y += kvm_para.h
  generic-y += linkage.h
  generic-y += module.h
@@@ -12,4 -12,4 +14,3 @@@ generic-y += trace_clock.
  generic-y += vga.h
  generic-y += xor.h
  generic-y += preempt.h
- generic-y += hash.h
 -generic-y += barrier.h
diff --cc arch/hexagon/include/asm/Kbuild
index 469d223950ff,a614ec9747a6..000000000000
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@@ -2,6 -2,6 +2,7 @@@
  header-y += ucontext.h
  
  generic-y += auxvec.h
++generic-y += barrier.h
  generic-y += bug.h
  generic-y += bugs.h
  generic-y += clkdev.h
@@@ -15,6 -15,6 +16,7 @@@ generic-y += fb.
  generic-y += fcntl.h
  generic-y += ftrace.h
  generic-y += hardirq.h
++generic-y += hash.h
  generic-y += hw_irq.h
  generic-y += ioctl.h
  generic-y += ioctls.h
@@@ -54,4 -54,4 +56,3 @@@ generic-y += ucontext.
  generic-y += unaligned.h
  generic-y += xor.h
  generic-y += preempt.h
- generic-y += hash.h
 -generic-y += barrier.h
diff --cc arch/microblaze/include/asm/Kbuild
index 43eec338ff50,f77fb6630b11..000000000000
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@@ -1,7 -1,7 +1,8 @@@
  
++generic-y += barrier.h
  generic-y += clkdev.h
  generic-y += exec.h
++generic-y += hash.h
  generic-y += trace_clock.h
  generic-y += syscalls.h
  generic-y += preempt.h
- generic-y += hash.h
 -generic-y += barrier.h
diff --cc arch/parisc/include/asm/Kbuild
index 75edd5fcc6ff,8df06d0074f4..000000000000
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@@ -1,8 -1,8 +1,9 @@@
  
++generic-y += barrier.h
  generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \
  	  segment.h topology.h vga.h device.h percpu.h hw_irq.h mutex.h \
  	  div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \
  	  poll.h xor.h clkdev.h exec.h
++generic-y += hash.h
  generic-y += trace_clock.h
  generic-y += preempt.h
- generic-y += hash.h
 -generic-y += barrier.h
diff --cc arch/score/include/asm/Kbuild
index 099e7ba40599,ee2993b6e5d1..000000000000
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@@ -1,9 -1,8 +1,9 @@@
  
  header-y +=
  
++generic-y += barrier.h
  generic-y += clkdev.h
++generic-y += hash.h
  generic-y += trace_clock.h
  generic-y += xor.h
  generic-y += preempt.h
- generic-y += hash.h
- 
 -generic-y += barrier.h

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 1/5] net: mvneta: increase the 64-bit rx/tx stats out of the hot path
From: Willy Tarreau @ 2014-01-13  3:06 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev, Thomas Petazzoni, Gregory CLEMENT
In-Reply-To: <1389574192.31367.194.camel@edumazet-glaptop2.roam.corp.google.com>

On Sun, Jan 12, 2014 at 04:49:52PM -0800, Eric Dumazet wrote:
> On Sun, 2014-01-12 at 10:31 +0100, Willy Tarreau wrote:
> > Better count packets and bytes in the stack and on 32 bit then
> > accumulate them at the end for once. This saves two memory writes
> > and two memory barriers per packet. The incoming packet rate was
> > increased by 4.7% on the Openblocks AX3 thanks to this.
> > 
> > Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
> > Cc: Gregory CLEMENT <gregory.clement@free-electrons.com>
> > Signed-off-by: Willy Tarreau <w@1wt.eu>
> > ---
> >  drivers/net/ethernet/marvell/mvneta.c | 15 +++++++++++----
> >  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> 
> Reviewed-by: Eric Dumazet <edumazet@google.com>
> 
> Note that with such a cost, one has to wonder why we keep 64bit stats
> for this NIC on 32bit hosts...

At least this avoids wrapping if stats are not retrieved often enough.
As someone who had to support 32-bit stats in production on a firewall
running on kernel 2.4, I can say it really becomes a problem to graph
activity if stats are not collected as often as every 30 seconds, which
is short in certain environments.

Thanks,
Willy

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox