Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH net-next v5 3/4] tunnels: advertise link netns via netlink
From: Nicolas Dichtel @ 2015-01-15 14:11 UTC (permalink / raw)
  To: netdev, containers, linux-kernel, linux-api
  Cc: davem, ebiederm, stephen, akpm, luto, cwang, Nicolas Dichtel
In-Reply-To: <1421331078-21622-1-git-send-email-nicolas.dichtel@6wind.com>

Implement rtnl_link_ops->get_link_net() callback so that IFLA_LINK_NETNSID is
added to rtnetlink messages.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 drivers/net/vxlan.c      | 8 ++++++++
 include/net/ip6_tunnel.h | 1 +
 include/net/ip_tunnels.h | 1 +
 net/ipv4/ip_gre.c        | 2 ++
 net/ipv4/ip_tunnel.c     | 8 ++++++++
 net/ipv4/ip_vti.c        | 1 +
 net/ipv4/ipip.c          | 1 +
 net/ipv6/ip6_gre.c       | 1 +
 net/ipv6/ip6_tunnel.c    | 9 +++++++++
 net/ipv6/ip6_vti.c       | 1 +
 net/ipv6/sit.c           | 1 +
 11 files changed, 34 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 6b6b45622a0a..88dbb1edea6e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2922,6 +2922,13 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static struct net *vxlan_get_link_net(const struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	return vxlan->net;
+}
+
 static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.kind		= "vxlan",
 	.maxtype	= IFLA_VXLAN_MAX,
@@ -2933,6 +2940,7 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.dellink	= vxlan_dellink,
 	.get_size	= vxlan_get_size,
 	.fill_info	= vxlan_fill_info,
+	.get_link_net	= vxlan_get_link_net,
 };
 
 static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 9326c41c2d7f..76c091b53dae 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -70,6 +70,7 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
 __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw);
 __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr,
 			     const struct in6_addr *raddr);
+struct net *ip6_tnl_get_link_net(const struct net_device *dev);
 
 static inline void ip6tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index ce4db3cc5647..2c47061a6954 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -141,6 +141,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 int ip_tunnel_init(struct net_device *dev);
 void ip_tunnel_uninit(struct net_device *dev);
 void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
+struct net *ip_tunnel_get_link_net(const struct net_device *dev);
 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 942576e27df1..6e7727f27393 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -829,6 +829,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipgre_get_size,
 	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
@@ -843,6 +844,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipgre_get_size,
 	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static int __net_init ipgre_tap_init_net(struct net *net)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d3e447936720..2cd08280c77b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -972,6 +972,14 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 
+struct net *ip_tunnel_get_link_net(const struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	return tunnel->net;
+}
+EXPORT_SYMBOL(ip_tunnel_get_link_net);
+
 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 				  struct rtnl_link_ops *ops, char *devname)
 {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 1a7e979e80ba..94efe148181c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -531,6 +531,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
 	.dellink        = ip_tunnel_dellink,
 	.get_size	= vti_get_size,
 	.fill_info	= vti_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static int __init vti_init(void)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 40403114f00a..b58d6689874c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -498,6 +498,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipip_get_size,
 	.fill_info	= ipip_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct xfrm_tunnel ipip_handler __read_mostly = {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 13cda4c6313b..9306a5ff9149 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1662,6 +1662,7 @@ static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
 	.dellink	= ip6gre_dellink,
 	.get_size	= ip6gre_get_size,
 	.fill_info	= ip6gre_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 92b3da571980..266a264ec212 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1760,6 +1760,14 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+struct net *ip6_tnl_get_link_net(const struct net_device *dev)
+{
+	struct ip6_tnl *tunnel = netdev_priv(dev);
+
+	return tunnel->net;
+}
+EXPORT_SYMBOL(ip6_tnl_get_link_net);
+
 static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_LINK]		= { .type = NLA_U32 },
 	[IFLA_IPTUN_LOCAL]		= { .len = sizeof(struct in6_addr) },
@@ -1783,6 +1791,7 @@ static struct rtnl_link_ops ip6_link_ops __read_mostly = {
 	.dellink	= ip6_tnl_dellink,
 	.get_size	= ip6_tnl_get_size,
 	.fill_info	= ip6_tnl_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ace10d0b3aac..5fb9e212eca8 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1016,6 +1016,7 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
 	.changelink	= vti6_changelink,
 	.get_size	= vti6_get_size,
 	.fill_info	= vti6_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 213546bd6d5d..3cc197c72b59 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1763,6 +1763,7 @@ static struct rtnl_link_ops sit_link_ops __read_mostly = {
 	.get_size	= ipip6_get_size,
 	.fill_info	= ipip6_fill_info,
 	.dellink	= ipip6_dellink,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct xfrm_tunnel sit_handler __read_mostly = {
-- 
2.2.2

^ permalink raw reply related

* [PATCH net-next v5 2/4] rtnl: add link netns id to interface messages
From: Nicolas Dichtel @ 2015-01-15 14:11 UTC (permalink / raw)
  To: netdev, containers, linux-kernel, linux-api
  Cc: davem, ebiederm, stephen, akpm, luto, cwang, Nicolas Dichtel
In-Reply-To: <1421331078-21622-1-git-send-email-nicolas.dichtel@6wind.com>

This patch adds a new attribute (IFLA_LINK_NETNSID) which contains the 'link'
netns id when this netns is different from the netns where the interface
stands (for example for x-net interfaces like ip tunnels).
With this attribute, it's possible to interpret correctly all advertised
information (like IFLA_LINK, etc.).

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/net/rtnetlink.h      |  2 ++
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 13 +++++++++++++
 3 files changed, 16 insertions(+)

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index e21b9f9653c0..6c6d5393fc34 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -46,6 +46,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
  *			    to create when creating a new device.
  *	@get_num_rx_queues: Function to determine number of receive queues
  *			    to create when creating a new device.
+ *	@get_link_net: Function to get the i/o netns of the device
  */
 struct rtnl_link_ops {
 	struct list_head	list;
@@ -93,6 +94,7 @@ struct rtnl_link_ops {
 	int			(*fill_slave_info)(struct sk_buff *skb,
 						   const struct net_device *dev,
 						   const struct net_device *slave_dev);
+	struct net		*(*get_link_net)(const struct net_device *dev);
 };
 
 int __rtnl_link_register(struct rtnl_link_ops *ops);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 2a8380edbb7e..0deee3eeddbf 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -146,6 +146,7 @@ enum {
 	IFLA_PHYS_PORT_ID,
 	IFLA_CARRIER_CHANGES,
 	IFLA_PHYS_SWITCH_ID,
+	IFLA_LINK_NETNSID,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 6a6cdade1676..ab78ba9a34e8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -875,6 +875,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
 	       + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+	       + nla_total_size(4) /* IFLA_LINK_NETNSID */
 	       + nla_total_size(ext_filter_mask
 			        & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
 	       + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
@@ -1169,6 +1170,18 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			goto nla_put_failure;
 	}
 
+	if (dev->rtnl_link_ops &&
+	    dev->rtnl_link_ops->get_link_net) {
+		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
+
+		if (!net_eq(dev_net(dev), link_net)) {
+			int id = peernet2id(dev_net(dev), link_net);
+
+			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
+				goto nla_put_failure;
+		}
+	}
+
 	if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
 		goto nla_put_failure;
 
-- 
2.2.2

^ permalink raw reply related

* [PATCH net-next v5 1/4] netns: add rtnl cmd to add and get peer netns ids
From: Nicolas Dichtel @ 2015-01-15 14:11 UTC (permalink / raw)
  To: netdev, containers, linux-kernel, linux-api
  Cc: davem, ebiederm, stephen, akpm, luto, cwang, Nicolas Dichtel
In-Reply-To: <1421331078-21622-1-git-send-email-nicolas.dichtel@6wind.com>

With this patch, a user can define an id for a peer netns by providing a FD or a
PID. These ids are local to the netns where it is added (ie valid only into this
netns).

The main function (ie the one exported to other module), peernet2id(), allows to
get the id of a peer netns. If no id has been assigned by the user, this
function allocates one.

These ids will be used in netlink messages to point to a peer netns, for example
in case of a x-netns interface.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 MAINTAINERS                        |   1 +
 include/net/net_namespace.h        |   4 +
 include/uapi/linux/Kbuild          |   1 +
 include/uapi/linux/net_namespace.h |  23 ++++
 include/uapi/linux/rtnetlink.h     |   5 +
 net/core/net_namespace.c           | 210 +++++++++++++++++++++++++++++++++++++
 6 files changed, 244 insertions(+)
 create mode 100644 include/uapi/linux/net_namespace.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 9de900572633..9b91d9f0257e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6578,6 +6578,7 @@ F:	include/linux/netdevice.h
 F:	include/uapi/linux/in.h
 F:	include/uapi/linux/net.h
 F:	include/uapi/linux/netdevice.h
+F:	include/uapi/linux/net_namespace.h
 F:	tools/net/
 F:	tools/testing/selftests/net/
 F:	lib/random32.c
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 2e8756b8c775..36faf4990c4b 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -60,6 +60,7 @@ struct net {
 	struct list_head	exit_list;	/* Use only net_mutex */
 
 	struct user_namespace   *user_ns;	/* Owning user namespace */
+	struct idr		netns_ids;
 
 	struct ns_common	ns;
 
@@ -290,6 +291,9 @@ static inline struct net *read_pnet(struct net * const *pnet)
 #define __net_initconst	__initconst
 #endif
 
+int peernet2id(struct net *net, struct net *peer);
+struct net *get_net_ns_by_id(struct net *net, int id);
+
 struct pernet_operations {
 	struct list_head list;
 	int (*init)(struct net *net);
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 00b100023c47..14b7b6e44c77 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -283,6 +283,7 @@ header-y += net.h
 header-y += netlink_diag.h
 header-y += netlink.h
 header-y += netrom.h
+header-y += net_namespace.h
 header-y += net_tstamp.h
 header-y += nfc.h
 header-y += nfs2.h
diff --git a/include/uapi/linux/net_namespace.h b/include/uapi/linux/net_namespace.h
new file mode 100644
index 000000000000..778cd2c3ebf4
--- /dev/null
+++ b/include/uapi/linux/net_namespace.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2015 6WIND S.A.
+ * Author: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+#ifndef _UAPI_LINUX_NET_NAMESPACE_H_
+#define _UAPI_LINUX_NET_NAMESPACE_H_
+
+/* Attributes of RTM_NEWNSID/RTM_GETNSID messages */
+enum {
+	NETNSA_NONE,
+#define NETNSA_NSID_NOT_ASSIGNED -1
+	NETNSA_NSID,
+	NETNSA_PID,
+	NETNSA_FD,
+	__NETNSA_MAX,
+};
+
+#define NETNSA_MAX		(__NETNSA_MAX - 1)
+
+#endif /* _UAPI_LINUX_NET_NAMESPACE_H_ */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index a1d18593f41e..5cc5d66bf519 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -132,6 +132,11 @@ enum {
 	RTM_GETMDB = 86,
 #define RTM_GETMDB RTM_GETMDB
 
+	RTM_NEWNSID = 88,
+#define RTM_NEWNSID RTM_NEWNSID
+	RTM_GETNSID = 90,
+#define RTM_GETNSID RTM_GETNSID
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ce780c722e48..edf089dd792a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -15,6 +15,10 @@
 #include <linux/file.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
@@ -144,6 +148,77 @@ static void ops_free_list(const struct pernet_operations *ops,
 	}
 }
 
+static int alloc_netid(struct net *net, struct net *peer, int reqid)
+{
+	int min = 0, max = 0;
+
+	ASSERT_RTNL();
+
+	if (reqid >= 0) {
+		min = reqid;
+		max = reqid + 1;
+	}
+
+	return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
+}
+
+/* This function is used by idr_for_each(). If net is equal to peer, the
+ * function returns the id so that idr_for_each() stops. Because we cannot
+ * returns the id 0 (idr_for_each() will not stop), we return the magic value
+ * NET_ID_ZERO (-1) for it.
+ */
+#define NET_ID_ZERO -1
+static int net_eq_idr(int id, void *net, void *peer)
+{
+	if (net_eq(net, peer))
+		return id ? : NET_ID_ZERO;
+	return 0;
+}
+
+static int __peernet2id(struct net *net, struct net *peer, bool alloc)
+{
+	int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
+
+	ASSERT_RTNL();
+
+	/* Magic value for id 0. */
+	if (id == NET_ID_ZERO)
+		return 0;
+	if (id > 0)
+		return id;
+
+	if (alloc)
+		return alloc_netid(net, peer, -1);
+
+	return -ENOENT;
+}
+
+/* This function returns the id of a peer netns. If no id is assigned, one will
+ * be allocated and returned.
+ */
+int peernet2id(struct net *net, struct net *peer)
+{
+	int id = __peernet2id(net, peer, true);
+
+	return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+}
+
+struct net *get_net_ns_by_id(struct net *net, int id)
+{
+	struct net *peer;
+
+	if (id < 0)
+		return NULL;
+
+	rcu_read_lock();
+	peer = idr_find(&net->netns_ids, id);
+	if (peer)
+		get_net(peer);
+	rcu_read_unlock();
+
+	return peer;
+}
+
 /*
  * setup_net runs the initializers for the network namespace object.
  */
@@ -158,6 +233,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	atomic_set(&net->passive, 1);
 	net->dev_base_seq = 1;
 	net->user_ns = user_ns;
+	idr_init(&net->netns_ids);
 
 #ifdef NETNS_REFCNT_DEBUG
 	atomic_set(&net->use_count, 0);
@@ -288,6 +364,14 @@ static void cleanup_net(struct work_struct *work)
 	list_for_each_entry(net, &net_kill_list, cleanup_list) {
 		list_del_rcu(&net->list);
 		list_add_tail(&net->exit_list, &net_exit_list);
+		for_each_net(tmp) {
+			int id = __peernet2id(tmp, net, false);
+
+			if (id >= 0)
+				idr_remove(&tmp->netns_ids, id);
+		}
+		idr_destroy(&net->netns_ids);
+
 	}
 	rtnl_unlock();
 
@@ -402,6 +486,129 @@ static struct pernet_operations __net_initdata net_ns_ops = {
 	.exit = net_ns_net_exit,
 };
 
+static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
+	[NETNSA_NONE]		= { .type = NLA_UNSPEC },
+	[NETNSA_NSID]		= { .type = NLA_S32 },
+	[NETNSA_PID]		= { .type = NLA_U32 },
+	[NETNSA_FD]		= { .type = NLA_U32 },
+};
+
+static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[NETNSA_MAX + 1];
+	struct net *peer;
+	int nsid, err;
+
+	err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
+			  rtnl_net_policy);
+	if (err < 0)
+		return err;
+	if (!tb[NETNSA_NSID])
+		return -EINVAL;
+	nsid = nla_get_s32(tb[NETNSA_NSID]);
+
+	if (tb[NETNSA_PID])
+		peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
+	else if (tb[NETNSA_FD])
+		peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
+	else
+		return -EINVAL;
+	if (IS_ERR(peer))
+		return PTR_ERR(peer);
+
+	if (__peernet2id(net, peer, false) >= 0) {
+		err = -EEXIST;
+		goto out;
+	}
+
+	err = alloc_netid(net, peer, nsid);
+	if (err > 0)
+		err = 0;
+out:
+	put_net(peer);
+	return err;
+}
+
+static int rtnl_net_get_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct rtgenmsg))
+	       + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
+	       ;
+}
+
+static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
+			 int cmd, struct net *net, struct net *peer)
+{
+	struct nlmsghdr *nlh;
+	struct rtgenmsg *rth;
+	int id;
+
+	ASSERT_RTNL();
+
+	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	rth = nlmsg_data(nlh);
+	rth->rtgen_family = AF_UNSPEC;
+
+	id = __peernet2id(net, peer, false);
+	if  (id < 0)
+		id = NETNSA_NSID_NOT_ASSIGNED;
+	if (nla_put_s32(skb, NETNSA_NSID, id))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[NETNSA_MAX + 1];
+	struct sk_buff *msg;
+	int err = -ENOBUFS;
+	struct net *peer;
+
+	err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
+			  rtnl_net_policy);
+	if (err < 0)
+		return err;
+	if (tb[NETNSA_PID])
+		peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
+	else if (tb[NETNSA_FD])
+		peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
+	else
+		return -EINVAL;
+
+	if (IS_ERR(peer))
+		return PTR_ERR(peer);
+
+	msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+	if (!msg) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+			    RTM_GETNSID, net, peer);
+	if (err < 0)
+		goto err_out;
+
+	err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
+	goto out;
+
+err_out:
+	nlmsg_free(msg);
+out:
+	put_net(peer);
+	return err;
+}
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@@ -435,6 +642,9 @@ static int __init net_ns_init(void)
 
 	register_pernet_subsys(&net_ns_ops);
 
+	rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, NULL, NULL);
+
 	return 0;
 }
 
-- 
2.2.2

^ permalink raw reply related

* [PATCH net-next v5 0/4] netns: allow to identify peer netns
From: Nicolas Dichtel @ 2015-01-15 14:11 UTC (permalink / raw)
  To: netdev, containers, linux-kernel, linux-api
  Cc: davem, ebiederm, stephen, akpm, luto, cwang
In-Reply-To: <87wq7g831b.fsf@x220.int.ebiederm.org>

The goal of this serie is to be able to multicast netlink messages with an
attribute that identify a peer netns.
This is needed by the userland to interpret some information contained in
netlink messages (like IFLA_LINK value, but also some other attributes in case
of x-netns netdevice (see also
http://thread.gmane.org/gmane.linux.network/315933/focus=316064 and
http://thread.gmane.org/gmane.linux.kernel.containers/28301/focus=4239)).

Ids of peer netns can be set by userland via a new rtnl cmd RTM_NEWNSID. When
the kernel needs an id for a peer (for example when advertising a new x-netns
interface via netlink), if the user didn't allocate an id, one will be
automatically allocated.
These ids are stored per netns and are local (ie only valid in the netns where
they are set). To avoid allocating an int for each peer netns, I use
idr_for_each() to retrieve the id of a peer netns. Note that it will be possible
to add a table (struct net -> id) later to optimize this lookup if needed.

Patch 1/4 introduces the rtnetlink API mechanism to set and get these ids.
Patch 2/4 and 3/4 implements an example of how to use these ids when advertising
information about a x-netns interface.
And patch 4/4 shows that the netlink messages can be symetric between a GET and
a SET.

iproute2 patches are available, I can send them on demand.

Here is a small screenshot to show how it can be used by userland.

# Initialization:
$ ip netns add foo
$ ip netns del foo
$ ip netns
$ touch /var/run/netns/init_net
$ mount --bind /proc/1/ns/net /var/run/netns/init_net
$ ip netns add foo
$ ip -n foo netns
foo
init_net
$ ip -n foo netns set init_net 0
$ ip -n foo netns set foo 1

# Only netns seen from foo have an id:
$ ip netns
foo
init_net
$ ip -n foo netns
foo (id: 1)
init_net (id: 0)

# Add a 4in4 x-netns interface with a link-netnsid option and check the dump:
$ ip -n foo link add ipip1 link-netnsid 0 type ipip remote 10.16.0.121 local 10.16.0.249
$ ip -n foo link ls ipip1
6: ipip1@NONE: <POINTOPOINT,NOARP> mtu 1480 qdisc noop state DOWN mode DEFAULT group default 
    link/ipip 10.16.0.249 peer 10.16.0.121 link-netnsid 0
# The parameter link-netnsid shows us where the interface sends and receives
# packets (and thus we know where encapsulated addresses are set).

# Add a 4in4 x-netns interface without a link-netnsid option and check that an
# id is allocated in init_net for foo
$ ip netns
foo
init_net
$ ip -n foo link add ipip2 type ipip remote 10.16.0.121 local 10.16.0.249
$ ip -n foo link set ipip2 netns init_net
$ ip link ls ipip2
7: ipip2@NONE: <POINTOPOINT,NOARP> mtu 1480 qdisc noop state DOWN mode DEFAULT group default 
    link/ipip 10.16.0.249 peer 10.16.0.121 link-netnsid 0
$ ip netns
foo (id: 0)
init_net

v4 -> v5:
  use rtnetlink instead of genetlink
  allocate automatically an id if user didn't assign one
  rename include/uapi/linux/netns.h to include/uapi/linux/net_namespace.h
  add vxlan in patch #3

RFCv3 -> v4:
  rebase on net-next
  add copyright text in the new netns.h file

RFCv2 -> RFCv3:
  ids are now defined by userland (via netlink). Ids are stored in each netns
  (and they are local to this netns).
  add get_link_net support for ip6 tunnels
  netnsid is now a s32 instead of a u32

RFCv1 -> RFCv2:
  remove useless ()
  ids are now stored in the user ns. It's possible to get an id for a peer netns
  only if the current netns and the peer netns have the same user ns parent.

 MAINTAINERS                        |   1 +
 drivers/net/vxlan.c                |   8 ++
 include/net/ip6_tunnel.h           |   1 +
 include/net/ip_tunnels.h           |   1 +
 include/net/net_namespace.h        |   4 +
 include/net/rtnetlink.h            |   2 +
 include/uapi/linux/Kbuild          |   1 +
 include/uapi/linux/if_link.h       |   1 +
 include/uapi/linux/net_namespace.h |  23 ++++
 include/uapi/linux/rtnetlink.h     |   5 +
 net/core/net_namespace.c           | 210 +++++++++++++++++++++++++++++++++++++
 net/core/rtnetlink.c               |  38 ++++++-
 net/ipv4/ip_gre.c                  |   2 +
 net/ipv4/ip_tunnel.c               |   8 ++
 net/ipv4/ip_vti.c                  |   1 +
 net/ipv4/ipip.c                    |   1 +
 net/ipv6/ip6_gre.c                 |   1 +
 net/ipv6/ip6_tunnel.c              |   9 ++
 net/ipv6/ip6_vti.c                 |   1 +
 net/ipv6/sit.c                     |   1 +
 20 files changed, 316 insertions(+), 3 deletions(-)

Comments are welcome.

Regards,
Nicolas

^ permalink raw reply

* Re: [PATCH] virtio_balloon: coding style fixes
From: Michael S. Tsirkin @ 2015-01-15 13:44 UTC (permalink / raw)
  To: Michal Hocko; +Cc: linux-api, linux-kernel, virtualization
In-Reply-To: <20150115130642.GC7008@dhcp22.suse.cz>

On Thu, Jan 15, 2015 at 02:06:42PM +0100, Michal Hocko wrote:
> On Thu 15-01-15 13:39:06, Michael S. Tsirkin wrote:
> > Most of our code has
> > struct foo {
> > }
> > 
> > Fix two instances where balloon is inconsistent.
> 
> I hate to complain but is it really necessary to post such patches to
> linux-api?

Well it's human to err, so it seems wise to copy parties
interested in the ABI/API whenever we are changing files under include/uapi.
Whitespace changes should mostly be safe, but it's not unknown
e.g. to include unrelated changes in the same commit by mistake.

> I thought the list was primarily for API related discussions.

Basically this line in MAINTAINERS

ABI/API
L:      linux-api@vger.kernel.org
F:      Documentation/ABI/
F:      include/linux/syscalls.h
F:      include/uapi/
F:      kernel/sys_ni.c

normally means "send all patches affecting files under include/uapi/ to
this list", does it not?

Wasn't this the intent?

> This is not the only mail sent here which doesn't fall into that
> category IMO. It is far from low volume list for quite some time.
> 
> Please let's get back low volume and API only discussion!

Maybe send patch dropping include/uapi/ from there,
should help drive the volumes down?

> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >  include/uapi/linux/virtio_balloon.h | 3 +--
> >  drivers/virtio/virtio_balloon.c     | 3 +--
> >  2 files changed, 2 insertions(+), 4 deletions(-)
> > 
> > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> > index be40f70..4b0488f 100644
> > --- a/include/uapi/linux/virtio_balloon.h
> > +++ b/include/uapi/linux/virtio_balloon.h
> > @@ -36,8 +36,7 @@
> >  /* Size of a PFN in the balloon interface. */
> >  #define VIRTIO_BALLOON_PFN_SHIFT 12
> >  
> > -struct virtio_balloon_config
> > -{
> > +struct virtio_balloon_config {
> >  	/* Number of pages host wants Guest to give up. */
> >  	__le32 num_pages;
> >  	/* Number of pages we've actually got in balloon. */
> > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> > index 3176ea4..0413157 100644
> > --- a/drivers/virtio/virtio_balloon.c
> > +++ b/drivers/virtio/virtio_balloon.c
> > @@ -44,8 +44,7 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
> >  module_param(oom_pages, int, S_IRUSR | S_IWUSR);
> >  MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
> >  
> > -struct virtio_balloon
> > -{
> > +struct virtio_balloon {
> >  	struct virtio_device *vdev;
> >  	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> >  
> > -- 
> > MST
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> 
> -- 
> Michal Hocko
> SUSE Labs

^ permalink raw reply

* Re: [PATCH] virtio_balloon: coding style fixes
From: Michal Hocko @ 2015-01-15 13:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: linux-api, linux-kernel, virtualization
In-Reply-To: <1421321941-21111-1-git-send-email-mst@redhat.com>

On Thu 15-01-15 13:39:06, Michael S. Tsirkin wrote:
> Most of our code has
> struct foo {
> }
> 
> Fix two instances where balloon is inconsistent.

I hate to complain but is it really necessary to post such patches to
linux-api?

I thought the list was primarily for API related discussions.
This is not the only mail sent here which doesn't fall into that
category IMO. It is far from low volume list for quite some time.

Please let's get back low volume and API only discussion!

> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  include/uapi/linux/virtio_balloon.h | 3 +--
>  drivers/virtio/virtio_balloon.c     | 3 +--
>  2 files changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index be40f70..4b0488f 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -36,8 +36,7 @@
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>  
> -struct virtio_balloon_config
> -{
> +struct virtio_balloon_config {
>  	/* Number of pages host wants Guest to give up. */
>  	__le32 num_pages;
>  	/* Number of pages we've actually got in balloon. */
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 3176ea4..0413157 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -44,8 +44,7 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
>  module_param(oom_pages, int, S_IRUSR | S_IWUSR);
>  MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
>  
> -struct virtio_balloon
> -{
> +struct virtio_balloon {
>  	struct virtio_device *vdev;
>  	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
>  
> -- 
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* [PATCH] virtio_balloon: coding style fixes
From: Michael S. Tsirkin @ 2015-01-15 11:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-api, virtualization

Most of our code has
struct foo {
}

Fix two instances where balloon is inconsistent.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_balloon.h | 3 +--
 drivers/virtio/virtio_balloon.c     | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index be40f70..4b0488f 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -36,8 +36,7 @@
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
 
-struct virtio_balloon_config
-{
+struct virtio_balloon_config {
 	/* Number of pages host wants Guest to give up. */
 	__le32 num_pages;
 	/* Number of pages we've actually got in balloon. */
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 3176ea4..0413157 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -44,8 +44,7 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
-struct virtio_balloon
-{
+struct virtio_balloon {
 	struct virtio_device *vdev;
 	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
 
-- 
MST

^ permalink raw reply related

* Re: [PATCH v4 20/20] kbuild: add a new kselftest_install make target to install selftests
From: Michal Marek @ 2015-01-15 11:39 UTC (permalink / raw)
  To: Shuah Khan, masami.hiramatsu.pt-FCd8Q96Dh0JBDgjK7y7TUQ
  Cc: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	rostedt-nx8X9YLhiw1AfugRpC6u6w, mingo-H+wXaHxf7aLQT0dZR+AlfA,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, keescook-F7+t8E8rja9g9hUCZPvPmw,
	tranmanphong-Re5JQEeQqe8AvxtiuMwx3w, mpe-Gsx/Oe8HsFggBc27wqDAHg,
	cov-sgV2jX0FEOL9JmXXK+q4OQ, dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w,
	hughd-hpIqsD4AKlfQT0dZR+AlfA, bobby.prani-Re5JQEeQqe8AvxtiuMwx3w,
	serge.hallyn-GeWIH/nMZzLQT0dZR+AlfA,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w, tim.bird-/MT0OVThwyLZJqsBc5GL+g,
	josh-iaAMLnmF4UmaiuxdJuQwMA, koct9i-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kbuild-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <54B69A1B.80006-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>

On 2015-01-14 17:32, Shuah Khan wrote:
> On 01/06/2015 12:43 PM, Shuah Khan wrote:
>> Add a new make target to install to install kernel selftests.
>> This new target will build and install selftests. kselftest
>> target now depends on kselftest_install and runs the generated
>> kselftest script to reduce duplicate work and for common look
>> and feel when running tests.
>>
>> make kselftest_target:
>> -- exports kselftest INSTALL_KSFT_PATH
>>    default $(INSTALL_MOD_PATH)/lib/kselftest/$(KERNELRELEASE)
>> -- exports INSTALL_KSFT_PATH
>> -- runs selftests make install target
>>
>> Signed-off-by: Shuah Khan <shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>
>> ---
>>  Makefile | 14 +++++++++++++-
>>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> Hi Marek,
> 
> Could you please Ack this patch, if this version looks good,
> so I can take this through ksefltest tree.

Sorry, I thought I had acked v3.

Acked-by: Michal Marek <mmarek-AlSwsSmVLrQ@public.gmane.org>

Michal

^ permalink raw reply

* Re: [LSF/MM TOPIC] userfaultfd
From: Pavel Emelyanov @ 2015-01-15  9:01 UTC (permalink / raw)
  To: Andrea Arcangeli, lsf-pc; +Cc: linux-mm, linux-kernel, linux-api
In-Reply-To: <20150114230130.GR6103@redhat.com>

On 01/15/2015 02:01 AM, Andrea Arcangeli wrote:
> Hello,
> 
> I would like to attend this year (2015) LSF/MM summit. I'm
> particularly interested about the MM track, in order to get help in
> finalizing the userfaultfd feature I've been working on lately.

I'd like the +1 this. I'm also interested in this topic, especially
in the item 5 below.

> 5) postcopy live migration of binaries inside linux containers
>    (provided there is a userfaultfd command [not an external syscall
>    like the original implementation] that allows to copy memory
>    atomically in the userfaultfd "mm" and not in the manager "mm",
>    hence the main reason the external syscalls are going away, and in
>    turn MADV_USERFAULT fd-less is going away as well).

We've started to play with the userfaultfd in the CRIU project [1] to 
do the post-copy live migration of whole containers (and their parts).

One more use case I've seen on CRIU mailing list is the restore of
container from on-disk images w/o getting the whole memory in at the
restore time. The memory is to be put into tasks' address space in
n-demand manner later. It's claimed that such restore decreases the 
restore time significantly.

One more thing that userfaultfd can help with is restoring COW areas.
Right now, if we have two tasks, that share a phys page, but have one
RO mapped to do the COW later we do complex tricks with restoring the
page in common ancestor, then inheriting one on fork()-s and mremap-ing
it. Probably it's an API misuse, but it seems to be much simpler if
the page could be just "sent" to the remote mm via userfaultfd.

[1] http://criu.org/Main_Page

Thanks,
Pavel

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [v8 2/5] ext4: adds project ID support
From: Li Xi @ 2015-01-15  7:52 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: linux-fsdevel@vger.kernel.org, Ext4 Developers List,
	linux-api@vger.kernel.org, Theodore Ts'o, Jan Kara,
	viro@zeniv.linux.org.uk, hch@infradead.org, Dmitry Monakhov
In-Reply-To: <0F8FAF43-54EB-4202-8120-C1D70CF330F0@dilger.ca>

On Thu, Jan 8, 2015 at 7:11 AM, Andreas Dilger <adilger@dilger.ca> wrote:
> On Dec 8, 2014, at 10:22 PM, Li Xi <pkuelelixi@gmail.com> wrote:
>> This patch adds a new internal field of ext4 inode to save project
>> identifier. Also a new flag EXT4_INODE_PROJINHERIT is added for
>> inheriting project ID from parent directory.
>>
>> Signed-off-by: Li Xi <lixi@ddn.com>
>> Reviewed-by: Jan Kara <jack@suse.cz>
>> ---
>> fs/ext4/ext4.h          |   21 +++++++++++++++++----
>> fs/ext4/ialloc.c        |    6 ++++++
>> fs/ext4/inode.c         |   29 +++++++++++++++++++++++++++++
>> fs/ext4/namei.c         |   17 +++++++++++++++++
>> fs/ext4/super.c         |    1 +
>> include/uapi/linux/fs.h |    1 +
>> 6 files changed, 71 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 29c43e7..8bd1da9 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -377,16 +377,18 @@ struct flex_groups {
>> #define EXT4_EA_INODE_FL              0x00200000 /* Inode used for large EA */
>> #define EXT4_EOFBLOCKS_FL             0x00400000 /* Blocks allocated beyond EOF */
>> #define EXT4_INLINE_DATA_FL           0x10000000 /* Inode has inline data. */
>> +#define EXT4_PROJINHERIT_FL          FS_PROJINHERIT_FL /* Create with parents projid */
>
> Sigh, only a single on-disk inode flag unused.
>
>> #define EXT4_RESERVED_FL              0x80000000 /* reserved for ext4 lib */
>>
>> -#define EXT4_FL_USER_VISIBLE         0x004BDFFF /* User visible flags */
>> -#define EXT4_FL_USER_MODIFIABLE              0x004380FF /* User modifiable flags */
>> +#define EXT4_FL_USER_VISIBLE         0x204BDFFF /* User visible flags */
>> +#define EXT4_FL_USER_MODIFIABLE              0x204380FF /* User modifiable flags */
>>
>> /* Flags that should be inherited by new inodes from their parent. */
>> #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
>>                          EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
>>                          EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
>> -                        EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
>> +                        EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
>> +                        EXT4_PROJINHERIT_FL)
>>
>> /* Flags that are appropriate for regular files (all but dir-specific ones). */
>> #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
>> @@ -434,6 +436,7 @@ enum {
>>       EXT4_INODE_EA_INODE     = 21,   /* Inode used for large EA */
>>       EXT4_INODE_EOFBLOCKS    = 22,   /* Blocks allocated beyond EOF */
>>       EXT4_INODE_INLINE_DATA  = 28,   /* Data in inode. */
>> +     EXT4_INODE_PROJINHERIT  = 29,   /* Create with parents projid */
>>       EXT4_INODE_RESERVED     = 31,   /* reserved for ext4 lib */
>> };
>>
>> @@ -683,6 +686,7 @@ struct ext4_inode {
>>       __le32  i_crtime;       /* File Creation time */
>>       __le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
>>       __le32  i_version_hi;   /* high 32 bits for 64-bit version */
>> +     __le32  i_projid;       /* Project ID */
>> };
>>
>> struct move_extent {
>> @@ -934,6 +938,7 @@ struct ext4_inode_info {
>>
>>       /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
>>       __u32 i_csum_seed;
>> +     kprojid_t i_projid;
>> };
>>
>> /*
>> @@ -1518,6 +1523,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>>  * GDT_CSUM bits are mutually exclusive.
>>  */
>> #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM  0x0400
>> +#define EXT4_FEATURE_RO_COMPAT_PROJECT               0x1000 /* Project quota */
>
> I wonder if it makes sense to add EXT4_FEATURE_RO_COMPAT_METADATA_CSUM
> here as well, to make it clear why this is skipping a value?
>
>> #define EXT4_FEATURE_INCOMPAT_COMPRESSION     0x0001
>> #define EXT4_FEATURE_INCOMPAT_FILETYPE                0x0002
>> @@ -1567,7 +1573,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>>                                        EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
>>                                        EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
>>                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
>> -                                      EXT4_FEATURE_RO_COMPAT_QUOTA)
>> +                                      EXT4_FEATURE_RO_COMPAT_QUOTA |\
>> +                                      EXT4_FEATURE_RO_COMPAT_PROJECT)
>>
>> /*
>>  * Default values for user and/or group using reserved blocks
>> @@ -1575,6 +1582,11 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>> #define       EXT4_DEF_RESUID         0
>> #define       EXT4_DEF_RESGID         0
>>
>> +/*
>> + * Default project ID
>> + */
>> +#define      EXT4_DEF_PROJID         0
>> +
>> #define EXT4_DEF_INODE_READAHEAD_BLKS 32
>>
>> /*
>> @@ -2131,6 +2143,7 @@ extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
>>                            loff_t lstart, loff_t lend);
>> extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
>> extern qsize_t *ext4_get_reserved_space(struct inode *inode);
>> +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
>> extern void ext4_da_update_reserve_space(struct inode *inode,
>>                                       int used, int quota_claim);
>>
>> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
>> index ac644c3..fefb948 100644
>> --- a/fs/ext4/ialloc.c
>> +++ b/fs/ext4/ialloc.c
>> @@ -756,6 +756,12 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
>>               inode->i_gid = dir->i_gid;
>>       } else
>>               inode_init_owner(inode, dir, mode);
>> +     if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
>> +         ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) {
>> +             ei->i_projid = EXT4_I(dir)->i_projid;
>> +     } else {
>> +             ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
>> +     }
>
> (style) no need for { } if only a single line in both if/else parts.
>
>>       dquot_initialize(inode);
>>
>>       if (!goal)
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 5653fa4..29204d4 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -3863,6 +3863,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
>>               EXT4_I(inode)->i_inline_off = 0;
>> }
>>
>> +int ext4_get_projid(struct inode *inode, kprojid_t *projid)
>> +{
>> +     if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
>> +             return -EOPNOTSUPP;
>> +     *projid = EXT4_I(inode)->i_projid;
>> +     return 0;
>> +}
>> +
>> struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>> {
>>       struct ext4_iloc iloc;
>> @@ -3874,6 +3882,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>       int block;
>>       uid_t i_uid;
>>       gid_t i_gid;
>> +     projid_t i_projid;
>>
>>       inode = iget_locked(sb, ino);
>>       if (!inode)
>> @@ -3923,12 +3932,18 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
>>       inode->i_mode = le16_to_cpu(raw_inode->i_mode);
>>       i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
>>       i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
>> +     if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
>> +             i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
>> +     else
>> +             i_projid = EXT4_DEF_PROJID;
>> +
>>       if (!(test_opt(inode->i_sb, NO_UID32))) {
>>               i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
>>               i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
>>       }
>>       i_uid_write(inode, i_uid);
>>       i_gid_write(inode, i_gid);
>> +     ei->i_projid = make_kprojid(&init_user_ns, i_projid);;
>>       set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
>>
>>       ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
>> @@ -4158,6 +4173,7 @@ static int ext4_do_update_inode(handle_t *handle,
>>       int need_datasync = 0, set_large_file = 0;
>>       uid_t i_uid;
>>       gid_t i_gid;
>> +     projid_t i_projid;
>>
>>       spin_lock(&ei->i_raw_lock);
>>
>> @@ -4170,6 +4186,7 @@ static int ext4_do_update_inode(handle_t *handle,
>>       raw_inode->i_mode = cpu_to_le16(inode->i_mode);
>>       i_uid = i_uid_read(inode);
>>       i_gid = i_gid_read(inode);
>> +     i_projid = from_kprojid(&init_user_ns, ei->i_projid);
>>       if (!(test_opt(inode->i_sb, NO_UID32))) {
>>               raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
>>               raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
>> @@ -4249,6 +4266,18 @@ static int ext4_do_update_inode(handle_t *handle,
>>               }
>>       }
>>
>> +     BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
>> +                     EXT4_FEATURE_RO_COMPAT_PROJECT) &&
>> +            i_projid != EXT4_DEF_PROJID);
>> +     if (i_projid != EXT4_DEF_PROJID &&
>> +         (EXT4_INODE_SIZE(inode->i_sb) <= EXT4_GOOD_OLD_INODE_SIZE ||
>> +          (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)))) {
>> +             spin_unlock(&ei->i_raw_lock);
>> +             err = -EFBIG;
>
> I'm not sure if -EFBIG is the best error case, since that is a common
> filesystem error.  Maybe -EOVERFLOW would be better?
>
> Also, returning the error from ext4_mark_iloc_dirty->ext4_do_update_inode()
> is a bit late in the game, since the inode has already been modified at
> this point.  That callpath typically only returned an error if the disk
> was bad or the journal aborted (again normally because the disk was bad),
> so at that point the dirty in-memory data was lost anyway.
>
> It would be better to check this in the caller before the inode is changed
> so that an error can be returned without (essentially) corrupting the
> in-memory state.  Since the projid should only be set for new inodes (which
> will always have enough space, assuming RO_COMPAT_PROJECT cannot be set on
> filesystems with 128-byte inodes), or in case of rename into a project
> directory, it shouldn't be too big a change.
Sorry, I am not sure what I should do to improve this because of the reason that
Jan Kara mentioned. Please let me know if I need to do anything about this.

^ permalink raw reply

* [LSF/MM TOPIC] userfaultfd
From: Andrea Arcangeli @ 2015-01-14 23:01 UTC (permalink / raw)
  To: lsf-pc; +Cc: linux-mm, linux-kernel, linux-api

Hello,

I would like to attend this year (2015) LSF/MM summit. I'm
particularly interested about the MM track, in order to get help in
finalizing the userfaultfd feature I've been working on lately.

An overview on the userfaultfd feature can be read here:

   http://lwn.net/Articles/615086/

In essence the userfault feature could be imagined as an optimal
implementation for userland driven on demand paging similar to
PROT_NONE+SIGSEGV.

userfaultfd is fundamentally allowing to manage memory at the
pagetable level by delivering the page fault notification to userland
to handle it with proper userfaultfd commands that mangle the address
space, without involving heavyweight structures like vmas (in fact the
userfaultfd runtime load never takes the mmap_sem for writing, just
like its kernel counterpart wouldn't). The number of vmas is limited
too so they're not suitable if there are too many scattered faults and
the address space is not limited. userfaultfd allows all userfaults to
happen in parallel from different threads and it relies on userland to
use atomic copy or move commands to resolve the userfaults.

By adding more featured commands to the userfaultfd protocol (spoken
on the fd, like the basic atomic copy command that is needed to
resolve the userfault) in the future we can also mark regions readonly
and trap only wrprotect faults (or both wrprotect and non present
faults simultaneously).

Different userfaultfd can already be used independently by multiple
librarians and the main application within the same process.

The userfaultfd once opened, can also be passed using unix domain
sockets to a manager process (use case 5) below wants to do this), so the
same manager process could handle the userfaults of a multitude of
different process without them being aware about what is going on
(well of course unless they later try to use the userfaultfd themself
on the same region the manager is already tracking, which is a corner
case the relevancy of which should be discussed).

There was interest from multiple users, hope I'm not forgetting some:

1) KVM postcopy live migration (one form of cloud memory
   externalization). KVM postcopy live migration is the primary driver
   of this work:
   http://blog.zhaw.ch/icclab/setting-up-post-copy-live-migration-in-openstack/
   )

2) KVM postcopy live snapshotting (allowing to limit/throttle the
   memory usage, unlike fork would).

3) KVM userfaults on shared memory (currently only anonymous memory
   is handled by the userfaultfd but there's nothing that prevents to
   extend it and allow to register a tmpfs region in the userfaultfd
   and fire an userfault if the tmpfs page is not present)

4) alternate mechanism to notify web browsers or apps on embedded
   devices that volatile pages have been reclaimed. This basically
   avoids the need to run a syscall before the app can access with the
   CPU the virtual regions marked volatile. This also requires point 3)
   to be fulfilled, as volatile pages happily apply to tmpfs.

5) postcopy live migration of binaries inside linux containers
   (provided there is a userfaultfd command [not an external syscall
   like the original implementation] that allows to copy memory
   atomically in the userfaultfd "mm" and not in the manager "mm",
   hence the main reason the external syscalls are going away, and in
   turn MADV_USERFAULT fd-less is going away as well).

6) qemu linux-user binary emulation was also briefly interested about
   the wrprotection fault notification for non-x86 archs. In this
   context the userfaultfd ""might"" (not sure) be useful to JIT
   emulation to efficiently protect the translated regions by only
   wrprotecting the page table without having to split or merge vmas
   (the risk of running out of vmas isn't there for this use case as
   the translated cache is probably limited in size and not heavily
   scattered).

7) distributed shared memory that could allow simultaneous mapping of
   regions marked readonly and collapse them on the first exclusive
   write. I'm mentioning it as a corollary, because I'm not aware of
   anybody who is planning to use it that way (still I'd like that
   this will be possible too just in case it finds its way later on).

The currently planned API (as hinted above) is already different to
the first version of the code posted a couple of months ago, thanks to
the valuable feedback received by the community so far.

As usual suggestions will be welcome, thanks!
Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] crypto: algif - change algif_skcipher to be asynchronous
From: Tadeusz Struk @ 2015-01-14 17:36 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, linux-crypto, qat-linux, linux-api
In-Reply-To: <20150114053805.GA15204@gondor.apana.org.au>

Hi Herbert,
On 01/13/2015 09:38 PM, Herbert Xu wrote:
> What you want is AIO so we should try to use that interface rather
> than creating some funky crypto-specific interface.
> 
> Dave, the AIO hooks in net/socket.c is currently simply pointing
> to the sync implementation.  What are you thoughts on allowing
> socket implementations to supply these hooks?
> 
> The algif interface can then use these hooks to implement AIO
> which is useful for maximising the hardware performance without
> resorting to loads of threads.

But then would you like to extend AIO interface to take the IV and
something that would indicate the encrypt/decrypt operation on
aio_write()? Also as far as I can see AIO doesn't support splice()
operation for zero copy, which is the main thing here.

>From the other hand it shouldn't be a problem to add crypto specific
stuff to include/uapi/linux/if_alg.h, because it is all about crypto
anyway, is it not?

If you have a better way how to indicate that data processing should
start on the last page in sendpage() instead of ALG_OP_IN_PLACE I would
be happy to use it.
Thanks,
Tadeusz

^ permalink raw reply

* [PATCH v3 14/16] virtio_pci: macros for PCI layout offsets
From: Michael S. Tsirkin @ 2015-01-14 17:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: Rusty Russell, cornelia.huck, virtualization, linux-api
In-Reply-To: <1421256142-11512-1-git-send-email-mst@redhat.com>

From: Rusty Russell <rusty@rustcorp.com.au>

QEMU wants it, so why not?  Trust, but verify.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_pci.h    | 30 ++++++++++++++++++++
 drivers/virtio/virtio_pci_modern.c | 58 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 4e05423..e841edd 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -159,6 +159,36 @@ struct virtio_pci_common_cfg {
 	__le32 queue_used_hi;		/* read-write */
 };
 
+/* Macro versions of offsets for the Old Timers! */
+#define VIRTIO_PCI_CAP_VNDR		0
+#define VIRTIO_PCI_CAP_NEXT		1
+#define VIRTIO_PCI_CAP_LEN		2
+#define VIRTIO_PCI_CAP_TYPE_AND_BAR	3
+#define VIRTIO_PCI_CAP_OFFSET		4
+#define VIRTIO_PCI_CAP_LENGTH		8
+
+#define VIRTIO_PCI_NOTIFY_CAP_MULT	12
+
+#define VIRTIO_PCI_COMMON_DFSELECT	0
+#define VIRTIO_PCI_COMMON_DF		4
+#define VIRTIO_PCI_COMMON_GFSELECT	8
+#define VIRTIO_PCI_COMMON_GF		12
+#define VIRTIO_PCI_COMMON_MSIX		16
+#define VIRTIO_PCI_COMMON_NUMQ		18
+#define VIRTIO_PCI_COMMON_STATUS	20
+#define VIRTIO_PCI_COMMON_CFGGENERATION	21
+#define VIRTIO_PCI_COMMON_Q_SELECT	22
+#define VIRTIO_PCI_COMMON_Q_SIZE	24
+#define VIRTIO_PCI_COMMON_Q_MSIX	26
+#define VIRTIO_PCI_COMMON_Q_ENABLE	28
+#define VIRTIO_PCI_COMMON_Q_NOFF	30
+#define VIRTIO_PCI_COMMON_Q_DESCLO	32
+#define VIRTIO_PCI_COMMON_Q_DESCHI	36
+#define VIRTIO_PCI_COMMON_Q_AVAILLO	40
+#define VIRTIO_PCI_COMMON_Q_AVAILHI	44
+#define VIRTIO_PCI_COMMON_Q_USEDLO	48
+#define VIRTIO_PCI_COMMON_Q_USEDHI	52
+
 #endif /* VIRTIO_PCI_NO_MODERN */
 
 #endif
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 38c0a11..5e0d309 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -469,9 +469,65 @@ static void virtio_pci_release_dev(struct device *_d)
 	kfree(vp_dev);
 }
 
-/* TODO: validate the ABI statically. */
+/* This is part of the ABI.  Don't screw with it. */
 static inline void check_offsets(void)
 {
+	/* Note: disk space was harmed in compilation of this function. */
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR !=
+		     offsetof(struct virtio_pci_cap, cap_vndr));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT !=
+		     offsetof(struct virtio_pci_cap, cap_next));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN !=
+		     offsetof(struct virtio_pci_cap, cap_len));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_TYPE_AND_BAR !=
+		     offsetof(struct virtio_pci_cap, type_and_bar));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET !=
+		     offsetof(struct virtio_pci_cap, offset));
+	BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH !=
+		     offsetof(struct virtio_pci_cap, length));
+	BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT !=
+		     offsetof(struct virtio_pci_notify_cap,
+			      notify_off_multiplier));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT !=
+		     offsetof(struct virtio_pci_common_cfg,
+			      device_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF !=
+		     offsetof(struct virtio_pci_common_cfg, device_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT !=
+		     offsetof(struct virtio_pci_common_cfg,
+			      guest_feature_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF !=
+		     offsetof(struct virtio_pci_common_cfg, guest_feature));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX !=
+		     offsetof(struct virtio_pci_common_cfg, msix_config));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ !=
+		     offsetof(struct virtio_pci_common_cfg, num_queues));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS !=
+		     offsetof(struct virtio_pci_common_cfg, device_status));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION !=
+		     offsetof(struct virtio_pci_common_cfg, config_generation));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT !=
+		     offsetof(struct virtio_pci_common_cfg, queue_select));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE !=
+		     offsetof(struct virtio_pci_common_cfg, queue_size));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX !=
+		     offsetof(struct virtio_pci_common_cfg, queue_msix_vector));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE !=
+		     offsetof(struct virtio_pci_common_cfg, queue_enable));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF !=
+		     offsetof(struct virtio_pci_common_cfg, queue_notify_off));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_desc_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_avail_hi));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO !=
+		     offsetof(struct virtio_pci_common_cfg, queue_used_lo));
+	BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI !=
+		     offsetof(struct virtio_pci_common_cfg, queue_used_hi));
 }
 
 /* the PCI probing function */
-- 
MST

^ permalink raw reply related

* [PATCH v3 12/16] virtio-pci: define layout for virtio 1.0
From: Michael S. Tsirkin @ 2015-01-14 17:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-api, virtualization
In-Reply-To: <1421256142-11512-1-git-send-email-mst@redhat.com>

From: Rusty Russell <rusty@rustcorp.com.au>

Based on patches by Michael S. Tsirkin <mst@redhat.com>, but I found it
hard to follow so changed to use structures which are more
self-documenting.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_pci.h | 62 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 509d630..4e05423 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -99,4 +99,66 @@
 /* Vector value used to disable MSI for queue */
 #define VIRTIO_MSI_NO_VECTOR            0xffff
 
+#ifndef VIRTIO_PCI_NO_MODERN
+
+/* IDs for different capabilities.  Must all exist. */
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific confiuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	__u8 cap_vndr;		/* Generic PCI field: PCI_CAP_ID_VNDR */
+	__u8 cap_next;		/* Generic PCI field: next ptr. */
+	__u8 cap_len;		/* Generic PCI field: capability length */
+	__u8 type_and_bar;	/* Upper 3 bits: bar.
+				 * Lower 3 is VIRTIO_PCI_CAP_*_CFG. */
+	__le32 offset;		/* Offset within bar. */
+	__le32 length;		/* Length. */
+};
+
+#define VIRTIO_PCI_CAP_BAR_SHIFT	5
+#define VIRTIO_PCI_CAP_BAR_MASK		0x7
+#define VIRTIO_PCI_CAP_TYPE_SHIFT	0
+#define VIRTIO_PCI_CAP_TYPE_MASK	0x7
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	__le32 notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	__le32 device_feature_select;	/* read-write */
+	__le32 device_feature;		/* read-only */
+	__le32 guest_feature_select;	/* read-write */
+	__le32 guest_feature;		/* read-write */
+	__le16 msix_config;		/* read-write */
+	__le16 num_queues;		/* read-only */
+	__u8 device_status;		/* read-write */
+	__u8 config_generation;		/* read-only */
+
+	/* About a specific virtqueue. */
+	__le16 queue_select;		/* read-write */
+	__le16 queue_size;		/* read-write, power of 2. */
+	__le16 queue_msix_vector;	/* read-write */
+	__le16 queue_enable;		/* read-write */
+	__le16 queue_notify_off;	/* read-only */
+	__le32 queue_desc_lo;		/* read-write */
+	__le32 queue_desc_hi;		/* read-write */
+	__le32 queue_avail_lo;		/* read-write */
+	__le32 queue_avail_hi;		/* read-write */
+	__le32 queue_used_lo;		/* read-write */
+	__le32 queue_used_hi;		/* read-write */
+};
+
+#endif /* VIRTIO_PCI_NO_MODERN */
+
 #endif
-- 
MST

^ permalink raw reply related

* [PATCH v3 01/16] virtio_pci: drop virtio_config dependency
From: Michael S. Tsirkin @ 2015-01-14 17:27 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-api, virtualization
In-Reply-To: <1421256142-11512-1-git-send-email-mst@redhat.com>

virtio_pci does not depend on virtio_config:
let's not include it, users can pull it in as necessary.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 35b552c..509d630 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -39,7 +39,7 @@
 #ifndef _LINUX_VIRTIO_PCI_H
 #define _LINUX_VIRTIO_PCI_H
 
-#include <linux/virtio_config.h>
+#include <linux/types.h>
 
 #ifndef VIRTIO_PCI_NO_LEGACY
 
-- 
MST

^ permalink raw reply related

* Re: [PATCH v10 0/2] crypto: AF_ALG: add AEAD and RNG support
From: Neil Horman @ 2015-01-14 17:26 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: Herbert Xu, Daniel Borkmann, 'Quentin Gouchet',
	'LKML', ABI/API, linux-crypto-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <5006702.rWOQQ5L1VO-PJstQz4BMNNP20K/wil9xYQuADTiUCJX@public.gmane.org>

On Wed, Jan 14, 2015 at 04:31:44PM +0100, Stephan Mueller wrote:
> Am Mittwoch, 14. Januar 2015, 22:00:11 schrieb Herbert Xu:
> 
> Hi Herbert,
> 
> > On Wed, Jan 14, 2015 at 04:46:31AM -0500, Neil Horman wrote:
> > > On Wed, Jan 14, 2015 at 04:52:29AM +0100, Stephan Mueller wrote:
> > > > Hi,
> > > > 
> > > > This patch set adds AEAD and RNG support to the AF_ALG interface
> > > > exported by the kernel crypto API. By extending AF_ALG with AEAD and RNG
> > > > support, all cipher types the kernel crypto API allows access to are
> > > > now accessible from userspace.
> > > > 
> > > >  crypto/Kconfig      |   9 +
> > > >  crypto/Makefile     |   1 +
> > > >  crypto/algif_aead.c | 680
> > > >  ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed,
> > > >  690 insertions(+)
> > > >  create mode 100644 crypto/algif_aead.c
> > > 
> > > Where are the RNG bits?
> > 
> > They've already been merged.
> 
> Please also see my change log for v7:
> ...
> * RNG: patch dropped as it was applied
> ...
> 
> To ensure consistency, I did not change the intro part of my description.
Ah, thanks for the clarification
Acked-by: Neil Horman <nhorman-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>

> 
> > 
> > Cheers,
> 
> 
> -- 
> Ciao
> Stephan
> --
> To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH v4 20/20] kbuild: add a new kselftest_install make target to install selftests
From: Shuah Khan @ 2015-01-14 16:32 UTC (permalink / raw)
  To: mmarek, masami.hiramatsu.pt
  Cc: gregkh, akpm, rostedt, mingo, davem, keescook, tranmanphong, mpe,
	cov, dh.herrmann, hughd, bobby.prani, serge.hallyn, ebiederm,
	tim.bird, josh, koct9i, linux-kbuild, linux-kernel, linux-api,
	netdev
In-Reply-To: <2c5a28faaa79d9c2415854a08817ada509fcb943.1420571615.git.shuahkh@osg.samsung.com>

On 01/06/2015 12:43 PM, Shuah Khan wrote:
> Add a new make target to install to install kernel selftests.
> This new target will build and install selftests. kselftest
> target now depends on kselftest_install and runs the generated
> kselftest script to reduce duplicate work and for common look
> and feel when running tests.
> 
> make kselftest_target:
> -- exports kselftest INSTALL_KSFT_PATH
>    default $(INSTALL_MOD_PATH)/lib/kselftest/$(KERNELRELEASE)
> -- exports INSTALL_KSFT_PATH
> -- runs selftests make install target
> 
> Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
> ---
>  Makefile | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)

Hi Marek,

Could you please Ack this patch, if this version looks good,
so I can take this through ksefltest tree.

thanks,
-- Shuah


-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh@osg.samsung.com | (970) 217-8978

^ permalink raw reply

* Re: [PATCH v10 0/2] crypto: AF_ALG: add AEAD and RNG support
From: Stephan Mueller @ 2015-01-14 15:31 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Neil Horman, Daniel Borkmann, 'Quentin Gouchet',
	'LKML', ABI/API, linux-crypto
In-Reply-To: <20150114110011.GA18099@gondor.apana.org.au>

Am Mittwoch, 14. Januar 2015, 22:00:11 schrieb Herbert Xu:

Hi Herbert,

> On Wed, Jan 14, 2015 at 04:46:31AM -0500, Neil Horman wrote:
> > On Wed, Jan 14, 2015 at 04:52:29AM +0100, Stephan Mueller wrote:
> > > Hi,
> > > 
> > > This patch set adds AEAD and RNG support to the AF_ALG interface
> > > exported by the kernel crypto API. By extending AF_ALG with AEAD and RNG
> > > support, all cipher types the kernel crypto API allows access to are
> > > now accessible from userspace.
> > > 
> > >  crypto/Kconfig      |   9 +
> > >  crypto/Makefile     |   1 +
> > >  crypto/algif_aead.c | 680
> > >  ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed,
> > >  690 insertions(+)
> > >  create mode 100644 crypto/algif_aead.c
> > 
> > Where are the RNG bits?
> 
> They've already been merged.

Please also see my change log for v7:
...
* RNG: patch dropped as it was applied
...

To ensure consistency, I did not change the intro part of my description.

> 
> Cheers,


-- 
Ciao
Stephan

^ permalink raw reply

* Re: Linux GPIO internals
From: Linus Walleij @ 2015-01-14 12:18 UTC (permalink / raw)
  To: Michael Welling, linux-api-u79uwXL29TY76Z2rM5mHXA
  Cc: Alexandre Courbot, Greg Kroah-Hartman,
	acourbot-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org,
	Linux Kernel Mailing List,
	linux-gpio-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Roland Stigge,
	Wolfgang Grandegger, tstratman-ei31t81uNw5BDgjK7y7TUQ,
	Rojhalat Ibrahim
In-Reply-To: <20150105174723.GA11391@sysresccd>

On Mon, Jan 5, 2015 at 6:47 PM, Michael Welling <mwelling-ei31t81uNw5BDgjK7y7TUQ@public.gmane.org> wrote:
> On Sat, Jan 03, 2015 at 02:12:06PM +0100, Alexandre Courbot wrote:

>> It seems like your mail is coming just at the right time. We have
>> recently merged a patch that allows setting several GPIOs at the same
>> time, if the hardware supports it:
>>
>> http://patchwork.ozlabs.org/patch/406666/
>
> This looks like a good starting point. Being able to access to multiple
> GPIOs simultaneously on the same controller bank is great.

It's intended to handle the case when several GPIOs can be
switched at the same time by a single register write. It implies
being on the same controller.

> What is nice about the EMAC class is that the GPIOs do not necessarily
> have to be on the same bank (or controller) to be grouped together. The
> system call overhead of accessing a single IO is the same as multiple
> IOs in the same group.

You're comparing pears and apples now I think.

A userspace ABI making it possible to switch several GPIOs
on several controllers is perfectly doable both with and without
the above interface.

In this latter case you're more worried about the latency
incurred by the userspace/kernelspace switch, whereas
the former is about the delays incurred by several register
writes.

Userspace/kernelspace switch delay is a few magnitudes
larger than the delay between sequenced register writes I
suspect.

> Still wondering what happened to the gpioblock patch.

Roland?

> The sysfs interface is great for command line and scripting languages
> but it has more overhead. It requires string conversion at both the
> kernel and userspace. More system calls are typically required for
> similar transactions.

Agree. I have a problem with it too.

>> Considering the constraints that we have (no GPIO integers for
>> exporting, sysfs-based, uses gpiod_*array()), do you think we could
>> satisfy your goals as well?
>
> This should be satisfactory for most use cases. I will try to support
> efforts toward modernizing the sysfs interface.
>
> Would a character interface to gpiolib ever be considered?

I like the character interface idea actually.

/dev/gpiochip0
/dev/gpiochip1
(...)

Then ioctl() operations to do all the magic to figure out what GPIOs
are there and how to read/write them etc. To me this reflects the
system properly and gives all kind of freedom to manipulate the
GPIOs with efficient context switches. Plus we can deliberately
avoid including any GPIO numbers anywhere, just allow names
and nothing else.

But I'm no userspace/kernelspace expert, let's hear what
the linux-api mailing list has to say.

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH v10 0/2] crypto: AF_ALG: add AEAD and RNG support
From: Herbert Xu @ 2015-01-14 11:00 UTC (permalink / raw)
  To: Neil Horman
  Cc: Stephan Mueller, Daniel Borkmann, 'Quentin Gouchet',
	'LKML', ABI/API, linux-crypto-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150114094631.GB15294-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>

On Wed, Jan 14, 2015 at 04:46:31AM -0500, Neil Horman wrote:
> On Wed, Jan 14, 2015 at 04:52:29AM +0100, Stephan Mueller wrote:
> > Hi,
> > 
> > This patch set adds AEAD and RNG support to the AF_ALG interface
> > exported by the kernel crypto API. By extending AF_ALG with AEAD and RNG
> > support, all cipher types the kernel crypto API allows access to are
> > now accessible from userspace.
> > 
> >  crypto/Kconfig      |   9 +
> >  crypto/Makefile     |   1 +
> >  crypto/algif_aead.c | 680 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 690 insertions(+)
> >  create mode 100644 crypto/algif_aead.c
> > 
> > -- 
> > 2.1.0
> > 
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
> > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> Where are the RNG bits?

They've already been merged.

Cheers,
-- 
Email: Herbert Xu <herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q@public.gmane.org>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Should mmap MAP_LOCKED fail if mm_poppulate fails?
From: Michal Hocko @ 2015-01-14  9:50 UTC (permalink / raw)
  To: linux-mm
  Cc: Cyril Hrubis, Hugh Dickins, Michel Lespinasse, Andrew Morton,
	Linus Torvalds, Rik van Riel, Michael Kerrisk (man-pages), LKML,
	linux-api

Hi,
Cyril has encountered one of the LTP tests failing after 3.12 kernel.
To quote him:
"
What the test does is to set memory limit inside of memcg to PAGESIZE by
writing to memory.limit_in_bytes, then runs a subprocess that uses
mmap() with MAP_LOCKED which allocates 2 * PAGESIZE and expects that
it's killed by OOM. This does not happen and the call to mmap() returns
a correct pointer to a memory region, that when accessed finally causes
the OOM.
"

The difference came from the memcg OOM killer rework because OOM killer
is triggered only from the page fault path since 519e52473ebe (mm:
memcg: enable memcg OOM killer only for user faults). The rationale is
described in 3812c8c8f395 (mm: memcg: do not trap chargers with full
callstack on OOM).

This is _not_ the primary _issue_, though. It has just made a long
standing issue more visible, the same is possible even without memcg but
it is much less likely (it might get more visible once we start failing
GFP_KERNEL allocations more often). The primary issue is that mmap
doesn't report a failure if MAP_LOCKED fails to populate the area. Is
this the correct/expected behavior?

The man page says
"
MAP_LOCKED (since Linux 2.5.37)
      Lock the pages of the mapped region into memory in the manner of
      mlock(2).  This flag is ignored in older kernels.
"

and mlock is required to fail if the population fails.
"
       mlock() locks pages in the address range starting at addr and
       continuing for len bytes.  All pages that contain a part of the
       specified address range are guaranteed to be resident in RAM when
       the call returns successfully; the pages are guaranteed to stay
       in RAM until later unlocked.
"

I have checked the history and it seems we never reported an error, at
least not during git era.

FWIW mlock behaves correctly and reports the error to the userspace.

I am not sure this is something to be fixed or rather documented in the
man page. I can imagine users who would prefer ENOMEM rather than seeing
a page fault later on - I would expect RT - but do those run inside memcg
controller or on heavily overcommited systems?

On the other hand the fix sound quite easy, we just have to use
__mm_populate and unmap the area on failure for VM_LOCKED vmas. Maybe
there are some historical reason for not doing that though.

Thanks!
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v10 0/2] crypto: AF_ALG: add AEAD and RNG support
From: Neil Horman @ 2015-01-14  9:46 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: 'Herbert Xu', Daniel Borkmann, 'Quentin Gouchet',
	'LKML', ABI/API, linux-crypto-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <5489736.JvWhZNrmuD-PJstQz4BMNNP20K/wil9xYQuADTiUCJX@public.gmane.org>

On Wed, Jan 14, 2015 at 04:52:29AM +0100, Stephan Mueller wrote:
> Hi,
> 
> This patch set adds AEAD and RNG support to the AF_ALG interface
> exported by the kernel crypto API. By extending AF_ALG with AEAD and RNG
> support, all cipher types the kernel crypto API allows access to are
> now accessible from userspace.
> 
>  crypto/Kconfig      |   9 +
>  crypto/Makefile     |   1 +
>  crypto/algif_aead.c | 680 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 690 insertions(+)
>  create mode 100644 crypto/algif_aead.c
> 
> -- 
> 2.1.0
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
Where are the RNG bits?
Neil

^ permalink raw reply

* Re: [PATCH] crypto: algif - change algif_skcipher to be asynchronous
From: Herbert Xu @ 2015-01-14  5:38 UTC (permalink / raw)
  To: Tadeusz Struk; +Cc: davem, linux-crypto, qat-linux, linux-api
In-Reply-To: <20150113202823.32244.45456.stgit@tstruk-mobl1>

On Tue, Jan 13, 2015 at 12:28:23PM -0800, Tadeusz Struk wrote:
> The way the algif_skcipher works currently is that on sendmsg/sendpage it
> builds an sgl for the input data and then on read/recvmsg it sends the job
> for encryption putting the user to sleep till the data is processed.
> This way it can only handle one job at a given time.
> This patch changes it to be asynchronous.
> The idea is to allow enqueue multiple jobs to get most of available crypto HW
> accelerators and then read when the data is processed without blocking.
> To allow that both the input and output sgl need to be know at sendmsg/sendpage
> or the operation needs to happen "in place" in the input sgl. The approach here
> is to use the "in place" operation and process the data in the sgl provided in
> sendmsg. To allow that new user visible flags are introduced:
> ALG_SET_OP_TYPE
> ALG_OP_OUTOF_PLACE
> ALG_OP_IN_PLACE
> By default the operation type is ALG_OP_OUTOF_PLACE, which works the same way as
> without the change and allows existing application working without any update.
> 
> Using the test application from https://lkml.org/lkml/2011/8/28/87 with small
> modification to support in place operation, and reading after every 16th
> sendmsg these are the results:

What you want is AIO so we should try to use that interface rather
than creating some funky crypto-specific interface.

Dave, the AIO hooks in net/socket.c is currently simply pointing
to the sync implementation.  What are you thoughts on allowing
socket implementations to supply these hooks?

The algif interface can then use these hooks to implement AIO
which is useful for maximising the hardware performance without
resorting to loads of threads.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH v10 2/2] crypto: AF_ALG: enable AEAD interface compilation
From: Stephan Mueller @ 2015-01-14  3:53 UTC (permalink / raw)
  To: 'Herbert Xu'
  Cc: Daniel Borkmann, 'Quentin Gouchet', 'LKML',
	ABI/API, linux-crypto-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <5489736.JvWhZNrmuD-PJstQz4BMNNP20K/wil9xYQuADTiUCJX@public.gmane.org>

Enable compilation of the AEAD AF_ALG support and provide a Kconfig
option to compile the AEAD AF_ALG support.

Signed-off-by: Stephan Mueller <smueller-T9tCv8IpfcWELgA04lAiVw@public.gmane.org>
---
 crypto/Kconfig  | 9 +++++++++
 crypto/Makefile | 1 +
 2 files changed, 10 insertions(+)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 50f4da4..41a3fc5 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1523,6 +1523,15 @@ config CRYPTO_USER_API_RNG
 	  This option enables the user-spaces interface for random
 	  number generator algorithms.
 
+config CRYPTO_USER_API_AEAD
+	tristate "User-space interface for AEAD cipher algorithms"
+	depends on NET
+	select CRYPTO_AEAD
+	select CRYPTO_USER_API
+	help
+	  This option enables the user-spaces interface for AEAD
+	  cipher algorithms.
+
 config CRYPTO_HASH_INFO
 	bool
 
diff --git a/crypto/Makefile b/crypto/Makefile
index ba19465..97b7d3a 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
 obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
+obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
 
 #
 # generic algorithms and the async_tx api
-- 
2.1.0

^ permalink raw reply related

* [PATCH v10 1/2] crypto: AF_ALG: add AEAD support
From: Stephan Mueller @ 2015-01-14  3:53 UTC (permalink / raw)
  To: 'Herbert Xu'
  Cc: Daniel Borkmann, 'Quentin Gouchet', 'LKML',
	ABI/API, linux-crypto
In-Reply-To: <5489736.JvWhZNrmuD@tachyon.chronox.de>

This patch adds the AEAD support for AF_ALG.

The implementation is based on algif_skcipher, but contains heavy
modifications to streamline the interface for AEAD uses.

To use AEAD, the user space consumer has to use the salg_type named
"aead".

The AEAD implementation includes some overhead to calculate the size of
the ciphertext, because the AEAD implementation of the kernel crypto API
makes implied assumption on the location of the authentication tag. When
performing an encryption, the tag will be added to the created
ciphertext (note, the tag is placed adjacent to the ciphertext). For
decryption, the caller must hand in the ciphertext with the tag appended
to the ciphertext. Therefore, the selection of the used memory
needs to add/subtract the tag size from the source/destination buffers
depending on the encryption type. The code is provided with comments
explaining when and how that operation is performed.

A fully working example using all aspects of AEAD is provided at
http://www.chronox.de/libkcapi.html

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/algif_aead.c | 680 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 680 insertions(+)
 create mode 100644 crypto/algif_aead.c

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
new file mode 100644
index 0000000..8d1dda5
--- /dev/null
+++ b/crypto/algif_aead.c
@@ -0,0 +1,680 @@
+/*
+ * algif_aead: User-space interface for AEAD algorithms
+ *
+ * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de>
+ *
+ * This file provides the user-space API for AEAD ciphers.
+ *
+ * This file is derived from algif_skcipher.c.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/scatterwalk.h>
+#include <crypto/if_alg.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <net/sock.h>
+
+struct aead_sg_list {
+	unsigned int cur;
+	struct scatterlist sg[ALG_MAX_PAGES];
+};
+
+struct aead_ctx {
+	struct aead_sg_list tsgl;
+	struct af_alg_sgl rsgl;
+
+	void *iv;
+
+	struct af_alg_completion completion;
+
+	unsigned long used;
+
+	unsigned int len;
+	bool more;
+	bool merge;
+	bool enc;
+	bool trunc;
+
+	size_t aead_assoclen;
+	struct aead_request aead_req;
+};
+
+static inline int aead_sndbuf(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+
+	return max_t(int, max_t(int, sk->sk_sndbuf & PAGE_MASK, PAGE_SIZE) -
+			  ctx->used, 0);
+}
+
+static inline bool aead_writable(struct sock *sk)
+{
+	return PAGE_SIZE <= aead_sndbuf(sk);
+}
+
+static inline bool aead_sufficient_data(struct aead_ctx *ctx)
+{
+	unsigned as = crypto_aead_authsize(crypto_aead_reqtfm(&ctx->aead_req));
+
+	return (ctx->used >= (ctx->aead_assoclen + (ctx->enc ? 0 : as)));
+}
+
+static void aead_put_sgl(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct scatterlist *sg = sgl->sg;
+	unsigned int i;
+
+	for (i = 0; i < sgl->cur; i++) {
+		if (!sg_page(sg + i))
+			continue;
+
+		put_page(sg_page(sg + i));
+		sg_assign_page(sg + i, NULL);
+	}
+	sgl->cur = 0;
+	ctx->used = 0;
+	ctx->more = 0;
+	ctx->merge = 0;
+	ctx->trunc = 0;
+}
+
+static int aead_wait_for_wmem(struct sock *sk, unsigned flags)
+{
+	long timeout;
+	DEFINE_WAIT(wait);
+	int err = -ERESTARTSYS;
+
+	if (flags & MSG_DONTWAIT)
+		return -EAGAIN;
+
+	set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (sk_wait_event(sk, &timeout, aead_writable(sk))) {
+			err = 0;
+			break;
+		}
+	}
+	finish_wait(sk_sleep(sk), &wait);
+
+	return err;
+}
+
+static void aead_wmem_wakeup(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	if (!aead_writable(sk))
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+static int aead_wait_for_data(struct sock *sk, unsigned flags)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	long timeout;
+	DEFINE_WAIT(wait);
+	int err = -ERESTARTSYS;
+
+	if (flags & MSG_DONTWAIT) {
+		return -EAGAIN;
+	}
+
+	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (sk_wait_event(sk, &timeout, !ctx->more)) {
+			err = 0;
+			break;
+		}
+	}
+	finish_wait(sk_sleep(sk), &wait);
+
+	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+
+	return err;
+}
+
+static void aead_data_wakeup(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct socket_wq *wq;
+
+	if (ctx->more)
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	rcu_read_unlock();
+}
+
+static int aead_sendmsg(struct kiocb *unused, struct socket *sock,
+		        struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned ivsize =
+		crypto_aead_ivsize(crypto_aead_reqtfm(&ctx->aead_req));
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct af_alg_control con = {};
+	long copied = 0;
+	bool enc = 0;
+	bool init = 0;
+	int err = -EINVAL;
+
+	if (msg->msg_controllen) {
+		err = af_alg_cmsg_send(msg, &con);
+		if (err)
+			return err;
+
+		init = 1;
+		switch (con.op) {
+		case ALG_OP_ENCRYPT:
+			enc = 1;
+			break;
+		case ALG_OP_DECRYPT:
+			enc = 0;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (con.iv && con.iv->ivlen != ivsize)
+			return -EINVAL;
+	}
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (init) {
+		ctx->enc = enc;
+		if (con.iv)
+			memcpy(ctx->iv, con.iv->iv, ivsize);
+
+		ctx->aead_assoclen = con.aead_assoclen;
+	}
+
+	while (size) {
+		unsigned long len = size;
+		struct scatterlist *sg = NULL;
+
+		/* use the existing memory in an allocated page */
+		if (ctx->merge) {
+			sg = sgl->sg + sgl->cur - 1;
+			len = min_t(unsigned long, len,
+				    PAGE_SIZE - sg->offset - sg->length);
+			err = memcpy_from_msg(page_address(sg_page(sg)) +
+					      sg->offset + sg->length,
+					      msg, len);
+			if (err)
+				goto unlock;
+
+			sg->length += len;
+			ctx->merge = (sg->offset + sg->length) &
+				     (PAGE_SIZE - 1);
+
+			ctx->used += len;
+			copied += len;
+			size -= len;
+		}
+
+		if (!aead_writable(sk)) {
+			/*
+			 * If there is more data to be expected, but we cannot
+			 * write more data, forcefully define that we do not
+			 * expect more data to invoke the AEAD operation. This
+			 * prevents a deadlock in user space.
+			 */
+			ctx->more = 0;
+			ctx->trunc = 1;
+			err = aead_wait_for_wmem(sk, msg->msg_flags);
+			if (err)
+				goto unlock;
+		}
+
+		/* allocate a new page */
+		len = min_t(unsigned long, size, aead_sndbuf(sk));
+		while (len) {
+			int plen = 0;
+
+			if (sgl->cur >= ALG_MAX_PAGES) {
+				err = -E2BIG;
+				goto unlock;
+			}
+
+			sg = sgl->sg + sgl->cur;
+			plen = min_t(int, len, PAGE_SIZE);
+
+			sg_assign_page(sg, alloc_page(GFP_KERNEL));
+			err = -ENOMEM;
+			if (!sg_page(sg))
+				goto unlock;
+
+			err = memcpy_from_msg(page_address(sg_page(sg)),
+					      msg, plen);
+			if (err) {
+				__free_page(sg_page(sg));
+				sg_assign_page(sg, NULL);
+				goto unlock;
+			}
+
+			sg->offset = 0;
+			sg->length = plen;
+			len -= plen;
+			ctx->used += plen;
+			copied += plen;
+			sgl->cur++;
+			size -= plen;
+			ctx->merge = plen & (PAGE_SIZE - 1);
+		}
+	}
+
+	err = 0;
+
+	ctx->more = msg->msg_flags & MSG_MORE;
+	if (!ctx->more && !aead_sufficient_data(ctx)) {
+		aead_put_sgl(sk);
+		err = -EMSGSIZE;
+	}
+
+unlock:
+	aead_data_wakeup(sk);
+	release_sock(sk);
+
+	return err ?: copied;
+}
+
+static ssize_t aead_sendpage(struct socket *sock, struct page *page,
+			     int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	int err = -EINVAL;
+
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
+	if (sgl->cur >= ALG_MAX_PAGES)
+		return -E2BIG;
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (!size)
+		goto done;
+
+	if (!aead_writable(sk)) {
+		/* see aead_sendmsg why more is set to 0 */
+		ctx->more = 0;
+		ctx->trunc = 1;
+		err = aead_wait_for_wmem(sk, flags);
+		if (err)
+			goto unlock;
+	}
+
+	ctx->merge = 0;
+
+	get_page(page);
+	sg_set_page(sgl->sg + sgl->cur, page, size, offset);
+	sgl->cur++;
+	ctx->used += size;
+
+	err = 0;
+
+done:
+	ctx->more = flags & MSG_MORE;
+	if (!ctx->more && !aead_sufficient_data(ctx)) {
+		aead_put_sgl(sk);
+		err = -EMSGSIZE;
+	}
+
+unlock:
+	aead_data_wakeup(sk);
+	release_sock(sk);
+
+	return err ?: size;
+}
+
+static int aead_recvmsg(struct kiocb *unused, struct socket *sock,
+			struct msghdr *msg, size_t ignored, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned bs = crypto_aead_blocksize(crypto_aead_reqtfm(&ctx->aead_req));
+	unsigned as = crypto_aead_authsize(crypto_aead_reqtfm(&ctx->aead_req));
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct scatterlist *sg = sgl->sg;
+	struct scatterlist assoc[ALG_MAX_PAGES];
+	size_t assoclen = 0;
+	unsigned int i = 0;
+	int err = -EINVAL;
+	unsigned long used = 0;
+	unsigned long outlen = 0;
+
+	/*
+	 * Require exactly one IOV block as the AEAD operation is a one shot
+	 * due to the authentication tag.
+	 */
+	if (msg->msg_iter.nr_segs != 1)
+		return -ENOMSG;
+
+	lock_sock(sk);
+	/*
+	* AEAD memory structure: For encryption, the tag is appended to the
+	* ciphertext which implies that the memory allocated for the ciphertext
+	* must be increased by the tag length. For decryption, the tag
+	* is expected to be concatenated to the ciphertext. The plaintext
+	* therefore has a memory size of the ciphertext minus the tag length.
+	*
+	* The memory structure for cipher operation has the following
+	* structure:
+	*	AEAD encryption input:  assoc data || plaintext
+	*	AEAD encryption output: cipherntext || auth tag
+	*	AEAD decryption input:  assoc data || ciphertext || auth tag
+	*	AEAD decryption output: plaintext
+	*/
+
+	if (ctx->more) {
+		err = aead_wait_for_data(sk, flags);
+		if (err)
+			goto unlock;
+	}
+
+	used = ctx->used;
+
+	/*
+	 * Make sure sufficient data is present -- note, the same check is
+	 * is also present in sendmsg/sendpage. The checks in sendpage/sendmsg
+	 * shall provide an information to the data sender that something is
+	 * wrong, but they are irrelevant to maintain the kernel integrity.
+	 * We need this check here too in case user space decides to not honor
+	 * the error message in sendmsg/sendpage and still call recvmsg. This
+	 * check here protects the kernel integrity.
+	 */
+	if (!aead_sufficient_data(ctx))
+		goto unlock;
+
+	/*
+	 * The cipher operation input data is reduced by the associated data
+	 * length as this data is processed separately later on.
+	 */
+	used -= ctx->aead_assoclen;
+
+	if (ctx->enc) {
+		/* round up output buffer to multiple of block size */
+		outlen = ((used + bs - 1) / bs * bs);
+		/* add the size needed for the auth tag to be created */
+		outlen += as;
+	} else {
+		/* output data size is input without the authentication tag */
+		outlen = used - as;
+		/* round up output buffer to multiple of block size */
+		outlen = ((outlen + bs - 1) / bs * bs);
+	}
+
+	/* ensure output buffer is sufficiently large */
+	if (msg->msg_iter.iov->iov_len < outlen)
+		goto unlock;
+
+	outlen = af_alg_make_sg(&ctx->rsgl, msg->msg_iter.iov->iov_base,
+				outlen, 1);
+	err = outlen;
+	if (err < 0)
+		goto unlock;
+
+	err = -EINVAL;
+	sg_init_table(assoc, ALG_MAX_PAGES);
+	assoclen = ctx->aead_assoclen;
+	/*
+	 * Split scatterlist into two: first part becomes AD, second part
+	 * is plaintext / ciphertext. The first part is assigned to assoc
+	 * scatterlist. When this loop finishes, sg points to the start of the
+	 * plaintext / ciphertext.
+	 */
+	for (i = 0; i < ctx->tsgl.cur; i++) {
+		sg = sgl->sg + i;
+		if (sg->length <= assoclen) {
+			/* AD is larger than one page */
+			sg_set_page(assoc + i, sg_page(sg),
+				    sg->length, sg->offset);
+			assoclen -= sg->length;
+			if (i >= ctx->tsgl.cur)
+				goto unlock;
+		} else if (!assoclen) {
+			/* current page is to start of plaintext / ciphertext */
+			if (i)
+				/* AD terminates at page boundary */
+				sg_mark_end(assoc + i - 1);
+			else
+				/* AD size is zero */
+				sg_mark_end(assoc);
+			break;
+		} else {
+			/* AD does not terminate at page boundary */
+			sg_set_page(assoc + i, sg_page(sg),
+				    assoclen, sg->offset);
+			sg_mark_end(assoc + i);
+			/* plaintext / ciphertext starts after AD */
+			sg->length -= assoclen;
+			sg->offset += assoclen;
+			break;
+		}
+	}
+
+	aead_request_set_assoc(&ctx->aead_req, assoc, ctx->aead_assoclen);
+	aead_request_set_crypt(&ctx->aead_req, sg, ctx->rsgl.sg, used, ctx->iv);
+
+	err = af_alg_wait_for_completion(ctx->enc ?
+					 crypto_aead_encrypt(&ctx->aead_req) :
+					 crypto_aead_decrypt(&ctx->aead_req),
+					 &ctx->completion);
+
+	af_alg_free_sg(&ctx->rsgl);
+
+	/* indicate userspace that we processed incomplete data */
+	if (ctx->trunc)
+		msg->msg_flags |= MSG_TRUNC;
+
+	if (err) {
+		/* EBADMSG implies a valid cipher operation took place */
+		if (err == -EBADMSG)
+			aead_put_sgl(sk);
+		goto unlock;
+	}
+
+	aead_put_sgl(sk);
+
+	err = 0;
+
+unlock:
+	aead_wmem_wakeup(sk);
+	release_sock(sk);
+
+	return err ? err : outlen;
+}
+
+static unsigned int aead_poll(struct file *file, struct socket *sock,
+			      poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned int mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	if (!ctx->more)
+		mask |= POLLIN | POLLRDNORM;
+
+	if (aead_writable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
+static struct proto_ops algif_aead_ops = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.accept		=	sock_no_accept,
+	.setsockopt	=	sock_no_setsockopt,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	aead_sendmsg,
+	.sendpage	=	aead_sendpage,
+	.recvmsg	=	aead_recvmsg,
+	.poll		=	aead_poll,
+};
+
+static void *aead_bind(const char *name, u32 type, u32 mask)
+{
+	return crypto_alloc_aead(name, type, mask);
+}
+
+static void aead_release(void *private)
+{
+	crypto_free_aead(private);
+}
+
+static int aead_setauthsize(void *private, unsigned int authsize)
+{
+	return crypto_aead_setauthsize(private, authsize);
+}
+
+static int aead_setkey(void *private, const u8 *key, unsigned int keylen)
+{
+	return crypto_aead_setkey(private, key, keylen);
+}
+
+static void aead_sock_destruct(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned int ivlen = crypto_aead_ivsize(
+				crypto_aead_reqtfm(&ctx->aead_req));
+
+	aead_put_sgl(sk);
+	sock_kzfree_s(sk, ctx->iv, ivlen);
+	sock_kfree_s(sk, ctx, ctx->len);
+	af_alg_release_parent(sk);
+}
+
+static int aead_accept_parent(void *private, struct sock *sk)
+{
+	struct aead_ctx *ctx;
+	struct alg_sock *ask = alg_sk(sk);
+	unsigned int len = sizeof(*ctx) + crypto_aead_reqsize(private);
+	unsigned int ivlen = crypto_aead_ivsize(private);
+
+	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	memset(ctx, 0, len);
+
+	ctx->iv = sock_kmalloc(sk, ivlen, GFP_KERNEL);
+	if (!ctx->iv) {
+		sock_kfree_s(sk, ctx, len);
+		return -ENOMEM;
+	}
+	memset(ctx->iv, 0, ivlen);
+
+	ctx->len = len;
+	ctx->used = 0;
+	ctx->more = 0;
+	ctx->merge = 0;
+	ctx->enc = 0;
+	ctx->tsgl.cur = 0;
+	ctx->aead_assoclen = 0;
+	ctx->trunc = 0;
+	af_alg_init_completion(&ctx->completion);
+	sg_init_table(ctx->tsgl.sg, ALG_MAX_PAGES);
+
+	ask->private = ctx;
+
+	aead_request_set_tfm(&ctx->aead_req, private);
+	aead_request_set_callback(&ctx->aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  af_alg_complete, &ctx->completion);
+
+	sk->sk_destruct = aead_sock_destruct;
+
+	return 0;
+}
+
+static const struct af_alg_type algif_type_aead = {
+	.bind		=	aead_bind,
+	.release	=	aead_release,
+	.setkey		=	aead_setkey,
+	.setauthsize	=	aead_setauthsize,
+	.accept		=	aead_accept_parent,
+	.ops		=	&algif_aead_ops,
+	.name		=	"aead",
+	.owner		=	THIS_MODULE
+};
+
+static int __init algif_aead_init(void)
+{
+	return af_alg_register_type(&algif_type_aead);
+}
+
+static void __exit algif_aead_exit(void)
+{
+	int err = af_alg_unregister_type(&algif_type_aead);
+	BUG_ON(err);
+}
+
+module_init(algif_aead_init);
+module_exit(algif_aead_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
+MODULE_DESCRIPTION("AEAD kernel crypto API user space interface");
-- 
2.1.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox