Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH v3 03/41] virtio: add virtio 1.0 feature bit
From: Michael S. Tsirkin @ 2014-11-24 11:52 UTC (permalink / raw)
  To: linux-kernel; +Cc: rusty, linux-api, virtualization, pbonzini, David Miller
In-Reply-To: <1416829787-14252-1-git-send-email-mst@redhat.com>

Based on original patches by Rusty Russell, Thomas Huth
and Cornelia Huck.

Note: at this time, we do not negotiate this feature bit
in core, drivers have to declare VERSION_1 support explicitly.

After all drivers are converted, we will be able to
move VERSION_1 to core and drop it from all drivers.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
---
 include/uapi/linux/virtio_config.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 3ce768c..f3fe33a 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -41,11 +41,11 @@
 /* We've given up on this device. */
 #define VIRTIO_CONFIG_S_FAILED		0x80
 
-/* Some virtio feature bits (currently bits 28 through 31) are reserved for the
+/* Some virtio feature bits (currently bits 28 through 32) are reserved for the
  * transport being used (eg. virtio_ring), the rest are per-device feature
  * bits. */
 #define VIRTIO_TRANSPORT_F_START	28
-#define VIRTIO_TRANSPORT_F_END		32
+#define VIRTIO_TRANSPORT_F_END		33
 
 /* Do we get callbacks when the ring is completely used, even if we've
  * suppressed them? */
@@ -54,4 +54,7 @@
 /* Can the device handle any descriptor layout? */
 #define VIRTIO_F_ANY_LAYOUT		27
 
+/* v1.0 compliant. */
+#define VIRTIO_F_VERSION_1		32
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
-- 
MST

^ permalink raw reply related

* [PATCH v3 04/41] virtio: memory access APIs
From: Michael S. Tsirkin @ 2014-11-24 11:52 UTC (permalink / raw)
  To: linux-kernel
  Cc: Bjarke Istrup Pedersen, Anup Patel, rusty, Greg Kroah-Hartman,
	virtualization, Geert Uytterhoeven, Sakari Ailus, linux-api,
	pbonzini, Pranavkumar Sawargaonkar, David Miller,
	Alexei Starovoitov
In-Reply-To: <1416829787-14252-1-git-send-email-mst@redhat.com>

virtio 1.0 makes all memory structures LE, so
we need APIs to conditionally do a byteswap on BE
architectures.

To make it easier to check code statically,
add virtio specific types for multi-byte integers
in memory.

Add low level wrappers that do a byteswap conditionally, these will be
useful e.g. for vhost.  Add high level wrappers that
query device endian-ness and act accordingly.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_byteorder.h  | 59 +++++++++++++++++++++++++++++++++++++++
 include/linux/virtio_config.h     | 32 +++++++++++++++++++++
 include/uapi/linux/virtio_ring.h  | 45 ++++++++++++++---------------
 include/uapi/linux/virtio_types.h | 48 +++++++++++++++++++++++++++++++
 include/uapi/linux/Kbuild         |  1 +
 5 files changed, 163 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/virtio_byteorder.h
 create mode 100644 include/uapi/linux/virtio_types.h

diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h
new file mode 100644
index 0000000..824ed0b
--- /dev/null
+++ b/include/linux/virtio_byteorder.h
@@ -0,0 +1,59 @@
+#ifndef _LINUX_VIRTIO_BYTEORDER_H
+#define _LINUX_VIRTIO_BYTEORDER_H
+#include <linux/types.h>
+#include <uapi/linux/virtio_types.h>
+
+/*
+ * Memory accessors for handling virtio in modern little endian and in
+ * compatibility native endian format.
+ */
+
+static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val)
+{
+	if (little_endian)
+		return le16_to_cpu((__force __le16)val);
+	else
+		return (__force u16)val;
+}
+
+static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val)
+{
+	if (little_endian)
+		return (__force __virtio16)cpu_to_le16(val);
+	else
+		return (__force __virtio16)val;
+}
+
+static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val)
+{
+	if (little_endian)
+		return le32_to_cpu((__force __le32)val);
+	else
+		return (__force u32)val;
+}
+
+static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val)
+{
+	if (little_endian)
+		return (__force __virtio32)cpu_to_le32(val);
+	else
+		return (__force __virtio32)val;
+}
+
+static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val)
+{
+	if (little_endian)
+		return le64_to_cpu((__force __le64)val);
+	else
+		return (__force u64)val;
+}
+
+static inline __virtio64 __cpu_to_virtio64(bool little_endian, u64 val)
+{
+	if (little_endian)
+		return (__force __virtio64)cpu_to_le64(val);
+	else
+		return (__force __virtio64)val;
+}
+
+#endif /* _LINUX_VIRTIO_BYTEORDER */
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 022d904..b9cd689 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -4,6 +4,7 @@
 #include <linux/err.h>
 #include <linux/bug.h>
 #include <linux/virtio.h>
+#include <linux/virtio_byteorder.h>
 #include <uapi/linux/virtio_config.h>
 
 /**
@@ -152,6 +153,37 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
 	return 0;
 }
 
+/* Memory accessors */
+static inline u16 virtio16_to_cpu(struct virtio_device *vdev, __virtio16 val)
+{
+	return __virtio16_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+}
+
+static inline __virtio16 cpu_to_virtio16(struct virtio_device *vdev, u16 val)
+{
+	return __cpu_to_virtio16(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+}
+
+static inline u32 virtio32_to_cpu(struct virtio_device *vdev, __virtio32 val)
+{
+	return __virtio32_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+}
+
+static inline __virtio32 cpu_to_virtio32(struct virtio_device *vdev, u32 val)
+{
+	return __cpu_to_virtio32(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+}
+
+static inline u64 virtio64_to_cpu(struct virtio_device *vdev, __virtio64 val)
+{
+	return __virtio64_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+}
+
+static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val)
+{
+	return __cpu_to_virtio64(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+}
+
 /* Config space accessors. */
 #define virtio_cread(vdev, structname, member, ptr)			\
 	do {								\
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index a99f9b7..61c818a 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -32,6 +32,7 @@
  *
  * Copyright Rusty Russell IBM Corporation 2007. */
 #include <linux/types.h>
+#include <linux/virtio_types.h>
 
 /* This marks a buffer as continuing via the next field. */
 #define VRING_DESC_F_NEXT	1
@@ -61,32 +62,32 @@
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc {
 	/* Address (guest-physical). */
-	__u64 addr;
+	__virtio64 addr;
 	/* Length. */
-	__u32 len;
+	__virtio32 len;
 	/* The flags as indicated above. */
-	__u16 flags;
+	__virtio16 flags;
 	/* We chain unused descriptors via this, too */
-	__u16 next;
+	__virtio16 next;
 };
 
 struct vring_avail {
-	__u16 flags;
-	__u16 idx;
-	__u16 ring[];
+	__virtio16 flags;
+	__virtio16 idx;
+	__virtio16 ring[];
 };
 
 /* u32 is used here for ids for padding reasons. */
 struct vring_used_elem {
 	/* Index of start of used descriptor chain. */
-	__u32 id;
+	__virtio32 id;
 	/* Total length of the descriptor chain which was used (written to) */
-	__u32 len;
+	__virtio32 len;
 };
 
 struct vring_used {
-	__u16 flags;
-	__u16 idx;
+	__virtio16 flags;
+	__virtio16 idx;
 	struct vring_used_elem ring[];
 };
 
@@ -109,25 +110,25 @@ struct vring {
  *	struct vring_desc desc[num];
  *
  *	// A ring of available descriptor heads with free-running index.
- *	__u16 avail_flags;
- *	__u16 avail_idx;
- *	__u16 available[num];
- *	__u16 used_event_idx;
+ *	__virtio16 avail_flags;
+ *	__virtio16 avail_idx;
+ *	__virtio16 available[num];
+ *	__virtio16 used_event_idx;
  *
  *	// Padding to the next align boundary.
  *	char pad[];
  *
  *	// A ring of used descriptor heads with free-running index.
- *	__u16 used_flags;
- *	__u16 used_idx;
+ *	__virtio16 used_flags;
+ *	__virtio16 used_idx;
  *	struct vring_used_elem used[num];
- *	__u16 avail_event_idx;
+ *	__virtio16 avail_event_idx;
  * };
  */
 /* We publish the used event index at the end of the available ring, and vice
  * versa. They are at the end for backwards compatibility. */
 #define vring_used_event(vr) ((vr)->avail->ring[(vr)->num])
-#define vring_avail_event(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])
+#define vring_avail_event(vr) (*(__virtio16 *)&(vr)->used->ring[(vr)->num])
 
 static inline void vring_init(struct vring *vr, unsigned int num, void *p,
 			      unsigned long align)
@@ -135,15 +136,15 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p,
 	vr->num = num;
 	vr->desc = p;
 	vr->avail = p + num*sizeof(struct vring_desc);
-	vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__u16)
+	vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__virtio16)
 		+ align-1) & ~(align - 1));
 }
 
 static inline unsigned vring_size(unsigned int num, unsigned long align)
 {
-	return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num)
+	return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num)
 		 + align - 1) & ~(align - 1))
-		+ sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num;
+		+ sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num;
 }
 
 /* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */
diff --git a/include/uapi/linux/virtio_types.h b/include/uapi/linux/virtio_types.h
new file mode 100644
index 0000000..b90385f
--- /dev/null
+++ b/include/uapi/linux/virtio_types.h
@@ -0,0 +1,48 @@
+#ifndef _UAPI_LINUX_VIRTIO_TYPES_H
+#define _UAPI_LINUX_VIRTIO_TYPES_H
+/* An interface for efficient virtio implementation, currently for use by KVM
+ * and lguest, but hopefully others soon.  Do NOT change this since it will
+ * break existing servers and clients.
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) 2014 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ */
+#include <linux/types.h>
+
+/*
+ * __virtio{16,32,64} have the following meaning:
+ * - __u{16,32,64} for virtio devices in legacy mode, accessed in native endian
+ * - __le{16,32,64} for standard-compliant virtio devices
+ */
+
+typedef __u16 __bitwise__ __virtio16;
+typedef __u32 __bitwise__ __virtio32;
+typedef __u64 __bitwise__ __virtio64;
+
+#endif /* _UAPI_LINUX_VIRTIO_TYPES_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 4c94f31..44a5581 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -423,6 +423,7 @@ header-y += virtio_blk.h
 header-y += virtio_config.h
 header-y += virtio_console.h
 header-y += virtio_ids.h
+header-y += virtio_types.h
 header-y += virtio_net.h
 header-y += virtio_pci.h
 header-y += virtio_ring.h
-- 
MST

^ permalink raw reply related

* [PATCH v3 08/41] virtio: set FEATURES_OK
From: Michael S. Tsirkin @ 2014-11-24 11:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, Rusty Russell,
	virtualization, linux-api
In-Reply-To: <1416829787-14252-1-git-send-email-mst@redhat.com>

set FEATURES_OK as per virtio 1.0 spec

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_config.h |  2 ++
 drivers/virtio/virtio.c            | 29 ++++++++++++++++++++++-------
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index f3fe33a..a6d0cde 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -38,6 +38,8 @@
 #define VIRTIO_CONFIG_S_DRIVER		2
 /* Driver has used its parts of the config, and is happy */
 #define VIRTIO_CONFIG_S_DRIVER_OK	4
+/* Driver has finished configuring features */
+#define VIRTIO_CONFIG_S_FEATURES_OK	8
 /* We've given up on this device. */
 #define VIRTIO_CONFIG_S_FAILED		0x80
 
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index d213567..a3df817 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -160,6 +160,7 @@ static int virtio_dev_probe(struct device *_d)
 	struct virtio_device *dev = dev_to_virtio(_d);
 	struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
 	u64 device_features;
+	unsigned status;
 
 	/* We have a driver! */
 	add_status(dev, VIRTIO_CONFIG_S_DRIVER);
@@ -183,18 +184,32 @@ static int virtio_dev_probe(struct device *_d)
 
 	dev->config->finalize_features(dev);
 
+	if (virtio_has_feature(dev, VIRTIO_F_VERSION_1)) {
+		add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
+		status = dev->config->get_status(dev);
+		if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) {
+			printk(KERN_ERR "virtio: device refuses features: %x\n",
+			       status);
+			err = -ENODEV;
+			goto err;
+		}
+	}
+
 	err = drv->probe(dev);
 	if (err)
-		add_status(dev, VIRTIO_CONFIG_S_FAILED);
-	else {
-		add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
-		if (drv->scan)
-			drv->scan(dev);
+		goto err;
 
-		virtio_config_enable(dev);
-	}
+	add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
+	if (drv->scan)
+		drv->scan(dev);
+
+	virtio_config_enable(dev);
 
+	return 0;
+err:
+	add_status(dev, VIRTIO_CONFIG_S_FAILED);
 	return err;
+
 }
 
 static int virtio_dev_remove(struct device *_d)
-- 
MST

^ permalink raw reply related

* [PATCH v3 11/41] virtio_net: v1.0 endianness
From: Michael S. Tsirkin @ 2014-11-24 11:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: rusty, netdev, virtualization, linux-api, pbonzini, David Miller
In-Reply-To: <1416829787-14252-1-git-send-email-mst@redhat.com>

Based on patches by Rusty Russell, Cornelia Huck.
Note: more code changes are needed for 1.0 support
(due to different header size).
So we don't advertize support for 1.0 yet.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_net.h | 15 ++++++++-------
 drivers/net/virtio_net.c        | 33 ++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 172a7f0..b5f1677 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -28,6 +28,7 @@
 #include <linux/types.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
+#include <linux/virtio_types.h>
 #include <linux/if_ether.h>
 
 /* The feature bitmap for virtio net */
@@ -84,17 +85,17 @@ struct virtio_net_hdr {
 #define VIRTIO_NET_HDR_GSO_TCPV6	4	// GSO frame, IPv6 TCP
 #define VIRTIO_NET_HDR_GSO_ECN		0x80	// TCP has ECN set
 	__u8 gso_type;
-	__u16 hdr_len;		/* Ethernet + IP + tcp/udp hdrs */
-	__u16 gso_size;		/* Bytes to append to hdr_len per frame */
-	__u16 csum_start;	/* Position to start checksumming from */
-	__u16 csum_offset;	/* Offset after that to place checksum */
+	__virtio16 hdr_len;		/* Ethernet + IP + tcp/udp hdrs */
+	__virtio16 gso_size;		/* Bytes to append to hdr_len per frame */
+	__virtio16 csum_start;	/* Position to start checksumming from */
+	__virtio16 csum_offset;	/* Offset after that to place checksum */
 };
 
 /* This is the version of the header to use when the MRG_RXBUF
  * feature has been negotiated. */
 struct virtio_net_hdr_mrg_rxbuf {
 	struct virtio_net_hdr hdr;
-	__u16 num_buffers;	/* Number of merged rx buffers */
+	__virtio16 num_buffers;	/* Number of merged rx buffers */
 };
 
 /*
@@ -149,7 +150,7 @@ typedef __u8 virtio_net_ctrl_ack;
  * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
  */
 struct virtio_net_ctrl_mac {
-	__u32 entries;
+	__virtio32 entries;
 	__u8 macs[][ETH_ALEN];
 } __attribute__((packed));
 
@@ -193,7 +194,7 @@ struct virtio_net_ctrl_mac {
  * specified.
  */
 struct virtio_net_ctrl_mq {
-	__u16 virtqueue_pairs;
+	__virtio16 virtqueue_pairs;
 };
 
 #define VIRTIO_NET_CTRL_MQ   4
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b0bc8ea..c07e030 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -347,13 +347,14 @@ err:
 }
 
 static struct sk_buff *receive_mergeable(struct net_device *dev,
+					 struct virtnet_info *vi,
 					 struct receive_queue *rq,
 					 unsigned long ctx,
 					 unsigned int len)
 {
 	void *buf = mergeable_ctx_to_buf_address(ctx);
 	struct skb_vnet_hdr *hdr = buf;
-	int num_buf = hdr->mhdr.num_buffers;
+	u16 num_buf = virtio16_to_cpu(rq->vq->vdev, hdr->mhdr.num_buffers);
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
 	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
@@ -369,7 +370,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len);
 		if (unlikely(!ctx)) {
 			pr_debug("%s: rx error: %d buffers out of %d missing\n",
-				 dev->name, num_buf, hdr->mhdr.num_buffers);
+				 dev->name, num_buf,
+				 virtio16_to_cpu(rq->vq->vdev,
+						 hdr->mhdr.num_buffers));
 			dev->stats.rx_length_errors++;
 			goto err_buf;
 		}
@@ -454,7 +457,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 	}
 
 	if (vi->mergeable_rx_bufs)
-		skb = receive_mergeable(dev, rq, (unsigned long)buf, len);
+		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
 	else if (vi->big_packets)
 		skb = receive_big(dev, rq, buf, len);
 	else
@@ -473,8 +476,8 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
 		pr_debug("Needs csum!\n");
 		if (!skb_partial_csum_set(skb,
-					  hdr->hdr.csum_start,
-					  hdr->hdr.csum_offset))
+			  virtio16_to_cpu(vi->vdev, hdr->hdr.csum_start),
+			  virtio16_to_cpu(vi->vdev, hdr->hdr.csum_offset)))
 			goto frame_err;
 	} else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -514,7 +517,8 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 		if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
 			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 
-		skb_shinfo(skb)->gso_size = hdr->hdr.gso_size;
+		skb_shinfo(skb)->gso_size = virtio16_to_cpu(vi->vdev,
+							    hdr->hdr.gso_size);
 		if (skb_shinfo(skb)->gso_size == 0) {
 			net_warn_ratelimited("%s: zero gso size.\n", dev->name);
 			goto frame_err;
@@ -876,16 +880,19 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		hdr->hdr.csum_start = skb_checksum_start_offset(skb);
-		hdr->hdr.csum_offset = skb->csum_offset;
+		hdr->hdr.csum_start = cpu_to_virtio16(vi->vdev,
+						skb_checksum_start_offset(skb));
+		hdr->hdr.csum_offset = cpu_to_virtio16(vi->vdev,
+							 skb->csum_offset);
 	} else {
 		hdr->hdr.flags = 0;
 		hdr->hdr.csum_offset = hdr->hdr.csum_start = 0;
 	}
 
 	if (skb_is_gso(skb)) {
-		hdr->hdr.hdr_len = skb_headlen(skb);
-		hdr->hdr.gso_size = skb_shinfo(skb)->gso_size;
+		hdr->hdr.hdr_len = cpu_to_virtio16(vi->vdev, skb_headlen(skb));
+		hdr->hdr.gso_size = cpu_to_virtio16(vi->vdev,
+						    skb_shinfo(skb)->gso_size);
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
 			hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
@@ -1112,7 +1119,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
 		return 0;
 
-	s.virtqueue_pairs = queue_pairs;
+	s.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
 	sg_init_one(&sg, &s, sizeof(s));
 
 	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
@@ -1189,7 +1196,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	sg_init_table(sg, 2);
 
 	/* Store the unicast list and count in the front of the buffer */
-	mac_data->entries = uc_count;
+	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
 	i = 0;
 	netdev_for_each_uc_addr(ha, dev)
 		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
@@ -1200,7 +1207,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	/* multicast list and count fill the end */
 	mac_data = (void *)&mac_data->macs[uc_count][0];
 
-	mac_data->entries = mc_count;
+	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
 	i = 0;
 	netdev_for_each_mc_addr(ha, dev)
 		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
-- 
MST

^ permalink raw reply related

* [PATCH v3 12/41] virtio_blk: v1.0 support
From: Michael S. Tsirkin @ 2014-11-24 11:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, Thomas Huth,
	David Hildenbrand, Rusty Russell, virtualization, linux-api
In-Reply-To: <1416829787-14252-1-git-send-email-mst@redhat.com>

Based on patch by Cornelia Huck.

Note: for consistency, and to avoid sparse errors,
      convert all fields, even those no longer in use
      for virtio v1.0.

Reviewed-by: Thomas Huth <thuth@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_blk.h | 15 ++++-----
 drivers/block/virtio_blk.c      | 70 ++++++++++++++++++++++++-----------------
 2 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 9ad67b2..247c8ba 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -28,6 +28,7 @@
 #include <linux/types.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
+#include <linux/virtio_types.h>
 
 /* Feature bits */
 #define VIRTIO_BLK_F_BARRIER	0	/* Does host support barriers? */
@@ -114,18 +115,18 @@ struct virtio_blk_config {
 /* This is the first element of the read scatter-gather list. */
 struct virtio_blk_outhdr {
 	/* VIRTIO_BLK_T* */
-	__u32 type;
+	__virtio32 type;
 	/* io priority. */
-	__u32 ioprio;
+	__virtio32 ioprio;
 	/* Sector (ie. 512 byte offset) */
-	__u64 sector;
+	__virtio64 sector;
 };
 
 struct virtio_scsi_inhdr {
-	__u32 errors;
-	__u32 data_len;
-	__u32 sense_len;
-	__u32 residual;
+	__virtio32 errors;
+	__virtio32 data_len;
+	__virtio32 sense_len;
+	__virtio32 residual;
 };
 
 /* And this is the final byte of the write scatter-gather list. */
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c6a27d5..f601f16 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -80,7 +80,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
 {
 	struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
 	unsigned int num_out = 0, num_in = 0;
-	int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
+	__virtio32 type = vbr->out_hdr.type & ~cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT);
 
 	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
 	sgs[num_out++] = &hdr;
@@ -91,19 +91,19 @@ static int __virtblk_add_req(struct virtqueue *vq,
 	 * block, and before the normal inhdr we put the sense data and the
 	 * inhdr with additional status information.
 	 */
-	if (type == VIRTIO_BLK_T_SCSI_CMD) {
+	if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) {
 		sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
 		sgs[num_out++] = &cmd;
 	}
 
 	if (have_data) {
-		if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
+		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
 			sgs[num_out++] = data_sg;
 		else
 			sgs[num_out + num_in++] = data_sg;
 	}
 
-	if (type == VIRTIO_BLK_T_SCSI_CMD) {
+	if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) {
 		sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
 		sgs[num_out + num_in++] = &sense;
 		sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
@@ -119,12 +119,13 @@ static int __virtblk_add_req(struct virtqueue *vq,
 static inline void virtblk_request_done(struct request *req)
 {
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+	struct virtio_blk *vblk = req->q->queuedata;
 	int error = virtblk_result(vbr);
 
 	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
-		req->resid_len = vbr->in_hdr.residual;
-		req->sense_len = vbr->in_hdr.sense_len;
-		req->errors = vbr->in_hdr.errors;
+		req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
+		req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
+		req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
 	} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
 		req->errors = (error != 0);
 	}
@@ -173,25 +174,25 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
 
 	vbr->req = req;
 	if (req->cmd_flags & REQ_FLUSH) {
-		vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
+		vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH);
 		vbr->out_hdr.sector = 0;
-		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+		vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
 	} else {
 		switch (req->cmd_type) {
 		case REQ_TYPE_FS:
 			vbr->out_hdr.type = 0;
-			vbr->out_hdr.sector = blk_rq_pos(vbr->req);
-			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+			vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, blk_rq_pos(vbr->req));
+			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
 			break;
 		case REQ_TYPE_BLOCK_PC:
-			vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
+			vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_SCSI_CMD);
 			vbr->out_hdr.sector = 0;
-			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
 			break;
 		case REQ_TYPE_SPECIAL:
-			vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID;
+			vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
 			vbr->out_hdr.sector = 0;
-			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
+			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
 			break;
 		default:
 			/* We don't put anything else in the queue. */
@@ -204,9 +205,9 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
 	num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
 	if (num) {
 		if (rq_data_dir(vbr->req) == WRITE)
-			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
 		else
-			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
 	}
 
 	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
@@ -476,7 +477,8 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev)
 				   struct virtio_blk_config, wce,
 				   &writeback);
 	if (err)
-		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
+		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) ||
+		            virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
 
 	return writeback;
 }
@@ -821,25 +823,35 @@ static const struct virtio_device_id id_table[] = {
 	{ 0 },
 };
 
-static unsigned int features[] = {
+static unsigned int features_legacy[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
 	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
 	VIRTIO_BLK_F_MQ,
+}
+;
+static unsigned int features[] = {
+	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
+	VIRTIO_BLK_F_TOPOLOGY,
+	VIRTIO_BLK_F_MQ,
+	VIRTIO_F_VERSION_1,
 };
 
 static struct virtio_driver virtio_blk = {
-	.feature_table		= features,
-	.feature_table_size	= ARRAY_SIZE(features),
-	.driver.name		= KBUILD_MODNAME,
-	.driver.owner		= THIS_MODULE,
-	.id_table		= id_table,
-	.probe			= virtblk_probe,
-	.remove			= virtblk_remove,
-	.config_changed		= virtblk_config_changed,
+	.feature_table			= features,
+	.feature_table_size		= ARRAY_SIZE(features),
+	.feature_table_legacy		= features_legacy,
+	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
+	.driver.name			= KBUILD_MODNAME,
+	.driver.owner			= THIS_MODULE,
+	.id_table			= id_table,
+	.probe				= virtblk_probe,
+	.remove				= virtblk_remove,
+	.config_changed			= virtblk_config_changed,
 #ifdef CONFIG_PM_SLEEP
-	.freeze			= virtblk_freeze,
-	.restore		= virtblk_restore,
+	.freeze				= virtblk_freeze,
+	.restore			= virtblk_restore,
 #endif
 };
 
-- 
MST

^ permalink raw reply related

* [PATCHv10 0/5] syscalls,x86,sparc: Add execveat() system call
From: David Drysdale @ 2014-11-24 11:53 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel, Andrew Morton, David Miller, Thomas Gleixner
  Cc: Stephen Rothwell, Oleg Nesterov, Michael Kerrisk, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86, linux-arch, linux-api, sparclinux,
	David Drysdale

This patch set adds execveat(2) for x86 and sparc, and is derived from
Meredydd Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).

The primary aim of adding an execveat syscall is to allow an
implementation of fexecve(3) that does not rely on the /proc
filesystem, at least for executables (rather than scripts).  The
current glibc version of fexecve(3) is implemented via /proc, which
causes problems in sandboxed or otherwise restricted environments.

Given the desire for a /proc-free fexecve() implementation, HPA
suggested (https://lkml.org/lkml/2006/7/11/556) that an execveat(2)
syscall would be an appropriate generalization.

Also, having a new syscall means that it can take a flags argument
without back-compatibility concerns.  The current implementation just
defines the AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other
flags could be added in future -- for example, flags for new namespaces
(as suggested at https://lkml.org/lkml/2006/7/11/474).

Related history:
 - https://lkml.org/lkml/2006/12/27/123 is an example of someone
   realizing that fexecve() is likely to fail in a chroot environment.
 - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
   documenting the /proc requirement of fexecve(3) in its manpage, to
   "prevent other people from wasting their time".
 - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
   problem where a process that did setuid() could not fexecve()
   because it no longer had access to /proc/self/fd; this has since
   been fixed.


Changes since v9:
 - Add sparc syscall wrappers and use correct sparc 32b compatibility
   function [Stephen Rothwell, David S. Miller]

Changes since v8:
 - Split core/fs changes from x86 changes [Thomas Gleixner]

Changes since v7:
 - Speculatively wire up sparc version of syscall (untested)
 - Fix leak of pathbuf in mainline arm [Oleg Nesterov]
 - Add rcu_dereference_raw() on fdt access [sparse kbuild robot]
 - Realigned comment [Andrew Morton]
 - Merged up to v3.18-rc4

Changes since v6:
 - Remove special case for O_PATH file descriptors [Andy Lutomirski]
 - Use kasprintf rather than error-prone arithmetic [Kees Cook]
 - Add test for long name [Kees Cook]
 - Add test for non-executable O_PATH fd [Andy Lutomirski]

Changes since v5:
 - Set new flag in bprm->interp_flags for O_CLOEXEC fds, so that binfmts
   that invoke an interpreter fail the exec (as they will not be able
   to access the invoked file). [Andy Lutomirski]
 - Don't truncate long paths. [Andy Lutomirski]
 - Commonize code to open the executed file. [Eric W. Biederman]
 - Mark O_PATH file descriptors so they cannot be fexecve()ed.
 - Make self-test more helpful, and add additional cases:
     - file offset non-zero
     - binary file without execute bit
     - O_CLOEXEC fds

Changes since v4, suggested by Eric W. Biederman:
 - Use empty filename with AT_EMPTY_PATH flag rather than NULL
   pathname to request fexecve-like behaviour.
 - Build pathname as "/dev/fd/<fd>/<filename>" (or "/dev/fd/<fd>")
   rather than using d_path().
 - Patch against v3.17 (bfe01a5ba249)

Changes since Meredydd's v3 patch:
 - Added a selftest.
 - Added a man page.
 - Left open_exec() signature untouched to reduce patch impact
   elsewhere (as suggested by Al Viro).
 - Filled in bprm->filename with d_path() into a buffer, to avoid use
   of potentially-ephemeral dentry->d_name.
 - Patch against v3.14 (455c6fdbd21916).


David Drysdale (4):
  syscalls: implement execveat() system call
  x86: Hook up execveat system call.
  syscalls: add selftest for execveat(2)
  sparc: Hook up execveat system call.

 arch/sparc/include/uapi/asm/unistd.h    |   3 +-
 arch/sparc/kernel/systbls_32.S          |   1 +
 arch/sparc/kernel/systbls_64.S          |   2 +
 arch/x86/ia32/audit.c                   |   1 +
 arch/x86/ia32/ia32entry.S               |   1 +
 arch/x86/kernel/audit_64.c              |   1 +
 arch/x86/kernel/entry_64.S              |  28 +++
 arch/x86/syscalls/syscall_32.tbl        |   1 +
 arch/x86/syscalls/syscall_64.tbl        |   2 +
 arch/x86/um/sys_call_table_64.c         |   1 +
 fs/binfmt_em86.c                        |   4 +
 fs/binfmt_misc.c                        |   4 +
 fs/binfmt_script.c                      |  10 +
 fs/exec.c                               | 113 +++++++--
 fs/namei.c                              |   2 +-
 include/linux/binfmts.h                 |   4 +
 include/linux/compat.h                  |   3 +
 include/linux/fs.h                      |   1 +
 include/linux/sched.h                   |   4 +
 include/linux/syscalls.h                |   5 +
 include/uapi/asm-generic/unistd.h       |   4 +-
 kernel/sys_ni.c                         |   3 +
 lib/audit.c                             |   3 +
 tools/testing/selftests/Makefile        |   1 +
 tools/testing/selftests/exec/.gitignore |   9 +
 tools/testing/selftests/exec/Makefile   |  25 ++
 tools/testing/selftests/exec/execveat.c | 397 ++++++++++++++++++++++++++++++++
 27 files changed, 617 insertions(+), 16 deletions(-)
 create mode 100644 tools/testing/selftests/exec/.gitignore
 create mode 100644 tools/testing/selftests/exec/Makefile
 create mode 100644 tools/testing/selftests/exec/execveat.c

-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply

* [PATCHv10 1/5] syscalls: implement execveat() system call
From: David Drysdale @ 2014-11-24 11:53 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel, Andrew Morton, David Miller, Thomas Gleixner
  Cc: Stephen Rothwell, Oleg Nesterov, Michael Kerrisk, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86, linux-arch, linux-api, sparclinux,
	David Drysdale
In-Reply-To: <1416830039-21952-1-git-send-email-drysdale@google.com>

Add a new execveat(2) system call. execveat() is to execve() as
openat() is to open(): it takes a file descriptor that refers to a
directory, and resolves the filename relative to that.

In addition, if the filename is empty and AT_EMPTY_PATH is specified,
execveat() executes the file to which the file descriptor refers. This
replicates the functionality of fexecve(), which is a system call in
other UNIXen, but in Linux glibc it depends on opening
"/proc/self/fd/<fd>" (and so relies on /proc being mounted).

The filename fed to the executed program as argv[0] (or the name of the
script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
(for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
reflecting how the executable was found.  This does however mean that
execution of a script in a /proc-less environment won't work; also,
script execution via an O_CLOEXEC file descriptor fails (as the file
will not be accessible after exec).

Based on patches by Meredydd Luff <meredydd@senatehouse.org>

Signed-off-by: David Drysdale <drysdale@google.com>
---
 fs/binfmt_em86.c                  |   4 ++
 fs/binfmt_misc.c                  |   4 ++
 fs/binfmt_script.c                |  10 ++++
 fs/exec.c                         | 113 +++++++++++++++++++++++++++++++++-----
 fs/namei.c                        |   2 +-
 include/linux/binfmts.h           |   4 ++
 include/linux/compat.h            |   3 +
 include/linux/fs.h                |   1 +
 include/linux/sched.h             |   4 ++
 include/linux/syscalls.h          |   5 ++
 include/uapi/asm-generic/unistd.h |   4 +-
 kernel/sys_ni.c                   |   3 +
 lib/audit.c                       |   3 +
 13 files changed, 145 insertions(+), 15 deletions(-)

diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index f37b08cea1f7..490538536cb4 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm)
 			return -ENOEXEC;
 	}
 
+	/* Need to be able to load the file after exec */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd8beb9657a2..85acb8c83a9a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -142,6 +142,10 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	if (!fmt)
 		goto _ret;
 
+	/* Need to be able to load the file after exec */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
 	if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
 		retval = remove_arg_zero(bprm);
 		if (retval)
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 5027a3e14922..afdf4e3cafc2 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm)
 
 	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
 		return -ENOEXEC;
+
+	/*
+	 * If the script filename will be inaccessible after exec, typically
+	 * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give
+	 * up now (on the assumption that the interpreter will want to load
+	 * this file).
+	 */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
 	/*
 	 * This section does the #! interpretation.
 	 * Sorta complicated, but hopefully it will work.  -TYT
diff --git a/fs/exec.c b/fs/exec.c
index 7302b75a9820..6ce5cc47a201 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -747,18 +747,25 @@ EXPORT_SYMBOL(setup_arg_pages);
 
 #endif /* CONFIG_MMU */
 
-static struct file *do_open_exec(struct filename *name)
+static struct file *do_open_execat(int fd, struct filename *name, int flags)
 {
 	struct file *file;
 	int err;
-	static const struct open_flags open_exec_flags = {
+	struct open_flags open_exec_flags = {
 		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 		.acc_mode = MAY_EXEC | MAY_OPEN,
 		.intent = LOOKUP_OPEN,
 		.lookup_flags = LOOKUP_FOLLOW,
 	};
 
-	file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return ERR_PTR(-EINVAL);
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_EMPTY_PATH)
+		open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
+
+	file = do_filp_open(fd, name, &open_exec_flags);
 	if (IS_ERR(file))
 		goto out;
 
@@ -769,12 +776,13 @@ static struct file *do_open_exec(struct filename *name)
 	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	fsnotify_open(file);
-
 	err = deny_write_access(file);
 	if (err)
 		goto exit;
 
+	if (name->name[0] != '\0')
+		fsnotify_open(file);
+
 out:
 	return file;
 
@@ -786,7 +794,7 @@ exit:
 struct file *open_exec(const char *name)
 {
 	struct filename tmp = { .name = name };
-	return do_open_exec(&tmp);
+	return do_open_execat(AT_FDCWD, &tmp, 0);
 }
 EXPORT_SYMBOL(open_exec);
 
@@ -1427,10 +1435,12 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int do_execve_common(struct filename *filename,
-				struct user_arg_ptr argv,
-				struct user_arg_ptr envp)
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
 {
+	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
 	struct file *file;
 	struct files_struct *displaced;
@@ -1471,7 +1481,7 @@ static int do_execve_common(struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
-	file = do_open_exec(filename);
+	file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1479,7 +1489,28 @@ static int do_execve_common(struct filename *filename,
 	sched_exec();
 
 	bprm->file = file;
-	bprm->filename = bprm->interp = filename->name;
+	if (fd == AT_FDCWD || filename->name[0] == '/') {
+		bprm->filename = filename->name;
+	} else {
+		if (filename->name[0] == '\0')
+			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
+		else
+			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
+					    fd, filename->name);
+		if (!pathbuf) {
+			retval = -ENOMEM;
+			goto out_unmark;
+		}
+		/*
+		 * Record that a name derived from an O_CLOEXEC fd will be
+		 * inaccessible after exec. Relies on having exclusive access to
+		 * current->files (due to unshare_files above).
+		 */
+		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
+			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+		bprm->filename = pathbuf;
+	}
+	bprm->interp = bprm->filename;
 
 	retval = bprm_mm_init(bprm);
 	if (retval)
@@ -1520,6 +1551,7 @@ static int do_execve_common(struct filename *filename,
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
+	kfree(pathbuf);
 	putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
@@ -1537,6 +1569,7 @@ out_unmark:
 
 out_free:
 	free_bprm(bprm);
+	kfree(pathbuf);
 
 out_files:
 	if (displaced)
@@ -1552,7 +1585,18 @@ int do_execve(struct filename *filename,
 {
 	struct user_arg_ptr argv = { .ptr.native = __argv };
 	struct user_arg_ptr envp = { .ptr.native = __envp };
-	return do_execve_common(filename, argv, envp);
+	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+int do_execveat(int fd, struct filename *filename,
+		const char __user *const __user *__argv,
+		const char __user *const __user *__envp,
+		int flags)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+
+	return do_execveat_common(fd, filename, argv, envp, flags);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1568,7 +1612,23 @@ static int compat_do_execve(struct filename *filename,
 		.is_compat = true,
 		.ptr.compat = __envp,
 	};
-	return do_execve_common(filename, argv, envp);
+	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+static int compat_do_execveat(int fd, struct filename *filename,
+			      const compat_uptr_t __user *__argv,
+			      const compat_uptr_t __user *__envp,
+			      int flags)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
+	return do_execveat_common(fd, filename, argv, envp, flags);
 }
 #endif
 
@@ -1608,6 +1668,20 @@ SYSCALL_DEFINE3(execve,
 {
 	return do_execve(getname(filename), argv, envp);
 }
+
+SYSCALL_DEFINE5(execveat,
+		int, fd, const char __user *, filename,
+		const char __user *const __user *, argv,
+		const char __user *const __user *, envp,
+		int, flags)
+{
+	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+	return do_execveat(fd,
+			   getname_flags(filename, lookup_flags, NULL),
+			   argv, envp, flags);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
 	const compat_uptr_t __user *, argv,
@@ -1615,4 +1689,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
 {
 	return compat_do_execve(getname(filename), argv, envp);
 }
+
+COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
+		       const char __user *, filename,
+		       const compat_uptr_t __user *, argv,
+		       const compat_uptr_t __user *, envp,
+		       int,  flags)
+{
+	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+	return compat_do_execveat(fd,
+				  getname_flags(filename, lookup_flags, NULL),
+				  argv, envp, flags);
+}
 #endif
diff --git a/fs/namei.c b/fs/namei.c
index db5fe86319e6..ca814165d84c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -130,7 +130,7 @@ void final_putname(struct filename *name)
 
 #define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))
 
-static struct filename *
+struct filename *
 getname_flags(const char __user *filename, int flags, int *empty)
 {
 	struct filename *result, *err;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 61f29e5ea840..576e4639ca60 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -53,6 +53,10 @@ struct linux_binprm {
 #define BINPRM_FLAGS_EXECFD_BIT 1
 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT)
 
+/* filename of the binary will be inaccessible after exec */
+#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
+#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
+
 /* Function parameter for binfmt->coredump */
 struct coredump_params {
 	const siginfo_t *siginfo;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index e6494261eaff..7450ca2ac1fc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);
 
 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
 		     const compat_uptr_t __user *envp);
+asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
+		     const compat_uptr_t __user *argv,
+		     const compat_uptr_t __user *envp, int flags);
 
 asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab779e8a63c..133b60b1d4d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2072,6 +2072,7 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *);
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);
 
+extern struct filename *getname_flags(const char __user *, int, int *);
 extern struct filename *getname(const char __user *);
 extern struct filename *getname_kernel(const char *);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5e344bbe63ec..344163d09efb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2441,6 +2441,10 @@ extern void do_group_exit(int);
 extern int do_execve(struct filename *,
 		     const char __user * const __user *,
 		     const char __user * const __user *);
+extern int do_execveat(int, struct filename *,
+		       const char __user * const __user *,
+		       const char __user * const __user *,
+		       int);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bda9b81357cc..1ff5a4d09693 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -877,4 +877,9 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 asmlinkage long sys_getrandom(char __user *buf, size_t count,
 			      unsigned int flags);
 asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+asmlinkage long sys_execveat(int dfd, const char __user *filename,
+			const char __user *const __user *argv,
+			const char __user *const __user *envp, int flags);
+
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 22749c134117..e016bd9b1a04 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -707,9 +707,11 @@ __SYSCALL(__NR_getrandom, sys_getrandom)
 __SYSCALL(__NR_memfd_create, sys_memfd_create)
 #define __NR_bpf 280
 __SYSCALL(__NR_bpf, sys_bpf)
+#define __NR_execveat 281
+__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
 
 #undef __NR_syscalls
-#define __NR_syscalls 281
+#define __NR_syscalls 282
 
 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 02aa4185b17e..832fba6e2eb1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -224,3 +224,6 @@ cond_syscall(sys_seccomp);
 
 /* access BPF programs and maps */
 cond_syscall(sys_bpf);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/lib/audit.c b/lib/audit.c
index 1d726a22565b..b8fb5ee81e26 100644
--- a/lib/audit.c
+++ b/lib/audit.c
@@ -54,6 +54,9 @@ int audit_classify_syscall(int abi, unsigned syscall)
 	case __NR_socketcall:
 		return 4;
 #endif
+#ifdef __NR_execveat
+	case __NR_execveat:
+#endif
 	case __NR_execve:
 		return 5;
 	default:
-- 
2.1.0.rc2.206.gedb03e5


^ permalink raw reply related

* [PATCHv10 2/5] x86: Hook up execveat system call.
From: David Drysdale @ 2014-11-24 11:53 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel, Andrew Morton, David Miller, Thomas Gleixner
  Cc: Stephen Rothwell, Oleg Nesterov, Michael Kerrisk, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86, linux-arch, linux-api, sparclinux,
	David Drysdale
In-Reply-To: <1416830039-21952-1-git-send-email-drysdale@google.com>

Hook up x86-64, i386 and x32 ABIs.

Signed-off-by: David Drysdale <drysdale@google.com>
---
 arch/x86/ia32/audit.c            |  1 +
 arch/x86/ia32/ia32entry.S        |  1 +
 arch/x86/kernel/audit_64.c       |  1 +
 arch/x86/kernel/entry_64.S       | 28 ++++++++++++++++++++++++++++
 arch/x86/syscalls/syscall_32.tbl |  1 +
 arch/x86/syscalls/syscall_64.tbl |  2 ++
 arch/x86/um/sys_call_table_64.c  |  1 +
 7 files changed, 35 insertions(+)

diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 5d7b381da692..2eccc8932ae6 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall)
 	case __NR_socketcall:
 		return 4;
 	case __NR_execve:
+	case __NR_execveat:
 		return 5;
 	default:
 		return 1;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffe71228fc10..82e8a1d44658 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -480,6 +480,7 @@ GLOBAL(\label)
 	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
 	PTREGSCALL stub32_sigreturn, sys32_sigreturn
 	PTREGSCALL stub32_execve, compat_sys_execve
+	PTREGSCALL stub32_execveat, compat_sys_execveat
 	PTREGSCALL stub32_fork, sys_fork
 	PTREGSCALL stub32_vfork, sys_vfork
 
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..f3672508b249 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
 	case __NR_openat:
 		return 3;
 	case __NR_execve:
+	case __NR_execveat:
 		return 5;
 	default:
 		return 0;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index df088bb03fb3..40d893c60fcc 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -652,6 +652,20 @@ ENTRY(stub_execve)
 	CFI_ENDPROC
 END(stub_execve)
 
+ENTRY(stub_execveat)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	call sys_execveat
+	RESTORE_TOP_OF_STACK %r11
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_execveat)
+
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
@@ -697,6 +711,20 @@ ENTRY(stub_x32_execve)
 	CFI_ENDPROC
 END(stub_x32_execve)
 
+ENTRY(stub_x32_execveat)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	call compat_sys_execveat
+	RESTORE_TOP_OF_STACK %r11
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_x32_execveat)
+
 #endif
 
 /*
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 9fe1b5d002f0..b3560ece1c9f 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -364,3 +364,4 @@
 355	i386	getrandom		sys_getrandom
 356	i386	memfd_create		sys_memfd_create
 357	i386	bpf			sys_bpf
+358	i386	execveat		sys_execveat			stub32_execveat
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 281150b539a2..8d656fbb57aa 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -328,6 +328,7 @@
 319	common	memfd_create		sys_memfd_create
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
+322	64	execveat		stub_execveat
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
@@ -366,3 +367,4 @@
 542	x32	getsockopt		compat_sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
 544	x32	io_submit		compat_sys_io_submit
+545	x32	execveat		stub_x32_execveat
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index f2f0723070ca..20c3649d0691 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -31,6 +31,7 @@
 #define stub_fork sys_fork
 #define stub_vfork sys_vfork
 #define stub_execve sys_execve
+#define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
 #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-- 
2.1.0.rc2.206.gedb03e5


^ permalink raw reply related

* [PATCHv10 3/5] syscalls: add selftest for execveat(2)
From: David Drysdale @ 2014-11-24 11:53 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel, Andrew Morton, David Miller, Thomas Gleixner
  Cc: Stephen Rothwell, Oleg Nesterov, Michael Kerrisk, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86, linux-arch, linux-api, sparclinux,
	David Drysdale
In-Reply-To: <1416830039-21952-1-git-send-email-drysdale@google.com>

Signed-off-by: David Drysdale <drysdale@google.com>
---
 tools/testing/selftests/Makefile        |   1 +
 tools/testing/selftests/exec/.gitignore |   9 +
 tools/testing/selftests/exec/Makefile   |  25 ++
 tools/testing/selftests/exec/execveat.c | 397 ++++++++++++++++++++++++++++++++
 4 files changed, 432 insertions(+)
 create mode 100644 tools/testing/selftests/exec/.gitignore
 create mode 100644 tools/testing/selftests/exec/Makefile
 create mode 100644 tools/testing/selftests/exec/execveat.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 45f145c6f843..c14893b501a9 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -15,6 +15,7 @@ TARGETS += user
 TARGETS += sysctl
 TARGETS += firmware
 TARGETS += ftrace
+TARGETS += exec
 
 TARGETS_HOTPLUG = cpu-hotplug
 TARGETS_HOTPLUG += memory-hotplug
diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
new file mode 100644
index 000000000000..64073e050c6a
--- /dev/null
+++ b/tools/testing/selftests/exec/.gitignore
@@ -0,0 +1,9 @@
+subdir*
+script*
+execveat
+execveat.symlink
+execveat.moved
+execveat.path.ephemeral
+execveat.ephemeral
+execveat.denatured
+xxxxxxxx*
\ No newline at end of file
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
new file mode 100644
index 000000000000..66dfc2ce1788
--- /dev/null
+++ b/tools/testing/selftests/exec/Makefile
@@ -0,0 +1,25 @@
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -Wall
+BINARIES = execveat
+DEPS = execveat.symlink execveat.denatured script subdir
+all: $(BINARIES) $(DEPS)
+
+subdir:
+	mkdir -p $@
+script:
+	echo '#!/bin/sh' > $@
+	echo 'exit $$*' >> $@
+	chmod +x $@
+execveat.symlink: execveat
+	ln -s -f $< $@
+execveat.denatured: execveat
+	cp $< $@
+	chmod -x $@
+%: %.c
+	$(CC) $(CFLAGS) -o $@ $^
+
+run_tests: all
+	./execveat
+
+clean:
+	rm -rf $(BINARIES) $(DEPS) subdir.moved execveat.moved xxxxx*
diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c
new file mode 100644
index 000000000000..33a5c06d95ca
--- /dev/null
+++ b/tools/testing/selftests/exec/execveat.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2014 Google, Inc.
+ *
+ * Licensed under the terms of the GNU GPL License version 2
+ *
+ * Selftests for execveat(2).
+ */
+
+#define _GNU_SOURCE  /* to get O_PATH, AT_EMPTY_PATH */
+#include <sys/sendfile.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static char longpath[2 * PATH_MAX] = "";
+static char *envp[] = { "IN_TEST=yes", NULL, NULL };
+static char *argv[] = { "execveat", "99", NULL };
+
+static int execveat_(int fd, const char *path, char **argv, char **envp,
+		     int flags)
+{
+#ifdef __NR_execveat
+	return syscall(__NR_execveat, fd, path, argv, envp, flags);
+#else
+	errno = -ENOSYS;
+	return -1;
+#endif
+}
+
+#define check_execveat_fail(fd, path, flags, errno)	\
+	_check_execveat_fail(fd, path, flags, errno, #errno)
+static int _check_execveat_fail(int fd, const char *path, int flags,
+				int expected_errno, const char *errno_str)
+{
+	int rc;
+
+	errno = 0;
+	printf("Check failure of execveat(%d, '%s', %d) with %s... ",
+		fd, path?:"(null)", flags, errno_str);
+	rc = execveat_(fd, path, argv, envp, flags);
+
+	if (rc > 0) {
+		printf("[FAIL] (unexpected success from execveat(2))\n");
+		return 1;
+	}
+	if (errno != expected_errno) {
+		printf("[FAIL] (expected errno %d (%s) not %d (%s)\n",
+			expected_errno, strerror(expected_errno),
+			errno, strerror(errno));
+		return 1;
+	}
+	printf("[OK]\n");
+	return 0;
+}
+
+static int check_execveat_invoked_rc(int fd, const char *path, int flags,
+				     int expected_rc)
+{
+	int status;
+	int rc;
+	pid_t child;
+	int pathlen = path ? strlen(path) : 0;
+
+	if (pathlen > 40)
+		printf("Check success of execveat(%d, '%.20s...%s', %d)... ",
+			fd, path, (path + pathlen - 20), flags);
+	else
+		printf("Check success of execveat(%d, '%s', %d)... ",
+			fd, path?:"(null)", flags);
+	child = fork();
+	if (child < 0) {
+		printf("[FAIL] (fork() failed)\n");
+		return 1;
+	}
+	if (child == 0) {
+		/* Child: do execveat(). */
+		rc = execveat_(fd, path, argv, envp, flags);
+		printf("[FAIL]: execveat() failed, rc=%d errno=%d (%s)\n",
+			rc, errno, strerror(errno));
+		exit(1);  /* should not reach here */
+	}
+	/* Parent: wait for & check child's exit status. */
+	rc = waitpid(child, &status, 0);
+	if (rc != child) {
+		printf("[FAIL] (waitpid(%d,...) returned %d)\n", child, rc);
+		return 1;
+	}
+	if (!WIFEXITED(status)) {
+		printf("[FAIL] (child %d did not exit cleanly, status=%08x)\n",
+			child, status);
+		return 1;
+	}
+	if (WEXITSTATUS(status) != expected_rc) {
+		printf("[FAIL] (child %d exited with %d not %d)\n",
+			child, WEXITSTATUS(status), expected_rc);
+		return 1;
+	}
+	printf("[OK]\n");
+	return 0;
+}
+
+static int check_execveat(int fd, const char *path, int flags)
+{
+	return check_execveat_invoked_rc(fd, path, flags, 99);
+}
+
+static char *concat(const char *left, const char *right)
+{
+	char *result = malloc(strlen(left) + strlen(right) + 1);
+
+	strcpy(result, left);
+	strcat(result, right);
+	return result;
+}
+
+static int open_or_die(const char *filename, int flags)
+{
+	int fd = open(filename, flags);
+
+	if (fd < 0) {
+		printf("Failed to open '%s'; "
+			"check prerequisites are available\n", filename);
+		exit(1);
+	}
+	return fd;
+}
+
+static void exe_cp(const char *src, const char *dest)
+{
+	int in_fd = open_or_die(src, O_RDONLY);
+	int out_fd = open(dest, O_RDWR|O_CREAT|O_TRUNC, 0755);
+	struct stat info;
+
+	fstat(in_fd, &info);
+	sendfile(out_fd, in_fd, NULL, info.st_size);
+	close(in_fd);
+	close(out_fd);
+}
+
+#define XX_DIR_LEN 200
+static int check_execveat_pathmax(int dot_dfd, const char *src, int is_script)
+{
+	int fail = 0;
+	int ii, count, len;
+	char longname[XX_DIR_LEN + 1];
+	int fd;
+
+	if (*longpath == '\0') {
+		/* Create a filename close to PATH_MAX in length */
+		memset(longname, 'x', XX_DIR_LEN - 1);
+		longname[XX_DIR_LEN - 1] = '/';
+		longname[XX_DIR_LEN] = '\0';
+		count = (PATH_MAX - 3) / XX_DIR_LEN;
+		for (ii = 0; ii < count; ii++) {
+			strcat(longpath, longname);
+			mkdir(longpath, 0755);
+		}
+		len = (PATH_MAX - 3) - (count * XX_DIR_LEN);
+		if (len <= 0)
+			len = 1;
+		memset(longname, 'y', len);
+		longname[len] = '\0';
+		strcat(longpath, longname);
+	}
+	exe_cp(src, longpath);
+
+	/*
+	 * Execute as a pre-opened file descriptor, which works whether this is
+	 * a script or not (because the interpreter sees a filename like
+	 * "/dev/fd/20").
+	 */
+	fd = open(longpath, O_RDONLY);
+	if (fd > 0) {
+		printf("Invoke copy of '%s' via filename of length %lu:\n",
+			src, strlen(longpath));
+		fail += check_execveat(fd, "", AT_EMPTY_PATH);
+	} else {
+		printf("Failed to open length %lu filename, errno=%d (%s)\n",
+			strlen(longpath), errno, strerror(errno));
+		fail++;
+	}
+
+	/*
+	 * Execute as a long pathname relative to ".".  If this is a script,
+	 * the interpreter will launch but fail to open the script because its
+	 * name ("/dev/fd/5/xxx....") is bigger than PATH_MAX.
+	 */
+	if (is_script)
+		fail += check_execveat_invoked_rc(dot_dfd, longpath, 0, 127);
+	else
+		fail += check_execveat(dot_dfd, longpath, 0);
+
+	return fail;
+}
+
+static int run_tests(void)
+{
+	int fail = 0;
+	char *fullname = realpath("execveat", NULL);
+	char *fullname_script = realpath("script", NULL);
+	char *fullname_symlink = concat(fullname, ".symlink");
+	int subdir_dfd = open_or_die("subdir", O_DIRECTORY|O_RDONLY);
+	int subdir_dfd_ephemeral = open_or_die("subdir.ephemeral",
+					       O_DIRECTORY|O_RDONLY);
+	int dot_dfd = open_or_die(".", O_DIRECTORY|O_RDONLY);
+	int dot_dfd_path = open_or_die(".", O_DIRECTORY|O_RDONLY|O_PATH);
+	int dot_dfd_cloexec = open_or_die(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+	int fd = open_or_die("execveat", O_RDONLY);
+	int fd_path = open_or_die("execveat", O_RDONLY|O_PATH);
+	int fd_symlink = open_or_die("execveat.symlink", O_RDONLY);
+	int fd_denatured = open_or_die("execveat.denatured", O_RDONLY);
+	int fd_denatured_path = open_or_die("execveat.denatured",
+					    O_RDONLY|O_PATH);
+	int fd_script = open_or_die("script", O_RDONLY);
+	int fd_ephemeral = open_or_die("execveat.ephemeral", O_RDONLY);
+	int fd_ephemeral_path = open_or_die("execveat.path.ephemeral",
+					    O_RDONLY|O_PATH);
+	int fd_script_ephemeral = open_or_die("script.ephemeral", O_RDONLY);
+	int fd_cloexec = open_or_die("execveat", O_RDONLY|O_CLOEXEC);
+	int fd_script_cloexec = open_or_die("script", O_RDONLY|O_CLOEXEC);
+
+	/* Change file position to confirm it doesn't affect anything */
+	lseek(fd, 10, SEEK_SET);
+
+	/* Normal executable file: */
+	/*   dfd + path */
+	fail += check_execveat(subdir_dfd, "../execveat", 0);
+	fail += check_execveat(dot_dfd, "execveat", 0);
+	fail += check_execveat(dot_dfd_path, "execveat", 0);
+	/*   absolute path */
+	fail += check_execveat(AT_FDCWD, fullname, 0);
+	/*   absolute path with nonsense dfd */
+	fail += check_execveat(99, fullname, 0);
+	/*   fd + no path */
+	fail += check_execveat(fd, "", AT_EMPTY_PATH);
+	/*   O_CLOEXEC fd + no path */
+	fail += check_execveat(fd_cloexec, "", AT_EMPTY_PATH);
+	/*   O_PATH fd */
+	fail += check_execveat(fd_path, "", AT_EMPTY_PATH);
+
+	/* Mess with executable file that's already open: */
+	/*   fd + no path to a file that's been renamed */
+	rename("execveat.ephemeral", "execveat.moved");
+	fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH);
+	/*   fd + no path to a file that's been deleted */
+	unlink("execveat.moved"); /* remove the file now fd open */
+	fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH);
+
+	/* Mess with executable file that's already open with O_PATH */
+	/*   fd + no path to a file that's been deleted */
+	unlink("execveat.path.ephemeral");
+	fail += check_execveat(fd_ephemeral_path, "", AT_EMPTY_PATH);
+
+	/* Invalid argument failures */
+	fail += check_execveat_fail(fd, "", 0, ENOENT);
+	fail += check_execveat_fail(fd, NULL, AT_EMPTY_PATH, EFAULT);
+
+	/* Symlink to executable file: */
+	/*   dfd + path */
+	fail += check_execveat(dot_dfd, "execveat.symlink", 0);
+	fail += check_execveat(dot_dfd_path, "execveat.symlink", 0);
+	/*   absolute path */
+	fail += check_execveat(AT_FDCWD, fullname_symlink, 0);
+	/*   fd + no path, even with AT_SYMLINK_NOFOLLOW (already followed) */
+	fail += check_execveat(fd_symlink, "", AT_EMPTY_PATH);
+	fail += check_execveat(fd_symlink, "",
+			       AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW);
+
+	/* Symlink fails when AT_SYMLINK_NOFOLLOW set: */
+	/*   dfd + path */
+	fail += check_execveat_fail(dot_dfd, "execveat.symlink",
+				    AT_SYMLINK_NOFOLLOW, ELOOP);
+	fail += check_execveat_fail(dot_dfd_path, "execveat.symlink",
+				    AT_SYMLINK_NOFOLLOW, ELOOP);
+	/*   absolute path */
+	fail += check_execveat_fail(AT_FDCWD, fullname_symlink,
+				    AT_SYMLINK_NOFOLLOW, ELOOP);
+
+	/* Shell script wrapping executable file: */
+	/*   dfd + path */
+	fail += check_execveat(subdir_dfd, "../script", 0);
+	fail += check_execveat(dot_dfd, "script", 0);
+	fail += check_execveat(dot_dfd_path, "script", 0);
+	/*   absolute path */
+	fail += check_execveat(AT_FDCWD, fullname_script, 0);
+	/*   fd + no path */
+	fail += check_execveat(fd_script, "", AT_EMPTY_PATH);
+	fail += check_execveat(fd_script, "",
+			       AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW);
+	/*   O_CLOEXEC fd fails for a script (as script file inaccessible) */
+	fail += check_execveat_fail(fd_script_cloexec, "", AT_EMPTY_PATH,
+				    ENOENT);
+	fail += check_execveat_fail(dot_dfd_cloexec, "script", 0, ENOENT);
+
+	/* Mess with script file that's already open: */
+	/*   fd + no path to a file that's been renamed */
+	rename("script.ephemeral", "script.moved");
+	fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH);
+	/*   fd + no path to a file that's been deleted */
+	unlink("script.moved"); /* remove the file while fd open */
+	fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH);
+
+	/* Rename a subdirectory in the path: */
+	rename("subdir.ephemeral", "subdir.moved");
+	fail += check_execveat(subdir_dfd_ephemeral, "../script", 0);
+	fail += check_execveat(subdir_dfd_ephemeral, "script", 0);
+	/* Remove the subdir and its contents */
+	unlink("subdir.moved/script");
+	unlink("subdir.moved");
+	/* Shell loads via deleted subdir OK because name starts with .. */
+	fail += check_execveat(subdir_dfd_ephemeral, "../script", 0);
+	fail += check_execveat_fail(subdir_dfd_ephemeral, "script", 0, ENOENT);
+
+	/* Flag values other than AT_SYMLINK_NOFOLLOW => EINVAL */
+	fail += check_execveat_fail(dot_dfd, "execveat", 0xFFFF, EINVAL);
+	/* Invalid path => ENOENT */
+	fail += check_execveat_fail(dot_dfd, "no-such-file", 0, ENOENT);
+	fail += check_execveat_fail(dot_dfd_path, "no-such-file", 0, ENOENT);
+	fail += check_execveat_fail(AT_FDCWD, "no-such-file", 0, ENOENT);
+	/* Attempt to execute directory => EACCES */
+	fail += check_execveat_fail(dot_dfd, "", AT_EMPTY_PATH, EACCES);
+	/* Attempt to execute non-executable => EACCES */
+	fail += check_execveat_fail(dot_dfd, "Makefile", 0, EACCES);
+	fail += check_execveat_fail(fd_denatured, "", AT_EMPTY_PATH, EACCES);
+	fail += check_execveat_fail(fd_denatured_path, "", AT_EMPTY_PATH,
+				    EACCES);
+	/* Attempt to execute nonsense FD => EBADF */
+	fail += check_execveat_fail(99, "", AT_EMPTY_PATH, EBADF);
+	fail += check_execveat_fail(99, "execveat", 0, EBADF);
+	/* Attempt to execute relative to non-directory => ENOTDIR */
+	fail += check_execveat_fail(fd, "execveat", 0, ENOTDIR);
+
+	fail += check_execveat_pathmax(dot_dfd, "execveat", 0);
+	fail += check_execveat_pathmax(dot_dfd, "script", 1);
+	return fail;
+}
+
+static void prerequisites(void)
+{
+	int fd;
+	const char *script = "#!/bin/sh\nexit $*\n";
+
+	/* Create ephemeral copies of files */
+	exe_cp("execveat", "execveat.ephemeral");
+	exe_cp("execveat", "execveat.path.ephemeral");
+	exe_cp("script", "script.ephemeral");
+	mkdir("subdir.ephemeral", 0755);
+
+	fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755);
+	write(fd, script, strlen(script));
+	close(fd);
+}
+
+int main(int argc, char **argv)
+{
+	int ii;
+	int rc;
+	const char *verbose = getenv("VERBOSE");
+
+	if (argc >= 2) {
+		/* If we are invoked with an argument, don't run tests. */
+		const char *in_test = getenv("IN_TEST");
+
+		if (verbose) {
+			printf("  invoked with:");
+			for (ii = 0; ii < argc; ii++)
+				printf(" [%d]='%s'", ii, argv[ii]);
+			printf("\n");
+		}
+
+		/* Check expected environment transferred. */
+		if (!in_test || strcmp(in_test, "yes") != 0) {
+			printf("[FAIL] (no IN_TEST=yes in env)\n");
+			return 1;
+		}
+
+		/* Use the final argument as an exit code. */
+		rc = atoi(argv[argc - 1]);
+		fflush(stdout);
+	} else {
+		prerequisites();
+		if (verbose)
+			envp[1] = "VERBOSE=1";
+		rc = run_tests();
+		if (rc > 0)
+			printf("%d tests failed\n", rc);
+	}
+	return rc;
+}
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCHv10 4/5] sparc: Hook up execveat system call.
From: David Drysdale @ 2014-11-24 11:53 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel, Andrew Morton, David Miller, Thomas Gleixner
  Cc: Stephen Rothwell, Oleg Nesterov, Michael Kerrisk, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86, linux-arch, linux-api, sparclinux,
	David Drysdale
In-Reply-To: <1416830039-21952-1-git-send-email-drysdale@google.com>

Signed-off-by: David Drysdale <drysdale@google.com>
---
 arch/sparc/include/uapi/asm/unistd.h |  3 ++-
 arch/sparc/kernel/syscalls.S         | 10 ++++++++++
 arch/sparc/kernel/systbls_32.S       |  1 +
 arch/sparc/kernel/systbls_64.S       |  2 ++
 4 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h
index 46d83842eddc..6f35f4df17f2 100644
--- a/arch/sparc/include/uapi/asm/unistd.h
+++ b/arch/sparc/include/uapi/asm/unistd.h
@@ -415,8 +415,9 @@
 #define __NR_getrandom		347
 #define __NR_memfd_create	348
 #define __NR_bpf		349
+#define __NR_execveat		350
 
-#define NR_syscalls		350
+#define NR_syscalls		351
 
 /* Bitmask values returned from kern_features system call.  */
 #define KERN_FEATURE_MIXED_MODE_STACK	0x00000001
diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S
index 33a17e7b3ccd..bb0008927598 100644
--- a/arch/sparc/kernel/syscalls.S
+++ b/arch/sparc/kernel/syscalls.S
@@ -6,6 +6,11 @@ sys64_execve:
 	jmpl	%g1, %g0
 	 flushw
 
+sys64_execveat:
+	set	sys_execveat, %g1
+	jmpl	%g1, %g0
+	 flushw
+
 #ifdef CONFIG_COMPAT
 sunos_execv:
 	mov	%g0, %o2
@@ -13,6 +18,11 @@ sys32_execve:
 	set	compat_sys_execve, %g1
 	jmpl	%g1, %g0
 	 flushw
+
+sys32_execveat:
+	set	compat_sys_execveat, %g1
+	jmpl	%g1, %g0
+	 flushw
 #endif
 
 	.align	32
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index ad0cdf497b78..e31a9056a303 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -87,3 +87,4 @@ sys_call_table:
 /*335*/	.long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
 /*340*/	.long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 /*345*/	.long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
+/*350*/	.long sys_execveat
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 580cde9370c9..d72f76ae70eb 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -88,6 +88,7 @@ sys_call_table32:
 	.word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 	.word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
+/*350*/	.word sys32_execveat
 
 #endif /* CONFIG_COMPAT */
 
@@ -167,3 +168,4 @@ sys_call_table:
 	.word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 	.word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
+/*350*/	.word sys64_execveat
-- 
2.1.0.rc2.206.gedb03e5


^ permalink raw reply related

* [PATCHv10 man-pages 5/5] execveat.2: initial man page for execveat(2)
From: David Drysdale @ 2014-11-24 11:53 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel, Andrew Morton, David Miller, Thomas Gleixner
  Cc: Stephen Rothwell, Oleg Nesterov, Michael Kerrisk, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86, linux-arch, linux-api, sparclinux,
	David Drysdale
In-Reply-To: <1416830039-21952-1-git-send-email-drysdale@google.com>

Signed-off-by: David Drysdale <drysdale@google.com>
---
 man2/execveat.2 | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 man2/execveat.2

diff --git a/man2/execveat.2 b/man2/execveat.2
new file mode 100644
index 000000000000..937d79e4c4f0
--- /dev/null
+++ b/man2/execveat.2
@@ -0,0 +1,153 @@
+.\" Copyright (c) 2014 Google, Inc.
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date.  The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein.  The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and authors of this work.
+.\" %%%LICENSE_END
+.\"
+.TH EXECVEAT 2 2014-04-02 "Linux" "Linux Programmer's Manual"
+.SH NAME
+execveat \- execute program relative to a directory file descriptor
+.SH SYNOPSIS
+.B #include <unistd.h>
+.sp
+.BI "int execveat(int " fd ", const char *" pathname ","
+.br
+.BI "             char *const " argv "[],  char *const " envp "[],"
+.br
+.BI "             int " flags);
+.SH DESCRIPTION
+The
+.BR execveat ()
+system call executes the program pointed to by the combination of \fIfd\fP and \fIpathname\fP.
+The
+.BR execveat ()
+system call operates in exactly the same way as
+.BR execve (2),
+except for the differences described in this manual page.
+
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I fd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR execve (2)
+for a relative pathname).
+
+If
+.I pathname
+is relative and
+.I fd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR execve (2)).
+
+If
+.I pathname
+is absolute, then
+.I fd
+is ignored.
+
+If
+.I pathname
+is an empty string and the
+.BR AT_EMPTY_PATH
+flag is specified, then the file descriptor
+.I fd
+specifies the file to be executed.
+
+.I flags
+can either be 0, or include the following flags:
+.TP
+.BR AT_EMPTY_PATH
+If
+.I pathname
+is an empty string, operate on the file referred to by
+.IR fd
+(which may have been obtained using the
+.BR open (2)
+.B O_PATH
+flag).
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If the file identified by
+.I fd
+and a non-NULL
+.I pathname
+is a symbolic link, then the call fails with the error
+.BR EINVAL .
+.SH "RETURN VALUE"
+On success,
+.BR execveat ()
+does not return. On error \-1 is returned, and
+.I errno
+is set appropriately.
+.SH ERRORS
+The same errors that occur for
+.BR execve (2)
+can also occur for
+.BR execveat ().
+The following additional errors can occur for
+.BR execveat ():
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B ENOENT
+The program identified by \fIfd\fP and \fIpathname\fP requires the
+use of an interpreter program (such as a script starting with
+"#!") but the file descriptor
+.I fd
+was opened with the
+.B O_CLOEXEC
+flag and so the program file is inaccessible to the launched interpreter.
+.TP
+.B EINVAL
+Invalid flag specified in
+.IR flags .
+.TP
+.B ENOTDIR
+.I pathname
+is relative and
+.I fd
+is a file descriptor referring to a file other than a directory.
+.SH VERSIONS
+.BR execveat ()
+was added to Linux in kernel 3.???.
+.SH NOTES
+In addition to the reasons explained in
+.BR openat (2),
+the
+.BR execveat ()
+system call is also needed to allow
+.BR fexecve (3)
+to be implemented on systems that do not have the
+.I /proc
+filesystem mounted.
+.SH SEE ALSO
+.BR execve (2),
+.BR fexecve (3)
--
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH v3 32/41] tun: move internal flag defines out of uapi
From: Michael S. Tsirkin @ 2014-11-24 11:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, Jason Wang,
	Zhi Yong Wu, Tom Herbert, Ben Hutchings, Masatake YAMATO,
	Herbert Xu, Xi Wang, netdev, linux-api
In-Reply-To: <1416829787-14252-1-git-send-email-mst@redhat.com>

TUN_ flags are internal and never exposed
to userspace. Any application using it is almost
certainly buggy.

Move them out to tun.c, we'll remove them in follow-up patches.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/if_tun.h | 16 ++--------
 drivers/net/tun.c           | 74 ++++++++++++++-------------------------------
 2 files changed, 26 insertions(+), 64 deletions(-)

diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index e9502dd..277a260 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -22,21 +22,11 @@
 
 /* Read queue size */
 #define TUN_READQ_SIZE	500
-
-/* TUN device flags */
-#define TUN_TUN_DEV 	0x0001	
-#define TUN_TAP_DEV	0x0002
+/* TUN device type flags: deprecated. Use IFF_TUN/IFF_TAP instead. */
+#define TUN_TUN_DEV 	IFF_TUN
+#define TUN_TAP_DEV	IFF_TAP
 #define TUN_TYPE_MASK   0x000f
 
-#define TUN_FASYNC	0x0010
-#define TUN_NOCHECKSUM	0x0020
-#define TUN_NO_PI	0x0040
-/* This flag has no real effect */
-#define TUN_ONE_QUEUE	0x0080
-#define TUN_PERSIST 	0x0100	
-#define TUN_VNET_HDR 	0x0200
-#define TUN_TAP_MQ      0x0400
-
 /* Ioctl defines */
 #define TUNSETNOCSUM  _IOW('T', 200, int) 
 #define TUNSETDEBUG   _IOW('T', 201, int) 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..bc89d07 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -103,6 +103,21 @@ do {								\
 } while (0)
 #endif
 
+/* TUN device flags */
+
+/* IFF_ATTACH_QUEUE is never stored in device flags,
+ * overload it to mean fasync when stored there.
+ */
+#define TUN_FASYNC	IFF_ATTACH_QUEUE
+#define TUN_NO_PI	IFF_NO_PI
+/* This flag has no real effect */
+#define TUN_ONE_QUEUE	IFF_ONE_QUEUE
+#define TUN_PERSIST 	IFF_PERSIST
+#define TUN_VNET_HDR 	IFF_VNET_HDR
+#define TUN_TAP_MQ      IFF_MULTI_QUEUE
+
+#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
+		      IFF_MULTI_QUEUE)
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -1521,32 +1536,7 @@ static struct proto tun_proto = {
 
 static int tun_flags(struct tun_struct *tun)
 {
-	int flags = 0;
-
-	if (tun->flags & TUN_TUN_DEV)
-		flags |= IFF_TUN;
-	else
-		flags |= IFF_TAP;
-
-	if (tun->flags & TUN_NO_PI)
-		flags |= IFF_NO_PI;
-
-	/* This flag has no real effect.  We track the value for backwards
-	 * compatibility.
-	 */
-	if (tun->flags & TUN_ONE_QUEUE)
-		flags |= IFF_ONE_QUEUE;
-
-	if (tun->flags & TUN_VNET_HDR)
-		flags |= IFF_VNET_HDR;
-
-	if (tun->flags & TUN_TAP_MQ)
-		flags |= IFF_MULTI_QUEUE;
-
-	if (tun->flags & TUN_PERSIST)
-		flags |= IFF_PERSIST;
-
-	return flags;
+	return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
 }
 
 static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
@@ -1706,28 +1696,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
 
-	if (ifr->ifr_flags & IFF_NO_PI)
-		tun->flags |= TUN_NO_PI;
-	else
-		tun->flags &= ~TUN_NO_PI;
-
-	/* This flag has no real effect.  We track the value for backwards
-	 * compatibility.
-	 */
-	if (ifr->ifr_flags & IFF_ONE_QUEUE)
-		tun->flags |= TUN_ONE_QUEUE;
-	else
-		tun->flags &= ~TUN_ONE_QUEUE;
-
-	if (ifr->ifr_flags & IFF_VNET_HDR)
-		tun->flags |= TUN_VNET_HDR;
-	else
-		tun->flags &= ~TUN_VNET_HDR;
-
-	if (ifr->ifr_flags & IFF_MULTI_QUEUE)
-		tun->flags |= TUN_TAP_MQ;
-	else
-		tun->flags &= ~TUN_TAP_MQ;
+	tun->flags = (tun->flags & ~TUN_FEATURES) |
+		(ifr->ifr_flags & TUN_FEATURES);
 
 	/* Make sure persistent devices do not get stuck in
 	 * xoff state.
@@ -1890,9 +1860,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	if (cmd == TUNGETFEATURES) {
 		/* Currently this just means: "what IFF flags are valid?".
 		 * This is needed because we never checked for invalid flags on
-		 * TUNSETIFF. */
-		return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
-				IFF_VNET_HDR | IFF_MULTI_QUEUE,
+		 * TUNSETIFF.  Why do we report IFF_TUN and IFF_TAP which are
+		 * not legal for TUNSETIFF here?  It's probably a bug, but it
+		 * doesn't seem to be worth fixing.
+		 */
+		return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
 				(unsigned int __user*)argp);
 	} else if (cmd == TUNSETQUEUE)
 		return tun_set_queue(file, &ifr);
-- 
MST

^ permalink raw reply related

* [PATCH v3 34/41] tun: add VNET_LE flag
From: Michael S. Tsirkin @ 2014-11-24 11:55 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, netdev,
	linux-api
In-Reply-To: <1416829787-14252-1-git-send-email-mst@redhat.com>

virtio 1.0 modified virtio net header format,
making all fields little endian.

Users can tweak header format before submitting it to tun,
but this means more data copies where none were necessary.
And if the iovec is in RO memory, this means we might
need to split iovec also means we might in theory overflow
iovec max size.

This patch adds a simpler way for applications to handle this,
using new "little endian" flag in tun.
As a result, tun simply byte-swaps header fields as appropriate.
This is a NOP on LE architectures.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/if_tun.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 277a260..18b2403 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -57,6 +57,7 @@
 #define IFF_ONE_QUEUE	0x2000
 #define IFF_VNET_HDR	0x4000
 #define IFF_TUN_EXCL	0x8000
+#define IFF_VNET_LE	0x10000
 #define IFF_MULTI_QUEUE 0x0100
 #define IFF_ATTACH_QUEUE 0x0200
 #define IFF_DETACH_QUEUE 0x0400
-- 
MST

^ permalink raw reply related

* [PATCH v3 39/41] virtio_scsi: export to userspace
From: Michael S. Tsirkin @ 2014-11-24 11:55 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, cornelia.huck, rusty, nab, pbonzini, Rusty Russell,
	Alexei Starovoitov, Greg Kroah-Hartman, David Drysdale,
	Geert Uytterhoeven, Bjarke Istrup Pedersen, stephen hemminger,
	Anup Patel, linux-api, virtualization
In-Reply-To: <1416829787-14252-1-git-send-email-mst@redhat.com>

Replace uXX by __uXX and _packed by __attribute((packed))
as seems to be the norm for userspace headers.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_scsi.h | 74 ++++++++++++++++++++--------------------
 include/uapi/linux/Kbuild        |  1 +
 2 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/include/uapi/linux/virtio_scsi.h b/include/uapi/linux/virtio_scsi.h
index af44864..42b9370 100644
--- a/include/uapi/linux/virtio_scsi.h
+++ b/include/uapi/linux/virtio_scsi.h
@@ -34,78 +34,78 @@
 
 /* SCSI command request, followed by data-out */
 struct virtio_scsi_cmd_req {
-	u8 lun[8];		/* Logical Unit Number */
+	__u8 lun[8];		/* Logical Unit Number */
 	__virtio64 tag;		/* Command identifier */
-	u8 task_attr;		/* Task attribute */
-	u8 prio;		/* SAM command priority field */
-	u8 crn;
-	u8 cdb[VIRTIO_SCSI_CDB_SIZE];
-} __packed;
+	__u8 task_attr;		/* Task attribute */
+	__u8 prio;		/* SAM command priority field */
+	__u8 crn;
+	__u8 cdb[VIRTIO_SCSI_CDB_SIZE];
+} __attribute__((packed));
 
 /* SCSI command request, followed by protection information */
 struct virtio_scsi_cmd_req_pi {
-	u8 lun[8];		/* Logical Unit Number */
+	__u8 lun[8];		/* Logical Unit Number */
 	__virtio64 tag;		/* Command identifier */
-	u8 task_attr;		/* Task attribute */
-	u8 prio;		/* SAM command priority field */
-	u8 crn;
+	__u8 task_attr;		/* Task attribute */
+	__u8 prio;		/* SAM command priority field */
+	__u8 crn;
 	__virtio32 pi_bytesout;	/* DataOUT PI Number of bytes */
 	__virtio32 pi_bytesin;		/* DataIN PI Number of bytes */
-	u8 cdb[VIRTIO_SCSI_CDB_SIZE];
-} __packed;
+	__u8 cdb[VIRTIO_SCSI_CDB_SIZE];
+} __attribute__((packed));
 
 /* Response, followed by sense data and data-in */
 struct virtio_scsi_cmd_resp {
 	__virtio32 sense_len;		/* Sense data length */
 	__virtio32 resid;		/* Residual bytes in data buffer */
 	__virtio16 status_qualifier;	/* Status qualifier */
-	u8 status;		/* Command completion status */
-	u8 response;		/* Response values */
-	u8 sense[VIRTIO_SCSI_SENSE_SIZE];
-} __packed;
+	__u8 status;		/* Command completion status */
+	__u8 response;		/* Response values */
+	__u8 sense[VIRTIO_SCSI_SENSE_SIZE];
+} __attribute__((packed));
 
 /* Task Management Request */
 struct virtio_scsi_ctrl_tmf_req {
 	__virtio32 type;
 	__virtio32 subtype;
-	u8 lun[8];
+	__u8 lun[8];
 	__virtio64 tag;
-} __packed;
+} __attribute__((packed));
 
 struct virtio_scsi_ctrl_tmf_resp {
-	u8 response;
-} __packed;
+	__u8 response;
+} __attribute__((packed));
 
 /* Asynchronous notification query/subscription */
 struct virtio_scsi_ctrl_an_req {
 	__virtio32 type;
-	u8 lun[8];
+	__u8 lun[8];
 	__virtio32 event_requested;
-} __packed;
+} __attribute__((packed));
 
 struct virtio_scsi_ctrl_an_resp {
 	__virtio32 event_actual;
-	u8 response;
-} __packed;
+	__u8 response;
+} __attribute__((packed));
 
 struct virtio_scsi_event {
 	__virtio32 event;
-	u8 lun[8];
+	__u8 lun[8];
 	__virtio32 reason;
-} __packed;
+} __attribute__((packed));
 
 struct virtio_scsi_config {
-	u32 num_queues;
-	u32 seg_max;
-	u32 max_sectors;
-	u32 cmd_per_lun;
-	u32 event_info_size;
-	u32 sense_size;
-	u32 cdb_size;
-	u16 max_channel;
-	u16 max_target;
-	u32 max_lun;
-} __packed;
+	__u32 num_queues;
+	__u32 seg_max;
+	__u32 max_sectors;
+	__u32 cmd_per_lun;
+	__u32 event_info_size;
+	__u32 sense_size;
+	__u32 cdb_size;
+	__u16 max_channel;
+	__u16 max_target;
+	__u32 max_lun;
+} __attribute__((packed));
 
 /* Feature Bits */
 #define VIRTIO_SCSI_F_INOUT                    0
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 44a5581..e61947c 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -428,6 +428,7 @@ header-y += virtio_net.h
 header-y += virtio_pci.h
 header-y += virtio_ring.h
 header-y += virtio_rng.h
+header-y += virtio_scsi.h
 header=y += vm_sockets.h
 header-y += vt.h
 header-y += wait.h
-- 
MST

^ permalink raw reply related

* Re: [PATCH v3 04/41] virtio: memory access APIs
From: Geert Uytterhoeven @ 2014-11-24 12:03 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Bjarke Istrup Pedersen, Anup Patel, rusty, Greg Kroah-Hartman,
	linux-kernel@vger.kernel.org, virtualization, Sakari Ailus,
	linux-api, Paolo Bonzini, Pranavkumar Sawargaonkar, David Miller,
	Alexei Starovoitov
In-Reply-To: <1416829787-14252-5-git-send-email-mst@redhat.com>

On Mon, Nov 24, 2014 at 12:52 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> virtio 1.0 makes all memory structures LE, so
> we need APIs to conditionally do a byteswap on BE
> architectures.
>
> To make it easier to check code statically,
> add virtio specific types for multi-byte integers
> in memory.
>
> Add low level wrappers that do a byteswap conditionally, these will be
> useful e.g. for vhost.  Add high level wrappers that
> query device endian-ness and act accordingly.

> diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h
> new file mode 100644
> index 0000000..824ed0b
> --- /dev/null
> +++ b/include/linux/virtio_byteorder.h

> +static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val)
> +{
> +       if (little_endian)
> +               return le16_to_cpu((__force __le16)val);
> +       else
> +               return (__force u16)val;
> +}

What's wrong with just using le16-to_cpu() ...

> --- a/include/uapi/linux/virtio_ring.h
> +++ b/include/uapi/linux/virtio_ring.h

>  /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
>  struct vring_desc {
>         /* Address (guest-physical). */
> -       __u64 addr;
> +       __virtio64 addr;

... and __le64?

There's already lots of precedence or this, even in include/uapi/.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH v3 04/41] virtio: memory access APIs
From: Michael S. Tsirkin @ 2014-11-24 12:15 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Bjarke Istrup Pedersen, Anup Patel, rusty, Greg Kroah-Hartman,
	linux-kernel@vger.kernel.org, virtualization, Sakari Ailus,
	linux-api, Paolo Bonzini, Pranavkumar Sawargaonkar, David Miller,
	Alexei Starovoitov
In-Reply-To: <CAMuHMdWiCqbaOs-oBW3QU+_d3GsH19A2Ynv9EB_eF2sTvcWy4w@mail.gmail.com>

On Mon, Nov 24, 2014 at 01:03:24PM +0100, Geert Uytterhoeven wrote:
> On Mon, Nov 24, 2014 at 12:52 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > virtio 1.0 makes all memory structures LE, so
> > we need APIs to conditionally do a byteswap on BE
> > architectures.
> >
> > To make it easier to check code statically,
> > add virtio specific types for multi-byte integers
> > in memory.
> >
> > Add low level wrappers that do a byteswap conditionally, these will be
> > useful e.g. for vhost.  Add high level wrappers that
> > query device endian-ness and act accordingly.
> 
> > diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h
> > new file mode 100644
> > index 0000000..824ed0b
> > --- /dev/null
> > +++ b/include/linux/virtio_byteorder.h
> 
> > +static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val)
> > +{
> > +       if (little_endian)
> > +               return le16_to_cpu((__force __le16)val);
> > +       else
> > +               return (__force u16)val;
> > +}
> 
> What's wrong with just using le16-to_cpu() ...

le16-to_cpu() is simply wrong: virtio needs to be
LE or native endian, depending on whether it's running
in 0.9 or 1.0 mode.

> > --- a/include/uapi/linux/virtio_ring.h
> > +++ b/include/uapi/linux/virtio_ring.h
> 
> >  /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
> >  struct vring_desc {
> >         /* Address (guest-physical). */
> > -       __u64 addr;
> > +       __virtio64 addr;
> 
> ... and __le64?
> 
> There's already lots of precedence or this, even in include/uapi/.
> 
> Gr{oetje,eeting}s,
> 
>                         Geert

__le would make people think they can use le16-to_cpu() which is wrong.

-- 
MST

^ permalink raw reply

* Re: [PATCHv10 2/5] x86: Hook up execveat system call.
From: Thomas Gleixner @ 2014-11-24 12:45 UTC (permalink / raw)
  To: David Drysdale
  Cc: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton, David Miller,
	Stephen Rothwell, Oleg Nesterov, Michael Kerrisk, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-arch-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	sparclinux-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1416830039-21952-3-git-send-email-drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

On Mon, 24 Nov 2014, David Drysdale wrote:

> Hook up x86-64, i386 and x32 ABIs.
> 
> Signed-off-by: David Drysdale <drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Reviewed-by: Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>

^ permalink raw reply

* Re: [PATCH v3 04/41] virtio: memory access APIs
From: Geert Uytterhoeven @ 2014-11-24 12:58 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	David Miller, cornelia.huck-tA70FqPdS9bQT0dZR+AlfA,
	rusty-8fk3Idey6ehBDgjK7y7TUQ, Nicholas Bellinger, Paolo Bonzini,
	Rusty Russell, Greg Kroah-Hartman, Alexei Starovoitov,
	Pranavkumar Sawargaonkar, Anup Patel, Sakari Ailus,
	Bjarke Istrup Pedersen, stephen hemminger,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20141124121503.GB14353-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Mon, Nov 24, 2014 at 1:15 PM, Michael S. Tsirkin <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On Mon, Nov 24, 2014 at 01:03:24PM +0100, Geert Uytterhoeven wrote:
>> On Mon, Nov 24, 2014 at 12:52 PM, Michael S. Tsirkin <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> > virtio 1.0 makes all memory structures LE, so
>> > we need APIs to conditionally do a byteswap on BE
>> > architectures.
>> >
>> > To make it easier to check code statically,
>> > add virtio specific types for multi-byte integers
>> > in memory.
>> >
>> > Add low level wrappers that do a byteswap conditionally, these will be
>> > useful e.g. for vhost.  Add high level wrappers that
>> > query device endian-ness and act accordingly.
>>
>> > diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h
>> > new file mode 100644
>> > index 0000000..824ed0b
>> > --- /dev/null
>> > +++ b/include/linux/virtio_byteorder.h
>>
>> > +static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val)
>> > +{
>> > +       if (little_endian)
>> > +               return le16_to_cpu((__force __le16)val);
>> > +       else
>> > +               return (__force u16)val;
>> > +}
>>
>> What's wrong with just using le16-to_cpu() ...
>
> le16-to_cpu() is simply wrong: virtio needs to be
> LE or native endian, depending on whether it's running
> in 0.9 or 1.0 mode.

IC, that was not clear from the description for this patch.
I thought it was dependent on BE architectures.

Nevertheless, any chance you can get rid of the "conditional"?

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert-Td1EMuHUCqxL1ZNQvxDV9g@public.gmane.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH net-next v4 0/4] netns: allow to identify peer netns
From: Nicolas Dichtel @ 2014-11-24 13:45 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, luto-kltTT9wpgjJwATOyAt5JVQ,
	stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
	cwang-xCSkyg8dI+0RB7SZvlqPiA, linux-api-u79uwXL29TY76Z2rM5mHXA,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q
In-Reply-To: <871tpph03k.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>

Le 30/10/2014 19:41, Eric W. Biederman a écrit :
> Nicolas Dichtel <nicolas.dichtel@6wind.com> writes:
>
>> The goal of this serie is to be able to multicast netlink messages with an
>> attribute that identify a peer netns.
>> This is needed by the userland to interpret some informations contained in
>> netlink messages (like IFLA_LINK value, but also some other attributes in case
>> of x-netns netdevice (see also
>> http://thread.gmane.org/gmane.linux.network/315933/focus=316064 and
>> http://thread.gmane.org/gmane.linux.kernel.containers/28301/focus=4239)).
>>
>> Ids of peer netns are set by userland via a new genl messages. These ids are
>> stored per netns and are local (ie only valid in the netns where they are set).
>> To avoid allocating an int for each peer netns, I use idr_for_each() to retrieve
>> the id of a peer netns. Note that it will be possible to add a table (struct net
>> -> id) later to optimize this lookup if needed.
>>
>> Patch 1/4 introduces the netlink API mechanism to set and get these ids.
>> Patch 2/4 and 3/4 implements an example of how to use these ids in rtnetlink
>> messages. And patch 4/4 shows that the netlink messages can be symetric between
>> a GET and a SET.
>>
>> iproute2 patches are available, I can send them on demand.
>
> A quick reply.  I think this patchset is in the right general direction.
> There are some oddball details that seem odd/awkward to me such as using
> genetlink instead of rtnetlink to get and set the ids, and not having
> ids if they are not set (that feels like a maintenance/usability challenge).
>
> I would like to give your patches a deep review, but I won't be able to
> do that for a couple of weeks.  I am deep in the process of moving,
> and will be mostly offline until about the Nov 11th.
Eric, did you have a chance to look at this?


Regards,
Nicolas
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/containers

^ permalink raw reply

* Re: [PATCH v3 1/7] crypto: AF_ALG: add user space interface for AEAD
From: Herbert Xu @ 2014-11-24 14:26 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: Daniel Borkmann, 'Quentin Gouchet',
	lkml - Kernel Mailing List, linux-crypto-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <5694690.RURGUoE58b-PJstQz4BMNNP20K/wil9xYQuADTiUCJX@public.gmane.org>

On Fri, Nov 21, 2014 at 06:30:18AM +0100, Stephan Mueller wrote:
>
> @@ -421,6 +421,18 @@ int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con)
>  			con->op = *(u32 *)CMSG_DATA(cmsg);
>  			break;
>  
> +		case ALG_SET_AEAD_AUTHSIZE:
> +			if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32)))
> +				return -EINVAL;
> +			con->aead_authsize = *(u32 *)CMSG_DATA(cmsg);
> +			break;

This is a tfm attribute so it should go into setsockopt.

Cheers,
-- 
Email: Herbert Xu <herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q@public.gmane.org>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v3 4/7] crypto: AF_ALG: add AEAD support
From: Herbert Xu @ 2014-11-24 14:29 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: Daniel Borkmann, 'Quentin Gouchet',
	lkml - Kernel Mailing List, linux-crypto-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <2175035.5IWBGpA0Ko-PJstQz4BMNNP20K/wil9xYQuADTiUCJX@public.gmane.org>

On Fri, Nov 21, 2014 at 06:32:16AM +0100, Stephan Mueller wrote:
> This patch adds the AEAD support for AF_ALG.
> 
> The AEAD implementation uses the entire memory handling and
> infrastructure of the existing skcipher implementation.
> 
> To use AEAD, the user space consumer has to use the salg_type named
> "aead". The AEAD extension only uses the bind callback as the key
> differentiator. The previously added functions that select whether to
> use AEAD or ablkcipher crypto API functions depend on the TFM type
> allocated during the bind() call.
> 
> The addition of AEAD brings a bit of overhead to calculate the size of
> the ciphertext, because the AEAD implementation of the kernel crypto API
> makes implied assumption on the location of the authentication tag. When
> performing an encryption, the tag will be added to the created
> ciphertext (note, the tag is placed adjacent to the ciphertext). For
> decryption, the caller must hand in the ciphertext with the tag appended
> to the ciphertext. Therefore, the selection of the used memory
> needs to add/subtract the tag size from the source/destination buffers
> depending on the encryption type. The code is provided with comments
> explainint when and how that operation is performed.
> 
> Note: The AF_ALG interface does not support zero length input data.
> Such zero length input data may be used if one wants to access the hash
> implementation of an AEAD directly (e.g. the GHASH of GCM or CMAC for
> CCM). However, this is a use case that is not of interest. GHASH or
> CMAC is directly available via the hash AF_ALG interface and we
> therefore do not need to take precautions for this use case.
> 
> A fully working example using all aspects of AEAD is provided at
> http://www.chronox.de/libkcapi.html
> 
> Signed-off-by: Stephan Mueller <smueller-T9tCv8IpfcWELgA04lAiVw@public.gmane.org>

I appreciate the effort to share code, but shoe-horning AEAD into
algif_skcipher is just too ugly.

How about let's just start with a separate algif_aead and then
add helpers to merge common code as applicable?

Thanks,
-- 
Email: Herbert Xu <herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q@public.gmane.org>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v3 5/7] crypto: AF_ALG: add random number generator support
From: Herbert Xu @ 2014-11-24 14:31 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: Daniel Borkmann, 'Quentin Gouchet',
	lkml - Kernel Mailing List, linux-crypto, linux-api
In-Reply-To: <2490914.nzfN9gyyGH@tachyon.chronox.de>

On Fri, Nov 21, 2014 at 06:32:52AM +0100, Stephan Mueller wrote:
> This patch adds the random number generator support for AF_ALG.
> 
> A random number generator's purpose is to generate data without
> requiring the caller to provide any data. Therefore, the AF_ALG
> interface handler for RNGs only implements a callback handler for
> recvmsg.
> 
> The following parameters provided with a recvmsg are processed by the
> RNG callback handler:
> 
>         * sock - to resolve the RNG context data structure accessing the
>           RNG instance private to the socket
> 
>         * len - this parameter allows userspace callers to specify how
>           many random bytes the RNG shall produce and return. As the
>           kernel context for the RNG allocates a buffer of 128 bytes to
>           store random numbers before copying them to userspace, the len
>           parameter is checked that it is not larger than 128. If a
>           caller wants more random numbers, a new request for recvmsg
>           shall be made.
> 
> The size of 128 bytes is chose because of the following considerations:
> 
>         * to increase the memory footprint of the kernel too much (note,
>           that would be 128 bytes per open socket)
> 
>         * 128 is divisible by any typical cryptographic block size an
>           RNG may have
> 
>         * A request for random numbers typically only shall supply small
>           amount of data like for keys or IVs that should only require
>           one invocation of the recvmsg function.
> 
> Note, during instantiation of the RNG, the code checks whether the RNG
> implementation requires seeding. If so, the RNG is seeded with output
> from get_random_bytes.
> 
> A fully working example using all aspects of the RNG interface is
> provided at http://www.chronox.de/libkcapi.html
> 
> Signed-off-by: Stephan Mueller <smueller@chronox.de>

Sorry but who is going to use this and for what purpose?

Every other algif interface exports real hardware features that
cannot otherwise be accessed from user-space.  All crypto RNGs
are by definition software-only, so what is the point of this?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 0/6] kernel tinification: optionally compile out splice family of syscalls (splice, vmsplice, tee and sendfile)
From: Josh Triplett @ 2014-11-24 14:54 UTC (permalink / raw)
  To: Pieter Smith
  Cc: Jeff Layton, David Miller,
	alexander.h.duyck-ral2JQCrhuEAvxtiuMwx3w,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, ast-uqk4Ao+rVK5Wk0Htik3J/w,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	beber-2YnHqweIUXrk1uMJSBkQmQ,
	catalina.mocanu-Re5JQEeQqe8AvxtiuMwx3w,
	dborkman-H+wXaHxf7aLQT0dZR+AlfA, edumazet-hpIqsD4AKlfQT0dZR+AlfA,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w, fabf-AgBVmzD5pcezQB+pC5nmwQ,
	fuse-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	geert-Td1EMuHUCqxL1ZNQvxDV9g, hughd-hpIqsD4AKlfQT0dZR+AlfA,
	iulia.manda21-Re5JQEeQqe8AvxtiuMwx3w, JBeulich-IBi9RG/b67k,
	bfields-uC3wQj2KruNg9hUCZPvPmw, linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, mcgrof-IBi9RG/b67k,
	mattst88-Re5JQEeQqe8AvxtiuMwx3w, mgorman-l3A5Bk7waGM,
	mst-H+wXaHxf7aLQT0dZR+AlfA, miklos-sUDqSbJrdHQHWmgEVkV9KA,
	netdev-u79uwXL29TY76Z2rM5mHXA, oleg-H+wXaHxf7aLQT0dZR+AlfA,
	Paul.Durrant-Sxgqhf6Nn4DQT0dZR+AlfA,
	paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	pefoley2-lY0TAiDIAFlBDgjK7y7TUQ, tgraf-G/eBtMaohhA,
	therbert-hpIqsD4AKlfQT0dZR+AlfA, willemb-hpIqsD4AKlfQT0dZR+AlfA,
	xiaoguangrong-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	zhenglong.cai-TJRtMXcVgQTM1kAEIRd3EQ
In-Reply-To: <20141124100138.GB1055@smipidev>

On Mon, Nov 24, 2014 at 11:01:38AM +0100, Pieter Smith wrote:
> On Sun, Nov 23, 2014 at 04:32:51PM -0800, Josh Triplett wrote:
> > On Sun, Nov 23, 2014 at 07:28:10PM -0500, Jeff Layton wrote:
> > > On Sun, 23 Nov 2014 15:36:37 -0800
> > > Josh Triplett <josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org> wrote:
> > > 
> > > > On Sun, Nov 23, 2014 at 09:30:40PM +0100, Pieter Smith wrote:
> > > > > On Sun, Nov 23, 2014 at 11:43:26AM -0800, Josh Triplett wrote:
> > > > > > On Sun, Nov 23, 2014 at 01:46:23PM -0500, David Miller wrote:
> > > > > > > Truly removing sendfile/sendpage means that you can't even compile NFS
> > > > > > > into the tree.
> > > > > > 
> > > > > > If you mean the in-kernel nfsd (CONFIG_NFSD), that already has a large
> > > > > > stack of "select" and "depends on", both directly and indirectly; adding
> > > > > > a "select SPLICE_SYSCALL" to it seems fine.  (That select does need
> > > > > > adding, though.  Pieter, you need to test-compile more than just
> > > > > > tinyconfig and defconfig.  Try an allyesconfig with *just* splice turned
> > > > > > off, and make sure that compiles.)
> > > > > 
> > > > > Did exacly that. Took forever on my hardware, but no problems.
> > > > 
> > > > Ah, I see.  Looking more closely at nfsd, it looks like it already has a
> > > > code path for filesystems that don't do splice.  I think, rather than
> > > > making nfsd select SPLICE_SYSCALL, that it would suffice to change the
> > > > "rqstp->rq_splice_ok = true;" in svc_process_common (net/sunrpc/svc.c)
> > > > to:
> > > > 
> > > > rqstp->rq_splice_ok = IS_ENABLED(CONFIG_SPLICE_SYSCALL);
> > > > 
> > > > Then nfsd should simply *always* fall back to its non-splice support.
> > > > 
> > > 
> > > I'd probably prefer the above, actually. We have to keep supporting
> > > non-splice enabled fs' for the forseeable future, so we may as well
> > > allow people to run nfsd in such configurations. It could even be
> > > useful for testing the non-splice-enabled codepaths.
> > 
> > Good point!
> > 
> > - Josh Triplett
> 
> I'll add this to svc_process_common. I can squash this into PATCH 3, which is
> where the syscalls can be compiled out. The log entry may however get a little
> crowded and multi-functional.
> 
> Should I keep this as a separate patch?

I'd keep it as a separate patch, yes.  It doesn't become necessary until
patch 6, so you can add it as a new patch between patches 3 and 6.

- Josh Triplett

^ permalink raw reply

* Re: [PATCH v3 4/7] crypto: AF_ALG: add AEAD support
From: Stephan Mueller @ 2014-11-24 14:58 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Daniel Borkmann, 'Quentin Gouchet',
	lkml - Kernel Mailing List, linux-crypto, linux-api
In-Reply-To: <20141124142945.GB31469@gondor.apana.org.au>

Am Montag, 24. November 2014, 22:29:46 schrieb Herbert Xu:

Hi Herbert,

>On Fri, Nov 21, 2014 at 06:32:16AM +0100, Stephan Mueller wrote:
>> This patch adds the AEAD support for AF_ALG.
>> 
>> The AEAD implementation uses the entire memory handling and
>> infrastructure of the existing skcipher implementation.
>> 
>> To use AEAD, the user space consumer has to use the salg_type named
>> "aead". The AEAD extension only uses the bind callback as the key
>> differentiator. The previously added functions that select whether to
>> use AEAD or ablkcipher crypto API functions depend on the TFM type
>> allocated during the bind() call.
>> 
>> The addition of AEAD brings a bit of overhead to calculate the size
>> of
>> the ciphertext, because the AEAD implementation of the kernel crypto
>> API makes implied assumption on the location of the authentication
>> tag. When performing an encryption, the tag will be added to the
>> created ciphertext (note, the tag is placed adjacent to the
>> ciphertext). For decryption, the caller must hand in the ciphertext
>> with the tag appended to the ciphertext. Therefore, the selection of
>> the used memory needs to add/subtract the tag size from the
>> source/destination buffers depending on the encryption type. The
>> code is provided with comments explainint when and how that
>> operation is performed.
>> 
>> Note: The AF_ALG interface does not support zero length input data.
>> Such zero length input data may be used if one wants to access the
>> hash implementation of an AEAD directly (e.g. the GHASH of GCM or
>> CMAC for CCM). However, this is a use case that is not of interest.
>> GHASH or CMAC is directly available via the hash AF_ALG interface
>> and we therefore do not need to take precautions for this use case.
>> 
>> A fully working example using all aspects of AEAD is provided at
>> http://www.chronox.de/libkcapi.html
>> 
>> Signed-off-by: Stephan Mueller <smueller@chronox.de>
>
>I appreciate the effort to share code, but shoe-horning AEAD into
>algif_skcipher is just too ugly.

Ok. But in the code you see that skcipher is a 100% subset of AEAD. For 
AEAD, all we need to do in addition to normal symmetric ciphers is to 
select the AEAD kernel crypto API calls, to locate and use the AD and to 
ensure we have the right memory size to process the tag.

How about the following:

Use algif_skcipher.c as the common code which requires:

- function pointers for the kernel crypto API backends

- function pointers for the AEAD specific handling which are invoked at 
the right places -- they are NULL in case of skcipher
>
>How about let's just start with a separate algif_aead and then
>add helpers to merge common code as applicable?

When we have a separate algif_aead, then I guess we want to allow 
skcipher and aead to be configured and compiled independently.

That raises the question where the common code should go? I would not 
suggest to put it into a header file. However, when adding it to a C 
file, how do you propose it should be considered in the Makefile. That C 
file needs to be compile once with either skcipher or aead.
>
>Thanks,


Ciao
Stephan

^ permalink raw reply

* Re: [PATCH v3 5/7] crypto: AF_ALG: add random number generator support
From: Stephan Mueller @ 2014-11-24 15:08 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Daniel Borkmann, 'Quentin Gouchet',
	lkml - Kernel Mailing List, linux-crypto, linux-api
In-Reply-To: <20141124143150.GC31469@gondor.apana.org.au>

Am Montag, 24. November 2014, 22:31:50 schrieb Herbert Xu:

Hi Herbert,

>On Fri, Nov 21, 2014 at 06:32:52AM +0100, Stephan Mueller wrote:
>> This patch adds the random number generator support for AF_ALG.
>> 
>> A random number generator's purpose is to generate data without
>> requiring the caller to provide any data. Therefore, the AF_ALG
>> interface handler for RNGs only implements a callback handler for
>> recvmsg.
>> 
>> The following parameters provided with a recvmsg are processed by the
>> 
>> RNG callback handler:
>>         * sock - to resolve the RNG context data structure accessing
>>         the
>>         
>>           RNG instance private to the socket
>>         
>>         * len - this parameter allows userspace callers to specify
>>         how
>>         
>>           many random bytes the RNG shall produce and return. As the
>>           kernel context for the RNG allocates a buffer of 128 bytes
>>           to
>>           store random numbers before copying them to userspace, the
>>           len
>>           parameter is checked that it is not larger than 128. If a
>>           caller wants more random numbers, a new request for recvmsg
>>           shall be made.
>> 
>> The size of 128 bytes is chose because of the following 
considerations:
>>         * to increase the memory footprint of the kernel too much
>>         (note,
>>         
>>           that would be 128 bytes per open socket)
>>         
>>         * 128 is divisible by any typical cryptographic block size an
>>         
>>           RNG may have
>>         
>>         * A request for random numbers typically only shall supply
>>         small
>>         
>>           amount of data like for keys or IVs that should only
>>           require
>>           one invocation of the recvmsg function.
>> 
>> Note, during instantiation of the RNG, the code checks whether the
>> RNG
>> implementation requires seeding. If so, the RNG is seeded with output
>> from get_random_bytes.
>> 
>> A fully working example using all aspects of the RNG interface is
>> provided at http://www.chronox.de/libkcapi.html
>> 
>> Signed-off-by: Stephan Mueller <smueller@chronox.de>
>
>Sorry but who is going to use this and for what purpose?
>
>Every other algif interface exports real hardware features that
>cannot otherwise be accessed from user-space.  All crypto RNGs
>are by definition software-only, so what is the point of this?


My idea is twofold: The software-RNGs currently available (X9.31 and 
DRBG) use other ciphers as backends. Therefore, they can be considered 
as transforms on top of these backend ciphers. Now, if these backend 
ciphers are available in kernel mode only, currently only these in-
kernel RNGs can use the hardware.

With the consideration of AEAD, all ciphers supported by the kernel 
crypto API are available to user space. That means, there is no need for 
an additional crypto library in user space in addition to provide 
hardware access. The RNG part is there to complement the case for not 
needing an additional crypto lib in user space.

Ciao
Stephan

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox