Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v13 1/2] crypto: AF_ALG: add AEAD support
From: Stephan Mueller @ 2015-02-28 19:50 UTC (permalink / raw)
  To: 'Herbert Xu
  Cc: 'Quentin Gouchet', Daniel Borkmann, linux-api,
	linux-crypto, linux-kernel
In-Reply-To: <2180298.gqj58NYuqx@tachyon.chronox.de>

This patch adds the AEAD support for AF_ALG.

The implementation is based on algif_skcipher, but contains heavy
modifications to streamline the interface for AEAD uses.

To use AEAD, the user space consumer has to use the salg_type named
"aead".

The AEAD implementation includes some overhead to calculate the size of
the ciphertext, because the AEAD implementation of the kernel crypto API
makes implied assumption on the location of the authentication tag. When
performing an encryption, the tag will be added to the created
ciphertext (note, the tag is placed adjacent to the ciphertext). For
decryption, the caller must hand in the ciphertext with the tag appended
to the ciphertext. Therefore, the selection of the used memory
needs to add/subtract the tag size from the source/destination buffers
depending on the encryption type. The code is provided with comments
explaining when and how that operation is performed.

A fully working example using all aspects of AEAD is provided at
http://www.chronox.de/libkcapi.html

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/algif_aead.c | 666 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 666 insertions(+)
 create mode 100644 crypto/algif_aead.c

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
new file mode 100644
index 0000000..527d27b
--- /dev/null
+++ b/crypto/algif_aead.c
@@ -0,0 +1,666 @@
+/*
+ * algif_aead: User-space interface for AEAD algorithms
+ *
+ * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de>
+ *
+ * This file provides the user-space API for AEAD ciphers.
+ *
+ * This file is derived from algif_skcipher.c.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/scatterwalk.h>
+#include <crypto/if_alg.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <net/sock.h>
+
+struct aead_sg_list {
+	unsigned int cur;
+	struct scatterlist sg[ALG_MAX_PAGES];
+};
+
+struct aead_ctx {
+	struct aead_sg_list tsgl;
+	/*
+	 * RSGL_MAX_ENTRIES is an artificial limit where user space at maximum
+	 * can cause the kernel to allocate RSGL_MAX_ENTRIES * ALG_MAX_PAGES
+	 * bytes
+	 */
+#define RSGL_MAX_ENTRIES ALG_MAX_PAGES
+	struct af_alg_sgl rsgl[RSGL_MAX_ENTRIES];
+
+	void *iv;
+
+	struct af_alg_completion completion;
+
+	unsigned long used;
+
+	unsigned int len;
+	bool more;
+	bool merge;
+	bool enc;
+
+	size_t aead_assoclen;
+	struct aead_request aead_req;
+};
+
+static inline int aead_sndbuf(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+
+	return max_t(int, max_t(int, sk->sk_sndbuf & PAGE_MASK, PAGE_SIZE) -
+			  ctx->used, 0);
+}
+
+static inline bool aead_writable(struct sock *sk)
+{
+	return PAGE_SIZE <= aead_sndbuf(sk);
+}
+
+static inline bool aead_sufficient_data(struct aead_ctx *ctx)
+{
+	unsigned as = crypto_aead_authsize(crypto_aead_reqtfm(&ctx->aead_req));
+
+	return (ctx->used >= (ctx->aead_assoclen + (ctx->enc ? 0 : as)));
+}
+
+static void aead_put_sgl(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct scatterlist *sg = sgl->sg;
+	unsigned int i;
+
+	for (i = 0; i < sgl->cur; i++) {
+		if (!sg_page(sg + i))
+			continue;
+
+		put_page(sg_page(sg + i));
+		sg_assign_page(sg + i, NULL);
+	}
+	sgl->cur = 0;
+	ctx->used = 0;
+	ctx->more = 0;
+	ctx->merge = 0;
+}
+
+static void aead_wmem_wakeup(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	if (!aead_writable(sk))
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+static int aead_wait_for_data(struct sock *sk, unsigned flags)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	long timeout;
+	DEFINE_WAIT(wait);
+	int err = -ERESTARTSYS;
+
+	if (flags & MSG_DONTWAIT)
+		return -EAGAIN;
+
+	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (sk_wait_event(sk, &timeout, !ctx->more)) {
+			err = 0;
+			break;
+		}
+	}
+	finish_wait(sk_sleep(sk), &wait);
+
+	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+
+	return err;
+}
+
+static void aead_data_wakeup(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct socket_wq *wq;
+
+	if (ctx->more)
+		return;
+	if (!ctx->used)
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	rcu_read_unlock();
+}
+
+static int aead_sendmsg(struct kiocb *unused, struct socket *sock,
+			struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned ivsize =
+		crypto_aead_ivsize(crypto_aead_reqtfm(&ctx->aead_req));
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct af_alg_control con = {};
+	long copied = 0;
+	bool enc = 0;
+	bool init = 0;
+	int err = -EINVAL;
+
+	if (msg->msg_controllen) {
+		err = af_alg_cmsg_send(msg, &con);
+		if (err)
+			return err;
+
+		init = 1;
+		switch (con.op) {
+		case ALG_OP_ENCRYPT:
+			enc = 1;
+			break;
+		case ALG_OP_DECRYPT:
+			enc = 0;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (con.iv && con.iv->ivlen != ivsize)
+			return -EINVAL;
+	}
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (init) {
+		ctx->enc = enc;
+		if (con.iv)
+			memcpy(ctx->iv, con.iv->iv, ivsize);
+
+		ctx->aead_assoclen = con.aead_assoclen;
+	}
+
+	while (size) {
+		unsigned long len = size;
+		struct scatterlist *sg = NULL;
+
+		/* use the existing memory in an allocated page */
+		if (ctx->merge) {
+			sg = sgl->sg + sgl->cur - 1;
+			len = min_t(unsigned long, len,
+				    PAGE_SIZE - sg->offset - sg->length);
+			err = memcpy_from_msg(page_address(sg_page(sg)) +
+					      sg->offset + sg->length,
+					      msg, len);
+			if (err)
+				goto unlock;
+
+			sg->length += len;
+			ctx->merge = (sg->offset + sg->length) &
+				     (PAGE_SIZE - 1);
+
+			ctx->used += len;
+			copied += len;
+			size -= len;
+			continue;
+		}
+
+		if (!aead_writable(sk)) {
+			/* user space sent too much data */
+			aead_put_sgl(sk);
+			err = -EMSGSIZE;
+			goto unlock;
+		}
+
+		/* allocate a new page */
+		len = min_t(unsigned long, size, aead_sndbuf(sk));
+		while (len) {
+			int plen = 0;
+
+			if (sgl->cur >= ALG_MAX_PAGES) {
+				aead_put_sgl(sk);
+				err = -E2BIG;
+				goto unlock;
+			}
+
+			sg = sgl->sg + sgl->cur;
+			plen = min_t(int, len, PAGE_SIZE);
+
+			sg_assign_page(sg, alloc_page(GFP_KERNEL));
+			err = -ENOMEM;
+			if (!sg_page(sg))
+				goto unlock;
+
+			err = memcpy_from_msg(page_address(sg_page(sg)),
+					      msg, plen);
+			if (err) {
+				__free_page(sg_page(sg));
+				sg_assign_page(sg, NULL);
+				goto unlock;
+			}
+
+			sg->offset = 0;
+			sg->length = plen;
+			len -= plen;
+			ctx->used += plen;
+			copied += plen;
+			sgl->cur++;
+			size -= plen;
+			ctx->merge = plen & (PAGE_SIZE - 1);
+		}
+	}
+
+	err = 0;
+
+	ctx->more = msg->msg_flags & MSG_MORE;
+	if (!ctx->more && !aead_sufficient_data(ctx)) {
+		aead_put_sgl(sk);
+		err = -EMSGSIZE;
+	}
+
+unlock:
+	aead_data_wakeup(sk);
+	release_sock(sk);
+
+	return err ?: copied;
+}
+
+static ssize_t aead_sendpage(struct socket *sock, struct page *page,
+			     int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	int err = -EINVAL;
+
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
+	if (sgl->cur >= ALG_MAX_PAGES)
+		return -E2BIG;
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (!size)
+		goto done;
+
+	if (!aead_writable(sk)) {
+		/* user space sent too much data */
+		aead_put_sgl(sk);
+		err = -EMSGSIZE;
+		goto unlock;
+	}
+
+	ctx->merge = 0;
+
+	get_page(page);
+	sg_set_page(sgl->sg + sgl->cur, page, size, offset);
+	sgl->cur++;
+	ctx->used += size;
+
+	err = 0;
+
+done:
+	ctx->more = flags & MSG_MORE;
+	if (!ctx->more && !aead_sufficient_data(ctx)) {
+		aead_put_sgl(sk);
+		err = -EMSGSIZE;
+	}
+
+unlock:
+	aead_data_wakeup(sk);
+	release_sock(sk);
+
+	return err ?: size;
+}
+
+static int aead_recvmsg(struct kiocb *unused, struct socket *sock,
+			struct msghdr *msg, size_t ignored, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned bs = crypto_aead_blocksize(crypto_aead_reqtfm(&ctx->aead_req));
+	unsigned as = crypto_aead_authsize(crypto_aead_reqtfm(&ctx->aead_req));
+	struct aead_sg_list *sgl = &ctx->tsgl;
+	struct scatterlist *sg = NULL;
+	struct scatterlist assoc[ALG_MAX_PAGES];
+	size_t assoclen = 0;
+	unsigned int i = 0;
+	int err = -EINVAL;
+	unsigned long used = 0;
+	size_t outlen = 0;
+	size_t usedpages = 0;
+	unsigned int cnt = 0;
+
+	/* Limit number of IOV blocks to be accessed below */
+	if (msg->msg_iter.nr_segs > RSGL_MAX_ENTRIES)
+		return -ENOMSG;
+
+	lock_sock(sk);
+
+	/*
+	 * AEAD memory structure: For encryption, the tag is appended to the
+	 * ciphertext which implies that the memory allocated for the ciphertext
+	 * must be increased by the tag length. For decryption, the tag
+	 * is expected to be concatenated to the ciphertext. The plaintext
+	 * therefore has a memory size of the ciphertext minus the tag length.
+	 *
+	 * The memory structure for cipher operation has the following
+	 * structure:
+	 *	AEAD encryption input:  assoc data || plaintext
+	 *	AEAD encryption output: cipherntext || auth tag
+	 *	AEAD decryption input:  assoc data || ciphertext || auth tag
+	 *	AEAD decryption output: plaintext
+	 */
+
+	if (ctx->more) {
+		err = aead_wait_for_data(sk, flags);
+		if (err)
+			goto unlock;
+	}
+
+	used = ctx->used;
+
+	/*
+	 * Make sure sufficient data is present -- note, the same check is
+	 * is also present in sendmsg/sendpage. The checks in sendpage/sendmsg
+	 * shall provide an information to the data sender that something is
+	 * wrong, but they are irrelevant to maintain the kernel integrity.
+	 * We need this check here too in case user space decides to not honor
+	 * the error message in sendmsg/sendpage and still call recvmsg. This
+	 * check here protects the kernel integrity.
+	 */
+	if (!aead_sufficient_data(ctx))
+		goto unlock;
+
+	/*
+	 * The cipher operation input data is reduced by the associated data
+	 * length as this data is processed separately later on.
+	 */
+	used -= ctx->aead_assoclen;
+
+	if (ctx->enc) {
+		/* round up output buffer to multiple of block size */
+		outlen = ((used + bs - 1) / bs * bs);
+		/* add the size needed for the auth tag to be created */
+		outlen += as;
+	} else {
+		/* output data size is input without the authentication tag */
+		outlen = used - as;
+		/* round up output buffer to multiple of block size */
+		outlen = ((outlen + bs - 1) / bs * bs);
+	}
+
+	/* convert iovecs of output buffers into scatterlists */
+	while (iov_iter_count(&msg->msg_iter)) {
+		size_t seglen = min_t(size_t, iov_iter_count(&msg->msg_iter),
+				      (outlen - usedpages));
+
+		/* make one iovec available as scatterlist */
+		err = af_alg_make_sg(&ctx->rsgl[cnt], &msg->msg_iter,
+				     seglen);
+		if (err < 0)
+			goto unlock;
+		usedpages += err;
+		/* chain the new scatterlist with initial list */
+		if (cnt)
+			scatterwalk_crypto_chain(ctx->rsgl[0].sg,
+					ctx->rsgl[cnt].sg, 1,
+					sg_nents(ctx->rsgl[cnt-1].sg));
+		/* we do not need more iovecs as we have sufficient memory */
+		if (outlen <= usedpages)
+			break;
+		iov_iter_advance(&msg->msg_iter, err);
+		cnt++;
+	}
+
+	err = -EINVAL;
+	/* ensure output buffer is sufficiently large */
+	if (usedpages < outlen)
+		goto unlock;
+
+	sg_init_table(assoc, ALG_MAX_PAGES);
+	assoclen = ctx->aead_assoclen;
+	/*
+	 * Split scatterlist into two: first part becomes AD, second part
+	 * is plaintext / ciphertext. The first part is assigned to assoc
+	 * scatterlist. When this loop finishes, sg points to the start of the
+	 * plaintext / ciphertext.
+	 */
+	for (i = 0; i < ctx->tsgl.cur; i++) {
+		sg = sgl->sg + i;
+		if (sg->length <= assoclen) {
+			/* AD is larger than one page */
+			sg_set_page(assoc + i, sg_page(sg),
+				    sg->length, sg->offset);
+			assoclen -= sg->length;
+			if (i >= ctx->tsgl.cur)
+				goto unlock;
+		} else if (!assoclen) {
+			/* current page is to start of plaintext / ciphertext */
+			if (i)
+				/* AD terminates at page boundary */
+				sg_mark_end(assoc + i - 1);
+			else
+				/* AD size is zero */
+				sg_mark_end(assoc);
+			break;
+		} else {
+			/* AD does not terminate at page boundary */
+			sg_set_page(assoc + i, sg_page(sg),
+				    assoclen, sg->offset);
+			sg_mark_end(assoc + i);
+			/* plaintext / ciphertext starts after AD */
+			sg->length -= assoclen;
+			sg->offset += assoclen;
+			break;
+		}
+	}
+
+	aead_request_set_assoc(&ctx->aead_req, assoc, ctx->aead_assoclen);
+	aead_request_set_crypt(&ctx->aead_req, sg, ctx->rsgl[0].sg, used,
+			       ctx->iv);
+
+	err = af_alg_wait_for_completion(ctx->enc ?
+					 crypto_aead_encrypt(&ctx->aead_req) :
+					 crypto_aead_decrypt(&ctx->aead_req),
+					 &ctx->completion);
+
+	if (err) {
+		/* EBADMSG implies a valid cipher operation took place */
+		if (err == -EBADMSG)
+			aead_put_sgl(sk);
+		goto unlock;
+	}
+
+	aead_put_sgl(sk);
+
+	err = 0;
+
+unlock:
+	for (i = 0; i < cnt; i++)
+		af_alg_free_sg(&ctx->rsgl[i]);
+
+	aead_wmem_wakeup(sk);
+	release_sock(sk);
+
+	return err ? err : outlen;
+}
+
+static unsigned int aead_poll(struct file *file, struct socket *sock,
+			      poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned int mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	if (!ctx->more)
+		mask |= POLLIN | POLLRDNORM;
+
+	if (aead_writable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
+static struct proto_ops algif_aead_ops = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.accept		=	sock_no_accept,
+	.setsockopt	=	sock_no_setsockopt,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	aead_sendmsg,
+	.sendpage	=	aead_sendpage,
+	.recvmsg	=	aead_recvmsg,
+	.poll		=	aead_poll,
+};
+
+static void *aead_bind(const char *name, u32 type, u32 mask)
+{
+	return crypto_alloc_aead(name, type, mask);
+}
+
+static void aead_release(void *private)
+{
+	crypto_free_aead(private);
+}
+
+static int aead_setauthsize(void *private, unsigned int authsize)
+{
+	return crypto_aead_setauthsize(private, authsize);
+}
+
+static int aead_setkey(void *private, const u8 *key, unsigned int keylen)
+{
+	return crypto_aead_setkey(private, key, keylen);
+}
+
+static void aead_sock_destruct(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct aead_ctx *ctx = ask->private;
+	unsigned int ivlen = crypto_aead_ivsize(
+				crypto_aead_reqtfm(&ctx->aead_req));
+
+	aead_put_sgl(sk);
+	sock_kzfree_s(sk, ctx->iv, ivlen);
+	sock_kfree_s(sk, ctx, ctx->len);
+	af_alg_release_parent(sk);
+}
+
+static int aead_accept_parent(void *private, struct sock *sk)
+{
+	struct aead_ctx *ctx;
+	struct alg_sock *ask = alg_sk(sk);
+	unsigned int len = sizeof(*ctx) + crypto_aead_reqsize(private);
+	unsigned int ivlen = crypto_aead_ivsize(private);
+
+	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	memset(ctx, 0, len);
+
+	ctx->iv = sock_kmalloc(sk, ivlen, GFP_KERNEL);
+	if (!ctx->iv) {
+		sock_kfree_s(sk, ctx, len);
+		return -ENOMEM;
+	}
+	memset(ctx->iv, 0, ivlen);
+
+	ctx->len = len;
+	ctx->used = 0;
+	ctx->more = 0;
+	ctx->merge = 0;
+	ctx->enc = 0;
+	ctx->tsgl.cur = 0;
+	ctx->aead_assoclen = 0;
+	af_alg_init_completion(&ctx->completion);
+	sg_init_table(ctx->tsgl.sg, ALG_MAX_PAGES);
+
+	ask->private = ctx;
+
+	aead_request_set_tfm(&ctx->aead_req, private);
+	aead_request_set_callback(&ctx->aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  af_alg_complete, &ctx->completion);
+
+	sk->sk_destruct = aead_sock_destruct;
+
+	return 0;
+}
+
+static const struct af_alg_type algif_type_aead = {
+	.bind		=	aead_bind,
+	.release	=	aead_release,
+	.setkey		=	aead_setkey,
+	.setauthsize	=	aead_setauthsize,
+	.accept		=	aead_accept_parent,
+	.ops		=	&algif_aead_ops,
+	.name		=	"aead",
+	.owner		=	THIS_MODULE
+};
+
+static int __init algif_aead_init(void)
+{
+	return af_alg_register_type(&algif_type_aead);
+}
+
+static void __exit algif_aead_exit(void)
+{
+	int err = af_alg_unregister_type(&algif_type_aead);
+	BUG_ON(err);
+}
+
+module_init(algif_aead_init);
+module_exit(algif_aead_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
+MODULE_DESCRIPTION("AEAD kernel crypto API user space interface");
-- 
2.1.0

^ permalink raw reply related

* [PATCH v13 2/2] crypto: AF_ALG: enable AEAD interface compilation
From: Stephan Mueller @ 2015-02-28 19:50 UTC (permalink / raw)
  To: 'Herbert Xu
  Cc: 'Quentin Gouchet', Daniel Borkmann, linux-api,
	linux-crypto, linux-kernel
In-Reply-To: <2180298.gqj58NYuqx@tachyon.chronox.de>

Enable compilation of the AEAD AF_ALG support and provide a Kconfig
option to compile the AEAD AF_ALG support.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/Kconfig  | 9 +++++++++
 crypto/Makefile | 1 +
 2 files changed, 10 insertions(+)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 2ca8d15..f3d1c24 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1532,6 +1532,15 @@ config CRYPTO_USER_API_RNG
 	  This option enables the user-spaces interface for random
 	  number generator algorithms.
 
+config CRYPTO_USER_API_AEAD
+	tristate "User-space interface for AEAD cipher algorithms"
+	depends on NET
+	select CRYPTO_AEAD
+	select CRYPTO_USER_API
+	help
+	  This option enables the user-spaces interface for AEAD
+	  cipher algorithms.
+
 config CRYPTO_HASH_INFO
 	bool
 
diff --git a/crypto/Makefile b/crypto/Makefile
index ba19465..97b7d3a 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
 obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
+obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
 
 #
 # generic algorithms and the async_tx api
-- 
2.1.0

^ permalink raw reply related

* Re: [PATCH] capabilities: Ambient capability set V2
From: Serge E. Hallyn @ 2015-03-01  4:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Serge Hallyn, Andy Lutomirski, Jonathan Corbet, Aaron Jones,
	linux-security-module-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r, Andrew G. Morgan,
	Mimi Zohar, Austin S Hemmelgarn, Markku Savela, Jarkko Sakkinen,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Michael Kerrisk
In-Reply-To: <alpine.DEB.2.11.1502261612370.8994-gkYfJU5Cukgdnm+yROfE0A@public.gmane.org>

On Thu, Feb 26, 2015 at 04:14:33PM -0600, Christoph Lameter wrote:
> 
> V1->V2:
>  - Fix up the processing of the caps bits after discussions
>    with Any and Serge. Make patch less intrusive.
> 
> Ambient caps are something like restricted root privileges.
> A process has a set of additional capabilities and those
> are inherited without have to set capabilites in other
> binaries involved. This allow the partial use of root
> like features in a controlled way. It is often useful
> to do this for user space device drivers or software that
> needs increased priviledges for networking or to control
> its own scheduling. Ambient caps allow one to avoid
> having to run these with full root priviledges.
> 
> Control over this feature is avaialable via a new
> prctl option called PR_CAP_AMBIENT. The second argument to prctl
> is a the capability number and the third the desired state.
> 0 for off. Otherwise on.
> 
> Ambient bits are enabled regardless of the inheritance
> mask of the target binary. They are only restricted
> by the bounding set.
> 
> History:
> 
> Linux capabilities have suffered from the problem that they are not
> inheritable like unregular process characteristics under Unix. This is
> behavior that is counter intuitive to the expected behavior of processes
> in Unix.
> 
> In particular there has been recently software that controls NICs from user
> space and provides IP stack like behavior also in user space (DPDK and RDMA
> kernel API based implementations). Those typically need either capabilities
> to allow raw network access or have to be run setsuid. There is scripting and
> LD_PREFLOAD etc involved, arbitrary binaries may be run from those scripts
> including those setting additional capabilites or requiring root access.
> 
> That does not go well with having file capabilities set that would enable
> the capabilities. Maybe it would work if one would setup capabilities on
> all executables but that would also defeat a secure design since these
> binaries may only need those caps for certain situations. Ok setting the
> inheritable flags on everything may also get one there (if there would not
> be the issues with LD_PRELOAD, debugging etc etc).
> 
> The easy solution is to allow some capabilities be inherited like setsuid
> is. We really prefer to use capabilities instead of setsuid (we want to
> limit what damage someone can do after all!). Therefore we have been
> running a patch like this in production for the last 6 years. At some
> point it becomes tedious to run your own custom kernel so we would like
> to have this functionality upstream.
> 
> See some of the earlier related discussions on the problems with capability
> inheritance:
> 
> 0. Recent surprise:
>                 https://lkml.org/lkml/2014/1/21/175
> 
> 1. Attempt to revise caps
>                 http://www.madore.org/~david/linux/newcaps/
> 
> 2. Problems of passing caps through exec
>                 http://unix.stackexchange.com/questions/128394/passing-capabilities-through-exec
> 
> 3. Problems of binding to privileged ports
>                 http://stackoverflow.com/questions/413807/is-there-a-way-for-non-root-processes-to-bind-to-privileged-ports-1024-on-l
> 
> 4. Reviving capabilities
>                 http://lwn.net/Articles/199004/
> 
> There does not seem to be an alternative on the horizon. Some involved
> in security development under Linux have even stated that they want to
> rip out the whole thing and replace it. Its been a couple of years now
> and we are still suffering from the capabilities mess. Let us just
> fix it. Others have already done implementations like this like Nokia
> for the N900.
> 
> 
> This patch does not change the default behavior but it allows to set up
> a list of capabilities via prctl that will enable regular
> unix inheritance only for the selected group of capabilities.
> 
> With that it is then possible to do something trivial like setting
> CAP_NET_RAW on an executable that can then allow that capability to
> be inherited by others.
> 
> Lets have a look at a coding example of a wrapper that enables
> a couple of capabilities:
> 
> ------------------------------ ambient_test.c
> /*
>  * Test program for the ambient capabilities
>  *
>  *
>  * Compile using:
>  *	gcc -o ambient_test ambient_test.o
>  *
>  * This program must have the following capabilities to run properly:
>  * CAP_SETPCAP, CAP_NET_RAW, CAP_NET_ADMIN, CAP_SYS_NICE
>  *
>  * A command to equip this with the right caps is:
>  *
>  *	setcap cap_setpcap,cap_net_raw,cap_net_admin,cap_sys_nice+eip ambient_test
>  *
>  * To get a shell with additional caps that can be inherited do:
>  *
>  * ./ambient_test /bin/bash
>  *
>  */
> 
> #include <stdlib.h>
> #include <stdio.h>
> #include <errno.h>
> #include <sys/prctl.h>
> #include <linux/capability.h>
> 
> /* Defintion to be updated in the user space include files */
> #define PR_CAP_AMBIENT 45
> 
> int main(int argc, char **argv)
> {
> 	int rc;
> 
> 	if (prctl(PR_CAP_AMBIENT, CAP_NET_RAW))
> 		perror("Cannot set CAP_NET_RAW");
> 
> 	if (prctl(PR_CAP_AMBIENT, CAP_NET_ADMIN))
> 		perror("Cannot set CAP_NET_ADMIN");
> 
> 	if (prctl(PR_CAP_AMBIENT, CAP_SYS_NICE))
> 		perror("Cannot set CAP_SYS_NICE");
> 

Your example program is not filling in pI though?

Ah, i see why.  In get_file_caps() you are still assigning

	fP = pA

if the file has no file capabilities.  so then you are actually
doing

	 pP' = (X & (fP | pA)) | (pI & (fI | pA))
rather than
	 pP' = (X & fP) | (pI & (fI | pA))

Other than that, the patch is looking good to me.  We should
consider emitting an audit record when a task fills in its
pA, and I do still wonder whether we should be requiring
CAP_SETFCAP (unsure how best to think of it).  But assuming the
fP = pA was not intended, I think this largely does the right
thing.

> 	printf("Ambient_test forking shell\n");
> 	if (execv(argv[1], argv + 1))
> 		perror("Cannot exec");
> 
> 	return 0;
> }
> -------------------------------- ambient_test.c
> 
> Allows the inheritance of CAP_SYS_NICE, CAP_NET_RAW and CAP_NET_ADMIN.
> With that device raw access is possible and also real time priorities
> can be set from user space. This is a frequently needed set of
> priviledged operations in HPC and HFT applications. User space
> processes need to be able to directly access devices as well as
> have full control over scheduling.
> 
> Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
> 
> Index: linux/security/commoncap.c
> ===================================================================
> --- linux.orig/security/commoncap.c	2015-02-25 13:43:06.929973954 -0600
> +++ linux/security/commoncap.c	2015-02-26 16:10:02.347913397 -0600
> @@ -347,15 +347,17 @@ static inline int bprm_caps_from_vfs_cap
>  		*has_cap = true;
> 
>  	CAP_FOR_EACH_U32(i) {
> +		__u32 ambient = current_cred()->cap_ambient.cap[i];
>  		__u32 permitted = caps->permitted.cap[i];
>  		__u32 inheritable = caps->inheritable.cap[i];
> 
>  		/*
> -		 * pP' = (X & fP) | (pI & fI)
> +		 * pP' = (X & fP) | (pI & (fI | pA))
>  		 */
>  		new->cap_permitted.cap[i] =
>  			(new->cap_bset.cap[i] & permitted) |
> -			(new->cap_inheritable.cap[i] & inheritable);
> +			(new->cap_inheritable.cap[i] &
> +					(inheritable | ambient));
> 
>  		if (permitted & ~new->cap_permitted.cap[i])
>  			/* insufficient to execute correctly */
> @@ -453,8 +455,18 @@ static int get_file_caps(struct linux_bi
>  		if (rc == -EINVAL)
>  			printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
>  				__func__, rc, bprm->filename);
> -		else if (rc == -ENODATA)
> +		else if (rc == -ENODATA) {
>  			rc = 0;
> +			if (!cap_isclear(current_cred()->cap_ambient)) {
> +				/*
> +				 * The ambient caps are permitted for
> +				 * files that have no caps
> +				 */
> +				bprm->cred->cap_permitted =
> +					current_cred()->cap_ambient;
> +				*effective = true;
> +			}
> +		}
>  		goto out;
>  	}
> 
> @@ -549,9 +561,20 @@ skip:
>  	new->sgid = new->fsgid = new->egid;
> 
>  	if (effective)
> +		/*
> +		 * pE' = pP' & (fE | pA)
> +		 *
> +		 * fE is implicity all set if effective == true.
> +		 * Therefore the above reduces to
> +		 *
> +		 * pE' = pP'
> +		 */
>  		new->cap_effective = new->cap_permitted;
>  	else
>  		cap_clear(new->cap_effective);
> +
> +	/* pA' = pA */
> +	new->cap_ambient = old->cap_ambient;
>  	bprm->cap_effective = effective;
> 
>  	/*
> @@ -566,7 +589,7 @@ skip:
>  	 * Number 1 above might fail if you don't have a full bset, but I think
>  	 * that is interesting information to audit.
>  	 */
> -	if (!cap_isclear(new->cap_effective)) {
> +	if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
>  		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
>  		    !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
>  		    issecure(SECURE_NOROOT)) {
> @@ -598,7 +621,7 @@ int cap_bprm_secureexec(struct linux_bin
>  	if (!uid_eq(cred->uid, root_uid)) {
>  		if (bprm->cap_effective)
>  			return 1;
> -		if (!cap_isclear(cred->cap_permitted))
> +		if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
>  			return 1;
>  	}
> 
> @@ -933,6 +956,23 @@ int cap_task_prctl(int option, unsigned
>  			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
>  		return commit_creds(new);
> 
> +	case PR_CAP_AMBIENT:
> +		if (!ns_capable(current_user_ns(), CAP_SETPCAP))
> +			return -EPERM;
> +
> +		if (!cap_valid(arg2))
> +			return -EINVAL;
> +
> +		if (!ns_capable(current_user_ns(), arg2))
> +			return -EPERM;
> +
> +		new = prepare_creds();
> +		if (arg3 == 0)
> +			cap_lower(new->cap_ambient, arg2);
> +		else
> +			cap_raise(new->cap_ambient, arg2);
> +		return commit_creds(new);
> +
>  	default:
>  		/* No functionality available - continue with default */
>  		return -ENOSYS;
> Index: linux/include/linux/cred.h
> ===================================================================
> --- linux.orig/include/linux/cred.h	2015-02-25 13:43:06.929973954 -0600
> +++ linux/include/linux/cred.h	2015-02-25 13:43:06.925972078 -0600
> @@ -122,6 +122,7 @@ struct cred {
>  	kernel_cap_t	cap_permitted;	/* caps we're permitted */
>  	kernel_cap_t	cap_effective;	/* caps we can actually use */
>  	kernel_cap_t	cap_bset;	/* capability bounding set */
> +	kernel_cap_t	cap_ambient;	/* Ambient capability set */
>  #ifdef CONFIG_KEYS
>  	unsigned char	jit_keyring;	/* default keyring to attach requested
>  					 * keys to */
> Index: linux/include/uapi/linux/prctl.h
> ===================================================================
> --- linux.orig/include/uapi/linux/prctl.h	2015-02-25 13:43:06.929973954 -0600
> +++ linux/include/uapi/linux/prctl.h	2015-02-25 13:43:06.925972078 -0600
> @@ -185,4 +185,7 @@ struct prctl_mm_map {
>  #define PR_MPX_ENABLE_MANAGEMENT  43
>  #define PR_MPX_DISABLE_MANAGEMENT 44
> 
> +/* Control the ambient capability set */
> +#define PR_CAP_AMBIENT 45
> +
>  #endif /* _LINUX_PRCTL_H */
> Index: linux/fs/proc/array.c
> ===================================================================
> --- linux.orig/fs/proc/array.c	2015-02-25 13:43:06.929973954 -0600
> +++ linux/fs/proc/array.c	2015-02-25 13:43:06.925972078 -0600
> @@ -302,7 +302,8 @@ static void render_cap_t(struct seq_file
>  static inline void task_cap(struct seq_file *m, struct task_struct *p)
>  {
>  	const struct cred *cred;
> -	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
> +	kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
> +			cap_bset, cap_ambient;
> 
>  	rcu_read_lock();
>  	cred = __task_cred(p);
> @@ -310,12 +311,14 @@ static inline void task_cap(struct seq_f
>  	cap_permitted	= cred->cap_permitted;
>  	cap_effective	= cred->cap_effective;
>  	cap_bset	= cred->cap_bset;
> +	cap_ambient	= cred->cap_ambient;
>  	rcu_read_unlock();
> 
>  	render_cap_t(m, "CapInh:\t", &cap_inheritable);
>  	render_cap_t(m, "CapPrm:\t", &cap_permitted);
>  	render_cap_t(m, "CapEff:\t", &cap_effective);
>  	render_cap_t(m, "CapBnd:\t", &cap_bset);
> +	render_cap_t(m, "CapAmb:\t", &cap_ambient);
>  }
> 
>  static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* [PATCH v5 tip 0/7] tracing: attach eBPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-01 23:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Peter, Steven,
I think this set addresses everything we've discussed.
Please review/ack. Thanks!

V4->V5:
- switched to ktime_get_mono_fast_ns() as suggested by Peter
- in libbpf.c fixed zero init of 'union bpf_attr' padding
- fresh rebase on tip/master

Hi All,

This is targeting 'tip' tree, since most of the changes are perf_event related.
There will be a small conflict between net-next and tip, since they both
add new bpf_prog_type (BPF_PROG_TYPE_SCHED_CLS and BPF_PROG_TYPE_KPROBE).

V3 discussion:
https://lkml.org/lkml/2015/2/9/738

V3->V4:
- since the boundary of stable ABI in bpf+tracepoints is not clear yet,
  I've dropped them for now.
- bpf+syscalls are ok from stable ABI point of view, but bpf+seccomp
  would want to do very similar analysis of syscalls, so I've dropped
  them as well to take time and define common bpf+syscalls and bpf+seccomp
  infra in the future.
- so only bpf+kprobes left. kprobes by definition is not a stable ABI,
  so bpf+kprobe is not stable ABI either. To stress on that point added
  kernel version attribute that user space must pass along with the program
  and kernel will reject programs when version code doesn't match.
  So bpf+kprobe is very similar to kernel modules, but unlike modules
  version check is not used for safety, but for enforcing 'non-ABI-ness'.
  (version check doesn't apply to bpf+sockets which are stable)

Patch 1 is in net-next and needs to be in tip too, since patch 2 depends on it.

Patch 2 actually adds bpf+kprobe infra:
programs receive 'struct pt_regs' on input and can walk data structures
using bpf_probe_read() helper which is a wrapper of probe_kernel_read()

Programs are attached to kprobe events via API:

prog_fd = bpf_prog_load(...);
struct perf_event_attr attr = {
  .type = PERF_TYPE_TRACEPOINT,
  .config = event_id, /* ID of just created kprobe event */
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

Patch 3 adds bpf_ktime_get_ns() helper function, so that bpf programs can
measure time delta between events to compute disk io latency, etc.

Patch 4 adds bpf_trace_printk() helper that is used to debug programs.
When bpf verifier sees that program is calling bpf_trace_printk() it inits
trace_printk buffers which emits nasty 'this is debug only' banner.
That's exactly what we want. bpf_trace_printk() is for debugging only.

Patch 5 sample code that shows how to use bpf_probe_read/bpf_trace_printk

Patch 6 sample code - combination of kfree_skb and sys_write tracing.

Patch 7 sample code that computes disk io latency and prints it as 'heatmap'

Interesting bit is that patch 6 has log2() function implemented in C
and patch 7 has another log2() function using different algorithm in C.
In the future if 'log2' usage becomes common, we can add it as in-kernel
helper function, but for now bpf programs can implement them on bpf side.

Another interesting bit from patch 7 is that it does approximation of
floating point log10(X)*10 using integer arithmetic, which demonstrates
the power of C->BPF vs traditional tracing language alternatives,
where one would need to introduce new helper functions to add functionality,
whereas bpf can just implement such things in C as part of the program.

Next step is to prototype TCP stack instrumentation (like web10g) using
bpf+kprobe, but without adding any new code tcp stack.
Though kprobes are slow comparing to tracepoints, they are good enough
for prototyping and trace_marker/debug_tracepoint ideas can accelerate
them in the future.

Alexei Starovoitov (6):
  tracing: attach BPF programs to kprobes
  tracing: allow BPF programs to call bpf_ktime_get_ns()
  tracing: allow BPF programs to call bpf_trace_printk()
  samples: bpf: simple non-portable kprobe filter example
  samples: bpf: counting example for kfree_skb and write syscall
  samples: bpf: IO latency analysis (iosnoop/heatmap)

Daniel Borkmann (1):
  bpf: make internal bpf API independent of CONFIG_BPF_SYSCALL ifdefs

 include/linux/bpf.h             |   20 ++++-
 include/linux/ftrace_event.h    |   14 +++
 include/uapi/linux/bpf.h        |    5 ++
 include/uapi/linux/perf_event.h |    1 +
 kernel/bpf/syscall.c            |    7 +-
 kernel/events/core.c            |   59 +++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  178 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |   10 ++-
 samples/bpf/Makefile            |   12 +++
 samples/bpf/bpf_helpers.h       |    6 ++
 samples/bpf/bpf_load.c          |  112 ++++++++++++++++++++++--
 samples/bpf/bpf_load.h          |    3 +
 samples/bpf/libbpf.c            |   14 ++-
 samples/bpf/libbpf.h            |    5 +-
 samples/bpf/sock_example.c      |    2 +-
 samples/bpf/test_verifier.c     |    2 +-
 samples/bpf/tracex1_kern.c      |   50 +++++++++++
 samples/bpf/tracex1_user.c      |   25 ++++++
 samples/bpf/tracex2_kern.c      |   86 +++++++++++++++++++
 samples/bpf/tracex2_user.c      |   95 +++++++++++++++++++++
 samples/bpf/tracex3_kern.c      |   89 ++++++++++++++++++++
 samples/bpf/tracex3_user.c      |  150 +++++++++++++++++++++++++++++++++
 23 files changed, 930 insertions(+), 16 deletions(-)
 create mode 100644 kernel/trace/bpf_trace.c
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c

-- 
1.7.9.5

^ permalink raw reply

* [PATCH v5 tip 1/7] bpf: make internal bpf API independent of CONFIG_BPF_SYSCALL ifdefs
From: Alexei Starovoitov @ 2015-03-01 23:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1425252465-27527-1-git-send-email-ast@plumgrid.com>

From: Daniel Borkmann <daniel@iogearbox.net>

Socket filter code and other subsystems with upcoming eBPF support should
not need to deal with the fact that we have CONFIG_BPF_SYSCALL defined or
not.

Having the bpf syscall as a config option is a nice thing and I'd expect
it to stay that way for expert users (I presume one day the default setting
of it might change, though), but code making use of it should not care if
it's actually enabled or not.

Instead, hide this via header files and let the rest deal with it.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/linux/bpf.h |   20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..c2e21113ecc0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -113,8 +113,6 @@ struct bpf_prog_type_list {
 	enum bpf_prog_type type;
 };
 
-void bpf_register_prog_type(struct bpf_prog_type_list *tl);
-
 struct bpf_prog;
 
 struct bpf_prog_aux {
@@ -129,11 +127,25 @@ struct bpf_prog_aux {
 };
 
 #ifdef CONFIG_BPF_SYSCALL
+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+
 void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
 #else
-static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+}
+
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_prog_put(struct bpf_prog *prog)
+{
+}
 #endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v5 tip 2/7] tracing: attach BPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-01 23:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1425252465-27527-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

User interface:
struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = event_id, ...};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

prog_fd is a file descriptor associated with BPF program previously loaded.
event_id is an ID of created kprobe

close(event_fd) - automatically detaches BPF program from it

BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any kernel
  data structures

BPF programs receive 'struct pt_regs *' as an input
('struct pt_regs' is architecture dependent)

Note, kprobes are _not_ a stable kernel ABI, so bpf programs attached to
kprobes must be recompiled for every kernel version and user must supply correct
LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call.

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 include/linux/ftrace_event.h    |   14 ++++++
 include/uapi/linux/bpf.h        |    3 ++
 include/uapi/linux/perf_event.h |    1 +
 kernel/bpf/syscall.c            |    7 ++-
 kernel/events/core.c            |   59 +++++++++++++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |   99 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |   10 +++-
 8 files changed, 192 insertions(+), 2 deletions(-)
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c674ee8f7fca..0aa535bc9f05 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -252,6 +253,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -265,6 +267,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -274,6 +277,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
@@ -303,6 +307,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -548,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..4486d36d2e9e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -151,6 +152,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when type=kprobe */
 	};
 } __attribute__((aligned(8)));
 
@@ -162,6 +164,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3c8b45de57ec..ad4dade2a502 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..504c10b990ef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/license.h>
 #include <linux/filter.h>
+#include <linux/version.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_buf
+#define	BPF_PROG_LOAD_LAST_FIELD kern_version
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (attr->insn_cnt >= BPF_MAXINSNS)
 		return -EINVAL;
 
+	if (type == BPF_PROG_TYPE_KPROBE &&
+	    attr->kern_version != LINUX_VERSION_CODE)
+		return -EINVAL;
+
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 	if (!prog)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 89f0f16d55f9..9cf449bae28a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -3402,6 +3404,7 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3411,6 +3414,7 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
+	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
@@ -3923,6 +3927,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -3976,6 +3981,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_SET_BPF:
+		return perf_event_set_bpf_prog(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -6436,6 +6444,49 @@ static void perf_event_free_filter(struct perf_event *event)
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	struct bpf_prog *prog;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	if (event->tp_event->prog)
+		return -EEXIST;
+
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+		/* bpf programs can only be attached to kprobes */
+		return -EINVAL;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
+		/* valid fd, but invalid bpf program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	event->tp_event->prog = prog;
+
+	return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog *prog;
+
+	if (!event->tp_event)
+		return;
+
+	prog = event->tp_event->prog;
+	if (prog) {
+		event->tp_event->prog = NULL;
+		bpf_prog_put(prog);
+	}
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6451,6 +6502,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..c575a300103b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..bcce9b238dad
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,99 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include "trace.h"
+
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	unsigned int ret;
+
+	if (in_nmi()) /* not supported yet */
+		return 1;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *dst = (void *) (long) r1;
+	int size = (int) r2;
+	void *unsafe_ptr = (void *) (long) r3;
+
+	return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static struct bpf_func_proto kprobe_prog_funcs[] = {
+	[BPF_FUNC_probe_read] = {
+		.func = bpf_probe_read,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_PTR_TO_STACK,
+		.arg2_type = ARG_CONST_STACK_SIZE,
+		.arg3_type = ARG_ANYTHING,
+	},
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		if (func_id < 0 || func_id >= ARRAY_SIZE(kprobe_prog_funcs))
+			return NULL;
+		return &kprobe_prog_funcs[func_id];
+	}
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct pt_regs))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+	.get_func_proto = kprobe_prog_func_proto,
+	.is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+	.ops = &kprobe_prog_ops,
+	.type = BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+	bpf_register_prog_type(&kprobe_tl);
+	return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..dc3462507d7c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1134,11 +1134,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1286,7 +1294,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	call->flags = 0;
+	call->flags = TRACE_EVENT_FL_KPROBE;
 	call->class->reg = kprobe_register;
 	call->data = tk;
 	ret = trace_add_event_call(call);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v5 tip 3/7] tracing: allow BPF programs to call bpf_ktime_get_ns()
From: Alexei Starovoitov @ 2015-03-01 23:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1425252465-27527-1-git-send-email-ast@plumgrid.com>

bpf_ktime_get_ns() is used by programs to compue time delta between events
or as a timestamp

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4486d36d2e9e..101e509d1001 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index bcce9b238dad..aa48b85cadb6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -36,6 +36,12 @@ static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return probe_kernel_read(dst, unsafe_ptr, size);
 }
 
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
 static struct bpf_func_proto kprobe_prog_funcs[] = {
 	[BPF_FUNC_probe_read] = {
 		.func = bpf_probe_read,
@@ -45,6 +51,11 @@ static struct bpf_func_proto kprobe_prog_funcs[] = {
 		.arg2_type = ARG_CONST_STACK_SIZE,
 		.arg3_type = ARG_ANYTHING,
 	},
+	[BPF_FUNC_ktime_get_ns] = {
+		.func = bpf_ktime_get_ns,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+	},
 };
 
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v5 tip 4/7] tracing: allow BPF programs to call bpf_trace_printk()
From: Alexei Starovoitov @ 2015-03-01 23:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1425252465-27527-1-git-send-email-ast@plumgrid.com>

Debugging of BPF programs needs some form of printk from the program,
so let programs call limited trace_printk() with %d %u %x %p modifiers only.

Similar to kernel modules, during program load verifier checks whether program
is calling bpf_trace_printk() and if so, kernel allocates trace_printk buffers
and emits big 'this is debug only' banner.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   68 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 101e509d1001..4095f3d9a716 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -166,6 +166,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
+	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index aa48b85cadb6..62460c1dd652 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -42,6 +42,60 @@ static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return ktime_get_mono_fast_ns();
 }
 
+/* limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+	char *fmt = (char *) (long) r1;
+	int fmt_cnt = 0;
+	bool mod_l[3] = {};
+	int i;
+
+	/* bpf_check() guarantees that fmt points to bpf program stack and
+	 * fmt_size bytes of it were initialized by bpf program
+	 */
+	if (fmt[fmt_size - 1] != 0)
+		return -EINVAL;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++)
+		if (fmt[i] == '%') {
+			if (fmt_cnt >= 3)
+				return -EINVAL;
+			i++;
+			if (i >= fmt_size)
+				return -EINVAL;
+
+			if (fmt[i] == 'l') {
+				mod_l[fmt_cnt] = true;
+				i++;
+				if (i >= fmt_size)
+					return -EINVAL;
+			} else if (fmt[i] == 'p') {
+				mod_l[fmt_cnt] = true;
+				fmt_cnt++;
+				continue;
+			}
+
+			if (fmt[i] == 'l') {
+				mod_l[fmt_cnt] = true;
+				i++;
+				if (i >= fmt_size)
+					return -EINVAL;
+			}
+
+			if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+				return -EINVAL;
+			fmt_cnt++;
+		}
+
+	return __trace_printk(1/* fake ip will not be printed */, fmt,
+			      mod_l[0] ? r3 : (u32) r3,
+			      mod_l[1] ? r4 : (u32) r4,
+			      mod_l[2] ? r5 : (u32) r5);
+}
+
 static struct bpf_func_proto kprobe_prog_funcs[] = {
 	[BPF_FUNC_probe_read] = {
 		.func = bpf_probe_read,
@@ -56,6 +110,13 @@ static struct bpf_func_proto kprobe_prog_funcs[] = {
 		.gpl_only = true,
 		.ret_type = RET_INTEGER,
 	},
+	[BPF_FUNC_trace_printk] = {
+		.func = bpf_trace_printk,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_PTR_TO_STACK,
+		.arg2_type = ARG_CONST_STACK_SIZE,
+	},
 };
 
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
@@ -70,6 +131,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 	default:
 		if (func_id < 0 || func_id >= ARRAY_SIZE(kprobe_prog_funcs))
 			return NULL;
+
+		if (func_id == BPF_FUNC_trace_printk)
+			/* this program might be calling bpf_trace_printk,
+			 * so allocate per-cpu printk buffers
+			 */
+			trace_printk_init_buffers();
+
 		return &kprobe_prog_funcs[func_id];
 	}
 }
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v5 tip 5/7] samples: bpf: simple non-portable kprobe filter example
From: Alexei Starovoitov @ 2015-03-01 23:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1425252465-27527-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

tracex1_kern.c - C program compiled into BPF.
It attaches to kprobe:netif_receive_skb
When skb->dev->name == "lo", it prints sample debug message into trace_pipe
via bpf_trace_printk() helper function.

tracex1_user.c - corresponding user space component that:
- loads bpf program via bpf() syscall
- opens kprobes:netif_receive_skb event via perf_event_open() syscall
- attaches the program to event via ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
- prints from trace_pipe

Note, this bpf program is completely non-portable. It must be recompiled
with current kernel headers. kprobe is not a stable ABI and bpf+kprobe scripts
may stop working any time.

bpf verifier will detect that it's using bpf_trace_printk() and kernel will
print warning banner:
** trace_printk() being used. Allocating extra memory.  **
**                                                      **
** This means that this is a DEBUG kernel and it is     **
** unsafe for production use.                           **

bpf_trace_printk() should be used for debugging of bpf program only.

Usage:
$ sudo tracex1
            ping-19826 [000] d.s2 63103.382648: : skb ffff880466b1ca00 len 84
            ping-19826 [000] d.s2 63103.382684: : skb ffff880466b1d300 len 84

            ping-19826 [000] d.s2 63104.382533: : skb ffff880466b1ca00 len 84
            ping-19826 [000] d.s2 63104.382594: : skb ffff880466b1d300 len 84

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 samples/bpf/Makefile        |    4 ++
 samples/bpf/bpf_helpers.h   |    6 +++
 samples/bpf/bpf_load.c      |  112 ++++++++++++++++++++++++++++++++++++++++---
 samples/bpf/bpf_load.h      |    3 ++
 samples/bpf/libbpf.c        |   14 +++++-
 samples/bpf/libbpf.h        |    5 +-
 samples/bpf/sock_example.c  |    2 +-
 samples/bpf/test_verifier.c |    2 +-
 samples/bpf/tracex1_kern.c  |   50 +++++++++++++++++++
 samples/bpf/tracex1_user.c  |   25 ++++++++++
 10 files changed, 213 insertions(+), 10 deletions(-)
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..51f6f01e5a3a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,23 +6,27 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += tracex1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += tracex1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..1c872bcf5a80 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,12 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 	(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
 	(void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
+	(void *) BPF_FUNC_probe_read;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+	(void *) BPF_FUNC_ktime_get_ns;
+static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
+	(void *) BPF_FUNC_trace_printk;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..8b42adb0b099 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,64 @@
 #include <unistd.h>
 #include <string.h>
 #include <stdbool.h>
+#include <stdlib.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
 #include "libbpf.h"
 #include "bpf_helpers.h"
 #include "bpf_load.h"
 
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
 static char license[128];
+static int kern_version;
 static bool processed_sec[128];
 int map_fd[MAX_MAPS];
 int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
 int prog_cnt;
 
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
-	int fd;
 	bool is_socket = strncmp(event, "socket", 6) == 0;
-
-	if (!is_socket)
-		/* tracing events tbd */
+	bool is_kprobe = strncmp(event, "events/kprobes/", 15) == 0;
+	enum bpf_prog_type prog_type;
+	char buf[256];
+	int fd, efd, err, id;
+	struct perf_event_attr attr = {};
+
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	if (is_socket) {
+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	} else if (is_kprobe) {
+		prog_type = BPF_PROG_TYPE_KPROBE;
+	} else {
+		printf("Unknown event '%s'\n", event);
 		return -1;
+	}
+
+	if (is_kprobe) {
+		snprintf(buf, sizeof(buf),
+			 "echo 'p:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+			 event + 15, event + 15);
+		err = system(buf);
+		if (err < 0) {
+			printf("failed to create kprobe '%s' error '%s'\n",
+			       event + 15, strerror(errno));
+			return -1;
+		}
+	}
 
-	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
-			   prog, size, license);
+	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
 
 	if (fd < 0) {
 		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +74,40 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_socket)
+		return 0;
+
+	strcpy(buf, DEBUGFS);
+	strcat(buf, event);
+	strcat(buf, "/id");
+
+	efd = open(buf, O_RDONLY, 0);
+	if (efd < 0) {
+		printf("failed to open event %s\n", event);
+		return -1;
+	}
+
+	err = read(efd, buf, sizeof(buf));
+	if (err < 0 || err >= sizeof(buf)) {
+		printf("read from '%s' failed '%s'\n", event, strerror(errno));
+		return -1;
+	}
+
+	close(efd);
+
+	buf[err] = 0;
+	id = atoi(buf);
+	attr.config = id;
+
+	efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+	if (efd < 0) {
+		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+		return -1;
+	}
+	event_fd[prog_cnt - 1] = efd;
+	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+
 	return 0;
 }
 
@@ -135,6 +204,9 @@ int load_bpf_file(char *path)
 	if (gelf_getehdr(elf, &ehdr) != &ehdr)
 		return 1;
 
+	/* clear all kprobes */
+	i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
+
 	/* scan over all elf sections to get license and map info */
 	for (i = 1; i < ehdr.e_shnum; i++) {
 
@@ -149,6 +221,14 @@ int load_bpf_file(char *path)
 		if (strcmp(shname, "license") == 0) {
 			processed_sec[i] = true;
 			memcpy(license, data->d_buf, data->d_size);
+		} else if (strcmp(shname, "version") == 0) {
+			processed_sec[i] = true;
+			if (data->d_size != sizeof(int)) {
+				printf("invalid size of version section %zd\n",
+				       data->d_size);
+				return 1;
+			}
+			memcpy(&kern_version, data->d_buf, sizeof(int));
 		} else if (strcmp(shname, "maps") == 0) {
 			processed_sec[i] = true;
 			if (load_maps(data->d_buf, data->d_size))
@@ -201,3 +281,23 @@ int load_bpf_file(char *path)
 	close(fd);
 	return 0;
 }
+
+void read_trace_pipe(void)
+{
+	int trace_fd;
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0)
+		return;
+
+	while (1) {
+		static char buf[4096];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf));
+		if (sz) {
+			buf[sz] = 0;
+			puts(buf);
+		}
+	}
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..cbd7c2b532b9 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
 
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
 
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
@@ -21,4 +22,6 @@ extern int prog_fd[MAX_PROGS];
  */
 int load_bpf_file(char *path);
 
+void read_trace_pipe(void);
+
 #endif
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 46d50b7ddf79..7e1efa7e2ed7 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -81,7 +81,7 @@ char bpf_log_buf[LOG_BUF_SIZE];
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int prog_len,
-		  const char *license)
+		  const char *license, int kern_version)
 {
 	union bpf_attr attr = {
 		.prog_type = prog_type,
@@ -93,6 +93,11 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		.log_level = 1,
 	};
 
+	/* assign one field outside of struct init to make sure any
+	 * padding is zero initialized
+	 */
+	attr.kern_version = kern_version;
+
 	bpf_log_buf[0] = 0;
 
 	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
@@ -121,3 +126,10 @@ int open_raw_sock(const char *name)
 
 	return sock;
 }
+
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, attr, pid, cpu,
+		       group_fd, flags);
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 58c5fe1bdba1..ac7b09672b46 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -13,7 +13,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key);
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int insn_len,
-		  const char *license);
+		  const char *license, int kern_version);
 
 #define LOG_BUF_SIZE 65536
 extern char bpf_log_buf[LOG_BUF_SIZE];
@@ -182,4 +182,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
 /* create RAW socket and bind to interface 'name' */
 int open_raw_sock(const char *name);
 
+struct perf_event_attr;
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags);
 #endif
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index c8ad0404416f..a0ce251c5390 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -56,7 +56,7 @@ static int test_sock(void)
 	};
 
 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
-				"GPL");
+				"GPL", 0);
 	if (prog_fd < 0) {
 		printf("failed to load prog '%s'\n", strerror(errno));
 		goto cleanup;
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index b96175e90363..740ce97cda5e 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -689,7 +689,7 @@ static int test(void)
 
 		prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
 					prog_len * sizeof(struct bpf_insn),
-					"GPL");
+					"GPL", 0);
 
 		if (tests[i].result == ACCEPT) {
 			if (prog_fd < 0) {
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..69ab7ed15510
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,50 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of argumnets and their posistions can change, etc.
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("events/kprobes/__netif_receive_skb_core")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	/* attaches to kprobe netif_receive_skb,
+	 * looks for packets on loobpack device and prints them
+	 */
+	char devname[IFNAMSIZ] = {};
+	struct net_device *dev;
+	struct sk_buff *skb;
+	int len;
+
+	/* non-portable! works for the given kernel only */
+	skb = (struct sk_buff *) ctx->di;
+
+	dev = _(skb->dev);
+
+	len = _(skb->len);
+
+	bpf_probe_read(devname, sizeof(devname), dev->name);
+
+	if (devname[0] == 'l' && devname[1] == 'o') {
+		char fmt[] = "skb %p len %d\n";
+		/* using bpf_trace_printk() for DEBUG ONLY */
+		bpf_trace_printk(fmt, sizeof(fmt), skb, len);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..31a48183beea
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	f = popen("taskset 1 ping -c5 localhost", "r");
+	(void) f;
+
+	read_trace_pipe();
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v5 tip 6/7] samples: bpf: counting example for kfree_skb and write syscall
From: Alexei Starovoitov @ 2015-03-01 23:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1425252465-27527-1-git-send-email-ast@plumgrid.com>

this example has two probes in one C file that attach to different kprove events
and use two different maps.

1st probe is x64 specific equivalent of dropmon. It attaches to kfree_skb,
retrevies 'ip' address of kfree_skb() caller and counts number of packet drops
at that 'ip' address. User space prints 'location - count' map every second.

2nd probe attaches to kprobe:sys_write and computes a histogram of different
write sizes

Usage:
$ sudo tracex2
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2

557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
           syscall write() stats
     byte_size       : count     distribution
       1 -> 1        : 3        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 0        |                                      |
       8 -> 15       : 0        |                                      |
      16 -> 31       : 2        |                                      |
      32 -> 63       : 3        |                                      |
      64 -> 127      : 1        |                                      |
     128 -> 255      : 1        |                                      |
     256 -> 511      : 0        |                                      |
     512 -> 1023     : 1118968  |************************************* |

Ctrl-C at any time. Kernel will auto cleanup maps and programs

$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex2_kern.c |   86 +++++++++++++++++++++++++++++++++++++++
 samples/bpf/tracex2_user.c |   95 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 185 insertions(+)
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 51f6f01e5a3a..6dd272143733 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += tracex1
+hostprogs-y += tracex2
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -14,12 +15,14 @@ sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
+always += tracex2_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -27,6 +30,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..a1ac1f99b665
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,86 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(long),
+	.max_entries = 1024,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("events/kprobes/kfree_skb")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long loc = 0;
+	long init_val = 1;
+	long *value;
+
+	/* x64 specific: read ip of kfree_skb caller.
+	 * non-portable version of __builtin_return_address(0)
+	 */
+	bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp);
+
+	value = bpf_map_lookup_elem(&my_map, &loc);
+	if (value)
+		*value += 1;
+	else
+		bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 64,
+};
+
+SEC("events/kprobes/sys_write")
+int bpf_prog3(struct pt_regs *ctx)
+{
+	long write_size = ctx->dx; /* arg3 */
+	long init_val = 1;
+	long *value;
+	u32 index = log2l(write_size);
+
+	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	if (value)
+		__sync_fetch_and_add(value, 1);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..91b8d0896fbb
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX	64
+#define MAX_STARS	38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	long value;
+	long data[MAX_INDEX] = {};
+	char starstr[MAX_STARS];
+	int i;
+	int max_ind = -1;
+	long max_value = 0;
+
+	for (key = 0; key < MAX_INDEX; key++) {
+		bpf_lookup_elem(fd, &key, &value);
+		data[key] = value;
+		if (value && key > max_ind)
+			max_ind = key;
+		if (value > max_value)
+			max_value = value;
+	}
+
+	printf("           syscall write() stats\n");
+	printf("     byte_size       : count     distribution\n");
+	for (i = 1; i <= max_ind + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+}
+static void int_exit(int sig)
+{
+	print_hist(map_fd[1]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	long key, next_key, value;
+	FILE *f;
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	signal(SIGINT, int_exit);
+
+	/* start 'ping' in the background to have some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	/* start 'dd' in the background to have plenty of 'write' syscalls */
+	f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+	(void) f;
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; i < 5; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd[0], &next_key, &value);
+			printf("location 0x%lx count %ld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+	print_hist(map_fd[1]);
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v5 tip 7/7] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Alexei Starovoitov @ 2015-03-01 23:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1425252465-27527-1-git-send-email-ast@plumgrid.com>

BPF C program attaches to blk_mq_start_request/blk_update_request kprobe events
to calculate IO latency.
For every completed block IO event it computes the time delta in nsec
and records in a histogram map: map[log10(delta)*10]++
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.

Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.

Similar experiments can be done for network transmit latencies, syscalls, etc

'-t' flag prints the heatmap using normal ascii characters:

$ sudo ./tracex3 -t
  heatmap of IO latency
  # - many events with this latency
    - few events
|1us      |10us     |100us    |1ms      |10ms     |100ms    |1s       |10s
                         *ooo. *O.#.                                    # 221
                      .  *#     .                                       # 125
                         ..   .o#*..                                    # 55
                    .  . .  .  .#O                                      # 37
                         .#                                             # 175
                               .#*.                                     # 37
                          #                                             # 199
              .              . *#*.                                     # 55
                               *#..*                                    # 42
                          #                                             # 266
                      ...***Oo#*OO**o#* .                               # 629
                          #                                             # 271
                              . .#o* o.*o*                              # 221
                        . . o* *#O..                                    # 50

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex3_kern.c |   89 ++++++++++++++++++++++++++
 samples/bpf/tracex3_user.c |  150 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 243 insertions(+)
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 6dd272143733..dcd850546d52 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += tracex1
 hostprogs-y += tracex2
+hostprogs-y += tracex3
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -16,6 +17,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -23,6 +25,7 @@ always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
+always += tracex3_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -31,6 +34,7 @@ HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..64f1c0b01a30
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(u64),
+	.max_entries = 4096,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("events/kprobes/blk_mq_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	long rq = ctx->di;
+	u64 val = bpf_ktime_get_ns();
+
+	bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+	int i = -(n == 0);
+	S(32); S(16); S(8); S(4); S(2); S(1);
+	return i;
+#undef S
+}
+
+#define SLOTS 100
+
+struct bpf_map_def SEC("maps") lat_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = SLOTS,
+};
+
+SEC("events/kprobes/blk_update_request")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long rq = ctx->di;
+	u64 *value, l, base;
+	u32 index;
+
+	value = bpf_map_lookup_elem(&my_map, &rq);
+	if (!value)
+		return 0;
+
+	u64 cur_time = bpf_ktime_get_ns();
+	u64 delta = cur_time - *value;
+
+	bpf_map_delete_elem(&my_map, &rq);
+
+	/* the lines below are computing index = log10(delta)*10
+	 * using integer arithmetic
+	 * index = 29 ~ 1 usec
+	 * index = 59 ~ 1 msec
+	 * index = 89 ~ 1 sec
+	 * index = 99 ~ 10sec or more
+	 * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+	 */
+	l = log2l(delta);
+	base = 1ll << l;
+	index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
+
+	if (index >= SLOTS)
+		index = SLOTS - 1;
+
+	value = bpf_map_lookup_elem(&lat_map, &index);
+	if (value)
+		__sync_fetch_and_add((long *)value, 1);
+
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..0aaa933ab938
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define SLOTS 100
+
+static void clear_stats(int fd)
+{
+	__u32 key;
+	__u64 value = 0;
+
+	for (key = 0; key < SLOTS; key++)
+		bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+	"\033[48;5;255m",
+	"\033[48;5;252m",
+	"\033[48;5;250m",
+	"\033[48;5;248m",
+	"\033[48;5;246m",
+	"\033[48;5;244m",
+	"\033[48;5;242m",
+	"\033[48;5;240m",
+	"\033[48;5;238m",
+	"\033[48;5;236m",
+	"\033[48;5;234m",
+	"\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+const char *sym[] = {
+	" ",
+	" ",
+	".",
+	".",
+	"*",
+	"*",
+	"o",
+	"o",
+	"O",
+	"O",
+	"#",
+	"#",
+};
+
+bool full_range = false;
+bool text_only = false;
+
+static void print_banner(void)
+{
+	if (full_range)
+		printf("|1ns     |10ns     |100ns    |1us      |10us     |100us"
+		       "    |1ms      |10ms     |100ms    |1s       |10s\n");
+	else
+		printf("|1us      |10us     |100us    |1ms      |10ms     "
+		       "|100ms    |1s       |10s\n");
+}
+
+static void print_hist(int fd)
+{
+	__u32 key;
+	__u64 value;
+	__u64 cnt[SLOTS];
+	__u64 max_cnt = 0;
+	__u64 total_events = 0;
+
+	for (key = 0; key < SLOTS; key++) {
+		value = 0;
+		bpf_lookup_elem(fd, &key, &value);
+		cnt[key] = value;
+		total_events += value;
+		if (value > max_cnt)
+			max_cnt = value;
+	}
+	clear_stats(fd);
+	for (key = full_range ? 0 : 29; key < SLOTS; key++) {
+		int c = num_colors * cnt[key] / (max_cnt + 1);
+
+		if (text_only)
+			printf("%s", sym[c]);
+		else
+			printf("%s %s", color[c], nocolor);
+	}
+	printf(" # %lld\n", total_events);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 1; i < ac; i++) {
+		if (strcmp(argv[i], "-a") == 0) {
+			full_range = true;
+		} else if (strcmp(argv[i], "-t") == 0) {
+			text_only = true;
+		} else if (strcmp(argv[i], "-h") == 0) {
+			printf("Usage:\n"
+			       "  -a display wider latency range\n"
+			       "  -t text only\n");
+			return 1;
+		}
+	}
+
+	printf("  heatmap of IO latency\n");
+	if (text_only)
+		printf("  %s", sym[num_colors - 1]);
+	else
+		printf("  %s %s", color[num_colors - 1], nocolor);
+	printf(" - many events with this latency\n");
+
+	if (text_only)
+		printf("  %s", sym[0]);
+	else
+		printf("  %s %s", color[0], nocolor);
+	printf(" - few events\n");
+
+	for (i = 0; ; i++) {
+		if (i % 20 == 0)
+			print_banner();
+		print_hist(map_fd[1]);
+		sleep(2);
+	}
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH] capabilities: Ambient capability set V2
From: Serge E. Hallyn @ 2015-03-01 23:33 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Serge Hallyn, Andy Lutomirski, Jonathan Corbet, Aaron Jones,
	linux-security-module-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r, Andrew G. Morgan,
	Mimi Zohar, Austin S Hemmelgarn, Markku Savela, Jarkko Sakkinen,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Michael Kerrisk
In-Reply-To: <alpine.DEB.2.11.1502261612370.8994-gkYfJU5Cukgdnm+yROfE0A@public.gmane.org>

On Thu, Feb 26, 2015 at 04:14:33PM -0600, Christoph Lameter wrote:
> 
> V1->V2:
>  - Fix up the processing of the caps bits after discussions
>    with Any and Serge. Make patch less intrusive.
> 
> Ambient caps are something like restricted root privileges.
> A process has a set of additional capabilities and those
> are inherited without have to set capabilites in other
> binaries involved. This allow the partial use of root
> like features in a controlled way. It is often useful
> to do this for user space device drivers or software that
> needs increased priviledges for networking or to control
> its own scheduling. Ambient caps allow one to avoid
> having to run these with full root priviledges.
> 
> Control over this feature is avaialable via a new
> prctl option called PR_CAP_AMBIENT. The second argument to prctl
> is a the capability number and the third the desired state.
> 0 for off. Otherwise on.
> 
> Ambient bits are enabled regardless of the inheritance
> mask of the target binary. They are only restricted
> by the bounding set.
> 
> History:
> 
> Linux capabilities have suffered from the problem that they are not
> inheritable like unregular process characteristics under Unix. This is
> behavior that is counter intuitive to the expected behavior of processes
> in Unix.
> 
> In particular there has been recently software that controls NICs from user
> space and provides IP stack like behavior also in user space (DPDK and RDMA
> kernel API based implementations). Those typically need either capabilities
> to allow raw network access or have to be run setsuid. There is scripting and
> LD_PREFLOAD etc involved, arbitrary binaries may be run from those scripts
> including those setting additional capabilites or requiring root access.
> 
> That does not go well with having file capabilities set that would enable
> the capabilities. Maybe it would work if one would setup capabilities on
> all executables but that would also defeat a secure design since these
> binaries may only need those caps for certain situations. Ok setting the
> inheritable flags on everything may also get one there (if there would not
> be the issues with LD_PRELOAD, debugging etc etc).
> 
> The easy solution is to allow some capabilities be inherited like setsuid
> is. We really prefer to use capabilities instead of setsuid (we want to
> limit what damage someone can do after all!). Therefore we have been
> running a patch like this in production for the last 6 years. At some
> point it becomes tedious to run your own custom kernel so we would like
> to have this functionality upstream.
> 
> See some of the earlier related discussions on the problems with capability
> inheritance:
> 
> 0. Recent surprise:
>                 https://lkml.org/lkml/2014/1/21/175
> 
> 1. Attempt to revise caps
>                 http://www.madore.org/~david/linux/newcaps/
> 
> 2. Problems of passing caps through exec
>                 http://unix.stackexchange.com/questions/128394/passing-capabilities-through-exec
> 
> 3. Problems of binding to privileged ports
>                 http://stackoverflow.com/questions/413807/is-there-a-way-for-non-root-processes-to-bind-to-privileged-ports-1024-on-l
> 
> 4. Reviving capabilities
>                 http://lwn.net/Articles/199004/
> 
> There does not seem to be an alternative on the horizon. Some involved
> in security development under Linux have even stated that they want to
> rip out the whole thing and replace it. Its been a couple of years now
> and we are still suffering from the capabilities mess. Let us just
> fix it. Others have already done implementations like this like Nokia
> for the N900.
> 
> 
> This patch does not change the default behavior but it allows to set up
> a list of capabilities via prctl that will enable regular
> unix inheritance only for the selected group of capabilities.
> 
> With that it is then possible to do something trivial like setting
> CAP_NET_RAW on an executable that can then allow that capability to
> be inherited by others.
> 
> Lets have a look at a coding example of a wrapper that enables
> a couple of capabilities:
> 
> ------------------------------ ambient_test.c
> /*
>  * Test program for the ambient capabilities
>  *
>  *
>  * Compile using:
>  *	gcc -o ambient_test ambient_test.o
>  *
>  * This program must have the following capabilities to run properly:
>  * CAP_SETPCAP, CAP_NET_RAW, CAP_NET_ADMIN, CAP_SYS_NICE
>  *
>  * A command to equip this with the right caps is:
>  *
>  *	setcap cap_setpcap,cap_net_raw,cap_net_admin,cap_sys_nice+eip ambient_test
>  *
>  * To get a shell with additional caps that can be inherited do:
>  *
>  * ./ambient_test /bin/bash
>  *
>  */
> 
> #include <stdlib.h>
> #include <stdio.h>
> #include <errno.h>
> #include <sys/prctl.h>
> #include <linux/capability.h>
> 
> /* Defintion to be updated in the user space include files */
> #define PR_CAP_AMBIENT 45
> 
> int main(int argc, char **argv)
> {
> 	int rc;
> 
> 	if (prctl(PR_CAP_AMBIENT, CAP_NET_RAW))
> 		perror("Cannot set CAP_NET_RAW");
> 
> 	if (prctl(PR_CAP_AMBIENT, CAP_NET_ADMIN))
> 		perror("Cannot set CAP_NET_ADMIN");
> 
> 	if (prctl(PR_CAP_AMBIENT, CAP_SYS_NICE))
> 		perror("Cannot set CAP_SYS_NICE");
> 
> 	printf("Ambient_test forking shell\n");
> 	if (execv(argv[1], argv + 1))
> 		perror("Cannot exec");
> 
> 	return 0;
> }
> -------------------------------- ambient_test.c
> 
> Allows the inheritance of CAP_SYS_NICE, CAP_NET_RAW and CAP_NET_ADMIN.
> With that device raw access is possible and also real time priorities
> can be set from user space. This is a frequently needed set of
> priviledged operations in HPC and HFT applications. User space
> processes need to be able to directly access devices as well as
> have full control over scheduling.
> 
> Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
> 
> Index: linux/security/commoncap.c
> ===================================================================
> --- linux.orig/security/commoncap.c	2015-02-25 13:43:06.929973954 -0600
> +++ linux/security/commoncap.c	2015-02-26 16:10:02.347913397 -0600
> @@ -347,15 +347,17 @@ static inline int bprm_caps_from_vfs_cap
>  		*has_cap = true;
> 
>  	CAP_FOR_EACH_U32(i) {
> +		__u32 ambient = current_cred()->cap_ambient.cap[i];
>  		__u32 permitted = caps->permitted.cap[i];
>  		__u32 inheritable = caps->inheritable.cap[i];
> 
>  		/*
> -		 * pP' = (X & fP) | (pI & fI)
> +		 * pP' = (X & fP) | (pI & (fI | pA))
>  		 */
>  		new->cap_permitted.cap[i] =
>  			(new->cap_bset.cap[i] & permitted) |
> -			(new->cap_inheritable.cap[i] & inheritable);
> +			(new->cap_inheritable.cap[i] &
> +					(inheritable | ambient));

So I'd say drop this change ^

> 
>  		if (permitted & ~new->cap_permitted.cap[i])
>  			/* insufficient to execute correctly */
> @@ -453,8 +455,18 @@ static int get_file_caps(struct linux_bi
>  		if (rc == -EINVAL)
>  			printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
>  				__func__, rc, bprm->filename);
> -		else if (rc == -ENODATA)
> +		else if (rc == -ENODATA) {
>  			rc = 0;
> +			if (!cap_isclear(current_cred()->cap_ambient)) {
> +				/*
> +				 * The ambient caps are permitted for
> +				 * files that have no caps
> +				 */
> +				bprm->cred->cap_permitted =
> +					current_cred()->cap_ambient;

and here set vcaps inheritable to current_cred()->ambient.

> +				*effective = true;
> +			}
> +		}
>  		goto out;
>  	}
> 
> @@ -549,9 +561,20 @@ skip:
>  	new->sgid = new->fsgid = new->egid;
> 
>  	if (effective)
> +		/*
> +		 * pE' = pP' & (fE | pA)
> +		 *
> +		 * fE is implicity all set if effective == true.
> +		 * Therefore the above reduces to
> +		 *
> +		 * pE' = pP'
> +		 */
>  		new->cap_effective = new->cap_permitted;
>  	else
>  		cap_clear(new->cap_effective);
> +
> +	/* pA' = pA */
> +	new->cap_ambient = old->cap_ambient;
>  	bprm->cap_effective = effective;
> 
>  	/*
> @@ -566,7 +589,7 @@ skip:
>  	 * Number 1 above might fail if you don't have a full bset, but I think
>  	 * that is interesting information to audit.
>  	 */
> -	if (!cap_isclear(new->cap_effective)) {
> +	if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
>  		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
>  		    !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
>  		    issecure(SECURE_NOROOT)) {
> @@ -598,7 +621,7 @@ int cap_bprm_secureexec(struct linux_bin
>  	if (!uid_eq(cred->uid, root_uid)) {
>  		if (bprm->cap_effective)
>  			return 1;
> -		if (!cap_isclear(cred->cap_permitted))
> +		if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
>  			return 1;
>  	}
> 
> @@ -933,6 +956,23 @@ int cap_task_prctl(int option, unsigned
>  			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
>  		return commit_creds(new);
> 
> +	case PR_CAP_AMBIENT:
> +		if (!ns_capable(current_user_ns(), CAP_SETPCAP))
> +			return -EPERM;
> +
> +		if (!cap_valid(arg2))
> +			return -EINVAL;
> +
> +		if (!ns_capable(current_user_ns(), arg2))
> +			return -EPERM;
> +
> +		new = prepare_creds();
> +		if (arg3 == 0)
> +			cap_lower(new->cap_ambient, arg2);
> +		else
> +			cap_raise(new->cap_ambient, arg2);
> +		return commit_creds(new);
> +
>  	default:
>  		/* No functionality available - continue with default */
>  		return -ENOSYS;
> Index: linux/include/linux/cred.h
> ===================================================================
> --- linux.orig/include/linux/cred.h	2015-02-25 13:43:06.929973954 -0600
> +++ linux/include/linux/cred.h	2015-02-25 13:43:06.925972078 -0600
> @@ -122,6 +122,7 @@ struct cred {
>  	kernel_cap_t	cap_permitted;	/* caps we're permitted */
>  	kernel_cap_t	cap_effective;	/* caps we can actually use */
>  	kernel_cap_t	cap_bset;	/* capability bounding set */
> +	kernel_cap_t	cap_ambient;	/* Ambient capability set */
>  #ifdef CONFIG_KEYS
>  	unsigned char	jit_keyring;	/* default keyring to attach requested
>  					 * keys to */
> Index: linux/include/uapi/linux/prctl.h
> ===================================================================
> --- linux.orig/include/uapi/linux/prctl.h	2015-02-25 13:43:06.929973954 -0600
> +++ linux/include/uapi/linux/prctl.h	2015-02-25 13:43:06.925972078 -0600
> @@ -185,4 +185,7 @@ struct prctl_mm_map {
>  #define PR_MPX_ENABLE_MANAGEMENT  43
>  #define PR_MPX_DISABLE_MANAGEMENT 44
> 
> +/* Control the ambient capability set */
> +#define PR_CAP_AMBIENT 45
> +
>  #endif /* _LINUX_PRCTL_H */
> Index: linux/fs/proc/array.c
> ===================================================================
> --- linux.orig/fs/proc/array.c	2015-02-25 13:43:06.929973954 -0600
> +++ linux/fs/proc/array.c	2015-02-25 13:43:06.925972078 -0600
> @@ -302,7 +302,8 @@ static void render_cap_t(struct seq_file
>  static inline void task_cap(struct seq_file *m, struct task_struct *p)
>  {
>  	const struct cred *cred;
> -	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
> +	kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
> +			cap_bset, cap_ambient;
> 
>  	rcu_read_lock();
>  	cred = __task_cred(p);
> @@ -310,12 +311,14 @@ static inline void task_cap(struct seq_f
>  	cap_permitted	= cred->cap_permitted;
>  	cap_effective	= cred->cap_effective;
>  	cap_bset	= cred->cap_bset;
> +	cap_ambient	= cred->cap_ambient;
>  	rcu_read_unlock();
> 
>  	render_cap_t(m, "CapInh:\t", &cap_inheritable);
>  	render_cap_t(m, "CapPrm:\t", &cap_permitted);
>  	render_cap_t(m, "CapEff:\t", &cap_effective);
>  	render_cap_t(m, "CapBnd:\t", &cap_bset);
> +	render_cap_t(m, "CapAmb:\t", &cap_ambient);
>  }
> 
>  static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* Confidential Letter
From: Mr. Juan Sebastian Morato @ 2015-03-02  0:34 UTC (permalink / raw)
  To: Recipients

Dear Friend, 

I am Mr. Juan Sebastian Morato, the Auditor General of Unicaja Bank Madrid. In the course of my auditing, I discovered a floating fund in an account, which was opened in 1990 at Cam Bank before it was bought over by Unicaja Group which I am the auditor belonging to a dead foreigner Mr. Kenny who died in 2004. Every effort made to track any member of his family or next of kin has since failed; hence I got in contact with you to stand as his next of kin since you bear the same last name. He died leaving no heir or a will.

My intention is to transfer this sum of 5.5M in the aforementioned account to a safe account. I am therefore proposing that you quietly partner with me and provide an account or set up a new one that will serve the purpose of receiving this fund. For your assistance in this venture, I am ready to part with a good percentage of the entire funds. After going through the deceased person's records and files, I discovered that:

(1) No one has operated this account since 2004
(2) He died without an heir; hence the money has been floating.
(3) No other person knows about this account and there was no known beneficiary.

If I do not remit this money urgently, it would be forfeited and subsequently converted to company's funds, which will benefit only the directors of my firm. This money can be approved to you legally as with all the necessary documentary approvals in your name. However, you would be required to show some proof of claim, which I will provide you with and also guide you on how to make your applications.

Please do give me a reply on my private e-mail juan.morato1-MWBdxC+W4QGHXe+LvDLADg@public.gmane.org or fax 00 34 917 692 656 so that I can send you detailed information on the modalities of my proposition. I completely trust you to keep this proposition absolutely confidential. Kindly forward your telephone number where I can reach you easily. I look forward to your prompt response.

Best Regards,
Mr. Juan Sebastian Morato
Fax: 00 34 917 692 656

^ permalink raw reply

* RE: [PATCH RFC 0/3] Drivers: hv: utils: re-implement the kernel/userspace communication layer
From: KY Srinivasan @ 2015-03-02  2:11 UTC (permalink / raw)
  To: Vitaly Kuznetsov, devel@linuxdriverproject.org
  Cc: Haiyang Zhang, linux-kernel@vger.kernel.org, Dexuan Cui,
	Radim Krčmář, Greg Kroah-Hartman,
	linux-api@vger.kernel.org
In-Reply-To: <1425053665-635-1-git-send-email-vkuznets@redhat.com>



> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:vkuznets@redhat.com]
> Sent: Friday, February 27, 2015 8:14 AM
> To: KY Srinivasan; devel@linuxdriverproject.org
> Cc: Haiyang Zhang; linux-kernel@vger.kernel.org; Dexuan Cui; Radim Krčmář;
> Greg Kroah-Hartman; linux-api@vger.kernel.org
> Subject: [PATCH RFC 0/3] Drivers: hv: utils: re-implement the
> kernel/userspace communication layer
> 
> This series converts kvp/vss daemons to use misc char devices instead of
> netlink for userspace/kernel communication and then updates fcopy to be
> consistent with kvp/vss.
> 
> Userspace/kernel communication via netlink has a number of issues:
> - It is hard for userspace to figure out if the kernel part was loaded or not
>   and this fact can change as there is a way to enable/disable the service from
>   host side. Racy daemon startup is also a problem.
> - When the userspace daemon restarts/dies kernel part doesn't receive a
>   notification.
> - Netlink communication is not stable under heavy load.
> - ...
> 
> RFC: I'm a bit puzzled on how to split commits 1 and 2 avoiding breakages.
> Commit 3 can definitely be split, however, it is consistent with commits 1 and
> 2 at this moment and I'm not sure such split will simplify the review.
> 
> Vitaly Kuznetsov (3):
>   Drivers: hv: kvp: convert userspace/kernel communication to using char
>     device
>   Drivers: hv: vss: convert userspace/kernel communication to using char
>     device
>   Drivers: hv: fcopy: make it consistent with vss/kvp

Vitaly,

Thank you for working on this. Before I give you detailed comments on your
patches, I wanted to understand if the cost of maintaining compatibility was
carefully considered. As a first step we could look at cleanly abstracting the 
transport (between user level and the kernel) out of the kernel driver code 
as well as the new daemon
code. What are your thoughts on this. Version negotiation is obviously key to
maintaining compatibility. One of the options we can explore is to continue to
use netlink for version negotiation and for appropriate daemon versions, we could use
the char device mechanism for transporting the payload.

I like the new state machine you have defined and this is orthogonal to the transport
options we have. You have sought feedback on how we can split up these changes into
smaller patches. This is how I would proceed here:

Patch(es) to clean up  the current code: 
	Patch(es) to clean up the state machine.
	Patch(es) to isolate the kernel/user transport
Patch(es) to implement the new transport
 
Regards,

K. Y
> 
>  drivers/hv/hv_fcopy.c       | 395 +++++++++++++++++++++++++---------------
> ---
>  drivers/hv/hv_kvp.c         | 396 +++++++++++++++++++++++++++-------------
> ----
>  drivers/hv/hv_snapshot.c    | 335 +++++++++++++++++++++++++++---------
> -
>  include/uapi/linux/hyperv.h |  10 ++
>  tools/hv/hv_fcopy_daemon.c  |  48 ++++--
>  tools/hv/hv_kvp_daemon.c    | 187 ++++-----------------
>  tools/hv/hv_vss_daemon.c    | 141 +++-------------
>  7 files changed, 824 insertions(+), 688 deletions(-)
> 
> --
> 1.9.3

^ permalink raw reply

* [PATCH 0/9] N900 Modem Speech Support
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api

Hi,

This patchset contains the missing speech data support for the
Nokia N900 modem.

Userland access goes via /dev/cmt_speech. The API is implemented in
libcmtspeechdata, which is used by ofono and the freesmartphone.org project.
Apart from that the device is also used by the phone binaries distributed
with Maemo. So while this is a new userland ABI for the mainline kernel it
has been tested in the wild for some years.

Simple Testing of the API can be done by checking out libcmtspeechdata [0],
building the test tool and executing it. The tool will loop back audio data
received from the caller.

I have prepared a kernel branch, which includes these changes at [1].

[0] https://lkml.org/lkml/2015/2/11/526
[1] git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git branch/cmt-speech

-- Sebastian

Kai Vehmanen (3):
  HSI: cmt_speech: Add cmt-speech driver
  HSI: cmt_speech: Avoid GFP_ATOMIC in cs_char_open
  HSI: cmt_speech: Return error if HSI port not configured

Sebastian Reichel (6):
  HSI: cmt_speech: Fix build for 4.0 kernel
  HSI: cmt_speech: Cleanup initialisation
  HSI: cmt_speech: Rename driver to cmt-speech
  HSI: cmt_speech: Move cs-protocol.h to include/uapi/linux/hsi
  HSI: cmt_speech: Remove hardcoded channel numbers
  HSI: nokia-modem: Add cmt_speech support

 drivers/hsi/clients/Kconfig          |   11 +-
 drivers/hsi/clients/Makefile         |    1 +
 drivers/hsi/clients/cmt_speech.c     | 1451 ++++++++++++++++++++++++++++++++++
 drivers/hsi/clients/nokia-modem.c    |   31 +-
 include/uapi/linux/hsi/Kbuild        |    2 +-
 include/uapi/linux/hsi/cs-protocol.h |  113 +++
 6 files changed, 1606 insertions(+), 3 deletions(-)
 create mode 100644 drivers/hsi/clients/cmt_speech.c
 create mode 100644 include/uapi/linux/hsi/cs-protocol.h

-- 
2.1.4

^ permalink raw reply

* [PATCH 1/9] HSI: cmt_speech: Add cmt-speech driver
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api, Kai Vehmanen, Carlos Chinea, Joni Lapilainen
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

From: Kai Vehmanen <kai.vehmanen@nokia.com>

Introduces the cmt-speech driver, which implements
a character device interface for transferring speech
data frames over HSI/SSI.

The driver is used to exchange voice/speech data between
the Nokia N900/N950/N9's modem and its cpu.

Signed-off-by: Kai Vehmanen <kai.vehmanen@nokia.com>
Signed-off-by: Carlos Chinea <carlos.chinea@nokia.com>
Signed-off-by: Joni Lapilainen <joni.lapilainen@gmail.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/cmt_speech.c | 1426 ++++++++++++++++++++++++++++++++++++++
 include/linux/cs-protocol.h      |  116 ++++
 2 files changed, 1542 insertions(+)
 create mode 100644 drivers/hsi/clients/cmt_speech.c
 create mode 100644 include/linux/cs-protocol.h

diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
new file mode 100644
index 0000000..7c0f711
--- /dev/null
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -0,0 +1,1426 @@
+/*
+ * cmt_speech.c - HSI CMT speech driver
+ *
+ * Copyright (C) 2008,2009,2010 Nokia Corporation. All rights reserved.
+ *
+ * Contact: Kai Vehmanen <kai.vehmanen@nokia.com>
+ * Original author: Peter Ujfalusi <peter.ujfalusi@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/ioctl.h>
+#include <linux/uaccess.h>
+#include <linux/pm_qos_params.h>
+#include <linux/hsi/hsi.h>
+#include <linux/hsi/ssip_slave.h>
+#include <linux/cs-protocol.h>
+
+#define CS_MMAP_SIZE	PAGE_SIZE
+#define DRIVER_NAME	"cmt_speech"
+
+struct char_queue {
+	struct list_head	list;
+	u32			msg;
+};
+
+struct cs_char {
+	unsigned int		opened;
+	struct hsi_client	*cl;
+	struct cs_hsi_iface	*hi;
+	struct list_head	chardev_queue;
+	struct list_head	dataind_queue;
+	int			dataind_pending;
+	/* mmap things */
+	unsigned long		mmap_base;
+	unsigned long		mmap_size;
+	spinlock_t		lock;
+	struct fasync_struct	*async_queue;
+	wait_queue_head_t	wait;
+};
+
+#define SSI_CHANNEL_STATE_READING	1
+#define SSI_CHANNEL_STATE_WRITING	(1 << 1)
+#define SSI_CHANNEL_STATE_POLL		(1 << 2)
+#define SSI_CHANNEL_STATE_ERROR		(1 << 3)
+
+#define CONTROL_HSI_CH			1
+#define DATA_HSI_CH			2
+
+#define TARGET_MASK			0xf000000
+#define TARGET_REMOTE			(1 << CS_DOMAIN_SHIFT)
+#define TARGET_LOCAL			0
+
+/* Number of pre-allocated commands buffers */
+#define CS_MAX_CMDS		        4
+
+/*
+ * During data transfers, transactions must be handled
+ * within 20ms (fixed value in cmtspeech HSI protocol)
+ */
+#define CS_QOS_LATENCY_FOR_DATA_USEC	20000
+
+/* Timeout to wait for pending HSI transfers to complete */
+#define CS_HSI_TRANSFER_TIMEOUT_MS      500
+
+
+#define RX_PTR_BOUNDARY_SHIFT		8
+#define RX_PTR_MAX_SHIFT		(RX_PTR_BOUNDARY_SHIFT + \
+						CS_MAX_BUFFERS_SHIFT)
+struct cs_hsi_iface {
+	struct hsi_client		*cl;
+	struct hsi_client		*master;
+
+	unsigned int			iface_state;
+	unsigned int			wakeline_state;
+	unsigned int			control_state;
+	unsigned int			data_state;
+
+	/* state exposed to application */
+	struct cs_mmap_config_block	*mmap_cfg;
+
+	unsigned long			mmap_base;
+	unsigned long			mmap_size;
+
+	unsigned int			rx_slot;
+	unsigned int			tx_slot;
+
+	/* note: for security reasons, we do not trust the contents of
+	 * mmap_cfg, but instead duplicate the variables here */
+	unsigned int			buf_size;
+	unsigned int			rx_bufs;
+	unsigned int			tx_bufs;
+	unsigned int			rx_ptr_boundary;
+	unsigned int			rx_offsets[CS_MAX_BUFFERS];
+	unsigned int			tx_offsets[CS_MAX_BUFFERS];
+	/* size of aligned memory blocks */
+	unsigned int			slot_size;
+	unsigned int			flags;
+
+	struct list_head		cmdqueue;
+
+	struct hsi_msg			*data_rx_msg;
+	struct hsi_msg			*data_tx_msg;
+	wait_queue_head_t		datawait;
+
+	struct pm_qos_request_list      pm_qos_req;
+
+	spinlock_t			lock;
+};
+
+static struct cs_char cs_char_data;
+
+static void cs_hsi_read_on_control(struct cs_hsi_iface *hi);
+static void cs_hsi_read_on_data(struct cs_hsi_iface *hi);
+
+static inline void rx_ptr_shift_too_big(void)
+{
+	BUILD_BUG_ON((1LLU << RX_PTR_MAX_SHIFT) > UINT_MAX);
+}
+
+static void cs_notify(u32 message, struct list_head *head)
+{
+	struct char_queue *entry;
+
+	spin_lock(&cs_char_data.lock);
+
+	if (!cs_char_data.opened) {
+		spin_unlock(&cs_char_data.lock);
+		goto out;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry) {
+		dev_err(&cs_char_data.cl->device,
+			"Can't allocate new entry for the queue.\n");
+		spin_unlock(&cs_char_data.lock);
+		goto out;
+	}
+
+	entry->msg = message;
+	list_add_tail(&entry->list, head);
+
+	spin_unlock(&cs_char_data.lock);
+
+	wake_up_interruptible(&cs_char_data.wait);
+	kill_fasync(&cs_char_data.async_queue, SIGIO, POLL_IN);
+
+out:
+	return;
+}
+
+static u32 cs_pop_entry(struct list_head *head)
+{
+	struct char_queue *entry;
+	u32 data;
+
+	entry = list_entry(head->next, struct char_queue, list);
+	data = entry->msg;
+	list_del(&entry->list);
+	kfree(entry);
+
+	return data;
+}
+
+static void cs_notify_control(u32 message)
+{
+	cs_notify(message, &cs_char_data.chardev_queue);
+}
+
+static void cs_notify_data(u32 message, int maxlength)
+{
+	cs_notify(message, &cs_char_data.dataind_queue);
+
+	spin_lock(&cs_char_data.lock);
+	++cs_char_data.dataind_pending;
+	while (cs_char_data.dataind_pending > maxlength &&
+				!list_empty(&cs_char_data.dataind_queue)) {
+		dev_dbg(&cs_char_data.cl->device, "data notification "
+		"queue overrun (%u entries)\n", cs_char_data.dataind_pending);
+
+		cs_pop_entry(&cs_char_data.dataind_queue);
+		--cs_char_data.dataind_pending;
+	}
+	spin_unlock(&cs_char_data.lock);
+}
+
+static inline void cs_set_cmd(struct hsi_msg *msg, u32 cmd)
+{
+	u32 *data;
+
+	data = sg_virt(msg->sgt.sgl);
+	*data = cmd;
+}
+
+static inline u32 cs_get_cmd(struct hsi_msg *msg)
+{
+	u32 *data;
+
+	data = sg_virt(msg->sgt.sgl);
+
+	return *data;
+}
+
+static void cs_release_cmd(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+
+	list_add_tail(&msg->link, &hi->cmdqueue);
+}
+
+static void cs_cmd_destructor(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+
+	spin_lock(&hi->lock);
+
+	dev_dbg(&cs_char_data.cl->device, "control cmd destructor\n");
+
+	if (hi->iface_state != CS_STATE_CLOSED)
+		dev_err(&hi->cl->device, "Cmd flushed while driver active\n");
+
+	if (msg->ttype == HSI_MSG_READ)
+		hi->control_state &=
+			~(SSI_CHANNEL_STATE_POLL | SSI_CHANNEL_STATE_READING);
+	else if (msg->ttype == HSI_MSG_WRITE &&
+			hi->control_state & SSI_CHANNEL_STATE_WRITING)
+		hi->control_state &= ~SSI_CHANNEL_STATE_WRITING;
+
+	cs_release_cmd(msg);
+
+	spin_unlock(&hi->lock);
+}
+
+static struct hsi_msg *cs_claim_cmd(struct cs_hsi_iface* ssi)
+{
+	struct hsi_msg *msg;
+
+	BUG_ON(list_empty(&ssi->cmdqueue));
+
+	msg = list_first_entry(&ssi->cmdqueue, struct hsi_msg, link);
+	list_del(&msg->link);
+	msg->destructor = cs_cmd_destructor;
+
+	return msg;
+}
+
+static void cs_free_cmds(struct cs_hsi_iface *ssi)
+{
+	struct hsi_msg *msg, *tmp;
+
+	list_for_each_entry_safe(msg, tmp, &ssi->cmdqueue, link) {
+		list_del(&msg->link);
+		msg->destructor = NULL;
+		kfree(sg_virt(msg->sgt.sgl));
+		hsi_free_msg(msg);
+	}
+}
+
+static int cs_alloc_cmds(struct cs_hsi_iface *hi)
+{
+	struct hsi_msg *msg;
+	u32 *buf;
+	unsigned int i;
+
+	INIT_LIST_HEAD(&hi->cmdqueue);
+
+	for (i = 0; i < CS_MAX_CMDS; i++) {
+		msg = hsi_alloc_msg(1, GFP_ATOMIC);
+		if (!msg)
+			goto out;
+		buf = kmalloc(sizeof(*buf), GFP_ATOMIC);
+		if (!buf) {
+			hsi_free_msg(msg);
+			goto out;
+		}
+		sg_init_one(msg->sgt.sgl, buf, sizeof(*buf));
+		msg->channel = CONTROL_HSI_CH;
+		msg->context = hi;
+		list_add_tail(&msg->link, &hi->cmdqueue);
+	}
+
+	return 0;
+
+out:
+	cs_free_cmds(hi);
+	return -ENOMEM;
+}
+
+static void cs_hsi_data_destructor(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	const char *dir = (msg->ttype == HSI_MSG_READ) ? "TX" : "RX";
+
+	dev_dbg(&cs_char_data.cl->device, "Freeing data %s message\n", dir);
+
+	spin_lock(&hi->lock);
+	if (hi->iface_state != CS_STATE_CLOSED)
+		dev_err(&cs_char_data.cl->device,
+				"Data %s flush while device active\n", dir);
+	if (msg->ttype == HSI_MSG_READ)
+		hi->data_state &=
+			~(SSI_CHANNEL_STATE_POLL | SSI_CHANNEL_STATE_READING);
+	else
+		hi->data_state &= ~SSI_CHANNEL_STATE_WRITING;
+
+	msg->status = HSI_STATUS_COMPLETED;
+	if (unlikely(waitqueue_active(&hi->datawait)))
+		wake_up_interruptible(&hi->datawait);
+
+	spin_unlock(&hi->lock);
+}
+
+static int cs_hsi_alloc_data(struct cs_hsi_iface *hi)
+{
+	struct hsi_msg *txmsg, *rxmsg;
+	int res = 0;
+
+	rxmsg = hsi_alloc_msg(1, GFP_KERNEL);
+	if (!rxmsg) {
+		res = -ENOMEM;
+		goto out1;
+	}
+	rxmsg->channel = DATA_HSI_CH;
+	rxmsg->destructor = cs_hsi_data_destructor;
+	rxmsg->context = hi;
+
+	txmsg = hsi_alloc_msg(1, GFP_KERNEL);
+	if (!txmsg) {
+		res = -ENOMEM;
+		goto out2;
+	}
+	txmsg->channel = DATA_HSI_CH;
+	txmsg->destructor = cs_hsi_data_destructor;
+	txmsg->context = hi;
+
+	hi->data_rx_msg = rxmsg;
+	hi->data_tx_msg = txmsg;
+
+	return 0;
+
+out2:
+	hsi_free_msg(rxmsg);
+out1:
+	return res;
+}
+
+static void cs_hsi_free_data_msg(struct hsi_msg *msg)
+{
+	WARN_ON(msg->status != HSI_STATUS_COMPLETED &&
+					msg->status != HSI_STATUS_ERROR);
+	hsi_free_msg(msg);
+}
+
+static void cs_hsi_free_data(struct cs_hsi_iface *hi)
+{
+	cs_hsi_free_data_msg(hi->data_rx_msg);
+	cs_hsi_free_data_msg(hi->data_tx_msg);
+}
+
+static inline void __cs_hsi_error_pre(struct cs_hsi_iface *hi,
+					struct hsi_msg *msg, const char *info,
+					unsigned int *state)
+{
+	spin_lock(&hi->lock);
+	dev_err(&hi->cl->device, "HSI %s error, msg %d, state %u\n",
+		info, msg->status, *state);
+}
+
+static inline void __cs_hsi_error_post(struct cs_hsi_iface *hi)
+{
+	spin_unlock(&hi->lock);
+}
+
+static inline void __cs_hsi_error_read_bits(unsigned int *state)
+{
+	*state |= SSI_CHANNEL_STATE_ERROR;
+	*state &= ~(SSI_CHANNEL_STATE_READING | SSI_CHANNEL_STATE_POLL);
+}
+
+static inline void __cs_hsi_error_write_bits(unsigned int *state)
+{
+	*state |= SSI_CHANNEL_STATE_ERROR;
+	*state &= ~SSI_CHANNEL_STATE_WRITING;
+}
+
+static void cs_hsi_control_read_error(struct cs_hsi_iface *hi,
+							struct hsi_msg *msg)
+{
+	__cs_hsi_error_pre(hi, msg, "control read", &hi->control_state);
+	cs_release_cmd(msg);
+	__cs_hsi_error_read_bits(&hi->control_state);
+	__cs_hsi_error_post(hi);
+}
+
+static void cs_hsi_control_write_error(struct cs_hsi_iface *hi,
+							struct hsi_msg *msg)
+{
+	__cs_hsi_error_pre(hi, msg, "control write", &hi->control_state);
+	cs_release_cmd(msg);
+	__cs_hsi_error_write_bits(&hi->control_state);
+	__cs_hsi_error_post(hi);
+
+}
+
+static void cs_hsi_data_read_error(struct cs_hsi_iface *hi, struct hsi_msg *msg)
+{
+	__cs_hsi_error_pre(hi, msg, "data read", &hi->data_state);
+	__cs_hsi_error_read_bits(&hi->data_state);
+	__cs_hsi_error_post(hi);
+}
+
+static void cs_hsi_data_write_error(struct cs_hsi_iface *hi,
+							struct hsi_msg *msg)
+{
+	__cs_hsi_error_pre(hi, msg, "data write", &hi->data_state);
+	__cs_hsi_error_write_bits(&hi->data_state);
+	__cs_hsi_error_post(hi);
+}
+
+static void cs_hsi_read_on_control_complete(struct hsi_msg *msg)
+{
+	u32 cmd = cs_get_cmd(msg);
+	struct cs_hsi_iface *hi = msg->context;
+
+	spin_lock(&hi->lock);
+	hi->control_state &= ~SSI_CHANNEL_STATE_READING;
+	if (msg->status == HSI_STATUS_ERROR) {
+		dev_err(&hi->cl->device, "Control RX error detected\n");
+		cs_hsi_control_read_error(hi, msg);
+		spin_unlock(&hi->lock);
+		goto out;
+	}
+	dev_dbg(&hi->cl->device, "Read on control: %08X\n", cmd);
+	cs_release_cmd(msg);
+	if (hi->flags & CS_FEAT_TSTAMP_RX_CTRL) {
+		struct timespec *tstamp =
+			&hi->mmap_cfg->tstamp_rx_ctrl;
+		do_posix_clock_monotonic_gettime(tstamp);
+	}
+	spin_unlock(&hi->lock);
+
+	cs_notify_control(cmd);
+
+out:
+	cs_hsi_read_on_control(hi);
+}
+
+static void cs_hsi_peek_on_control_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	int ret;
+
+	if (msg->status == HSI_STATUS_ERROR) {
+		dev_err(&hi->cl->device, "Control peek RX error detected\n");
+		cs_hsi_control_read_error(hi, msg);
+		return;
+	}
+
+	WARN_ON(!(hi->control_state & SSI_CHANNEL_STATE_READING));
+
+	dev_dbg(&hi->cl->device, "Peek on control complete, reading\n");
+	msg->sgt.nents = 1;
+	msg->complete = cs_hsi_read_on_control_complete;
+	ret = hsi_async_read(hi->cl, msg);
+	if (ret)
+		cs_hsi_control_read_error(hi, msg);
+}
+
+static void cs_hsi_read_on_control(struct cs_hsi_iface *hi)
+{
+	struct hsi_msg *msg;
+	int ret;
+
+	spin_lock(&hi->lock);
+	if (hi->control_state & SSI_CHANNEL_STATE_READING) {
+		dev_err(&hi->cl->device, "Control read already pending (%d)\n",
+			hi->control_state);
+		spin_unlock(&hi->lock);
+		return;
+	}
+	if (hi->control_state & SSI_CHANNEL_STATE_ERROR) {
+		dev_err(&hi->cl->device, "Control read error (%d)\n",
+			hi->control_state);
+		spin_unlock(&hi->lock);
+		return;
+	}
+	hi->control_state |= SSI_CHANNEL_STATE_READING;
+	dev_dbg(&hi->cl->device, "Issuing RX on control\n");
+	msg = cs_claim_cmd(hi);
+	spin_unlock(&hi->lock);
+
+	msg->sgt.nents = 0;
+	msg->complete = cs_hsi_peek_on_control_complete;
+	ret = hsi_async_read(hi->cl, msg);
+	if (ret)
+		cs_hsi_control_read_error(hi, msg);
+}
+
+static void cs_hsi_write_on_control_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	if (msg->status == HSI_STATUS_COMPLETED) {
+		spin_lock(&hi->lock);
+		hi->control_state &= ~SSI_CHANNEL_STATE_WRITING;
+		cs_release_cmd(msg);
+		spin_unlock(&hi->lock);
+	} else if (msg->status == HSI_STATUS_ERROR) {
+		cs_hsi_control_write_error(hi, msg);
+	} else {
+		dev_err(&hi->cl->device,
+			"unexpected status in control write callback %d\n",
+			msg->status);
+	}
+}
+
+static int cs_hsi_write_on_control(struct cs_hsi_iface *hi, u32 message)
+{
+	struct hsi_msg *msg;
+	int ret;
+
+	spin_lock(&hi->lock);
+	if (hi->control_state & SSI_CHANNEL_STATE_ERROR) {
+		spin_unlock(&hi->lock);
+		return -EIO;
+	}
+	if (hi->control_state & SSI_CHANNEL_STATE_WRITING) {
+		dev_err(&hi->cl->device,
+			"Write still pending on control channel.\n");
+		spin_unlock(&hi->lock);
+		return -EBUSY;
+	}
+	hi->control_state |= SSI_CHANNEL_STATE_WRITING;
+	msg = cs_claim_cmd(hi);
+	spin_unlock(&hi->lock);
+
+	cs_set_cmd(msg, message);
+	msg->sgt.nents = 1;
+	msg->complete = cs_hsi_write_on_control_complete;
+	dev_dbg(&hi->cl->device,
+		"Sending control message %08X\n", message);
+	ret = hsi_async_write(hi->cl, msg);
+	if (ret) {
+		dev_err(&hi->cl->device,
+			"async_write failed with %d\n", ret);
+		cs_hsi_control_write_error(hi, msg);
+	}
+
+	/*
+	 * Make sure control read is always pending when issuing
+	 * new control writes. This is needed as the controller
+	 * may flush our messages if e.g. the peer device reboots
+	 * unexpectedly (and we cannot directly resubmit a new read from
+	 * the message destructor; see cs_cmd_destructor()).
+	 */
+	if (!(hi->control_state & SSI_CHANNEL_STATE_READING)) {
+		dev_err(&hi->cl->device, "Restarting control reads\n");
+		cs_hsi_read_on_control(hi);
+	}
+
+	return 0;
+}
+
+static void cs_hsi_read_on_data_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	u32 payload;
+
+	if (unlikely(msg->status == HSI_STATUS_ERROR)) {
+		cs_hsi_data_read_error(hi, msg);
+		return;
+	}
+
+	spin_lock(&hi->lock);
+	WARN_ON(!(hi->data_state & SSI_CHANNEL_STATE_READING));
+	hi->data_state &= ~SSI_CHANNEL_STATE_READING;
+	payload = CS_RX_DATA_RECEIVED;
+	payload |= hi->rx_slot;
+	hi->rx_slot++;
+	hi->rx_slot %= hi->rx_ptr_boundary;
+	/* expose current rx ptr in mmap area */
+	hi->mmap_cfg->rx_ptr = hi->rx_slot;
+	if (unlikely(waitqueue_active(&hi->datawait)))
+		wake_up_interruptible(&hi->datawait);
+	spin_unlock(&hi->lock);
+
+	cs_notify_data(payload, hi->rx_bufs);
+	cs_hsi_read_on_data(hi);
+}
+
+static void cs_hsi_peek_on_data_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	u32 *address;
+	int ret;
+
+	if (unlikely(msg->status == HSI_STATUS_ERROR)) {
+		cs_hsi_data_read_error(hi, msg);
+		return;
+	}
+	if (unlikely(hi->iface_state != CS_STATE_CONFIGURED)) {
+		dev_err(&hi->cl->device, "Data received in invalid state\n");
+		cs_hsi_data_read_error(hi, msg);
+		return;
+	}
+
+	spin_lock(&hi->lock);
+	WARN_ON(!(hi->data_state & SSI_CHANNEL_STATE_POLL));
+	hi->data_state &= ~SSI_CHANNEL_STATE_POLL;
+	hi->data_state |= SSI_CHANNEL_STATE_READING;
+	spin_unlock(&hi->lock);
+
+	address = (u32 *)(hi->mmap_base +
+				hi->rx_offsets[hi->rx_slot % hi->rx_bufs]);
+	sg_init_one(msg->sgt.sgl, address, hi->buf_size);
+	msg->sgt.nents = 1;
+	msg->complete = cs_hsi_read_on_data_complete;
+	ret = hsi_async_read(hi->cl, msg);
+	if (ret)
+		cs_hsi_data_read_error(hi, msg);
+}
+
+/**
+ * Read/write transaction is ongoing. Returns false if in
+ * SSI_CHANNEL_STATE_POLL state.
+ */
+static inline int cs_state_xfer_active(unsigned int state)
+{
+	return (state & SSI_CHANNEL_STATE_WRITING) ||
+		(state & SSI_CHANNEL_STATE_READING);
+}
+
+/**
+ * No pending read/writes
+ */
+static inline int cs_state_idle(unsigned int state)
+{
+	return !(state & ~SSI_CHANNEL_STATE_ERROR);
+}
+
+static void cs_hsi_read_on_data(struct cs_hsi_iface *hi)
+{
+	struct hsi_msg *rxmsg;
+	int ret;
+
+	spin_lock(&hi->lock);
+	if (hi->data_state &
+		(SSI_CHANNEL_STATE_READING | SSI_CHANNEL_STATE_POLL)) {
+		dev_dbg(&hi->cl->device, "Data read already pending (%u)\n",
+			hi->data_state);
+		spin_unlock(&hi->lock);
+		return;
+	}
+	hi->data_state |= SSI_CHANNEL_STATE_POLL;
+	spin_unlock(&hi->lock);
+
+	rxmsg = hi->data_rx_msg;
+	sg_init_one(rxmsg->sgt.sgl, (void *)hi->mmap_base, 0);
+	rxmsg->sgt.nents = 0;
+	rxmsg->complete = cs_hsi_peek_on_data_complete;
+
+	ret = hsi_async_read(hi->cl, rxmsg);
+	if (ret)
+		cs_hsi_data_read_error(hi, rxmsg);
+}
+
+static void cs_hsi_write_on_data_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+
+	if (msg->status == HSI_STATUS_COMPLETED) {
+		spin_lock(&hi->lock);
+		hi->data_state &= ~SSI_CHANNEL_STATE_WRITING;
+		if (unlikely(waitqueue_active(&hi->datawait)))
+			wake_up_interruptible(&hi->datawait);
+		spin_unlock(&hi->lock);
+	} else {
+		cs_hsi_data_write_error(hi, msg);
+	}
+}
+
+static int cs_hsi_write_on_data(struct cs_hsi_iface *hi, unsigned int slot)
+{
+	u32 *address;
+	struct hsi_msg *txmsg;
+	int ret;
+
+	spin_lock(&hi->lock);
+	if (hi->iface_state != CS_STATE_CONFIGURED) {
+		dev_err(&hi->cl->device, "Not configured, aborting\n");
+		ret = -EINVAL;
+		goto error;
+	}
+	if (hi->data_state & SSI_CHANNEL_STATE_ERROR) {
+		dev_err(&hi->cl->device, "HSI error, aborting\n");
+		ret = -EIO;
+		goto error;
+	}
+	if (hi->data_state & SSI_CHANNEL_STATE_WRITING) {
+		dev_err(&hi->cl->device, "Write pending on data channel.\n");
+		ret = -EBUSY;
+		goto error;
+	}
+	hi->data_state |= SSI_CHANNEL_STATE_WRITING;
+	spin_unlock(&hi->lock);
+
+	hi->tx_slot = slot;
+	address = (u32 *)(hi->mmap_base + hi->tx_offsets[hi->tx_slot]);
+	txmsg = hi->data_tx_msg;
+	sg_init_one(txmsg->sgt.sgl, address, hi->buf_size);
+	txmsg->complete = cs_hsi_write_on_data_complete;
+	ret = hsi_async_write(hi->cl, txmsg);
+	if (ret)
+		cs_hsi_data_write_error(hi, txmsg);
+
+	return ret;
+
+error:
+	spin_unlock(&hi->lock);
+	if (ret == -EIO)
+		cs_hsi_data_write_error(hi, hi->data_tx_msg);
+
+	return ret;
+}
+
+static unsigned int cs_hsi_get_state(struct cs_hsi_iface *hi)
+{
+	return hi->iface_state;
+}
+
+static int cs_hsi_command(struct cs_hsi_iface *hi, u32 cmd)
+{
+	int ret = 0;
+
+	local_bh_disable();
+	switch (cmd & TARGET_MASK) {
+	case TARGET_REMOTE:
+		ret = cs_hsi_write_on_control(hi, cmd);
+		break;
+	case TARGET_LOCAL:
+		if ((cmd & CS_CMD_MASK) == CS_TX_DATA_READY)
+			ret = cs_hsi_write_on_data(hi, cmd & CS_PARAM_MASK);
+		else
+			ret = -EINVAL;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	local_bh_enable();
+
+	return ret;
+}
+
+static void cs_hsi_set_wakeline(struct cs_hsi_iface *hi,
+				unsigned int new_state)
+{
+	int change = 0;
+
+	spin_lock_bh(&hi->lock);
+	if (hi->wakeline_state != new_state) {
+		hi->wakeline_state = new_state;
+		change = 1;
+		dev_dbg(&hi->cl->device, "setting wake line to %d (%p)\n",
+			new_state, hi->cl);
+	}
+	spin_unlock_bh(&hi->lock);
+
+	if (change) {
+		if (new_state)
+			ssip_slave_start_tx(hi->master);
+		else
+			ssip_slave_stop_tx(hi->master);
+	}
+
+	dev_dbg(&hi->cl->device, "wake line set to %d (%p)\n",
+		new_state, hi->cl);
+}
+
+static void set_buffer_sizes(struct cs_hsi_iface *hi, int rx_bufs, int tx_bufs)
+{
+	hi->rx_bufs = rx_bufs;
+	hi->tx_bufs = tx_bufs;
+	hi->mmap_cfg->rx_bufs = rx_bufs;
+	hi->mmap_cfg->tx_bufs = tx_bufs;
+
+	if (hi->flags & CS_FEAT_ROLLING_RX_COUNTER) {
+		/*
+		 * For more robust overrun detection, let the rx
+		 * pointer run in range 0..'boundary-1'. Boundary
+		 * is a multiple of rx_bufs, and limited in max size
+		 * by RX_PTR_MAX_SHIFT to allow for fast ptr-diff
+		 * calculation.
+		 */
+		hi->rx_ptr_boundary = (rx_bufs << RX_PTR_BOUNDARY_SHIFT);
+		hi->mmap_cfg->rx_ptr_boundary = hi->rx_ptr_boundary;
+	} else {
+		hi->rx_ptr_boundary = hi->rx_bufs;
+	}
+}
+
+static int check_buf_params(struct cs_hsi_iface *hi,
+					const struct cs_buffer_config *buf_cfg)
+{
+	size_t buf_size_aligned = L1_CACHE_ALIGN(buf_cfg->buf_size) *
+					(buf_cfg->rx_bufs + buf_cfg->tx_bufs);
+	size_t ctrl_size_aligned = L1_CACHE_ALIGN(sizeof(*hi->mmap_cfg));
+	int r = 0;
+
+	if (buf_cfg->rx_bufs > CS_MAX_BUFFERS ||
+					buf_cfg->tx_bufs > CS_MAX_BUFFERS) {
+		r = -EINVAL;
+	} else if ((buf_size_aligned + ctrl_size_aligned) >= hi->mmap_size) {
+		dev_err(&hi->cl->device, "No space for the requested buffer "
+			"configuration\n");
+		r = -ENOBUFS;
+	}
+
+	return r;
+}
+
+/**
+ * Block until pending data transfers have completed.
+ */
+static int cs_hsi_data_sync(struct cs_hsi_iface *hi)
+{
+	int r = 0;
+
+	spin_lock_bh(&hi->lock);
+
+	if (!cs_state_xfer_active(hi->data_state)) {
+		dev_dbg(&hi->cl->device, "hsi_data_sync break, idle\n");
+		goto out;
+	}
+
+	for (;;) {
+		int s;
+		DEFINE_WAIT(wait);
+		if (!cs_state_xfer_active(hi->data_state))
+			goto out;
+		if (signal_pending(current)) {
+			r = -ERESTARTSYS;
+			goto out;
+		}
+		/**
+		 * prepare_to_wait must be called with hi->lock held
+		 * so that callbacks can check for waitqueue_active()
+		 */
+		prepare_to_wait(&hi->datawait, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock_bh(&hi->lock);
+		s = schedule_timeout(
+			msecs_to_jiffies(CS_HSI_TRANSFER_TIMEOUT_MS));
+		spin_lock_bh(&hi->lock);
+		finish_wait(&hi->datawait, &wait);
+		if (!s) {
+			dev_dbg(&hi->cl->device,
+				"hsi_data_sync timeout after %d ms\n",
+				CS_HSI_TRANSFER_TIMEOUT_MS);
+			r = -EIO;
+			goto out;
+		}
+	}
+
+out:
+	spin_unlock_bh(&hi->lock);
+	dev_dbg(&hi->cl->device, "hsi_data_sync done with res %d\n", r);
+
+	return r;
+}
+
+static void cs_hsi_data_enable(struct cs_hsi_iface *hi,
+					struct cs_buffer_config *buf_cfg)
+{
+	unsigned int data_start, i;
+
+	BUG_ON(hi->buf_size == 0);
+
+	set_buffer_sizes(hi, buf_cfg->rx_bufs, buf_cfg->tx_bufs);
+
+	hi->slot_size = L1_CACHE_ALIGN(hi->buf_size);
+	dev_dbg(&hi->cl->device,
+			"setting slot size to %u, buf size %u, align %u\n",
+			hi->slot_size, hi->buf_size, L1_CACHE_BYTES);
+
+	data_start = L1_CACHE_ALIGN(sizeof(*hi->mmap_cfg));
+	dev_dbg(&hi->cl->device,
+			"setting data start at %u, cfg block %u, align %u\n",
+			data_start, sizeof(*hi->mmap_cfg), L1_CACHE_BYTES);
+
+	for (i = 0; i < hi->mmap_cfg->rx_bufs; i++) {
+		hi->rx_offsets[i] = data_start + i * hi->slot_size;
+		hi->mmap_cfg->rx_offsets[i] = hi->rx_offsets[i];
+		dev_dbg(&hi->cl->device, "DL buf #%u at %u\n",
+					i, hi->rx_offsets[i]);
+	}
+	for (i = 0; i < hi->mmap_cfg->tx_bufs; i++) {
+		hi->tx_offsets[i] = data_start +
+			(i + hi->mmap_cfg->rx_bufs) * hi->slot_size;
+		hi->mmap_cfg->tx_offsets[i] = hi->tx_offsets[i];
+		dev_dbg(&hi->cl->device, "UL buf #%u at %u\n",
+					i, hi->rx_offsets[i]);
+	}
+
+	hi->iface_state = CS_STATE_CONFIGURED;
+}
+
+static void cs_hsi_data_disable(struct cs_hsi_iface *hi, int old_state)
+{
+	if (old_state == CS_STATE_CONFIGURED) {
+		dev_dbg(&hi->cl->device,
+			"closing data channel with slot size 0\n");
+		hi->iface_state = CS_STATE_OPENED;
+	}
+}
+
+static int cs_hsi_buf_config(struct cs_hsi_iface *hi,
+					struct cs_buffer_config *buf_cfg)
+{
+	int r = 0;
+	unsigned int old_state = hi->iface_state;
+
+	spin_lock_bh(&hi->lock);
+	/* Prevent new transactions during buffer reconfig */
+	if (old_state == CS_STATE_CONFIGURED)
+		hi->iface_state = CS_STATE_OPENED;
+	spin_unlock_bh(&hi->lock);
+
+	/*
+	 * make sure that no non-zero data reads are ongoing before
+	 * proceeding to change the buffer layout
+	 */
+	r = cs_hsi_data_sync(hi);
+	if (r < 0)
+		return r;
+
+	WARN_ON(cs_state_xfer_active(hi->data_state));
+
+	spin_lock_bh(&hi->lock);
+	r = check_buf_params(hi, buf_cfg);
+	if (r < 0)
+		goto error;
+
+	hi->buf_size = buf_cfg->buf_size;
+	hi->mmap_cfg->buf_size = hi->buf_size;
+	hi->flags = buf_cfg->flags;
+
+	hi->rx_slot = 0;
+	hi->tx_slot = 0;
+	hi->slot_size = 0;
+
+	if (hi->buf_size)
+		cs_hsi_data_enable(hi, buf_cfg);
+	else
+		cs_hsi_data_disable(hi, old_state);
+
+	spin_unlock_bh(&hi->lock);
+
+	if (old_state != hi->iface_state) {
+		if (hi->iface_state == CS_STATE_CONFIGURED) {
+			pm_qos_add_request(&hi->pm_qos_req,
+				PM_QOS_CPU_DMA_LATENCY,
+				CS_QOS_LATENCY_FOR_DATA_USEC);
+			local_bh_disable();
+			cs_hsi_read_on_data(hi);
+			local_bh_enable();
+		} else if (old_state == CS_STATE_CONFIGURED) {
+			pm_qos_remove_request(&hi->pm_qos_req);
+		}
+	}
+	return r;
+
+error:
+	spin_unlock_bh(&hi->lock);
+	return r;
+}
+
+static int cs_hsi_start(struct cs_hsi_iface **hi, struct hsi_client *cl,
+			unsigned long mmap_base, unsigned long mmap_size)
+{
+	int err = 0;
+	struct cs_hsi_iface *hsi_if = kzalloc(sizeof(*hsi_if), GFP_KERNEL);
+
+	dev_dbg(&cl->device, "cs_hsi_start\n");
+
+	if (!hsi_if) {
+		err = -ENOMEM;
+		goto leave0;
+	}
+	spin_lock_init(&hsi_if->lock);
+	hsi_if->cl = cl;
+	hsi_if->iface_state = CS_STATE_CLOSED;
+	hsi_if->mmap_cfg = (struct cs_mmap_config_block *)mmap_base;
+	hsi_if->mmap_base = mmap_base;
+	hsi_if->mmap_size = mmap_size;
+	memset(hsi_if->mmap_cfg, 0, sizeof(*hsi_if->mmap_cfg));
+	init_waitqueue_head(&hsi_if->datawait);
+	err = cs_alloc_cmds(hsi_if);
+	if (err < 0) {
+		dev_err(&cl->device, "Unable to alloc HSI messages\n");
+		goto leave1;
+	}
+	err = cs_hsi_alloc_data(hsi_if);
+	if (err < 0) {
+		dev_err(&cl->device, "Unable to alloc HSI messages for data\n");
+		goto leave2;
+	}
+	err = hsi_claim_port(cl, 1);
+	if (err < 0) {
+		dev_err(&cl->device,
+				"Could not open, HSI port already claimed\n");
+		goto leave3;
+	}
+	hsi_if->master = ssip_slave_get_master(cl);
+	if (IS_ERR(hsi_if->master)) {
+		dev_err(&cl->device, "Could not get HSI master client\n");
+		goto leave4;
+	}
+	hsi_if->iface_state = CS_STATE_OPENED;
+	local_bh_disable();
+	cs_hsi_read_on_control(hsi_if);
+	local_bh_enable();
+
+	dev_dbg(&cl->device, "cs_hsi_start...done\n");
+
+	BUG_ON(!hi);
+	*hi = hsi_if;
+
+	return 0;
+
+leave4:
+	hsi_release_port(cl);
+leave3:
+	cs_hsi_free_data(hsi_if);
+leave2:
+	cs_free_cmds(hsi_if);
+leave1:
+	kfree(hsi_if);
+leave0:
+	dev_dbg(&cl->device, "cs_hsi_start...done/error\n\n");
+
+	return err;
+}
+
+static void cs_hsi_stop(struct cs_hsi_iface *hi)
+{
+	dev_dbg(&hi->cl->device, "cs_hsi_stop\n");
+	cs_hsi_set_wakeline(hi, 0);
+	ssip_slave_put_master(hi->master);
+
+	/* hsi_release_port() needs to be called with CS_STATE_CLOSED */
+	hi->iface_state = CS_STATE_CLOSED;
+	hsi_release_port(hi->cl);
+
+	/*
+	 * hsi_release_port() should flush out all the pending
+	 * messages, so cs_state_idle() should be true for both
+	 * control and data channels.
+	 */
+	WARN_ON(!cs_state_idle(hi->control_state));
+	WARN_ON(!cs_state_idle(hi->data_state));
+
+	if (pm_qos_request_active(&hi->pm_qos_req))
+		pm_qos_remove_request(&hi->pm_qos_req);
+
+	spin_lock_bh(&hi->lock);
+	cs_hsi_free_data(hi);
+	cs_free_cmds(hi);
+	spin_unlock_bh(&hi->lock);
+	kfree(hi);
+}
+
+static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct cs_char *csdata = vma->vm_private_data;
+	struct page *page;
+
+	page = virt_to_page(csdata->mmap_base);
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static struct vm_operations_struct cs_char_vm_ops = {
+	.fault	= cs_char_vma_fault,
+};
+
+static int cs_char_fasync(int fd, struct file *file, int on)
+{
+	struct cs_char *csdata = file->private_data;
+
+	if (fasync_helper(fd, file, on, &csdata->async_queue) >= 0)
+		return 0;
+	else
+		return -EIO;
+}
+
+static unsigned int cs_char_poll(struct file *file, poll_table *wait)
+{
+	struct cs_char *csdata = file->private_data;
+	unsigned int ret = 0;
+
+	poll_wait(file, &cs_char_data.wait, wait);
+	spin_lock_bh(&csdata->lock);
+	if (!list_empty(&csdata->chardev_queue))
+		ret = POLLIN | POLLRDNORM;
+	else if (!list_empty(&csdata->dataind_queue))
+		ret = POLLIN | POLLRDNORM;
+	spin_unlock_bh(&csdata->lock);
+
+	return ret;
+}
+
+static ssize_t cs_char_read(struct file *file, char __user *buf, size_t count,
+								loff_t *unused)
+{
+	struct cs_char *csdata = file->private_data;
+	u32 data;
+	ssize_t retval;
+
+	if (count < sizeof(data))
+		return -EINVAL;
+
+	for ( ; ; ) {
+		DEFINE_WAIT(wait);
+
+		spin_lock_bh(&csdata->lock);
+		if (!list_empty(&csdata->chardev_queue)) {
+			data = cs_pop_entry(&csdata->chardev_queue);
+		} else if (!list_empty(&csdata->dataind_queue)) {
+			data = cs_pop_entry(&csdata->dataind_queue);
+			--csdata->dataind_pending;
+
+		} else {
+			data = 0;
+		}
+		spin_unlock_bh(&csdata->lock);
+
+		if (data)
+			break;
+		if (file->f_flags & O_NONBLOCK) {
+			retval = -EAGAIN;
+			goto out;
+		} else if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			goto out;
+		}
+		prepare_to_wait_exclusive(&csdata->wait, &wait,
+						TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&csdata->wait, &wait);
+	}
+
+	retval = put_user(data, (u32 __user *)buf);
+	if (!retval)
+		retval = sizeof(data);
+
+out:
+	return retval;
+}
+
+static ssize_t cs_char_write(struct file *file, const char __user *buf,
+						size_t count, loff_t *unused)
+{
+	struct cs_char *csdata = file->private_data;
+	u32 data;
+	int err;
+	ssize_t	retval;
+
+	if (count < sizeof(data))
+		return -EINVAL;
+
+	if (get_user(data, (u32 __user *)buf))
+		retval = -EFAULT;
+	else
+		retval = count;
+
+	err = cs_hsi_command(csdata->hi, data);
+	if (err < 0)
+		retval = err;
+
+	return retval;
+}
+
+static long cs_char_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct cs_char *csdata = file->private_data;
+	int r = 0;
+
+	switch (cmd) {
+	case CS_GET_STATE: {
+		unsigned int state;
+
+		state = cs_hsi_get_state(csdata->hi);
+		if (copy_to_user((void __user *)arg, &state, sizeof(state)))
+			r = -EFAULT;
+	}
+		break;
+	case CS_SET_WAKELINE: {
+		unsigned int state;
+
+		if (copy_from_user(&state, (void __user *)arg, sizeof(state)))
+			r = -EFAULT;
+		else
+			cs_hsi_set_wakeline(csdata->hi, state);
+	}
+		break;
+	case CS_GET_IF_VERSION: {
+		unsigned int ifver = CS_IF_VERSION;
+
+		if (copy_to_user((void __user *)arg, &ifver, sizeof(ifver)))
+			r = -EFAULT;
+		break;
+	}
+	case CS_CONFIG_BUFS: {
+		struct cs_buffer_config buf_cfg;
+
+		if (copy_from_user(&buf_cfg, (void __user *)arg,
+							sizeof(buf_cfg)))
+			r = -EFAULT;
+		else
+			r = cs_hsi_buf_config(csdata->hi, &buf_cfg);
+		break;
+	}
+	default:
+		r = -ENOTTY;
+		break;
+	}
+
+	return r;
+}
+
+static int cs_char_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+
+	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) != 1)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &cs_char_vm_ops;
+	vma->vm_private_data = file->private_data;
+
+	return 0;
+}
+
+static int cs_char_open(struct inode *unused, struct file *file)
+{
+	int ret = 0;
+
+	spin_lock_bh(&cs_char_data.lock);
+	if (cs_char_data.opened) {
+		ret = -EBUSY;
+		spin_unlock_bh(&cs_char_data.lock);
+		goto out;
+	}
+	cs_char_data.mmap_base = get_zeroed_page(GFP_ATOMIC);
+	if (!cs_char_data.mmap_base) {
+		dev_err(&cs_char_data.cl->device,
+					"Shared memory allocation failed.\n");
+		ret = -ENOMEM;
+		spin_unlock_bh(&cs_char_data.lock);
+		goto out;
+	}
+	cs_char_data.mmap_size = CS_MMAP_SIZE;
+	cs_char_data.dataind_pending = 0;
+	cs_char_data.opened = 1;
+	file->private_data = &cs_char_data;
+	spin_unlock_bh(&cs_char_data.lock);
+
+	BUG_ON(cs_char_data.hi);
+
+	ret = cs_hsi_start(&cs_char_data.hi, cs_char_data.cl,
+				cs_char_data.mmap_base, cs_char_data.mmap_size);
+	if (ret) {
+		dev_err(&cs_char_data.cl->device, "Unable to initialize HSI\n");
+		free_page(cs_char_data.mmap_base);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+static void cs_free_char_queue(struct list_head *head)
+{
+	struct char_queue *entry;
+	struct list_head *cursor, *next;
+
+	if (!list_empty(head)) {
+		list_for_each_safe(cursor, next, head) {
+			entry = list_entry(cursor, struct char_queue, list);
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+
+}
+
+static int cs_char_release(struct inode *unused, struct file *file)
+{
+	struct cs_char *csdata = file->private_data;
+
+	cs_hsi_stop(csdata->hi);
+	spin_lock_bh(&csdata->lock);
+	csdata->hi = NULL;
+	free_page(csdata->mmap_base);
+	cs_free_char_queue(&csdata->chardev_queue);
+	cs_free_char_queue(&csdata->dataind_queue);
+	csdata->opened = 0;
+	spin_unlock_bh(&csdata->lock);
+
+	return 0;
+}
+
+static const struct file_operations cs_char_fops = {
+	.owner		= THIS_MODULE,
+	.read		= cs_char_read,
+	.write		= cs_char_write,
+	.poll		= cs_char_poll,
+	.unlocked_ioctl	= cs_char_ioctl,
+	.mmap		= cs_char_mmap,
+	.open		= cs_char_open,
+	.release	= cs_char_release,
+	.fasync		= cs_char_fasync,
+};
+
+static struct miscdevice cs_char_miscdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= DRIVER_NAME,
+	.fops	= &cs_char_fops
+};
+
+static int __init cs_hsi_client_probe(struct device *dev)
+{
+	int err = 0;
+	struct hsi_client *cl = to_hsi_client(dev);
+
+	dev_dbg(dev, "hsi_client_probe\n");
+	init_waitqueue_head(&cs_char_data.wait);
+	spin_lock_init(&cs_char_data.lock);
+	cs_char_data.opened = 0;
+	cs_char_data.cl = cl;
+	cs_char_data.hi = NULL;
+	INIT_LIST_HEAD(&cs_char_data.chardev_queue);
+	INIT_LIST_HEAD(&cs_char_data.dataind_queue);
+
+	err = misc_register(&cs_char_miscdev);
+	if (err)
+		dev_err(dev, "Failed to register\n");
+
+	return err;
+}
+
+static int __exit cs_hsi_client_remove(struct device *dev)
+{
+	struct cs_hsi_iface *hi;
+
+	dev_dbg(dev, "hsi_client_remove\n");
+	misc_deregister(&cs_char_miscdev);
+	spin_lock_bh(&cs_char_data.lock);
+	hi = cs_char_data.hi;
+	cs_char_data.hi = NULL;
+	spin_unlock_bh(&cs_char_data.lock);
+	if (hi)
+		cs_hsi_stop(hi);
+
+	return 0;
+}
+
+static struct hsi_client_driver cs_hsi_driver = {
+	.driver = {
+		.name	= DRIVER_NAME,
+		.owner	= THIS_MODULE,
+		.probe	= cs_hsi_client_probe,
+		.remove	= cs_hsi_client_remove,
+	},
+};
+
+static int __init cs_char_init(void)
+{
+	int err = 0;
+
+	err = hsi_register_client_driver(&cs_hsi_driver);
+	if (err)
+		pr_err(DRIVER_NAME ": Error when registering driver %d\n", err);
+
+	return err;
+}
+module_init(cs_char_init);
+
+static void __exit cs_char_exit(void)
+{
+	hsi_unregister_client_driver(&cs_hsi_driver);
+}
+module_exit(cs_char_exit);
+
+MODULE_ALIAS("hsi:cmt_speech");
+MODULE_AUTHOR("Kai Vehmanen <kai.vehmanen@nokia.com>");
+MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi@nokia.com>");
+MODULE_DESCRIPTION("CMT speech driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/cs-protocol.h b/include/linux/cs-protocol.h
new file mode 100644
index 0000000..0d3d584
--- /dev/null
+++ b/include/linux/cs-protocol.h
@@ -0,0 +1,116 @@
+/*
+ * include/linux/cs-protocol.h - cmt_speech interface definitions
+ *
+ * Implemented by:
+ * - drivers/misc/cmt-speech/
+ *
+ * Copyright (C) 2008,2009,2010 Nokia Corporation. All rights reserved.
+ *
+ * Contact: Kai Vehmanen <kai.vehmanen@nokia.com>
+ * Original author: Peter Ujfalusi <peter.ujfalusi@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#ifndef _CS_PROTOCOL_H
+#define _CS_PROTOCOL_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/* chardev parameters */
+#define CS_DEV_FILE_NAME		"/dev/cmt_speech"
+
+/* user-space API versioning */
+#define CS_IF_VERSION			2
+
+/* APE kernel <-> user space messages */
+#define CS_CMD_SHIFT			28
+#define CS_DOMAIN_SHIFT			24
+
+#define CS_CMD_MASK			0xff000000
+#define CS_PARAM_MASK			0xffffff
+
+#define CS_CMD(id, dom) \
+	(((id) << CS_CMD_SHIFT) | ((dom) << CS_DOMAIN_SHIFT))
+
+#define CS_ERROR			CS_CMD(1, 0)
+#define CS_RX_DATA_RECEIVED		CS_CMD(2, 0)
+#define CS_TX_DATA_READY		CS_CMD(3, 0)
+#define CS_TX_DATA_SENT			CS_CMD(4, 0)
+
+/* params to CS_ERROR indication */
+#define CS_ERR_PEER_RESET		0
+
+/* ioctl interface */
+
+/* parameters to CS_CONFIG_BUFS ioctl */
+#define CS_FEAT_TSTAMP_RX_CTRL		(1 << 0)
+#define CS_FEAT_ROLLING_RX_COUNTER	(2 << 0)
+
+/* parameters to CS_GET_STATE ioctl */
+#define CS_STATE_CLOSED			0
+#define CS_STATE_OPENED			1 /* resource allocated */
+#define CS_STATE_CONFIGURED		2 /* data path active */
+
+/* maximum number of TX/RX buffers */
+#define CS_MAX_BUFFERS_SHIFT		4
+#define CS_MAX_BUFFERS			(1 << CS_MAX_BUFFERS_SHIFT)
+
+/* Parameters for setting up the data buffers */
+struct cs_buffer_config {
+	__u32 rx_bufs;	/* number of RX buffer slots */
+	__u32 tx_bufs;	/* number of TX buffer slots */
+	__u32 buf_size;	/* bytes */
+	__u32 flags;	/* see CS_FEAT_* */
+	__u32 reserved[4];
+};
+
+/*
+ * Struct describing the layout and contents of the driver mmap area.
+ * This information is meant as read-only information for the application.
+ */
+struct cs_mmap_config_block {
+	__u32 reserved1;
+	__u32 buf_size;		/* 0=disabled, otherwise the transfer size */
+	__u32 rx_bufs;		/* # of RX buffers */
+	__u32 tx_bufs;		/* # of TX buffers */
+	__u32 reserved2;
+	/* array of offsets within the mmap area for each RX and TX buffer */
+	__u32 rx_offsets[CS_MAX_BUFFERS];
+	__u32 tx_offsets[CS_MAX_BUFFERS];
+	__u32 rx_ptr;
+	__u32 rx_ptr_boundary;
+	__u32 reserved3[2];
+	/*
+	 * if enabled with CS_FEAT_TSTAMP_RX_CTRL, monotonic
+	 * timestamp taken when the last control command was received
+	 */
+	struct timespec tstamp_rx_ctrl;
+};
+
+#define CS_IO_MAGIC		'C'
+
+#define CS_IOW(num, dtype)	_IOW(CS_IO_MAGIC, num, dtype)
+#define CS_IOR(num, dtype)	_IOR(CS_IO_MAGIC, num, dtype)
+#define CS_IOWR(num, dtype)	_IOWR(CS_IO_MAGIC, num, dtype)
+#define CS_IO(num)		_IO(CS_IO_MAGIC, num)
+
+#define CS_GET_STATE		CS_IOR(21, unsigned int)
+#define CS_SET_WAKELINE		CS_IOW(23, unsigned int)
+#define CS_GET_IF_VERSION	CS_IOR(30, unsigned int)
+#define CS_CONFIG_BUFS		CS_IOW(31, struct cs_buffer_config)
+
+#endif /* _CS_PROTOCOL_H */
-- 
2.1.4

^ permalink raw reply related

* [PATCH 2/9] HSI: cmt_speech: Avoid GFP_ATOMIC in cs_char_open
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api, Kai Vehmanen, Joni Lapilainen
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

From: Kai Vehmanen <kai.vehmanen@nokia.com>

Also fixes a bug in updating 'opened' state in case cs_hsi_start()
fails when opening the char device.

Signed-off-by: Kai Vehmanen <kai.vehmanen@nokia.com>
Signed-off-by: Joni Lapilainen <joni.lapilainen@gmail.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/cmt_speech.c | 43 +++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 7c0f711..389eafb 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -1271,38 +1271,45 @@ static int cs_char_mmap(struct file *file, struct vm_area_struct *vma)
 static int cs_char_open(struct inode *unused, struct file *file)
 {
 	int ret = 0;
+	unsigned long p;
 
 	spin_lock_bh(&cs_char_data.lock);
 	if (cs_char_data.opened) {
 		ret = -EBUSY;
 		spin_unlock_bh(&cs_char_data.lock);
-		goto out;
-	}
-	cs_char_data.mmap_base = get_zeroed_page(GFP_ATOMIC);
-	if (!cs_char_data.mmap_base) {
-		dev_err(&cs_char_data.cl->device,
-					"Shared memory allocation failed.\n");
-		ret = -ENOMEM;
-		spin_unlock_bh(&cs_char_data.lock);
-		goto out;
+		goto out1;
 	}
-	cs_char_data.mmap_size = CS_MMAP_SIZE;
-	cs_char_data.dataind_pending = 0;
 	cs_char_data.opened = 1;
-	file->private_data = &cs_char_data;
+	cs_char_data.dataind_pending = 0;
 	spin_unlock_bh(&cs_char_data.lock);
 
-	BUG_ON(cs_char_data.hi);
+	p = get_zeroed_page(GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto out2;
+	}
 
-	ret = cs_hsi_start(&cs_char_data.hi, cs_char_data.cl,
-				cs_char_data.mmap_base, cs_char_data.mmap_size);
+	ret = cs_hsi_start(&cs_char_data.hi, cs_char_data.cl, p, CS_MMAP_SIZE);
 	if (ret) {
 		dev_err(&cs_char_data.cl->device, "Unable to initialize HSI\n");
-		free_page(cs_char_data.mmap_base);
-		goto out;
+		goto out3;
 	}
 
-out:
+	/* these are only used in release so lock not needed */
+	cs_char_data.mmap_base = p;
+	cs_char_data.mmap_size = CS_MMAP_SIZE;
+
+	file->private_data = &cs_char_data;
+
+	return 0;
+
+out3:
+	free_page(p);
+out2:
+	spin_lock_bh(&cs_char_data.lock);
+	cs_char_data.opened = 0;
+	spin_unlock_bh(&cs_char_data.lock);
+out1:
 	return ret;
 }
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 3/9] HSI: cmt_speech: Return error if HSI port not configured
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api, Kai Vehmanen, Joni Lapilainen
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

From: Kai Vehmanen <kai.vehmanen@nokia.com>

If HSI port is not configured by ssi_protocol, return an error from
char device open.

Signed-off-by: Kai Vehmanen <kai.vehmanen@nokia.com>
Acked-by: Carlos Chinea <carlos.chinea@nokia.com>
Signed-off-by: Joni Lapilainen <joni.lapilainen@gmail.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/cmt_speech.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 389eafb..56846c9 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -1037,6 +1037,13 @@ static int cs_hsi_start(struct cs_hsi_iface **hi, struct hsi_client *cl,
 		dev_err(&cl->device, "Could not get HSI master client\n");
 		goto leave4;
 	}
+	if (!ssip_slave_running(hsi_if->master)) {
+		err = -ENODEV;
+		dev_err(&cl->device,
+				"HSI port not initialized\n");
+		goto leave4;
+	}
+
 	hsi_if->iface_state = CS_STATE_OPENED;
 	local_bh_disable();
 	cs_hsi_read_on_control(hsi_if);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 4/9] HSI: cmt_speech: Fix build for 4.0 kernel
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

Fix building of the old out-of-tree code.

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/cmt_speech.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 56846c9..6d852ea 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -34,9 +34,9 @@
 #include <linux/sched.h>
 #include <linux/ioctl.h>
 #include <linux/uaccess.h>
-#include <linux/pm_qos_params.h>
+#include <linux/pm_qos.h>
 #include <linux/hsi/hsi.h>
-#include <linux/hsi/ssip_slave.h>
+#include <linux/hsi/ssi_protocol.h>
 #include <linux/cs-protocol.h>
 
 #define CS_MMAP_SIZE	PAGE_SIZE
@@ -126,7 +126,7 @@ struct cs_hsi_iface {
 	struct hsi_msg			*data_tx_msg;
 	wait_queue_head_t		datawait;
 
-	struct pm_qos_request_list      pm_qos_req;
+	struct pm_qos_request           pm_qos_req;
 
 	spinlock_t			lock;
 };
@@ -1268,7 +1268,7 @@ static int cs_char_mmap(struct file *file, struct vm_area_struct *vma)
 	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) != 1)
 		return -EINVAL;
 
-	vma->vm_flags |= VM_RESERVED;
+	vma->vm_flags |= VM_IO | VM_DONTDUMP | VM_DONTEXPAND;
 	vma->vm_ops = &cs_char_vm_ops;
 	vma->vm_private_data = file->private_data;
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 5/9] HSI: cmt_speech: Cleanup initialisation
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

Cleanup initialisation process, so that its similar to
the style used in ssi_protocol driver.

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/cmt_speech.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 6d852ea..8d9860b 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -1369,7 +1369,7 @@ static struct miscdevice cs_char_miscdev = {
 	.fops	= &cs_char_fops
 };
 
-static int __init cs_hsi_client_probe(struct device *dev)
+static int cs_hsi_client_probe(struct device *dev)
 {
 	int err = 0;
 	struct hsi_client *cl = to_hsi_client(dev);
@@ -1385,12 +1385,12 @@ static int __init cs_hsi_client_probe(struct device *dev)
 
 	err = misc_register(&cs_char_miscdev);
 	if (err)
-		dev_err(dev, "Failed to register\n");
+		dev_err(dev, "Failed to register: %d\n", err);
 
 	return err;
 }
 
-static int __exit cs_hsi_client_remove(struct device *dev)
+static int cs_hsi_client_remove(struct device *dev)
 {
 	struct cs_hsi_iface *hi;
 
@@ -1417,19 +1417,15 @@ static struct hsi_client_driver cs_hsi_driver = {
 
 static int __init cs_char_init(void)
 {
-	int err = 0;
-
-	err = hsi_register_client_driver(&cs_hsi_driver);
-	if (err)
-		pr_err(DRIVER_NAME ": Error when registering driver %d\n", err);
-
-	return err;
+	pr_info("CMT speech driver added\n");
+	return hsi_register_client_driver(&cs_hsi_driver);
 }
 module_init(cs_char_init);
 
 static void __exit cs_char_exit(void)
 {
 	hsi_unregister_client_driver(&cs_hsi_driver);
+	pr_info("CMT speech driver removed\n");
 }
 module_exit(cs_char_exit);
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 6/9] HSI: cmt_speech: Rename driver to cmt-speech
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

Rename driver and platform alias to cmt-speech, so that
it's consistent with the ssi-protocol driver.

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/cmt_speech.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 8d9860b..52001ed 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -40,7 +40,6 @@
 #include <linux/cs-protocol.h>
 
 #define CS_MMAP_SIZE	PAGE_SIZE
-#define DRIVER_NAME	"cmt_speech"
 
 struct char_queue {
 	struct list_head	list;
@@ -1365,7 +1364,7 @@ static const struct file_operations cs_char_fops = {
 
 static struct miscdevice cs_char_miscdev = {
 	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= DRIVER_NAME,
+	.name	= "cmt_speech",
 	.fops	= &cs_char_fops
 };
 
@@ -1408,7 +1407,7 @@ static int cs_hsi_client_remove(struct device *dev)
 
 static struct hsi_client_driver cs_hsi_driver = {
 	.driver = {
-		.name	= DRIVER_NAME,
+		.name	= "cmt-speech",
 		.owner	= THIS_MODULE,
 		.probe	= cs_hsi_client_probe,
 		.remove	= cs_hsi_client_remove,
@@ -1429,7 +1428,7 @@ static void __exit cs_char_exit(void)
 }
 module_exit(cs_char_exit);
 
-MODULE_ALIAS("hsi:cmt_speech");
+MODULE_ALIAS("hsi:cmt-speech");
 MODULE_AUTHOR("Kai Vehmanen <kai.vehmanen@nokia.com>");
 MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi@nokia.com>");
 MODULE_DESCRIPTION("CMT speech driver");
-- 
2.1.4

^ permalink raw reply related

* [PATCH 7/9] HSI: cmt_speech: Move cs-protocol.h to include/uapi/linux/hsi
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

Move cs-protocol.h to include/uapi/linux/hsi, since it
describes a userspace API.

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/cmt_speech.c                | 2 +-
 include/uapi/linux/hsi/Kbuild                   | 2 +-
 include/{linux => uapi/linux/hsi}/cs-protocol.h | 5 +----
 3 files changed, 3 insertions(+), 6 deletions(-)
 rename include/{linux => uapi/linux/hsi}/cs-protocol.h (96%)

diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 52001ed..69dc37f 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -37,7 +37,7 @@
 #include <linux/pm_qos.h>
 #include <linux/hsi/hsi.h>
 #include <linux/hsi/ssi_protocol.h>
-#include <linux/cs-protocol.h>
+#include <linux/hsi/cs-protocol.h>
 
 #define CS_MMAP_SIZE	PAGE_SIZE
 
diff --git a/include/uapi/linux/hsi/Kbuild b/include/uapi/linux/hsi/Kbuild
index 30ab3cd..a16a005 100644
--- a/include/uapi/linux/hsi/Kbuild
+++ b/include/uapi/linux/hsi/Kbuild
@@ -1,2 +1,2 @@
 # UAPI Header export list
-header-y += hsi_char.h
+header-y += hsi_char.h cs-protocol.h
diff --git a/include/linux/cs-protocol.h b/include/uapi/linux/hsi/cs-protocol.h
similarity index 96%
rename from include/linux/cs-protocol.h
rename to include/uapi/linux/hsi/cs-protocol.h
index 0d3d584..4957bba 100644
--- a/include/linux/cs-protocol.h
+++ b/include/uapi/linux/hsi/cs-protocol.h
@@ -1,8 +1,5 @@
 /*
- * include/linux/cs-protocol.h - cmt_speech interface definitions
- *
- * Implemented by:
- * - drivers/misc/cmt-speech/
+ * cmt-speech interface definitions
  *
  * Copyright (C) 2008,2009,2010 Nokia Corporation. All rights reserved.
  *
-- 
2.1.4

^ permalink raw reply related

* [PATCH 8/9] HSI: cmt_speech: Remove hardcoded channel numbers
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

cmt-speech channel numbers should be coming from Device Tree
instead of being hardcoded.

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/cmt_speech.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 69dc37f..a017292 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -59,6 +59,9 @@ struct cs_char {
 	spinlock_t		lock;
 	struct fasync_struct	*async_queue;
 	wait_queue_head_t	wait;
+	/* hsi channel ids */
+	int                     channel_id_cmd;
+	int                     channel_id_data;
 };
 
 #define SSI_CHANNEL_STATE_READING	1
@@ -66,9 +69,6 @@ struct cs_char {
 #define SSI_CHANNEL_STATE_POLL		(1 << 2)
 #define SSI_CHANNEL_STATE_ERROR		(1 << 3)
 
-#define CONTROL_HSI_CH			1
-#define DATA_HSI_CH			2
-
 #define TARGET_MASK			0xf000000
 #define TARGET_REMOTE			(1 << CS_DOMAIN_SHIFT)
 #define TARGET_LOCAL			0
@@ -296,7 +296,7 @@ static int cs_alloc_cmds(struct cs_hsi_iface *hi)
 			goto out;
 		}
 		sg_init_one(msg->sgt.sgl, buf, sizeof(*buf));
-		msg->channel = CONTROL_HSI_CH;
+		msg->channel = cs_char_data.channel_id_cmd;
 		msg->context = hi;
 		list_add_tail(&msg->link, &hi->cmdqueue);
 	}
@@ -342,7 +342,7 @@ static int cs_hsi_alloc_data(struct cs_hsi_iface *hi)
 		res = -ENOMEM;
 		goto out1;
 	}
-	rxmsg->channel = DATA_HSI_CH;
+	rxmsg->channel = cs_char_data.channel_id_data;
 	rxmsg->destructor = cs_hsi_data_destructor;
 	rxmsg->context = hi;
 
@@ -351,7 +351,7 @@ static int cs_hsi_alloc_data(struct cs_hsi_iface *hi)
 		res = -ENOMEM;
 		goto out2;
 	}
-	txmsg->channel = DATA_HSI_CH;
+	txmsg->channel = cs_char_data.channel_id_data;
 	txmsg->destructor = cs_hsi_data_destructor;
 	txmsg->context = hi;
 
@@ -1382,6 +1382,22 @@ static int cs_hsi_client_probe(struct device *dev)
 	INIT_LIST_HEAD(&cs_char_data.chardev_queue);
 	INIT_LIST_HEAD(&cs_char_data.dataind_queue);
 
+	cs_char_data.channel_id_cmd = hsi_get_channel_id_by_name(cl,
+		"speech-control");
+	if (cs_char_data.channel_id_cmd < 0) {
+		err = cs_char_data.channel_id_cmd;
+		dev_err(dev, "Could not get cmd channel (%d)\n", err);
+		return err;
+	}
+
+	cs_char_data.channel_id_data = hsi_get_channel_id_by_name(cl,
+		"speech-data");
+	if (cs_char_data.channel_id_data < 0) {
+		err = cs_char_data.channel_id_data;
+		dev_err(dev, "Could not get data channel (%d)\n", err);
+		return err;
+	}
+
 	err = misc_register(&cs_char_miscdev);
 	if (err)
 		dev_err(dev, "Failed to register: %d\n", err);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 9/9] HSI: nokia-modem: Add cmt_speech support
From: Sebastian Reichel @ 2015-03-02  4:38 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api
In-Reply-To: <1425271139-24715-1-git-send-email-sre@kernel.org>

This adds cmt_speech support to the nokia-modem driver
and adds Kconfig entries for cmt_speech, so that it can
be built.

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/Kconfig       | 11 ++++++++++-
 drivers/hsi/clients/Makefile      |  1 +
 drivers/hsi/clients/nokia-modem.c | 31 ++++++++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/drivers/hsi/clients/Kconfig b/drivers/hsi/clients/Kconfig
index bc60dec..77ee7bc 100644
--- a/drivers/hsi/clients/Kconfig
+++ b/drivers/hsi/clients/Kconfig
@@ -6,13 +6,22 @@ comment "HSI clients"
 
 config NOKIA_MODEM
 	tristate "Nokia Modem"
-	depends on HSI && SSI_PROTOCOL
+	depends on HSI && SSI_PROTOCOL && CMT_SPEECH
 	help
 	Say Y here if you want to add support for the modem on Nokia
 	N900 (Nokia RX-51) hardware.
 
 	If unsure, say N.
 
+config CMT_SPEECH
+	tristate "CMT speech"
+	depends on HSI && PHONET && OMAP_SSI
+	help
+	If you say Y here, you will enable the CMT speech protocol used
+	by Nokia modems.
+
+	If unsure, say N.
+
 config SSI_PROTOCOL
 	tristate "SSI protocol"
 	depends on HSI && PHONET && OMAP_SSI
diff --git a/drivers/hsi/clients/Makefile b/drivers/hsi/clients/Makefile
index 4d5bc0e..2607232 100644
--- a/drivers/hsi/clients/Makefile
+++ b/drivers/hsi/clients/Makefile
@@ -4,4 +4,5 @@
 
 obj-$(CONFIG_NOKIA_MODEM)	+= nokia-modem.o
 obj-$(CONFIG_SSI_PROTOCOL)	+= ssi_protocol.o
+obj-$(CONFIG_CMT_SPEECH)	+= cmt_speech.o
 obj-$(CONFIG_HSI_CHAR)		+= hsi_char.o
diff --git a/drivers/hsi/clients/nokia-modem.c b/drivers/hsi/clients/nokia-modem.c
index 9be4867..ef6ebda 100644
--- a/drivers/hsi/clients/nokia-modem.c
+++ b/drivers/hsi/clients/nokia-modem.c
@@ -46,6 +46,7 @@ struct nokia_modem_device {
 	struct nokia_modem_gpio	*gpios;
 	int			gpio_amount;
 	struct hsi_client	*ssi_protocol;
+	struct hsi_client	*cmt_speech;
 };
 
 static void do_nokia_modem_rst_ind_tasklet(unsigned long data)
@@ -149,6 +150,7 @@ static int nokia_modem_probe(struct device *dev)
 	struct hsi_port *port = hsi_get_port(cl);
 	int irq, pflags, err;
 	struct hsi_board_info ssip;
+	struct hsi_board_info cmtspeech;
 
 	np = dev->of_node;
 	if (!np) {
@@ -214,12 +216,34 @@ static int nokia_modem_probe(struct device *dev)
 		goto error3;
 	}
 
-	/* TODO: register cmt-speech hsi client */
+	cmtspeech.name = "cmt-speech";
+	cmtspeech.tx_cfg = cl->tx_cfg;
+	cmtspeech.rx_cfg = cl->rx_cfg;
+	cmtspeech.platform_data = NULL;
+	cmtspeech.archdata = NULL;
+
+	modem->cmt_speech = hsi_new_client(port, &cmtspeech);
+	if (!modem->cmt_speech) {
+		dev_err(dev, "Could not register cmt-speech device\n");
+		goto error3;
+	}
+
+	err = device_attach(&modem->cmt_speech->device);
+	if (err == 0) {
+		dev_err(dev, "Missing cmt-speech driver\n");
+		err = -EPROBE_DEFER;
+		goto error4;
+	} else if (err < 0) {
+		dev_err(dev, "Could not load cmt-speech driver (%d)\n", err);
+		goto error4;
+	}
 
 	dev_info(dev, "Registered Nokia HSI modem\n");
 
 	return 0;
 
+error4:
+	hsi_remove_client(&modem->cmt_speech->device, NULL);
 error3:
 	hsi_remove_client(&modem->ssi_protocol->device, NULL);
 error2:
@@ -238,6 +262,11 @@ static int nokia_modem_remove(struct device *dev)
 	if (!modem)
 		return 0;
 
+	if (modem->cmt_speech) {
+		hsi_remove_client(&modem->cmt_speech->device, NULL);
+		modem->cmt_speech = NULL;
+	}
+
 	if (modem->ssi_protocol) {
 		hsi_remove_client(&modem->ssi_protocol->device, NULL);
 		modem->ssi_protocol = NULL;
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH v3 0/3] epoll: introduce round robin wakeup mode
From: Jason Baron @ 2015-03-02  5:04 UTC (permalink / raw)
  To: Jonathan Corbet, Andrew Morton
  Cc: Ingo Molnar, peterz-wEGCiKHe2LqWVfeAwA7xHQ,
	mingo-H+wXaHxf7aLQT0dZR+AlfA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn, normalperson-rMlxZR9MS24,
	davidel-AhlLAIvw+VEjIGhXcJzhZg,
	mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w, luto-kltTT9wpgjJwATOyAt5JVQ,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Linus Torvalds, Alexander Viro
In-Reply-To: <20150227143147.07785626-T1hC0tSOHrs@public.gmane.org>

On 02/27/2015 04:31 PM, Jonathan Corbet wrote:
> On Fri, 27 Feb 2015 13:10:34 -0800
> Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> wrote:
>
>> I don't really understand the need for rotation/round-robin.  We can
>> solve the thundering herd via exclusive wakeups, but what is the point
>> in choosing to wake the task which has been sleeping for the longest
>> time?  Why is that better than waking the task which has been sleeping
>> for the *least* time?  That's probably faster as that task's data is
>> more likely to still be in cache.
> So here's my chance to show the world what a fool I am (again)...  If I
> understand this at all, a task woken from epoll_wait() remains on the wait
> queues while it is off doing other stuff.  If you're doing exclusive
> wakeups, the task at the head of the queue will get all of them, since it
> never gets removed from the queue.  So you don't spread your load around,
> and, indeed, you may "wake" a process that is busy doing something else and
> can't deal with the event now anyway.  You need some way to shuffle up the
> wait queue, and round-robin is probably as good as any.
>
> (The alternative would be to take the process off the queue until it calls
> epoll_wait() again, but that runs counter to what epoll is all about).
>
> At least, that was my impression when I took a look at this stuff.
>
> jon

So tasks do not remain on wait queues when they are not in
epoll_wait(). That is, tasks are added to the associated epoll
fd wait queue at the beginning of epoll_wait(), and removed
from the associated epoll fd wait queue when epoll_wait()
returns.

One can think about the problem, perhaps, as assigning fd
events - POLLIN, POLLLOUT, etc., to a set of tasks. And this
discussion is about how to do the assignment in certain
cases. Namely, one could start by partitioning  the set of fds
into unique sets and then assigning them
(via EPOLL_CTL_ADD) to different epoll fds. Then, if there
is say a single task blocking on each epoll fd (via epoll_wait())
then each task can work nicely on its own set of events
without needing to necessarily co-ordinate with the other
tasks.

Now, this all works fine until, we have an 'fd' or event source
that we wish to share among all the tasks. We might
want to share it because it generates events or work, that
would potentially overwhelm a single task.

So in this shared event source case, where we have added
the fd to all of the epoll fds, we currently do a wake all.
This series attempts to change that behavior (with an
optional flag to epoll_create1()), into a round robin wakeup
(both to avoid excessive wakeups and to more evenly distribute
the wakeups). Note also that it will continue to wake up tasks,
as long as it doesn't find any in epoll_wait(). Thus, it still can
potentially wake up all if nobody is in epoll_wait().

Now, we could try and distribute the fd events among tasks
all waiting on a single epoll fd (meaning we have a single
event queue). But, we have already partitioned most of the
events, why combine them back into a single queue?

Thanks,

-Jason

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox