Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCHv3 1/2] HSI: cmt_speech: Add cmt-speech driver
From: Sebastian Reichel @ 2015-03-21 19:09 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Kai Vehmanen
In-Reply-To: <1426964957-5023-1-git-send-email-sre-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

From: Kai Vehmanen <kai.vehmanen-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>

Introduces the cmt-speech driver, which implements
a character device interface for transferring speech
data frames over HSI/SSI.

The driver is used to exchange voice/speech data between
the Nokia N900/N950/N9's modem and its cpu.

Signed-off-by: Kai Vehmanen <kai.vehmanen-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Carlos Chinea <carlos.chinea-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Joni Lapilainen <joni.lapilainen-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Since the original driver has been written for 2.6.28 some
build fixes and general cleanups have been added by me:

 * fix build for 4.0 kernel
 * replace GFP_ATOMIC with GFP_KERNEL in cs_alloc_cmds()
 * add sanity check for CS_SET_WAKELINE ioctl
 * cleanup driver initialisation
 * rename driver to cmt-speech to be consistent with
   ssi-protocol driver
 * move cs-protocol.h to include/uapi/linux/hsi, since
   it describes a userspace API
 * replace hardcoded channels numbers with values provided
   via the HSI framework (e.g. coming from DT)

Signed-off-by: Sebastian Reichel <sre-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 drivers/hsi/clients/Kconfig          |   10 +
 drivers/hsi/clients/Makefile         |    1 +
 drivers/hsi/clients/cmt_speech.c     | 1456 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/hsi/Kbuild        |    2 +-
 include/uapi/linux/hsi/cs-protocol.h |  113 +++
 5 files changed, 1581 insertions(+), 1 deletion(-)
 create mode 100644 drivers/hsi/clients/cmt_speech.c
 create mode 100644 include/uapi/linux/hsi/cs-protocol.h

diff --git a/drivers/hsi/clients/Kconfig b/drivers/hsi/clients/Kconfig
index bc60dec..86c8495 100644
--- a/drivers/hsi/clients/Kconfig
+++ b/drivers/hsi/clients/Kconfig
@@ -13,6 +13,16 @@ config NOKIA_MODEM
 
 	If unsure, say N.
 
+config CMT_SPEECH
+	tristate "CMT speech"
+	depends on HSI && SSI_PROTOCOL
+	help
+	If you say Y here, you will enable the CMT speech protocol used
+	by Nokia modems. If you say M the protocol will be available as
+	module named cmt_speech.
+
+	If unsure, say N.
+
 config SSI_PROTOCOL
 	tristate "SSI protocol"
 	depends on HSI && PHONET && OMAP_SSI
diff --git a/drivers/hsi/clients/Makefile b/drivers/hsi/clients/Makefile
index 4d5bc0e..2607232 100644
--- a/drivers/hsi/clients/Makefile
+++ b/drivers/hsi/clients/Makefile
@@ -4,4 +4,5 @@
 
 obj-$(CONFIG_NOKIA_MODEM)	+= nokia-modem.o
 obj-$(CONFIG_SSI_PROTOCOL)	+= ssi_protocol.o
+obj-$(CONFIG_CMT_SPEECH)	+= cmt_speech.o
 obj-$(CONFIG_HSI_CHAR)		+= hsi_char.o
diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
new file mode 100644
index 0000000..e9560ef
--- /dev/null
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -0,0 +1,1456 @@
+/*
+ * cmt_speech.c - HSI CMT speech driver
+ *
+ * Copyright (C) 2008,2009,2010 Nokia Corporation. All rights reserved.
+ *
+ * Contact: Kai Vehmanen <kai.vehmanen-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>
+ * Original author: Peter Ujfalusi <peter.ujfalusi-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/ioctl.h>
+#include <linux/uaccess.h>
+#include <linux/pm_qos.h>
+#include <linux/hsi/hsi.h>
+#include <linux/hsi/ssi_protocol.h>
+#include <linux/hsi/cs-protocol.h>
+
+#define CS_MMAP_SIZE	PAGE_SIZE
+
+struct char_queue {
+	struct list_head	list;
+	u32			msg;
+};
+
+struct cs_char {
+	unsigned int		opened;
+	struct hsi_client	*cl;
+	struct cs_hsi_iface	*hi;
+	struct list_head	chardev_queue;
+	struct list_head	dataind_queue;
+	int			dataind_pending;
+	/* mmap things */
+	unsigned long		mmap_base;
+	unsigned long		mmap_size;
+	spinlock_t		lock;
+	struct fasync_struct	*async_queue;
+	wait_queue_head_t	wait;
+	/* hsi channel ids */
+	int                     channel_id_cmd;
+	int                     channel_id_data;
+};
+
+#define SSI_CHANNEL_STATE_READING	1
+#define SSI_CHANNEL_STATE_WRITING	(1 << 1)
+#define SSI_CHANNEL_STATE_POLL		(1 << 2)
+#define SSI_CHANNEL_STATE_ERROR		(1 << 3)
+
+#define TARGET_MASK			0xf000000
+#define TARGET_REMOTE			(1 << CS_DOMAIN_SHIFT)
+#define TARGET_LOCAL			0
+
+/* Number of pre-allocated commands buffers */
+#define CS_MAX_CMDS		        4
+
+/*
+ * During data transfers, transactions must be handled
+ * within 20ms (fixed value in cmtspeech HSI protocol)
+ */
+#define CS_QOS_LATENCY_FOR_DATA_USEC	20000
+
+/* Timeout to wait for pending HSI transfers to complete */
+#define CS_HSI_TRANSFER_TIMEOUT_MS      500
+
+
+#define RX_PTR_BOUNDARY_SHIFT		8
+#define RX_PTR_MAX_SHIFT		(RX_PTR_BOUNDARY_SHIFT + \
+						CS_MAX_BUFFERS_SHIFT)
+struct cs_hsi_iface {
+	struct hsi_client		*cl;
+	struct hsi_client		*master;
+
+	unsigned int			iface_state;
+	unsigned int			wakeline_state;
+	unsigned int			control_state;
+	unsigned int			data_state;
+
+	/* state exposed to application */
+	struct cs_mmap_config_block	*mmap_cfg;
+
+	unsigned long			mmap_base;
+	unsigned long			mmap_size;
+
+	unsigned int			rx_slot;
+	unsigned int			tx_slot;
+
+	/* note: for security reasons, we do not trust the contents of
+	 * mmap_cfg, but instead duplicate the variables here */
+	unsigned int			buf_size;
+	unsigned int			rx_bufs;
+	unsigned int			tx_bufs;
+	unsigned int			rx_ptr_boundary;
+	unsigned int			rx_offsets[CS_MAX_BUFFERS];
+	unsigned int			tx_offsets[CS_MAX_BUFFERS];
+
+	/* size of aligned memory blocks */
+	unsigned int			slot_size;
+	unsigned int			flags;
+
+	struct list_head		cmdqueue;
+
+	struct hsi_msg			*data_rx_msg;
+	struct hsi_msg			*data_tx_msg;
+	wait_queue_head_t		datawait;
+
+	struct pm_qos_request           pm_qos_req;
+
+	spinlock_t			lock;
+};
+
+static struct cs_char cs_char_data;
+
+static void cs_hsi_read_on_control(struct cs_hsi_iface *hi);
+static void cs_hsi_read_on_data(struct cs_hsi_iface *hi);
+
+static inline void rx_ptr_shift_too_big(void)
+{
+	BUILD_BUG_ON((1LLU << RX_PTR_MAX_SHIFT) > UINT_MAX);
+}
+
+static void cs_notify(u32 message, struct list_head *head)
+{
+	struct char_queue *entry;
+
+	spin_lock(&cs_char_data.lock);
+
+	if (!cs_char_data.opened) {
+		spin_unlock(&cs_char_data.lock);
+		goto out;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry) {
+		dev_err(&cs_char_data.cl->device,
+			"Can't allocate new entry for the queue.\n");
+		spin_unlock(&cs_char_data.lock);
+		goto out;
+	}
+
+	entry->msg = message;
+	list_add_tail(&entry->list, head);
+
+	spin_unlock(&cs_char_data.lock);
+
+	wake_up_interruptible(&cs_char_data.wait);
+	kill_fasync(&cs_char_data.async_queue, SIGIO, POLL_IN);
+
+out:
+	return;
+}
+
+static u32 cs_pop_entry(struct list_head *head)
+{
+	struct char_queue *entry;
+	u32 data;
+
+	entry = list_entry(head->next, struct char_queue, list);
+	data = entry->msg;
+	list_del(&entry->list);
+	kfree(entry);
+
+	return data;
+}
+
+static void cs_notify_control(u32 message)
+{
+	cs_notify(message, &cs_char_data.chardev_queue);
+}
+
+static void cs_notify_data(u32 message, int maxlength)
+{
+	cs_notify(message, &cs_char_data.dataind_queue);
+
+	spin_lock(&cs_char_data.lock);
+	cs_char_data.dataind_pending++;
+	while (cs_char_data.dataind_pending > maxlength &&
+				!list_empty(&cs_char_data.dataind_queue)) {
+		dev_dbg(&cs_char_data.cl->device, "data notification "
+		"queue overrun (%u entries)\n", cs_char_data.dataind_pending);
+
+		cs_pop_entry(&cs_char_data.dataind_queue);
+		cs_char_data.dataind_pending--;
+	}
+	spin_unlock(&cs_char_data.lock);
+}
+
+static inline void cs_set_cmd(struct hsi_msg *msg, u32 cmd)
+{
+	u32 *data = sg_virt(msg->sgt.sgl);
+	*data = cmd;
+}
+
+static inline u32 cs_get_cmd(struct hsi_msg *msg)
+{
+	u32 *data = sg_virt(msg->sgt.sgl);
+	return *data;
+}
+
+static void cs_release_cmd(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+
+	list_add_tail(&msg->link, &hi->cmdqueue);
+}
+
+static void cs_cmd_destructor(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+
+	spin_lock(&hi->lock);
+
+	dev_dbg(&cs_char_data.cl->device, "control cmd destructor\n");
+
+	if (hi->iface_state != CS_STATE_CLOSED)
+		dev_err(&hi->cl->device, "Cmd flushed while driver active\n");
+
+	if (msg->ttype == HSI_MSG_READ)
+		hi->control_state &=
+			~(SSI_CHANNEL_STATE_POLL | SSI_CHANNEL_STATE_READING);
+	else if (msg->ttype == HSI_MSG_WRITE &&
+			hi->control_state & SSI_CHANNEL_STATE_WRITING)
+		hi->control_state &= ~SSI_CHANNEL_STATE_WRITING;
+
+	cs_release_cmd(msg);
+
+	spin_unlock(&hi->lock);
+}
+
+static struct hsi_msg *cs_claim_cmd(struct cs_hsi_iface* ssi)
+{
+	struct hsi_msg *msg;
+
+	BUG_ON(list_empty(&ssi->cmdqueue));
+
+	msg = list_first_entry(&ssi->cmdqueue, struct hsi_msg, link);
+	list_del(&msg->link);
+	msg->destructor = cs_cmd_destructor;
+
+	return msg;
+}
+
+static void cs_free_cmds(struct cs_hsi_iface *ssi)
+{
+	struct hsi_msg *msg, *tmp;
+
+	list_for_each_entry_safe(msg, tmp, &ssi->cmdqueue, link) {
+		list_del(&msg->link);
+		msg->destructor = NULL;
+		kfree(sg_virt(msg->sgt.sgl));
+		hsi_free_msg(msg);
+	}
+}
+
+static int cs_alloc_cmds(struct cs_hsi_iface *hi)
+{
+	struct hsi_msg *msg;
+	u32 *buf;
+	unsigned int i;
+
+	INIT_LIST_HEAD(&hi->cmdqueue);
+
+	for (i = 0; i < CS_MAX_CMDS; i++) {
+		msg = hsi_alloc_msg(1, GFP_KERNEL);
+		if (!msg)
+			goto out;
+		buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+		if (!buf) {
+			hsi_free_msg(msg);
+			goto out;
+		}
+		sg_init_one(msg->sgt.sgl, buf, sizeof(*buf));
+		msg->channel = cs_char_data.channel_id_cmd;
+		msg->context = hi;
+		list_add_tail(&msg->link, &hi->cmdqueue);
+	}
+
+	return 0;
+
+out:
+	cs_free_cmds(hi);
+	return -ENOMEM;
+}
+
+static void cs_hsi_data_destructor(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	const char *dir = (msg->ttype == HSI_MSG_READ) ? "TX" : "RX";
+
+	dev_dbg(&cs_char_data.cl->device, "Freeing data %s message\n", dir);
+
+	spin_lock(&hi->lock);
+	if (hi->iface_state != CS_STATE_CLOSED)
+		dev_err(&cs_char_data.cl->device,
+				"Data %s flush while device active\n", dir);
+	if (msg->ttype == HSI_MSG_READ)
+		hi->data_state &=
+			~(SSI_CHANNEL_STATE_POLL | SSI_CHANNEL_STATE_READING);
+	else
+		hi->data_state &= ~SSI_CHANNEL_STATE_WRITING;
+
+	msg->status = HSI_STATUS_COMPLETED;
+	if (unlikely(waitqueue_active(&hi->datawait)))
+		wake_up_interruptible(&hi->datawait);
+
+	spin_unlock(&hi->lock);
+}
+
+static int cs_hsi_alloc_data(struct cs_hsi_iface *hi)
+{
+	struct hsi_msg *txmsg, *rxmsg;
+	int res = 0;
+
+	rxmsg = hsi_alloc_msg(1, GFP_KERNEL);
+	if (!rxmsg) {
+		res = -ENOMEM;
+		goto out1;
+	}
+	rxmsg->channel = cs_char_data.channel_id_data;
+	rxmsg->destructor = cs_hsi_data_destructor;
+	rxmsg->context = hi;
+
+	txmsg = hsi_alloc_msg(1, GFP_KERNEL);
+	if (!txmsg) {
+		res = -ENOMEM;
+		goto out2;
+	}
+	txmsg->channel = cs_char_data.channel_id_data;
+	txmsg->destructor = cs_hsi_data_destructor;
+	txmsg->context = hi;
+
+	hi->data_rx_msg = rxmsg;
+	hi->data_tx_msg = txmsg;
+
+	return 0;
+
+out2:
+	hsi_free_msg(rxmsg);
+out1:
+	return res;
+}
+
+static void cs_hsi_free_data_msg(struct hsi_msg *msg)
+{
+	WARN_ON(msg->status != HSI_STATUS_COMPLETED &&
+					msg->status != HSI_STATUS_ERROR);
+	hsi_free_msg(msg);
+}
+
+static void cs_hsi_free_data(struct cs_hsi_iface *hi)
+{
+	cs_hsi_free_data_msg(hi->data_rx_msg);
+	cs_hsi_free_data_msg(hi->data_tx_msg);
+}
+
+static inline void __cs_hsi_error_pre(struct cs_hsi_iface *hi,
+					struct hsi_msg *msg, const char *info,
+					unsigned int *state)
+{
+	spin_lock(&hi->lock);
+	dev_err(&hi->cl->device, "HSI %s error, msg %d, state %u\n",
+		info, msg->status, *state);
+}
+
+static inline void __cs_hsi_error_post(struct cs_hsi_iface *hi)
+{
+	spin_unlock(&hi->lock);
+}
+
+static inline void __cs_hsi_error_read_bits(unsigned int *state)
+{
+	*state |= SSI_CHANNEL_STATE_ERROR;
+	*state &= ~(SSI_CHANNEL_STATE_READING | SSI_CHANNEL_STATE_POLL);
+}
+
+static inline void __cs_hsi_error_write_bits(unsigned int *state)
+{
+	*state |= SSI_CHANNEL_STATE_ERROR;
+	*state &= ~SSI_CHANNEL_STATE_WRITING;
+}
+
+static void cs_hsi_control_read_error(struct cs_hsi_iface *hi,
+							struct hsi_msg *msg)
+{
+	__cs_hsi_error_pre(hi, msg, "control read", &hi->control_state);
+	cs_release_cmd(msg);
+	__cs_hsi_error_read_bits(&hi->control_state);
+	__cs_hsi_error_post(hi);
+}
+
+static void cs_hsi_control_write_error(struct cs_hsi_iface *hi,
+							struct hsi_msg *msg)
+{
+	__cs_hsi_error_pre(hi, msg, "control write", &hi->control_state);
+	cs_release_cmd(msg);
+	__cs_hsi_error_write_bits(&hi->control_state);
+	__cs_hsi_error_post(hi);
+
+}
+
+static void cs_hsi_data_read_error(struct cs_hsi_iface *hi, struct hsi_msg *msg)
+{
+	__cs_hsi_error_pre(hi, msg, "data read", &hi->data_state);
+	__cs_hsi_error_read_bits(&hi->data_state);
+	__cs_hsi_error_post(hi);
+}
+
+static void cs_hsi_data_write_error(struct cs_hsi_iface *hi,
+							struct hsi_msg *msg)
+{
+	__cs_hsi_error_pre(hi, msg, "data write", &hi->data_state);
+	__cs_hsi_error_write_bits(&hi->data_state);
+	__cs_hsi_error_post(hi);
+}
+
+static void cs_hsi_read_on_control_complete(struct hsi_msg *msg)
+{
+	u32 cmd = cs_get_cmd(msg);
+	struct cs_hsi_iface *hi = msg->context;
+
+	spin_lock(&hi->lock);
+	hi->control_state &= ~SSI_CHANNEL_STATE_READING;
+	if (msg->status == HSI_STATUS_ERROR) {
+		dev_err(&hi->cl->device, "Control RX error detected\n");
+		cs_hsi_control_read_error(hi, msg);
+		spin_unlock(&hi->lock);
+		goto out;
+	}
+	dev_dbg(&hi->cl->device, "Read on control: %08X\n", cmd);
+	cs_release_cmd(msg);
+	if (hi->flags & CS_FEAT_TSTAMP_RX_CTRL) {
+		struct timespec *tstamp =
+			&hi->mmap_cfg->tstamp_rx_ctrl;
+		do_posix_clock_monotonic_gettime(tstamp);
+	}
+	spin_unlock(&hi->lock);
+
+	cs_notify_control(cmd);
+
+out:
+	cs_hsi_read_on_control(hi);
+}
+
+static void cs_hsi_peek_on_control_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	int ret;
+
+	if (msg->status == HSI_STATUS_ERROR) {
+		dev_err(&hi->cl->device, "Control peek RX error detected\n");
+		cs_hsi_control_read_error(hi, msg);
+		return;
+	}
+
+	WARN_ON(!(hi->control_state & SSI_CHANNEL_STATE_READING));
+
+	dev_dbg(&hi->cl->device, "Peek on control complete, reading\n");
+	msg->sgt.nents = 1;
+	msg->complete = cs_hsi_read_on_control_complete;
+	ret = hsi_async_read(hi->cl, msg);
+	if (ret)
+		cs_hsi_control_read_error(hi, msg);
+}
+
+static void cs_hsi_read_on_control(struct cs_hsi_iface *hi)
+{
+	struct hsi_msg *msg;
+	int ret;
+
+	spin_lock(&hi->lock);
+	if (hi->control_state & SSI_CHANNEL_STATE_READING) {
+		dev_err(&hi->cl->device, "Control read already pending (%d)\n",
+			hi->control_state);
+		spin_unlock(&hi->lock);
+		return;
+	}
+	if (hi->control_state & SSI_CHANNEL_STATE_ERROR) {
+		dev_err(&hi->cl->device, "Control read error (%d)\n",
+			hi->control_state);
+		spin_unlock(&hi->lock);
+		return;
+	}
+	hi->control_state |= SSI_CHANNEL_STATE_READING;
+	dev_dbg(&hi->cl->device, "Issuing RX on control\n");
+	msg = cs_claim_cmd(hi);
+	spin_unlock(&hi->lock);
+
+	msg->sgt.nents = 0;
+	msg->complete = cs_hsi_peek_on_control_complete;
+	ret = hsi_async_read(hi->cl, msg);
+	if (ret)
+		cs_hsi_control_read_error(hi, msg);
+}
+
+static void cs_hsi_write_on_control_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	if (msg->status == HSI_STATUS_COMPLETED) {
+		spin_lock(&hi->lock);
+		hi->control_state &= ~SSI_CHANNEL_STATE_WRITING;
+		cs_release_cmd(msg);
+		spin_unlock(&hi->lock);
+	} else if (msg->status == HSI_STATUS_ERROR) {
+		cs_hsi_control_write_error(hi, msg);
+	} else {
+		dev_err(&hi->cl->device,
+			"unexpected status in control write callback %d\n",
+			msg->status);
+	}
+}
+
+static int cs_hsi_write_on_control(struct cs_hsi_iface *hi, u32 message)
+{
+	struct hsi_msg *msg;
+	int ret;
+
+	spin_lock(&hi->lock);
+	if (hi->control_state & SSI_CHANNEL_STATE_ERROR) {
+		spin_unlock(&hi->lock);
+		return -EIO;
+	}
+	if (hi->control_state & SSI_CHANNEL_STATE_WRITING) {
+		dev_err(&hi->cl->device,
+			"Write still pending on control channel.\n");
+		spin_unlock(&hi->lock);
+		return -EBUSY;
+	}
+	hi->control_state |= SSI_CHANNEL_STATE_WRITING;
+	msg = cs_claim_cmd(hi);
+	spin_unlock(&hi->lock);
+
+	cs_set_cmd(msg, message);
+	msg->sgt.nents = 1;
+	msg->complete = cs_hsi_write_on_control_complete;
+	dev_dbg(&hi->cl->device,
+		"Sending control message %08X\n", message);
+	ret = hsi_async_write(hi->cl, msg);
+	if (ret) {
+		dev_err(&hi->cl->device,
+			"async_write failed with %d\n", ret);
+		cs_hsi_control_write_error(hi, msg);
+	}
+
+	/*
+	 * Make sure control read is always pending when issuing
+	 * new control writes. This is needed as the controller
+	 * may flush our messages if e.g. the peer device reboots
+	 * unexpectedly (and we cannot directly resubmit a new read from
+	 * the message destructor; see cs_cmd_destructor()).
+	 */
+	if (!(hi->control_state & SSI_CHANNEL_STATE_READING)) {
+		dev_err(&hi->cl->device, "Restarting control reads\n");
+		cs_hsi_read_on_control(hi);
+	}
+
+	return 0;
+}
+
+static void cs_hsi_read_on_data_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	u32 payload;
+
+	if (unlikely(msg->status == HSI_STATUS_ERROR)) {
+		cs_hsi_data_read_error(hi, msg);
+		return;
+	}
+
+	spin_lock(&hi->lock);
+	WARN_ON(!(hi->data_state & SSI_CHANNEL_STATE_READING));
+	hi->data_state &= ~SSI_CHANNEL_STATE_READING;
+	payload = CS_RX_DATA_RECEIVED;
+	payload |= hi->rx_slot;
+	hi->rx_slot++;
+	hi->rx_slot %= hi->rx_ptr_boundary;
+	/* expose current rx ptr in mmap area */
+	hi->mmap_cfg->rx_ptr = hi->rx_slot;
+	if (unlikely(waitqueue_active(&hi->datawait)))
+		wake_up_interruptible(&hi->datawait);
+	spin_unlock(&hi->lock);
+
+	cs_notify_data(payload, hi->rx_bufs);
+	cs_hsi_read_on_data(hi);
+}
+
+static void cs_hsi_peek_on_data_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+	u32 *address;
+	int ret;
+
+	if (unlikely(msg->status == HSI_STATUS_ERROR)) {
+		cs_hsi_data_read_error(hi, msg);
+		return;
+	}
+	if (unlikely(hi->iface_state != CS_STATE_CONFIGURED)) {
+		dev_err(&hi->cl->device, "Data received in invalid state\n");
+		cs_hsi_data_read_error(hi, msg);
+		return;
+	}
+
+	spin_lock(&hi->lock);
+	WARN_ON(!(hi->data_state & SSI_CHANNEL_STATE_POLL));
+	hi->data_state &= ~SSI_CHANNEL_STATE_POLL;
+	hi->data_state |= SSI_CHANNEL_STATE_READING;
+	spin_unlock(&hi->lock);
+
+	address = (u32 *)(hi->mmap_base +
+				hi->rx_offsets[hi->rx_slot % hi->rx_bufs]);
+	sg_init_one(msg->sgt.sgl, address, hi->buf_size);
+	msg->sgt.nents = 1;
+	msg->complete = cs_hsi_read_on_data_complete;
+	ret = hsi_async_read(hi->cl, msg);
+	if (ret)
+		cs_hsi_data_read_error(hi, msg);
+}
+
+/*
+ * Read/write transaction is ongoing. Returns false if in
+ * SSI_CHANNEL_STATE_POLL state.
+ */
+static inline int cs_state_xfer_active(unsigned int state)
+{
+	return (state & SSI_CHANNEL_STATE_WRITING) ||
+		(state & SSI_CHANNEL_STATE_READING);
+}
+
+/*
+ * No pending read/writes
+ */
+static inline int cs_state_idle(unsigned int state)
+{
+	return !(state & ~SSI_CHANNEL_STATE_ERROR);
+}
+
+static void cs_hsi_read_on_data(struct cs_hsi_iface *hi)
+{
+	struct hsi_msg *rxmsg;
+	int ret;
+
+	spin_lock(&hi->lock);
+	if (hi->data_state &
+		(SSI_CHANNEL_STATE_READING | SSI_CHANNEL_STATE_POLL)) {
+		dev_dbg(&hi->cl->device, "Data read already pending (%u)\n",
+			hi->data_state);
+		spin_unlock(&hi->lock);
+		return;
+	}
+	hi->data_state |= SSI_CHANNEL_STATE_POLL;
+	spin_unlock(&hi->lock);
+
+	rxmsg = hi->data_rx_msg;
+	sg_init_one(rxmsg->sgt.sgl, (void *)hi->mmap_base, 0);
+	rxmsg->sgt.nents = 0;
+	rxmsg->complete = cs_hsi_peek_on_data_complete;
+
+	ret = hsi_async_read(hi->cl, rxmsg);
+	if (ret)
+		cs_hsi_data_read_error(hi, rxmsg);
+}
+
+static void cs_hsi_write_on_data_complete(struct hsi_msg *msg)
+{
+	struct cs_hsi_iface *hi = msg->context;
+
+	if (msg->status == HSI_STATUS_COMPLETED) {
+		spin_lock(&hi->lock);
+		hi->data_state &= ~SSI_CHANNEL_STATE_WRITING;
+		if (unlikely(waitqueue_active(&hi->datawait)))
+			wake_up_interruptible(&hi->datawait);
+		spin_unlock(&hi->lock);
+	} else {
+		cs_hsi_data_write_error(hi, msg);
+	}
+}
+
+static int cs_hsi_write_on_data(struct cs_hsi_iface *hi, unsigned int slot)
+{
+	u32 *address;
+	struct hsi_msg *txmsg;
+	int ret;
+
+	spin_lock(&hi->lock);
+	if (hi->iface_state != CS_STATE_CONFIGURED) {
+		dev_err(&hi->cl->device, "Not configured, aborting\n");
+		ret = -EINVAL;
+		goto error;
+	}
+	if (hi->data_state & SSI_CHANNEL_STATE_ERROR) {
+		dev_err(&hi->cl->device, "HSI error, aborting\n");
+		ret = -EIO;
+		goto error;
+	}
+	if (hi->data_state & SSI_CHANNEL_STATE_WRITING) {
+		dev_err(&hi->cl->device, "Write pending on data channel.\n");
+		ret = -EBUSY;
+		goto error;
+	}
+	hi->data_state |= SSI_CHANNEL_STATE_WRITING;
+	spin_unlock(&hi->lock);
+
+	hi->tx_slot = slot;
+	address = (u32 *)(hi->mmap_base + hi->tx_offsets[hi->tx_slot]);
+	txmsg = hi->data_tx_msg;
+	sg_init_one(txmsg->sgt.sgl, address, hi->buf_size);
+	txmsg->complete = cs_hsi_write_on_data_complete;
+	ret = hsi_async_write(hi->cl, txmsg);
+	if (ret)
+		cs_hsi_data_write_error(hi, txmsg);
+
+	return ret;
+
+error:
+	spin_unlock(&hi->lock);
+	if (ret == -EIO)
+		cs_hsi_data_write_error(hi, hi->data_tx_msg);
+
+	return ret;
+}
+
+static unsigned int cs_hsi_get_state(struct cs_hsi_iface *hi)
+{
+	return hi->iface_state;
+}
+
+static int cs_hsi_command(struct cs_hsi_iface *hi, u32 cmd)
+{
+	int ret = 0;
+
+	local_bh_disable();
+	switch (cmd & TARGET_MASK) {
+	case TARGET_REMOTE:
+		ret = cs_hsi_write_on_control(hi, cmd);
+		break;
+	case TARGET_LOCAL:
+		if ((cmd & CS_CMD_MASK) == CS_TX_DATA_READY)
+			ret = cs_hsi_write_on_data(hi, cmd & CS_PARAM_MASK);
+		else
+			ret = -EINVAL;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	local_bh_enable();
+
+	return ret;
+}
+
+static void cs_hsi_set_wakeline(struct cs_hsi_iface *hi, bool new_state)
+{
+	int change = 0;
+
+	spin_lock_bh(&hi->lock);
+	if (hi->wakeline_state != new_state) {
+		hi->wakeline_state = new_state;
+		change = 1;
+		dev_dbg(&hi->cl->device, "setting wake line to %d (%p)\n",
+			new_state, hi->cl);
+	}
+	spin_unlock_bh(&hi->lock);
+
+	if (change) {
+		if (new_state)
+			ssip_slave_start_tx(hi->master);
+		else
+			ssip_slave_stop_tx(hi->master);
+	}
+
+	dev_dbg(&hi->cl->device, "wake line set to %d (%p)\n",
+		new_state, hi->cl);
+}
+
+static void set_buffer_sizes(struct cs_hsi_iface *hi, int rx_bufs, int tx_bufs)
+{
+	hi->rx_bufs = rx_bufs;
+	hi->tx_bufs = tx_bufs;
+	hi->mmap_cfg->rx_bufs = rx_bufs;
+	hi->mmap_cfg->tx_bufs = tx_bufs;
+
+	if (hi->flags & CS_FEAT_ROLLING_RX_COUNTER) {
+		/*
+		 * For more robust overrun detection, let the rx
+		 * pointer run in range 0..'boundary-1'. Boundary
+		 * is a multiple of rx_bufs, and limited in max size
+		 * by RX_PTR_MAX_SHIFT to allow for fast ptr-diff
+		 * calculation.
+		 */
+		hi->rx_ptr_boundary = (rx_bufs << RX_PTR_BOUNDARY_SHIFT);
+		hi->mmap_cfg->rx_ptr_boundary = hi->rx_ptr_boundary;
+	} else {
+		hi->rx_ptr_boundary = hi->rx_bufs;
+	}
+}
+
+static int check_buf_params(struct cs_hsi_iface *hi,
+					const struct cs_buffer_config *buf_cfg)
+{
+	size_t buf_size_aligned = L1_CACHE_ALIGN(buf_cfg->buf_size) *
+					(buf_cfg->rx_bufs + buf_cfg->tx_bufs);
+	size_t ctrl_size_aligned = L1_CACHE_ALIGN(sizeof(*hi->mmap_cfg));
+	int r = 0;
+
+	if (buf_cfg->rx_bufs > CS_MAX_BUFFERS ||
+					buf_cfg->tx_bufs > CS_MAX_BUFFERS) {
+		r = -EINVAL;
+	} else if ((buf_size_aligned + ctrl_size_aligned) >= hi->mmap_size) {
+		dev_err(&hi->cl->device, "No space for the requested buffer "
+			"configuration\n");
+		r = -ENOBUFS;
+	}
+
+	return r;
+}
+
+/**
+ * Block until pending data transfers have completed.
+ */
+static int cs_hsi_data_sync(struct cs_hsi_iface *hi)
+{
+	int r = 0;
+
+	spin_lock_bh(&hi->lock);
+
+	if (!cs_state_xfer_active(hi->data_state)) {
+		dev_dbg(&hi->cl->device, "hsi_data_sync break, idle\n");
+		goto out;
+	}
+
+	for (;;) {
+		int s;
+		DEFINE_WAIT(wait);
+		if (!cs_state_xfer_active(hi->data_state))
+			goto out;
+		if (signal_pending(current)) {
+			r = -ERESTARTSYS;
+			goto out;
+		}
+		/**
+		 * prepare_to_wait must be called with hi->lock held
+		 * so that callbacks can check for waitqueue_active()
+		 */
+		prepare_to_wait(&hi->datawait, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock_bh(&hi->lock);
+		s = schedule_timeout(
+			msecs_to_jiffies(CS_HSI_TRANSFER_TIMEOUT_MS));
+		spin_lock_bh(&hi->lock);
+		finish_wait(&hi->datawait, &wait);
+		if (!s) {
+			dev_dbg(&hi->cl->device,
+				"hsi_data_sync timeout after %d ms\n",
+				CS_HSI_TRANSFER_TIMEOUT_MS);
+			r = -EIO;
+			goto out;
+		}
+	}
+
+out:
+	spin_unlock_bh(&hi->lock);
+	dev_dbg(&hi->cl->device, "hsi_data_sync done with res %d\n", r);
+
+	return r;
+}
+
+static void cs_hsi_data_enable(struct cs_hsi_iface *hi,
+					struct cs_buffer_config *buf_cfg)
+{
+	unsigned int data_start, i;
+
+	BUG_ON(hi->buf_size == 0);
+
+	set_buffer_sizes(hi, buf_cfg->rx_bufs, buf_cfg->tx_bufs);
+
+	hi->slot_size = L1_CACHE_ALIGN(hi->buf_size);
+	dev_dbg(&hi->cl->device,
+			"setting slot size to %u, buf size %u, align %u\n",
+			hi->slot_size, hi->buf_size, L1_CACHE_BYTES);
+
+	data_start = L1_CACHE_ALIGN(sizeof(*hi->mmap_cfg));
+	dev_dbg(&hi->cl->device,
+			"setting data start at %u, cfg block %u, align %u\n",
+			data_start, sizeof(*hi->mmap_cfg), L1_CACHE_BYTES);
+
+	for (i = 0; i < hi->mmap_cfg->rx_bufs; i++) {
+		hi->rx_offsets[i] = data_start + i * hi->slot_size;
+		hi->mmap_cfg->rx_offsets[i] = hi->rx_offsets[i];
+		dev_dbg(&hi->cl->device, "DL buf #%u at %u\n",
+					i, hi->rx_offsets[i]);
+	}
+	for (i = 0; i < hi->mmap_cfg->tx_bufs; i++) {
+		hi->tx_offsets[i] = data_start +
+			(i + hi->mmap_cfg->rx_bufs) * hi->slot_size;
+		hi->mmap_cfg->tx_offsets[i] = hi->tx_offsets[i];
+		dev_dbg(&hi->cl->device, "UL buf #%u at %u\n",
+					i, hi->rx_offsets[i]);
+	}
+
+	hi->iface_state = CS_STATE_CONFIGURED;
+}
+
+static void cs_hsi_data_disable(struct cs_hsi_iface *hi, int old_state)
+{
+	if (old_state == CS_STATE_CONFIGURED) {
+		dev_dbg(&hi->cl->device,
+			"closing data channel with slot size 0\n");
+		hi->iface_state = CS_STATE_OPENED;
+	}
+}
+
+static int cs_hsi_buf_config(struct cs_hsi_iface *hi,
+					struct cs_buffer_config *buf_cfg)
+{
+	int r = 0;
+	unsigned int old_state = hi->iface_state;
+
+	spin_lock_bh(&hi->lock);
+	/* Prevent new transactions during buffer reconfig */
+	if (old_state == CS_STATE_CONFIGURED)
+		hi->iface_state = CS_STATE_OPENED;
+	spin_unlock_bh(&hi->lock);
+
+	/*
+	 * make sure that no non-zero data reads are ongoing before
+	 * proceeding to change the buffer layout
+	 */
+	r = cs_hsi_data_sync(hi);
+	if (r < 0)
+		return r;
+
+	WARN_ON(cs_state_xfer_active(hi->data_state));
+
+	spin_lock_bh(&hi->lock);
+	r = check_buf_params(hi, buf_cfg);
+	if (r < 0)
+		goto error;
+
+	hi->buf_size = buf_cfg->buf_size;
+	hi->mmap_cfg->buf_size = hi->buf_size;
+	hi->flags = buf_cfg->flags;
+
+	hi->rx_slot = 0;
+	hi->tx_slot = 0;
+	hi->slot_size = 0;
+
+	if (hi->buf_size)
+		cs_hsi_data_enable(hi, buf_cfg);
+	else
+		cs_hsi_data_disable(hi, old_state);
+
+	spin_unlock_bh(&hi->lock);
+
+	if (old_state != hi->iface_state) {
+		if (hi->iface_state == CS_STATE_CONFIGURED) {
+			pm_qos_add_request(&hi->pm_qos_req,
+				PM_QOS_CPU_DMA_LATENCY,
+				CS_QOS_LATENCY_FOR_DATA_USEC);
+			local_bh_disable();
+			cs_hsi_read_on_data(hi);
+			local_bh_enable();
+		} else if (old_state == CS_STATE_CONFIGURED) {
+			pm_qos_remove_request(&hi->pm_qos_req);
+		}
+	}
+	return r;
+
+error:
+	spin_unlock_bh(&hi->lock);
+	return r;
+}
+
+static int cs_hsi_start(struct cs_hsi_iface **hi, struct hsi_client *cl,
+			unsigned long mmap_base, unsigned long mmap_size)
+{
+	int err = 0;
+	struct cs_hsi_iface *hsi_if = kzalloc(sizeof(*hsi_if), GFP_KERNEL);
+
+	dev_dbg(&cl->device, "cs_hsi_start\n");
+
+	if (!hsi_if) {
+		err = -ENOMEM;
+		goto leave0;
+	}
+	spin_lock_init(&hsi_if->lock);
+	hsi_if->cl = cl;
+	hsi_if->iface_state = CS_STATE_CLOSED;
+	hsi_if->mmap_cfg = (struct cs_mmap_config_block *)mmap_base;
+	hsi_if->mmap_base = mmap_base;
+	hsi_if->mmap_size = mmap_size;
+	memset(hsi_if->mmap_cfg, 0, sizeof(*hsi_if->mmap_cfg));
+	init_waitqueue_head(&hsi_if->datawait);
+	err = cs_alloc_cmds(hsi_if);
+	if (err < 0) {
+		dev_err(&cl->device, "Unable to alloc HSI messages\n");
+		goto leave1;
+	}
+	err = cs_hsi_alloc_data(hsi_if);
+	if (err < 0) {
+		dev_err(&cl->device, "Unable to alloc HSI messages for data\n");
+		goto leave2;
+	}
+	err = hsi_claim_port(cl, 1);
+	if (err < 0) {
+		dev_err(&cl->device,
+				"Could not open, HSI port already claimed\n");
+		goto leave3;
+	}
+	hsi_if->master = ssip_slave_get_master(cl);
+	if (IS_ERR(hsi_if->master)) {
+		dev_err(&cl->device, "Could not get HSI master client\n");
+		goto leave4;
+	}
+	if (!ssip_slave_running(hsi_if->master)) {
+		err = -ENODEV;
+		dev_err(&cl->device,
+				"HSI port not initialized\n");
+		goto leave4;
+	}
+
+	hsi_if->iface_state = CS_STATE_OPENED;
+	local_bh_disable();
+	cs_hsi_read_on_control(hsi_if);
+	local_bh_enable();
+
+	dev_dbg(&cl->device, "cs_hsi_start...done\n");
+
+	BUG_ON(!hi);
+	*hi = hsi_if;
+
+	return 0;
+
+leave4:
+	hsi_release_port(cl);
+leave3:
+	cs_hsi_free_data(hsi_if);
+leave2:
+	cs_free_cmds(hsi_if);
+leave1:
+	kfree(hsi_if);
+leave0:
+	dev_dbg(&cl->device, "cs_hsi_start...done/error\n\n");
+
+	return err;
+}
+
+static void cs_hsi_stop(struct cs_hsi_iface *hi)
+{
+	dev_dbg(&hi->cl->device, "cs_hsi_stop\n");
+	cs_hsi_set_wakeline(hi, 0);
+	ssip_slave_put_master(hi->master);
+
+	/* hsi_release_port() needs to be called with CS_STATE_CLOSED */
+	hi->iface_state = CS_STATE_CLOSED;
+	hsi_release_port(hi->cl);
+
+	/*
+	 * hsi_release_port() should flush out all the pending
+	 * messages, so cs_state_idle() should be true for both
+	 * control and data channels.
+	 */
+	WARN_ON(!cs_state_idle(hi->control_state));
+	WARN_ON(!cs_state_idle(hi->data_state));
+
+	if (pm_qos_request_active(&hi->pm_qos_req))
+		pm_qos_remove_request(&hi->pm_qos_req);
+
+	spin_lock_bh(&hi->lock);
+	cs_hsi_free_data(hi);
+	cs_free_cmds(hi);
+	spin_unlock_bh(&hi->lock);
+	kfree(hi);
+}
+
+static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct cs_char *csdata = vma->vm_private_data;
+	struct page *page;
+
+	page = virt_to_page(csdata->mmap_base);
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static struct vm_operations_struct cs_char_vm_ops = {
+	.fault	= cs_char_vma_fault,
+};
+
+static int cs_char_fasync(int fd, struct file *file, int on)
+{
+	struct cs_char *csdata = file->private_data;
+
+	if (fasync_helper(fd, file, on, &csdata->async_queue) < 0)
+		return -EIO;
+
+	return 0;
+}
+
+static unsigned int cs_char_poll(struct file *file, poll_table *wait)
+{
+	struct cs_char *csdata = file->private_data;
+	unsigned int ret = 0;
+
+	poll_wait(file, &cs_char_data.wait, wait);
+	spin_lock_bh(&csdata->lock);
+	if (!list_empty(&csdata->chardev_queue))
+		ret = POLLIN | POLLRDNORM;
+	else if (!list_empty(&csdata->dataind_queue))
+		ret = POLLIN | POLLRDNORM;
+	spin_unlock_bh(&csdata->lock);
+
+	return ret;
+}
+
+static ssize_t cs_char_read(struct file *file, char __user *buf, size_t count,
+								loff_t *unused)
+{
+	struct cs_char *csdata = file->private_data;
+	u32 data;
+	ssize_t retval;
+
+	if (count < sizeof(data))
+		return -EINVAL;
+
+	for (;;) {
+		DEFINE_WAIT(wait);
+
+		spin_lock_bh(&csdata->lock);
+		if (!list_empty(&csdata->chardev_queue)) {
+			data = cs_pop_entry(&csdata->chardev_queue);
+		} else if (!list_empty(&csdata->dataind_queue)) {
+			data = cs_pop_entry(&csdata->dataind_queue);
+			csdata->dataind_pending--;
+		} else {
+			data = 0;
+		}
+		spin_unlock_bh(&csdata->lock);
+
+		if (data)
+			break;
+		if (file->f_flags & O_NONBLOCK) {
+			retval = -EAGAIN;
+			goto out;
+		} else if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			goto out;
+		}
+		prepare_to_wait_exclusive(&csdata->wait, &wait,
+						TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&csdata->wait, &wait);
+	}
+
+	retval = put_user(data, (u32 __user *)buf);
+	if (!retval)
+		retval = sizeof(data);
+
+out:
+	return retval;
+}
+
+static ssize_t cs_char_write(struct file *file, const char __user *buf,
+						size_t count, loff_t *unused)
+{
+	struct cs_char *csdata = file->private_data;
+	u32 data;
+	int err;
+	ssize_t	retval;
+
+	if (count < sizeof(data))
+		return -EINVAL;
+
+	if (get_user(data, (u32 __user *)buf))
+		retval = -EFAULT;
+	else
+		retval = count;
+
+	err = cs_hsi_command(csdata->hi, data);
+	if (err < 0)
+		retval = err;
+
+	return retval;
+}
+
+static long cs_char_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct cs_char *csdata = file->private_data;
+	int r = 0;
+
+	switch (cmd) {
+	case CS_GET_STATE: {
+		unsigned int state;
+
+		state = cs_hsi_get_state(csdata->hi);
+		if (copy_to_user((void __user *)arg, &state, sizeof(state)))
+			r = -EFAULT;
+
+		break;
+	}
+	case CS_SET_WAKELINE: {
+		unsigned int state;
+
+		if (copy_from_user(&state, (void __user *)arg, sizeof(state))) {
+			r = -EFAULT;
+			break;
+		}
+
+		if (state > 1) {
+			r = -EINVAL;
+			break;
+		}
+
+		cs_hsi_set_wakeline(csdata->hi, !!state);
+
+		break;
+	}
+	case CS_GET_IF_VERSION: {
+		unsigned int ifver = CS_IF_VERSION;
+
+		if (copy_to_user((void __user *)arg, &ifver, sizeof(ifver)))
+			r = -EFAULT;
+
+		break;
+	}
+	case CS_CONFIG_BUFS: {
+		struct cs_buffer_config buf_cfg;
+
+		if (copy_from_user(&buf_cfg, (void __user *)arg,
+							sizeof(buf_cfg)))
+			r = -EFAULT;
+		else
+			r = cs_hsi_buf_config(csdata->hi, &buf_cfg);
+
+		break;
+	}
+	default:
+		r = -ENOTTY;
+		break;
+	}
+
+	return r;
+}
+
+static int cs_char_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+
+	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) != 1)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_IO | VM_DONTDUMP | VM_DONTEXPAND;
+	vma->vm_ops = &cs_char_vm_ops;
+	vma->vm_private_data = file->private_data;
+
+	return 0;
+}
+
+static int cs_char_open(struct inode *unused, struct file *file)
+{
+	int ret = 0;
+	unsigned long p;
+
+	spin_lock_bh(&cs_char_data.lock);
+	if (cs_char_data.opened) {
+		ret = -EBUSY;
+		spin_unlock_bh(&cs_char_data.lock);
+		goto out1;
+	}
+	cs_char_data.opened = 1;
+	cs_char_data.dataind_pending = 0;
+	spin_unlock_bh(&cs_char_data.lock);
+
+	p = get_zeroed_page(GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto out2;
+	}
+
+	ret = cs_hsi_start(&cs_char_data.hi, cs_char_data.cl, p, CS_MMAP_SIZE);
+	if (ret) {
+		dev_err(&cs_char_data.cl->device, "Unable to initialize HSI\n");
+		goto out3;
+	}
+
+	/* these are only used in release so lock not needed */
+	cs_char_data.mmap_base = p;
+	cs_char_data.mmap_size = CS_MMAP_SIZE;
+
+	file->private_data = &cs_char_data;
+
+	return 0;
+
+out3:
+	free_page(p);
+out2:
+	spin_lock_bh(&cs_char_data.lock);
+	cs_char_data.opened = 0;
+	spin_unlock_bh(&cs_char_data.lock);
+out1:
+	return ret;
+}
+
+static void cs_free_char_queue(struct list_head *head)
+{
+	struct char_queue *entry;
+	struct list_head *cursor, *next;
+
+	if (!list_empty(head)) {
+		list_for_each_safe(cursor, next, head) {
+			entry = list_entry(cursor, struct char_queue, list);
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+
+}
+
+static int cs_char_release(struct inode *unused, struct file *file)
+{
+	struct cs_char *csdata = file->private_data;
+
+	cs_hsi_stop(csdata->hi);
+	spin_lock_bh(&csdata->lock);
+	csdata->hi = NULL;
+	free_page(csdata->mmap_base);
+	cs_free_char_queue(&csdata->chardev_queue);
+	cs_free_char_queue(&csdata->dataind_queue);
+	csdata->opened = 0;
+	spin_unlock_bh(&csdata->lock);
+
+	return 0;
+}
+
+static const struct file_operations cs_char_fops = {
+	.owner		= THIS_MODULE,
+	.read		= cs_char_read,
+	.write		= cs_char_write,
+	.poll		= cs_char_poll,
+	.unlocked_ioctl	= cs_char_ioctl,
+	.mmap		= cs_char_mmap,
+	.open		= cs_char_open,
+	.release	= cs_char_release,
+	.fasync		= cs_char_fasync,
+};
+
+static struct miscdevice cs_char_miscdev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "cmt_speech",
+	.fops	= &cs_char_fops
+};
+
+static int cs_hsi_client_probe(struct device *dev)
+{
+	int err = 0;
+	struct hsi_client *cl = to_hsi_client(dev);
+
+	dev_dbg(dev, "hsi_client_probe\n");
+	init_waitqueue_head(&cs_char_data.wait);
+	spin_lock_init(&cs_char_data.lock);
+	cs_char_data.opened = 0;
+	cs_char_data.cl = cl;
+	cs_char_data.hi = NULL;
+	INIT_LIST_HEAD(&cs_char_data.chardev_queue);
+	INIT_LIST_HEAD(&cs_char_data.dataind_queue);
+
+	cs_char_data.channel_id_cmd = hsi_get_channel_id_by_name(cl,
+		"speech-control");
+	if (cs_char_data.channel_id_cmd < 0) {
+		err = cs_char_data.channel_id_cmd;
+		dev_err(dev, "Could not get cmd channel (%d)\n", err);
+		return err;
+	}
+
+	cs_char_data.channel_id_data = hsi_get_channel_id_by_name(cl,
+		"speech-data");
+	if (cs_char_data.channel_id_data < 0) {
+		err = cs_char_data.channel_id_data;
+		dev_err(dev, "Could not get data channel (%d)\n", err);
+		return err;
+	}
+
+	err = misc_register(&cs_char_miscdev);
+	if (err)
+		dev_err(dev, "Failed to register: %d\n", err);
+
+	return err;
+}
+
+static int cs_hsi_client_remove(struct device *dev)
+{
+	struct cs_hsi_iface *hi;
+
+	dev_dbg(dev, "hsi_client_remove\n");
+	misc_deregister(&cs_char_miscdev);
+	spin_lock_bh(&cs_char_data.lock);
+	hi = cs_char_data.hi;
+	cs_char_data.hi = NULL;
+	spin_unlock_bh(&cs_char_data.lock);
+	if (hi)
+		cs_hsi_stop(hi);
+
+	return 0;
+}
+
+static struct hsi_client_driver cs_hsi_driver = {
+	.driver = {
+		.name	= "cmt-speech",
+		.owner	= THIS_MODULE,
+		.probe	= cs_hsi_client_probe,
+		.remove	= cs_hsi_client_remove,
+	},
+};
+
+static int __init cs_char_init(void)
+{
+	pr_info("CMT speech driver added\n");
+	return hsi_register_client_driver(&cs_hsi_driver);
+}
+module_init(cs_char_init);
+
+static void __exit cs_char_exit(void)
+{
+	hsi_unregister_client_driver(&cs_hsi_driver);
+	pr_info("CMT speech driver removed\n");
+}
+module_exit(cs_char_exit);
+
+MODULE_ALIAS("hsi:cmt-speech");
+MODULE_AUTHOR("Kai Vehmanen <kai.vehmanen-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>");
+MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>");
+MODULE_DESCRIPTION("CMT speech driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/uapi/linux/hsi/Kbuild b/include/uapi/linux/hsi/Kbuild
index 30ab3cd..a16a005 100644
--- a/include/uapi/linux/hsi/Kbuild
+++ b/include/uapi/linux/hsi/Kbuild
@@ -1,2 +1,2 @@
 # UAPI Header export list
-header-y += hsi_char.h
+header-y += hsi_char.h cs-protocol.h
diff --git a/include/uapi/linux/hsi/cs-protocol.h b/include/uapi/linux/hsi/cs-protocol.h
new file mode 100644
index 0000000..4957bba
--- /dev/null
+++ b/include/uapi/linux/hsi/cs-protocol.h
@@ -0,0 +1,113 @@
+/*
+ * cmt-speech interface definitions
+ *
+ * Copyright (C) 2008,2009,2010 Nokia Corporation. All rights reserved.
+ *
+ * Contact: Kai Vehmanen <kai.vehmanen-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>
+ * Original author: Peter Ujfalusi <peter.ujfalusi-xNZwKgViW5gAvxtiuMwx3w@public.gmane.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#ifndef _CS_PROTOCOL_H
+#define _CS_PROTOCOL_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/* chardev parameters */
+#define CS_DEV_FILE_NAME		"/dev/cmt_speech"
+
+/* user-space API versioning */
+#define CS_IF_VERSION			2
+
+/* APE kernel <-> user space messages */
+#define CS_CMD_SHIFT			28
+#define CS_DOMAIN_SHIFT			24
+
+#define CS_CMD_MASK			0xff000000
+#define CS_PARAM_MASK			0xffffff
+
+#define CS_CMD(id, dom) \
+	(((id) << CS_CMD_SHIFT) | ((dom) << CS_DOMAIN_SHIFT))
+
+#define CS_ERROR			CS_CMD(1, 0)
+#define CS_RX_DATA_RECEIVED		CS_CMD(2, 0)
+#define CS_TX_DATA_READY		CS_CMD(3, 0)
+#define CS_TX_DATA_SENT			CS_CMD(4, 0)
+
+/* params to CS_ERROR indication */
+#define CS_ERR_PEER_RESET		0
+
+/* ioctl interface */
+
+/* parameters to CS_CONFIG_BUFS ioctl */
+#define CS_FEAT_TSTAMP_RX_CTRL		(1 << 0)
+#define CS_FEAT_ROLLING_RX_COUNTER	(2 << 0)
+
+/* parameters to CS_GET_STATE ioctl */
+#define CS_STATE_CLOSED			0
+#define CS_STATE_OPENED			1 /* resource allocated */
+#define CS_STATE_CONFIGURED		2 /* data path active */
+
+/* maximum number of TX/RX buffers */
+#define CS_MAX_BUFFERS_SHIFT		4
+#define CS_MAX_BUFFERS			(1 << CS_MAX_BUFFERS_SHIFT)
+
+/* Parameters for setting up the data buffers */
+struct cs_buffer_config {
+	__u32 rx_bufs;	/* number of RX buffer slots */
+	__u32 tx_bufs;	/* number of TX buffer slots */
+	__u32 buf_size;	/* bytes */
+	__u32 flags;	/* see CS_FEAT_* */
+	__u32 reserved[4];
+};
+
+/*
+ * Struct describing the layout and contents of the driver mmap area.
+ * This information is meant as read-only information for the application.
+ */
+struct cs_mmap_config_block {
+	__u32 reserved1;
+	__u32 buf_size;		/* 0=disabled, otherwise the transfer size */
+	__u32 rx_bufs;		/* # of RX buffers */
+	__u32 tx_bufs;		/* # of TX buffers */
+	__u32 reserved2;
+	/* array of offsets within the mmap area for each RX and TX buffer */
+	__u32 rx_offsets[CS_MAX_BUFFERS];
+	__u32 tx_offsets[CS_MAX_BUFFERS];
+	__u32 rx_ptr;
+	__u32 rx_ptr_boundary;
+	__u32 reserved3[2];
+	/*
+	 * if enabled with CS_FEAT_TSTAMP_RX_CTRL, monotonic
+	 * timestamp taken when the last control command was received
+	 */
+	struct timespec tstamp_rx_ctrl;
+};
+
+#define CS_IO_MAGIC		'C'
+
+#define CS_IOW(num, dtype)	_IOW(CS_IO_MAGIC, num, dtype)
+#define CS_IOR(num, dtype)	_IOR(CS_IO_MAGIC, num, dtype)
+#define CS_IOWR(num, dtype)	_IOWR(CS_IO_MAGIC, num, dtype)
+#define CS_IO(num)		_IO(CS_IO_MAGIC, num)
+
+#define CS_GET_STATE		CS_IOR(21, unsigned int)
+#define CS_SET_WAKELINE		CS_IOW(23, unsigned int)
+#define CS_GET_IF_VERSION	CS_IOR(30, unsigned int)
+#define CS_CONFIG_BUFS		CS_IOW(31, struct cs_buffer_config)
+
+#endif /* _CS_PROTOCOL_H */
-- 
2.1.4

^ permalink raw reply related

* [PATCHv3 2/2] HSI: nokia-modem: Add cmt-speech support
From: Sebastian Reichel @ 2015-03-21 19:09 UTC (permalink / raw)
  To: Sebastian Reichel
  Cc: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api
In-Reply-To: <1426964957-5023-1-git-send-email-sre@kernel.org>

Register cmt-speech driver in nokia-modem driver and forward
hsi channel information.

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/hsi/clients/Kconfig       |  2 +-
 drivers/hsi/clients/nokia-modem.c | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/drivers/hsi/clients/Kconfig b/drivers/hsi/clients/Kconfig
index 86c8495..d612620 100644
--- a/drivers/hsi/clients/Kconfig
+++ b/drivers/hsi/clients/Kconfig
@@ -6,7 +6,7 @@ comment "HSI clients"
 
 config NOKIA_MODEM
 	tristate "Nokia Modem"
-	depends on HSI && SSI_PROTOCOL
+	depends on HSI && SSI_PROTOCOL && CMT_SPEECH
 	help
 	Say Y here if you want to add support for the modem on Nokia
 	N900 (Nokia RX-51) hardware.
diff --git a/drivers/hsi/clients/nokia-modem.c b/drivers/hsi/clients/nokia-modem.c
index 9be4867..bbb1923 100644
--- a/drivers/hsi/clients/nokia-modem.c
+++ b/drivers/hsi/clients/nokia-modem.c
@@ -46,6 +46,7 @@ struct nokia_modem_device {
 	struct nokia_modem_gpio	*gpios;
 	int			gpio_amount;
 	struct hsi_client	*ssi_protocol;
+	struct hsi_client	*cmt_speech;
 };
 
 static void do_nokia_modem_rst_ind_tasklet(unsigned long data)
@@ -149,6 +150,7 @@ static int nokia_modem_probe(struct device *dev)
 	struct hsi_port *port = hsi_get_port(cl);
 	int irq, pflags, err;
 	struct hsi_board_info ssip;
+	struct hsi_board_info cmtspeech;
 
 	np = dev->of_node;
 	if (!np) {
@@ -214,12 +216,35 @@ static int nokia_modem_probe(struct device *dev)
 		goto error3;
 	}
 
-	/* TODO: register cmt-speech hsi client */
+	cmtspeech.name = "cmt-speech";
+	cmtspeech.tx_cfg = cl->tx_cfg;
+	cmtspeech.rx_cfg = cl->rx_cfg;
+	cmtspeech.platform_data = NULL;
+	cmtspeech.archdata = NULL;
+
+	modem->cmt_speech = hsi_new_client(port, &cmtspeech);
+	if (!modem->cmt_speech) {
+		dev_err(dev, "Could not register cmt-speech device\n");
+		err = -ENOMEM;
+		goto error3;
+	}
+
+	err = device_attach(&modem->cmt_speech->device);
+	if (err == 0) {
+		dev_err(dev, "Missing cmt-speech driver\n");
+		err = -EPROBE_DEFER;
+		goto error4;
+	} else if (err < 0) {
+		dev_err(dev, "Could not load cmt-speech driver (%d)\n", err);
+		goto error4;
+	}
 
 	dev_info(dev, "Registered Nokia HSI modem\n");
 
 	return 0;
 
+error4:
+	hsi_remove_client(&modem->cmt_speech->device, NULL);
 error3:
 	hsi_remove_client(&modem->ssi_protocol->device, NULL);
 error2:
@@ -238,6 +263,11 @@ static int nokia_modem_remove(struct device *dev)
 	if (!modem)
 		return 0;
 
+	if (modem->cmt_speech) {
+		hsi_remove_client(&modem->cmt_speech->device, NULL);
+		modem->cmt_speech = NULL;
+	}
+
 	if (modem->ssi_protocol) {
 		hsi_remove_client(&modem->ssi_protocol->device, NULL);
 		modem->ssi_protocol = NULL;
-- 
2.1.4


^ permalink raw reply related

* Re: [PATCHv3 0/2] N900 Modem Speech Support
From: Sebastian Reichel @ 2015-03-21 21:38 UTC (permalink / raw)
  To: Peter Ujfalusi, Kai Vehmanen, Pavel Machek, Pali Rohar,
	Aaro Koskinen, Ivaylo Dimitrov, linux-omap, linux-kernel,
	linux-api
In-Reply-To: <1426964957-5023-1-git-send-email-sre@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 287 bytes --]

Hi,

On Sat, Mar 21, 2015 at 08:09:15PM +0100, Sebastian Reichel wrote:
> [1] git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git branch/cmt-speech-2

This should actually be:

git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git branch/cmt-speech-3

-- Sebastian

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCH 1/1] Add virtio-input driver.
From: Michael S. Tsirkin @ 2015-03-21 22:22 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: virtio-dev, open list, open list:ABI/API, virtualization
In-Reply-To: <1426847327.32097.60.camel@nilsson.home.kraxel.org>

On Fri, Mar 20, 2015 at 11:28:47AM +0100, Gerd Hoffmann wrote:
>   Hi,
> 
> > > +static int virtinput_send_status(struct virtio_input *vi,
> > > +				 u16 type, u16 code, s32 value)
> > > +{
> > > +	struct virtio_input_event *stsbuf;
> > > +	struct scatterlist sg[1];
> > > +
> > > +	stsbuf = kzalloc(sizeof(*stsbuf), GFP_ATOMIC);
> > > +	if (!stsbuf)
> > > +		return -ENOMEM;
> > 
> > Does this return an error to userspace?
> > If so it's not a good idea I think, GFP_ATOMIC failures are
> > transient conditions and should not be reported
> > to userspace.
> > Can use GFP_KERNEL here?
> 
> Waiting for an answer from the ioput guys here.
> 
> > > +
> > > +	stsbuf->type  = cpu_to_le16(type);
> > > +	stsbuf->code  = cpu_to_le16(code);
> > > +	stsbuf->value = cpu_to_le32(value);
> > > +	sg_init_one(sg, stsbuf, sizeof(*stsbuf));
> > > +	virtqueue_add_outbuf(vi->sts, sg, 1, stsbuf, GFP_ATOMIC);
> > 
> > This can fail if queue is full. What prevents this from happening?
> 
> Nothing.  It's highly unlikely though given the throughput we have for
> input devices, not sure it is that useful to put too much effort into
> this.  Except for freeing stsbuf in the error case.
> 
> > > +	virtqueue_kick(vi->sts);
> > 
> > Also what prevents multiple virtinput_send_status calls
> > from racing with each other? Is there locking at a higher level?
> 
> I think input_dev->event_lock does this.
> 
> > > +static void virtinput_recv_status(struct virtqueue *vq)
> > > +{
> > > +	struct virtio_input *vi = vq->vdev->priv;
> > > +	struct virtio_input_event *stsbuf;
> > > +	unsigned int len;
> > > +
> > > +	while ((stsbuf = virtqueue_get_buf(vi->sts, &len)) != NULL)
> > 
> > looks like this get_buf can race with add above.
> 
> Yes, it can.
> 
> > Need some locking.
> 
> I'll add it.
> 
> > Also pls avoid != NULL, removing it makes code less verbose.
> > 
> > > +		kfree(stsbuf);
> > > +	virtqueue_kick(vq);
> > 
> > Why are you kicking here?
> 
> Hmm, it is pointless indeed.
> 
> > > +	if (select == VIRTIO_INPUT_CFG_EV_BITS)
> > > +		set_bit(subsel, vi->idev->evbit);
> > > +	for (bit = 0; bit < bitcount; bit++) {
> > > +		if ((bit % 8) == 0)
> > > +			virtio_cread(vi->vdev, struct virtio_input_config,
> > > +				     u.bitmap[bit/8], &cfg);
> > 
> > coding style violations above. you need spaces around ops like / and *.
> > Please run checkpatch.pl
> > 
> > > +		if (cfg & (1 << (bit % 8)))
> > > +			set_bit(bit, bits);
> > 
> > what if not set? does something clear the mask?
> 
> kzalloc?

So you are really just reading in array of bytes?
All this set bit trickery is just to convert things from LE?

> > > +	}
> > 
> > doesn't above just implement bitmap_copy or bitmap_or?
> 
> Not fully sure how bitmaps are defined.  virtio has a stream of bytes,
> first byte carries bits 0-7, second 8-15 etc.  linux kernel bitmaps ops
> are operating on longs, and native byteorder longs would be something
> else ...

This still looks too complex.
At least, this needs a comment explaining what the function does,
and maybe wrap it in a helper like virtio_input_bitmap_copy or
virtio_bitmap_or.


> > > +	size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ABS_INFO, abs);
> > > +	virtio_cread(vi->vdev, struct virtio_input_config, u.abs.min, &mi);
> > > +	virtio_cread(vi->vdev, struct virtio_input_config, u.abs.max, &ma);
> > > +	virtio_cread(vi->vdev, struct virtio_input_config, u.abs.fuzz, &fu);
> > > +	virtio_cread(vi->vdev, struct virtio_input_config, u.abs.flat, &fl);
> > 
> > you read le field into u32 value.
> > Please run sparse on this code. you will get a ton
> > of warnings. Same error appears elsewhere.
> 
> Indeed.  IIRC that wasn't the case a while back.  Guess those bitwise
> annotations have been added with the virtio 1.0 patches?
> 
> In any case I'll fix it up.

I see you still didn't in v2?

> > > +static int virtinput_probe(struct virtio_device *vdev)
> > > +{
> > > +	struct virtio_input *vi;
> > > +	size_t size;
> > > +	int abs, err;
> > 
> > How about checking VERSION_1 and bailing out of not there?
> 
> I don't think this is needed.  There isn't a hard dependency on virtio
> 1.0.  It's just that config space is relatively large and because of
> that I want it be 1.0 on the host (qemu) side to not allocate large
> portions of I/O address space for the legacy virtio pci bar.

You are doing leXXX everywhere, that's VERSION_1 dependency.
virtio_cread will do byteswaps differently without VERSION_1.
Just don't go there.

> > > +	vi->idev->name = vi->name;
> > > +	vi->idev->phys = vi->phys;
> > > +	vi->idev->id.bustype = BUS_VIRTUAL;
> > > +	vi->idev->id.vendor  = 0x0001;
> > > +	vi->idev->id.product = 0x0001;
> > > +	vi->idev->id.version = 0x0100;
> > 
> > Add comments explaining why these #s make sense?
> 
> See other subthread, will be changed to be host-provided (like name).
> 
> > > +	err = input_register_device(vi->idev);
> > 
> > Once you do this, virtinput_status can get called,
> > and that will kick, correct?
> 
> Correct.
> 
> > If so, you must call device_ready before this.
> 
> Ok.
> 
> > > +	if (err)
> > > +		goto out4;
> > > +
> > > +	return 0;
> > > +
> > > +out4:
> > > +	input_free_device(vi->idev);
> > > +out3:
> > > +	vdev->config->del_vqs(vdev);
> > > +out2:
> > > +	kfree(vi);
> > > +out1:
> > > +	return err;
> > 
> > free on error is out of order with initialization.
> > Might lead to leaks or other bugs.
> > Also - can you name labels something sensible pls?
> > out is usually for exiting on success too...
> > E.g. out4 -> err_register etc.
> 
> Will fix.
> 
> > > +static void virtinput_remove(struct virtio_device *vdev)
> > > +{
> > > +	struct virtio_input *vi = vdev->priv;
> > > +
> > > +	input_unregister_device(vi->idev);
> > > +	vdev->config->reset(vdev);
> > 
> > You don't really need a reset if you just to del_vqs.
> > People do this if they want to prevent interrupts
> > without deleting vqs.
> 
> Ok.
> 
> > > +	vdev->config->del_vqs(vdev);
> > > +	kfree(vi);
> > 
> > free_device seems to be missing?
> 
> Indeed, good catch.
> 
> thanks,
>   Gerd

^ permalink raw reply

* Re: [PATCH v2] Add virtio-input driver.
From: Michael S. Tsirkin @ 2015-03-21 22:23 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: virtio-dev, Dmitry Torokhov, open list:ABI/API, open list,
	virtualization, David Herrmann
In-Reply-To: <1426855170-10997-1-git-send-email-kraxel@redhat.com>

On Fri, Mar 20, 2015 at 01:39:29PM +0100, Gerd Hoffmann wrote:
> virtio-input is basically evdev-events-over-virtio, so this driver isn't
> much more than reading configuration from config space and forwarding
> incoming events to the linux input layer.
> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  MAINTAINERS                       |   6 +
>  drivers/virtio/Kconfig            |  10 ++
>  drivers/virtio/Makefile           |   1 +
>  drivers/virtio/virtio_input.c     | 335 ++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/Kbuild         |   1 +
>  include/uapi/linux/virtio_ids.h   |   1 +
>  include/uapi/linux/virtio_input.h |  75 +++++++++
>  7 files changed, 429 insertions(+)
>  create mode 100644 drivers/virtio/virtio_input.c
>  create mode 100644 include/uapi/linux/virtio_input.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 0e1abe8..585e6cd 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -10435,6 +10435,12 @@ S:	Maintained
>  F:	drivers/vhost/
>  F:	include/uapi/linux/vhost.h
>  
> +VIRTIO INPUT DRIVER
> +M:	Gerd Hoffmann <kraxel@redhat.com>
> +S:	Maintained
> +F:	drivers/virtio/virtio_input.c
> +F:	include/uapi/linux/virtio_input.h
> +
>  VIA RHINE NETWORK DRIVER
>  M:	Roger Luethi <rl@hellgate.ch>
>  S:	Maintained
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index b546da5..cab9f3f 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -48,6 +48,16 @@ config VIRTIO_BALLOON
>  
>  	 If unsure, say M.
>  
> +config VIRTIO_INPUT
> +	tristate "Virtio input driver"
> +	depends on VIRTIO
> +	depends on INPUT
> +	---help---
> +	 This driver supports virtio input devices such as
> +	 keyboards, mice and tablets.
> +
> +	 If unsure, say M.
> +
>   config VIRTIO_MMIO
>  	tristate "Platform bus driver for memory mapped virtio devices"
>  	depends on HAS_IOMEM
> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> index d85565b..41e30e3 100644
> --- a/drivers/virtio/Makefile
> +++ b/drivers/virtio/Makefile
> @@ -4,3 +4,4 @@ obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
>  virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
>  virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
>  obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
> +obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
> diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c
> new file mode 100644
> index 0000000..dd3215e
> --- /dev/null
> +++ b/drivers/virtio/virtio_input.c
> @@ -0,0 +1,335 @@
> +#include <linux/module.h>
> +#include <linux/virtio.h>
> +#include <linux/input.h>
> +
> +#include <uapi/linux/virtio_ids.h>
> +#include <uapi/linux/virtio_input.h>
> +
> +struct virtio_input {
> +	struct virtio_device       *vdev;
> +	struct input_dev           *idev;
> +	char                       name[64];
> +	char                       serial[64];
> +	char                       phys[64];
> +	struct virtqueue           *evt, *sts;
> +	struct virtio_input_event  evts[64];
> +	struct mutex               lock;
> +};
> +
> +static ssize_t serial_show(struct device *dev,
> +			   struct device_attribute *attr, char *buf)
> +{
> +	struct input_dev *idev = to_input_dev(dev);
> +	struct virtio_input *vi = input_get_drvdata(idev);
> +
> +	return sprintf(buf, "%s\n", vi->serial);
> +}
> +static DEVICE_ATTR_RO(serial);
> +
> +static struct attribute *dev_attrs[] = {
> +	&dev_attr_serial.attr,
> +	NULL
> +};
> +
> +static umode_t dev_attrs_are_visible(struct kobject *kobj,
> +				     struct attribute *a, int n)
> +{
> +	struct device *dev = container_of(kobj, struct device, kobj);
> +	struct input_dev *idev = to_input_dev(dev);
> +	struct virtio_input *vi = input_get_drvdata(idev);
> +
> +	if (a == &dev_attr_serial.attr && !strlen(vi->serial))
> +		return 0;
> +
> +	return a->mode;
> +}
> +
> +static struct attribute_group dev_attr_grp = {
> +	.attrs =	dev_attrs,
> +	.is_visible =	dev_attrs_are_visible,
> +};
> +
> +static const struct attribute_group *dev_attr_groups[] = {
> +	&dev_attr_grp,
> +	NULL
> +};
> +
> +static void virtinput_queue_evtbuf(struct virtio_input *vi,
> +				   struct virtio_input_event *evtbuf)
> +{
> +	struct scatterlist sg[1];
> +
> +	sg_init_one(sg, evtbuf, sizeof(*evtbuf));
> +	virtqueue_add_inbuf(vi->evt, sg, 1, evtbuf, GFP_ATOMIC);
> +}
> +
> +static void virtinput_recv_events(struct virtqueue *vq)
> +{
> +	struct virtio_input *vi = vq->vdev->priv;
> +	struct virtio_input_event *event;
> +	unsigned int len;
> +
> +	mutex_lock(&vi->lock);
> +	while ((event = virtqueue_get_buf(vi->evt, &len)) != NULL) {
> +		input_event(vi->idev,
> +			    le16_to_cpu(event->type),
> +			    le16_to_cpu(event->code),
> +			    le32_to_cpu(event->value));
> +		virtinput_queue_evtbuf(vi, event);
> +	}
> +	virtqueue_kick(vq);
> +	mutex_unlock(&vi->lock);
> +}
> +
> +static int virtinput_send_status(struct virtio_input *vi,
> +				 u16 type, u16 code, s32 value)
> +{
> +	struct virtio_input_event *stsbuf;
> +	struct scatterlist sg[1];
> +	int rc;
> +
> +	stsbuf = kzalloc(sizeof(*stsbuf), GFP_ATOMIC);
> +	if (!stsbuf)
> +		return -ENOMEM;
> +
> +	stsbuf->type  = cpu_to_le16(type);
> +	stsbuf->code  = cpu_to_le16(code);
> +	stsbuf->value = cpu_to_le32(value);
> +	sg_init_one(sg, stsbuf, sizeof(*stsbuf));
> +
> +	mutex_lock(&vi->lock);
> +	rc = virtqueue_add_outbuf(vi->sts, sg, 1, stsbuf, GFP_ATOMIC);
> +	virtqueue_kick(vi->sts);
> +	mutex_unlock(&vi->lock);
> +
> +	if (rc != 0)
> +		kfree(stsbuf);
> +	return rc;
> +}
> +
> +static void virtinput_recv_status(struct virtqueue *vq)
> +{
> +	struct virtio_input *vi = vq->vdev->priv;
> +	struct virtio_input_event *stsbuf;
> +	unsigned int len;
> +
> +	mutex_lock(&vi->lock);
> +	while ((stsbuf = virtqueue_get_buf(vi->sts, &len)) != NULL)
> +		kfree(stsbuf);
> +	mutex_unlock(&vi->lock);
> +}
> +

You can't take a mutex in an interrupt callback.
since you do everything with GFP_ATOMIC, I would say
just switch lock to a spinlock.
You can virtqueue_kick if contention becomes a problem.

Sent more comments on v1.

-- 
MST

^ permalink raw reply

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
From: Aliaksey Kandratsenka @ 2015-03-22  6:06 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools@googlegroups.com
In-Reply-To: <550A5FF8.90504@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 9888 bytes --]

On Wed, Mar 18, 2015 at 10:34 PM, Daniel Micay <danielmicay@gmail.com>
wrote:
>
> On 18/03/15 06:31 PM, Andrew Morton wrote:
> > On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> >
> >> There was a similar patch posted before, but it doesn't get merged.
I'd like
> >> to try again if there are more discussions.
> >> http://marc.info/?l=linux-mm&m=141230769431688&w=2
> >>
> >> mremap can be used to accelerate realloc. The problem is mremap will
> >> punch a hole in original VMA, which makes specific memory allocator
> >> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> >> chunks. mremap a range of the chunk will punch a hole, which other
> >> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> >> can't handle it.
> >
> > Daniel's changelog had additional details regarding the userspace
> > allocators' behaviour.  It would be best to incorporate that into your
> > changelog.
> >
> > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > Can you please do this?
> >
> > I'm not seeing any testing results for tcmalloc and I'm not seeing
> > confirmation that this patch will be useful for tcmalloc.  Has anyone
> > tried it, or sought input from tcmalloc developers?
>
> TCMalloc and jemalloc are currently equally slow in this benchmark, as
> neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
> the currently most active TCMalloc developer so they can give input
> into whether this patch would let them use it.


Hi.

Thanks for looping us in for feedback (I'm CC-ing gperftools mailing list).

Yes, that might be useful feature. (Assuming I understood it correctly) I
believe
tcmalloc would likely use:

mremap(old_ptr, move_size, move_size,
       MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
       new_ptr);

as optimized equivalent of:

memcpy(new_ptr, old_ptr, move_size);
madvise(old_ptr, move_size, MADV_DONTNEED);

And btw I find MREMAP_RETAIN name from original patch to be slightly more
intuitive than MREMAP_NOHOLE. In my humble opinion the later name does not
reflect semantic of this feature at all (assuming of course I correctly
understood what the patch does).

I do have a couple of questions about this approach however. Please feel
free to
educate me on them.

a) what is the smallest size where mremap is going to be faster ?

My initial thinking was that we'd likely use mremap in all cases where we
know
that touching destination would cause minor page faults (i.e. when
destination
chunk was MADV_DONTNEED-ed or is brand new mapping). And then also always
when
size is large enough, i.e. because "teleporting" large count of pages is
likely
to be faster than copying them.

But now I realize that it is more interesting than that. I.e. because as
Daniel
pointed out, mremap holds mmap_sem exclusively, while page faults are
holding it
for read. That could be optimized of course. Either by separate "teleport
ptes"
syscall (again, as noted by Daniel), or by having mremap drop mmap_sem for
write
and retaking it for read for "moving pages" part of work. Being not really
familiar with kernel code I have no idea if that's doable or not. But it
looks
like it might be quite important.

Another aspect where I am similarly illiterate is performance effect of tlb
flushes needed for such operation.

We can certainly experiment and find that limit. But if mremap threshold is
going to be large, then perhaps this kernel feature is not as useful as we
may
hope.

b) is that optimization worth having at all ?

After all, memcpy is actually known to be fast. I understand that copying
memory
in user space can be slowed down by minor page faults (results below seem to
confirm that). But this is something where either allocator may retain
populated
pages a bit longer or where kernel could help. E.g. maybe by exposing
something
similar to MAP_POPULATE in madvise, or even doing some safe combination of
madvise and MAP_UNINITIALIZED.

I've played with Daniel's original benchmark (copied from
http://marc.info/?l=linux-mm&m=141230769431688&w=2) with some tiny
modifications:

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>

int main(int argc, char **argv)
{
        if (argc > 1 && strcmp(argv[1], "--mlock") == 0) {
                int rv = mlockall(MCL_CURRENT | MCL_FUTURE);
                if (rv) {
                        perror("mlockall");
                        abort();
                }
                puts("mlocked!");
        }

        for (size_t i = 0; i < 64; i++) {
                void *ptr = NULL;
                size_t old_size = 0;
                for (size_t size = 4; size < (1 << 30); size *= 2) {
                        /*
                         * void *hole = malloc(1 << 20);
                         * if (!hole) {
                         *      perror("malloc");
                         *      abort();
                         * }
                         */
                        ptr = realloc(ptr, size);
                        if (!ptr) {
                                perror("realloc");
                                abort();
                        }
                        /* free(hole); */
                        memset(ptr + old_size, 0xff, size - old_size);
                        old_size = size;
                }
                free(ptr);
        }
}

I cannot say if this benchmark's vectors of up to 0.5 gigs are common in
important applications or not. It can be argued that apps that care about
such
large vectors can do mremap themselves.

On the other hand, I believe that this micro benchmark could be plausibly
changed to grow vector by smaller factor (i.e. see
https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md#memory-handling).
And
with smaller growth factor, is seems reasonable to expect larger overhead
from
memcpy and smaller overhead from mremap. And thus favor mremap more.

And I confirm that with all default settings tcmalloc and jemalloc lose to
glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
AFAIK) actually matches or exceeds glibc speed, despite still not doing
mremap. Apparently it is smarter about avoiding moving allocation for those
realloc-s. And it was even able to resist my attempt to force it to move
allocation. I haven't investigated why. Note that I built it couple weeks
or so
ago from dev branch, so it might simply have bugs.

Results also vary greatly depending in transparent huge pages setting.
Here's
what I've got:

allocator |   mode    | time  | sys time | pgfaults |             extra
----------+-----------+-------+----------+----------+-------------------------------
glibc     |           | 10.75 |     8.44 |  8388770 |
glibc     |    thp    |  5.67 |     3.44 |   310882 |
glibc     |   mlock   | 13.22 |     9.41 |  8388821 |
glibc     | thp+mlock |  8.43 |     4.63 |   310933 |
tcmalloc  |           | 11.46 |     2.00 |  2104826 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc  |    thp    | 10.61 |     0.89 |   386206 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc  |   mlock   | 10.11 |     0.27 |   264721 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc  | thp+mlock | 10.28 |     0.17 |    46011 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc  |           | 23.63 |    17.16 | 16770107 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc  |    thp    | 11.82 |     5.14 |   352477 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc  |   mlock   | 10.10 |     0.28 |   264724 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc  | thp+mlock | 10.30 |     0.17 |    49168 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
jemalloc1 |           | 23.71 |    17.33 | 16744572 |
jemalloc1 |    thp    | 11.65 |     4.68 |    64988 |
jemalloc1 |   mlock   | 10.13 |     0.29 |   263305 |
jemalloc1 | thp+mlock | 10.05 |     0.17 |    50217 |
jemalloc2 |           | 10.87 |     8.64 |  8521796 |
jemalloc2 |    thp    |  4.64 |     2.32 |    56060 |
jemalloc2 |   mlock   |  4.22 |     0.28 |   263181 |
jemalloc2 | thp+mlock |  4.12 |     0.19 |    50411 |
----------+-----------+-------+----------+----------+-------------------------------

NOTE: usual disclaimer applies about possibility of screwing something up
and
getting invalid benchmark results without being able to see it. I apologize
in
advance.

NOTE: jemalloc1 is 3.6 as shipped by up-to-date Debian Sid. jemalloc2 is
home-built snapshot of upcoming jemalloc 4.0.

NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
MADV_DONTNEED large free blocks immediately. As opposed to less rare with
setting of "false". And it makes big difference on page faults counts and
thus
on runtime.

Another notable thing is how mlock effectively disables MADV_DONTNEED for
jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
slightly better on runtime to glibc. The later spends a ton of time in
kernel,
probably handling minor page faults, and the former burns cpu in user space
doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
is
doing in this benchmark.

THP changes things however. Where apparently minor page faults become a lot
cheaper. Which makes glibc case a lot faster than even tcmalloc+mlock case.
So
in THP case, cost of page faults is smaller than cost of large memcpy.

So results are somewhat mixed, but overall I'm not sure that I'm able to see
very convincing story for MREMAP_HOLE yet. However:

1) it is possible that I am missing something. If so, please, educate me.

2) if kernel implements this API, I'm going to use it in tcmalloc.

P.S. benchmark results also seem to indicate that tcmalloc could do
something to
explicitly enable THP and maybe better adapt to it's presence. Perhaps with
some
collaboration with kernel, i.e. to prevent that famous delay-ful-ness which
causes people to disable THP.

[-- Attachment #2: Type: text/html, Size: 11931 bytes --]

^ permalink raw reply

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
From: Daniel Micay @ 2015-03-22  7:22 UTC (permalink / raw)
  To: Aliaksey Kandratsenka
  Cc: Andrew Morton, Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org
In-Reply-To: <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 7020 bytes --]

> Yes, that might be useful feature. (Assuming I understood it correctly)
> I believe
> tcmalloc would likely use:
> 
> mremap(old_ptr, move_size, move_size,
>        MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
>        new_ptr);
> 
> as optimized equivalent of:
> 
> memcpy(new_ptr, old_ptr, move_size);
> madvise(old_ptr, move_size, MADV_DONTNEED);

Yeah, it's essentially an optimized memcpy for when you don't need the
source allocation anymore.

> a) what is the smallest size where mremap is going to be faster ?

There are probably a lot of variables here like the CPU design and the
speed of system calls (syscall auditing makes them much slower!) in
addition to the stuff you've pointed out.

> My initial thinking was that we'd likely use mremap in all cases where
> we know
> that touching destination would cause minor page faults (i.e. when
> destination
> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
> always when
> size is large enough, i.e. because "teleporting" large count of pages is
> likely
> to be faster than copying them.
> 
> But now I realize that it is more interesting than that. I.e. because as
> Daniel
> pointed out, mremap holds mmap_sem exclusively, while page faults are
> holding it
> for read. That could be optimized of course. Either by separate
> "teleport ptes"
> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
> for write
> and retaking it for read for "moving pages" part of work. Being not really
> familiar with kernel code I have no idea if that's doable or not. But it
> looks
> like it might be quite important.

I think it's doable but it would pessimize the case where the dest VMA
isn't reusable. It would need to optimistically take the reader lock to
find out and then drop it. However, userspace knows when this is surely
going to work and could give it a hint.

I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
case would be. It would be extremely specific though... they want the
kernel to move pages from a source VMA to a destination VMA where both
are anon/private with identical flags so only the reader lock is
necessary. On top of that, they really want to keep around as many
destination pages as possible, maybe by swapping as many as possible
back to the source.

That's *extremely* specific though and I now think the best way to get
there is by landing this feature and then extending it as necessary down
the road. An allocator may actually want to manage other kinds of
mappings itself and it would want the mmap_sem optimization to be an
optional hint.

> And I confirm that with all default settings tcmalloc and jemalloc lose to
> glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
> AFAIK) actually matches or exceeds glibc speed, despite still not doing
> mremap. Apparently it is smarter about avoiding moving allocation for those
> realloc-s. And it was even able to resist my attempt to force it to move
> allocation. I haven't investigated why. Note that I built it couple
> weeks or so
> ago from dev branch, so it might simply have bugs.

I submitted patches teaching jemalloc to expand/shrink huge allocations
in-place, so it's hitting the in-place resize path after the initial
iteration on a repeated reallocation benchmark that's not doing any
other allocations.

In jemalloc, everything is allocated via naturally aligned chunks (4M
before, recently down to 256k in master) so if you want to block
in-place huge reallocation you'll either need to force a new non-huge
chunk to be allocated or make one that's at least as large as the chunk
size.

I don't think in-place reallocation is very common in long-running
programs. It's probably more common now that jemalloc is experimenting
with first-fit for chunk/huge allocation rather than address-ordered
best-fit. The best-fit algorithm is designed to keep the opportunity for
in-place reallocation to a minimum, although address ordering does
counter it :).

> NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
> MADV_DONTNEED large free blocks immediately. As opposed to less rare with
> setting of "false". And it makes big difference on page faults counts
> and thus
> on runtime.
> 
> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> slightly better on runtime to glibc. The later spends a ton of time in
> kernel,
> probably handling minor page faults, and the former burns cpu in user space
> doing memcpy-s. So "tons of memcpys" seems to be competitive to what
> glibc is
> doing in this benchmark.

When I taught jemalloc to use the MREMAP_RETAIN flag it was getting
significant wins over glibc, so this might be caused by the time spent
managing metadata, etc.

> THP changes things however. Where apparently minor page faults become a lot
> cheaper. Which makes glibc case a lot faster than even tcmalloc+mlock
> case. So
> in THP case, cost of page faults is smaller than cost of large memcpy.
> 
> So results are somewhat mixed, but overall I'm not sure that I'm able to see
> very convincing story for MREMAP_HOLE yet. However:
> 
> 1) it is possible that I am missing something. If so, please, educate me.
> 
> 2) if kernel implements this API, I'm going to use it in tcmalloc.
> 
> P.S. benchmark results also seem to indicate that tcmalloc could do
> something to
> explicitly enable THP and maybe better adapt to it's presence. Perhaps
> with some
> collaboration with kernel, i.e. to prevent that famous delay-ful-ness which
> causes people to disable THP.

BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
madvise purging. The part where khugepaged assigns huge pages to dense
spans of pages is *great*. The part where the kernel hands out a huge
page on for a fault in a 2M span can be awful. It causes the model
inside the allocator of uncommitted vs. committed pages to break down.

For example, the allocator might use 1M of a huge page and then start
purging. The purging will split it into 4k pages, so there will be 1M of
zeroed 4k pages that are considered purged by the allocator. Over time,
this can cripple purging. Search for "jemalloc huge pages" and you'll
find lots of horror stories about this.

I think a THP implementation playing that played well with purging would
need to drop the page fault heuristic and rely on a significantly better
khugepaged. This would mean faulting in a span of memory would no longer
be faster. Having a flag to populate a range with madvise would help a
lot though, since the allocator knows exactly how much it's going to
clobber with the memcpy. There will still be a threshold where mremap
gets significantly faster, but it would move it higher.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: Re: [PATCH v9 tip 3/9] tracing: attach BPF programs to kprobes
From: Masami Hiramatsu @ 2015-03-22 10:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <550D962E.7010400-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

(2015/03/22 1:02), Alexei Starovoitov wrote:
> On 3/21/15 5:14 AM, Masami Hiramatsu wrote:
>> (2015/03/21 8:30), Alexei Starovoitov wrote:
>>>
>>> Note, kprobes are _not_ a stable kernel ABI, so bpf programs attached to
>>> kprobes must be recompiled for every kernel version and user must supply correct
>>> LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call.
>>>
>>
>> Would you mean that the ABI of kprobe-based BPF programs? Kprobe API/ABIs
>> (register_kprobe() etc.) are stable, but the code who use kprobes certainly
>> depends the kernel binary by design. So, if you meant it, BPF programs must
>> be recompiled for every kernel binaries (including configuration changes,
>> not only its version).
> 
> yes. I mainly meant that bpf+kprobe programs must be recompiled
> for every kernel binary.

Hmm, if so, as we do in perf (and systemtap too), you'd better check
kernel's build-id instead of the kernel version when loading the
BPF program. It is safer than the KERNEL_VERSION_CODE.

> But you're incorrect saying that register_kprobe API is stable.
> It's equally kernel dependent.
> register_kprobe(struct kprobe *p) is export_gpl, but it takes
> kernel internal 'struct kprobe' and it's not declared in uapi header.
> Prototype of kprobe_handler_t is also kernel internal, so whoever
> is using kprobes must recompile their code every time.
> If we want, we can change register_kprobe function name to something
> else. Just like kernel modules cannot expect that exported symbols
> will stay around from version to version. We don't care when we
> break out of tree modules.

Indeed the register_kprobe (and other kprobe related functions)
should not be treated as a stable userland exported ABI/API. :)

Thank you,

-- 
Masami HIRAMATSU
Software Platform Research Dept. Linux Technology Research Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hiramatsu.pt-FCd8Q96Dh0JBDgjK7y7TUQ@public.gmane.org

^ permalink raw reply

* Re: [PATCH v9 tip 5/9] tracing: allow BPF programs to call bpf_trace_printk()
From: Ingo Molnar @ 2015-03-22 11:10 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426894210-27441-6-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>


* Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:

> +static const struct bpf_func_proto bpf_trace_printk_proto = {
> +	.func = bpf_trace_printk,
> +	.gpl_only = true,
> +	.ret_type = RET_INTEGER,
> +	.arg1_type = ARG_PTR_TO_STACK,
> +	.arg2_type = ARG_CONST_STACK_SIZE,
> +};

A nit, please align such initializations vertically, for more 
readability:

static const struct bpf_func_proto bpf_trace_printk_proto = {
	.func		= bpf_trace_printk,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_STACK,
	.arg2_type	= ARG_CONST_STACK_SIZE,
};

(this applies to other patches as well.)

Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH v9 tip 3/9] tracing: attach BPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-22 18:03 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <550E9421.7030507@hitachi.com>

On 3/22/15 3:06 AM, Masami Hiramatsu wrote:
> (2015/03/22 1:02), Alexei Starovoitov wrote:
>> On 3/21/15 5:14 AM, Masami Hiramatsu wrote:
>>> (2015/03/21 8:30), Alexei Starovoitov wrote:
>>>>
>>>> Note, kprobes are _not_ a stable kernel ABI, so bpf programs attached to
>>>> kprobes must be recompiled for every kernel version and user must supply correct
>>>> LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call.
>>>>
>>>
>>> Would you mean that the ABI of kprobe-based BPF programs? Kprobe API/ABIs
>>> (register_kprobe() etc.) are stable, but the code who use kprobes certainly
>>> depends the kernel binary by design. So, if you meant it, BPF programs must
>>> be recompiled for every kernel binaries (including configuration changes,
>>> not only its version).
>>
>> yes. I mainly meant that bpf+kprobe programs must be recompiled
>> for every kernel binary.
> 
> Hmm, if so, as we do in perf (and systemtap too), you'd better check
> kernel's build-id instead of the kernel version when loading the
> BPF program. It is safer than the KERNEL_VERSION_CODE.

It's not about safety. As I mentioned in cover letter:
"version check is not used for safety, but for enforcing 'non-ABI-ness'"
In other words it's like check-box next to 'terms and conditions'
paragraph that the user has to click before he can continue.
By providing 'kern_version' during loading the user accepts the fact
that bpf+kprobe is not a stable ABI. Nothing more and nothing less.
build-id cannot achieve that, because it cannot be checked from inside
the kernel.
User space tools that will compile ktap/dtrace scripts into bpf might
use build-id for their own purpose, but that's a different discussion.

^ permalink raw reply

* Re: [PATCH v9 tip 5/9] tracing: allow BPF programs to call bpf_trace_printk()
From: Alexei Starovoitov @ 2015-03-22 18:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150322111040.GA18695-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On 3/22/15 4:10 AM, Ingo Molnar wrote:
>
> * Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:
>
>> +static const struct bpf_func_proto bpf_trace_printk_proto = {
>> +	.func = bpf_trace_printk,
>> +	.gpl_only = true,
>> +	.ret_type = RET_INTEGER,
>> +	.arg1_type = ARG_PTR_TO_STACK,
>> +	.arg2_type = ARG_CONST_STACK_SIZE,
>> +};
>
> A nit, please align such initializations vertically, for more
> readability:
>
> static const struct bpf_func_proto bpf_trace_printk_proto = {
> 	.func		= bpf_trace_printk,
> 	.gpl_only	= true,
> 	.ret_type	= RET_INTEGER,
> 	.arg1_type	= ARG_PTR_TO_STACK,
> 	.arg2_type	= ARG_CONST_STACK_SIZE,
> };

sure. will respin.

^ permalink raw reply

* Re: [PATCH] seccomp.2: Add note about alarm(2) not being sufficient to limit runtime
From: Michael Kerrisk (man-pages) @ 2015-03-22 19:28 UTC (permalink / raw)
  To: Jann Horn, Mikael Pettersson
  Cc: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
	linux-man-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Russell King,
	Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, x86-DgEjT+Ai2ygdnm+yROfE0A, Jeff Dike,
	Richard Weinberger, Kees Cook, Andy Lutomirski, Will Drewry
In-Reply-To: <20150312130701.GA11073-J1fxOzX/cBvk1uMJSBkQmQ@public.gmane.org>

Hello Jann,

On 03/12/2015 02:07 PM, Jann Horn wrote:
> On Wed, Mar 11, 2015 at 10:43:50PM +0100, Mikael Pettersson wrote:
>> Jann Horn writes:
>>  > Or should I throw this patch away and write a patch
>>  > for the prctl() manpage instead that documents that
>>  > being able to call sigreturn() implies being able to
>>  > effectively call sigprocmask(), at least on some
>>  > architectures like X86?
>>
>> Well, that is the semantics of sigreturn().  It is essentially
>> setcontext() [which includes the actions of sigprocmask()], but
>> with restrictions on parameter placement (at least on x86).
>>
>> You could introduce some setting to restrict that aspect for
>> seccomp processes, but you can't change this for normal processes
>> without breaking things.
> 
> Then I think it's probably better and easier to just document the existing
> behavior? If a new setting would have to be introduced and developers would
> need to be aware of that, it's probably easier to just tell everyone to use
> SIGKILL.
> 
> Does this manpage patch look good?

Patch applied, with Acks from Andy, Mikael, and Kees (I don't
usually get patches whose pedigree is that good. Thanks!)

I tweaked a few wordings. You can find the changes in Git [1]

Cheers,

Michael

[1] 
http://git.kernel.org/cgit/docs/man-pages/man-pages.git/commit/?id=65be1b46fb88e14f0af494ac6b53a2d6a63bb860

> ---
>  man2/seccomp.2 | 25 +++++++++++++++++++++++++
>  1 file changed, 25 insertions(+)
> 
> diff --git a/man2/seccomp.2 b/man2/seccomp.2
> index 702ceb8..f762d07 100644
> --- a/man2/seccomp.2
> +++ b/man2/seccomp.2
> @@ -64,6 +64,31 @@ Strict secure computing mode is useful for number-crunching
>  applications that may need to execute untrusted byte code, perhaps
>  obtained by reading from a pipe or socket.
>  
> +Note that although the calling thread can no longer call
> +.BR sigprocmask (2),
> +it can use
> +.BR sigreturn (2)
> +to block all signals apart from
> +.BR SIGKILL
> +and
> +.BR SIGSTOP .
> +Therefore, to reliably terminate it,
> +.BR SIGKILL
> +has to be used, meaning that e.g.
> +.BR alarm (2)
> +is not sufficient for restricting its runtime. Instead, use
> +.BR timer_create (2)
> +with
> +.BR SIGEV_SIGNAL
> +and
> +.BR sigev_signo
> +set to
> +.BR SIGKILL
> +or use
> +.BR setrlimit (2)
> +to set the hard limit for
> +.BR RLIMIT_CPU .
> +
>  This operation is available only if the kernel is configured with
>  .BR CONFIG_SECCOMP
>  enabled.
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v10 tip 0/9] tracing: attach eBPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Hi Ingo,

Patch 1 is already in net-next. Patch 3 depends on it.
I'm assuming it's not going to be a problem during merge window.
Patch 3 will have a minor conflict in uapi/linux/bpf.h in linux-next,
since net-next has added new lines to the bpf_prog_type and bpf_func_id enums.
I'm assuming it's not a problem either.

V9->V10:
- prettified formatting of struct intializers in kernel
- added Masami's Reviewed-by. Thanks Masami!

V8->V9:
- fixed comment style and allowed ispunct after %p
- added Steven's Reviewed-by. Thanks Steven!

V7->V8:
- split addition of kprobe flag into separate patch
- switched to __this_cpu_inc in now documented trace_call_bpf()
- converted array into standalone bpf_func_proto and switch statement
  (this apporach looks cleanest, especially considering patch 5)
- refactored patch 5 bpf_trace_printk to do strict checking

V6->V7:
- rebase and remove confusing _notrace suffix from preempt_disable/enable
  everything else unchanged

V5->V6:
- added simple recursion check to trace_call_bpf()
- added tracex4 example that does kmem_cache_alloc/free tracking.
  It remembers every allocated object in a map and user space periodically
  prints a set of old objects. With more work in can be made into
  simple kmemleak detector.
  It was used as a test of recursive kmalloc/kfree: attached to
  kprobe/__kmalloc and let program to call kmalloc again.

V4->V5:
- switched to ktime_get_mono_fast_ns() as suggested by Peter
- in libbpf.c fixed zero init of 'union bpf_attr' padding
- fresh rebase on tip/master

V3 discussion:
https://lkml.org/lkml/2015/2/9/738

V3->V4:
- since the boundary of stable ABI in bpf+tracepoints is not clear yet,
  I've dropped them for now.
- bpf+syscalls are ok from stable ABI point of view, but bpf+seccomp
  would want to do very similar analysis of syscalls, so I've dropped
  them as well to take time and define common bpf+syscalls and bpf+seccomp
  infra in the future.
- so only bpf+kprobes left. kprobes by definition is not a stable ABI,
  so bpf+kprobe is not stable ABI either. To stress on that point added
  kernel version attribute that user space must pass along with the program
  and kernel will reject programs when version code doesn't match.
  So bpf+kprobe is very similar to kernel modules, but unlike modules
  version check is not used for safety, but for enforcing 'non-ABI-ness'.
  (version check doesn't apply to bpf+sockets which are stable)

Programs are attached to kprobe events via API:

prog_fd = bpf_prog_load(...);
struct perf_event_attr attr = {
  .type = PERF_TYPE_TRACEPOINT,
  .config = event_id, /* ID of just created kprobe event */
};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

Next step is to prototype TCP stack instrumentation (like web10g) using
bpf+kprobe, but without adding any new code tcp stack.
Though kprobes are slow comparing to tracepoints, they are good enough
for prototyping and trace_marker/debug_tracepoint ideas can accelerate
them in the future.

Alexei Starovoitov (8):
  tracing: add kprobe flag
  tracing: attach BPF programs to kprobes
  tracing: allow BPF programs to call bpf_ktime_get_ns()
  tracing: allow BPF programs to call bpf_trace_printk()
  samples: bpf: simple non-portable kprobe filter example
  samples: bpf: counting example for kfree_skb and write syscall
  samples: bpf: IO latency analysis (iosnoop/heatmap)
  samples: bpf: kmem_alloc/free tracker

Daniel Borkmann (1):
  bpf: make internal bpf API independent of CONFIG_BPF_SYSCALL ifdefs

 include/linux/bpf.h             |   20 +++-
 include/linux/ftrace_event.h    |   14 +++
 include/uapi/linux/bpf.h        |    5 +
 include/uapi/linux/perf_event.h |    1 +
 kernel/bpf/syscall.c            |    7 +-
 kernel/events/core.c            |   59 +++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  222 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |   10 +-
 samples/bpf/Makefile            |   16 +++
 samples/bpf/bpf_helpers.h       |    6 ++
 samples/bpf/bpf_load.c          |  125 ++++++++++++++++++++--
 samples/bpf/bpf_load.h          |    3 +
 samples/bpf/libbpf.c            |   14 ++-
 samples/bpf/libbpf.h            |    5 +-
 samples/bpf/sock_example.c      |    2 +-
 samples/bpf/test_verifier.c     |    2 +-
 samples/bpf/tracex1_kern.c      |   50 +++++++++
 samples/bpf/tracex1_user.c      |   25 +++++
 samples/bpf/tracex2_kern.c      |   86 +++++++++++++++
 samples/bpf/tracex2_user.c      |   95 +++++++++++++++++
 samples/bpf/tracex3_kern.c      |   89 ++++++++++++++++
 samples/bpf/tracex3_user.c      |  150 ++++++++++++++++++++++++++
 samples/bpf/tracex4_kern.c      |   54 ++++++++++
 samples/bpf/tracex4_user.c      |   69 ++++++++++++
 25 files changed, 1112 insertions(+), 18 deletions(-)
 create mode 100644 kernel/trace/bpf_trace.c
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c
 create mode 100644 samples/bpf/tracex4_kern.c
 create mode 100644 samples/bpf/tracex4_user.c

-- 
1.7.9.5

^ permalink raw reply

* [PATCH v10 tip 1/9] bpf: make internal bpf API independent of CONFIG_BPF_SYSCALL ifdefs
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-1-git-send-email-ast@plumgrid.com>

From: Daniel Borkmann <daniel@iogearbox.net>

Socket filter code and other subsystems with upcoming eBPF support should
not need to deal with the fact that we have CONFIG_BPF_SYSCALL defined or
not.

Having the bpf syscall as a config option is a nice thing and I'd expect
it to stay that way for expert users (I presume one day the default setting
of it might change, though), but code making use of it should not care if
it's actually enabled or not.

Instead, hide this via header files and let the rest deal with it.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
---
 include/linux/bpf.h |   20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..c2e21113ecc0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -113,8 +113,6 @@ struct bpf_prog_type_list {
 	enum bpf_prog_type type;
 };
 
-void bpf_register_prog_type(struct bpf_prog_type_list *tl);
-
 struct bpf_prog;
 
 struct bpf_prog_aux {
@@ -129,11 +127,25 @@ struct bpf_prog_aux {
 };
 
 #ifdef CONFIG_BPF_SYSCALL
+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+
 void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
 #else
-static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+}
+
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_prog_put(struct bpf_prog *prog)
+{
+}
 #endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v10 tip 2/9] tracing: add kprobe flag
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1427053150-32213-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

add TRACE_EVENT_FL_KPROBE flag to differentiate kprobe type of tracepoints,
since bpf programs can only be attached to kprobe type of
PERF_TYPE_TRACEPOINT perf events.

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
Reviewed-by: Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt-FCd8Q96Dh0JBDgjK7y7TUQ@public.gmane.org>
---
 include/linux/ftrace_event.h |    3 +++
 kernel/trace/trace_kprobe.c  |    2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c674ee8f7fca..77325e1a1816 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -252,6 +252,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -265,6 +266,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -274,6 +276,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d73f565b4e06..8fa549f6f528 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1286,7 +1286,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		kfree(call->print_fmt);
 		return -ENODEV;
 	}
-	call->flags = 0;
+	call->flags = TRACE_EVENT_FL_KPROBE;
 	call->class->reg = kprobe_register;
 	call->data = tk;
 	ret = trace_add_event_call(call);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v10 tip 3/9] tracing: attach BPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-1-git-send-email-ast@plumgrid.com>

User interface:
struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = event_id, ...};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

prog_fd is a file descriptor associated with BPF program previously loaded.
event_id is an ID of created kprobe

close(event_fd) - automatically detaches BPF program from it

BPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- probe_read - wraper of probe_kernel_read() used to access any kernel
  data structures

BPF programs receive 'struct pt_regs *' as an input
('struct pt_regs' is architecture dependent)
and return 0 to ignore the event and 1 to store kprobe event into ring buffer.

Note, kprobes are _not_ a stable kernel ABI, so bpf programs attached to
kprobes must be recompiled for every kernel version and user must supply correct
LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h    |   11 ++++
 include/uapi/linux/bpf.h        |    3 +
 include/uapi/linux/perf_event.h |    1 +
 kernel/bpf/syscall.c            |    7 ++-
 kernel/events/core.c            |   59 ++++++++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  130 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c     |    8 +++
 8 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 77325e1a1816..0aa535bc9f05 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -306,6 +307,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..b2948feeb70b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -151,6 +152,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when prog_type=kprobe */
 	};
 } __attribute__((aligned(8)));
 
@@ -162,6 +164,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3c8b45de57ec..ad4dade2a502 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -382,6 +382,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 536edc2be307..504c10b990ef 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/license.h>
 #include <linux/filter.h>
+#include <linux/version.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_buf
+#define	BPF_PROG_LOAD_LAST_FIELD kern_version
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (attr->insn_cnt >= BPF_MAXINSNS)
 		return -EINVAL;
 
+	if (type == BPF_PROG_TYPE_KPROBE &&
+	    attr->kern_version != LINUX_VERSION_CODE)
+		return -EINVAL;
+
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 	if (!prog)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2709063eb26b..3a45e7f6b2df 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -3402,6 +3404,7 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3411,6 +3414,7 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
+	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
@@ -3923,6 +3927,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -3976,6 +3981,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_SET_BPF:
+		return perf_event_set_bpf_prog(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -6436,6 +6444,49 @@ static void perf_event_free_filter(struct perf_event *event)
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	struct bpf_prog *prog;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	if (event->tp_event->prog)
+		return -EEXIST;
+
+	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+		/* bpf programs can only be attached to kprobes */
+		return -EINVAL;
+
+	prog = bpf_prog_get(prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
+		/* valid fd, but invalid bpf program type */
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	event->tp_event->prog = prog;
+
+	return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog *prog;
+
+	if (!event->tp_event)
+		return;
+
+	prog = event->tp_event->prog;
+	if (prog) {
+		event->tp_event->prog = NULL;
+		bpf_prog_put(prog);
+	}
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6451,6 +6502,14 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 98f26588255e..c575a300103b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..f1e87da91da3
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,130 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include "trace.h"
+
+static DEFINE_PER_CPU(int, bpf_prog_active);
+
+/**
+ * trace_call_bpf - invoke BPF program
+ * @prog: BPF program
+ * @ctx: opaque context pointer
+ *
+ * kprobe handlers execute BPF programs via this helper.
+ * Can be used from static tracepoints in the future.
+ *
+ * Return: BPF programs always return an integer which is interpreted by
+ * kprobe handler as:
+ * 0 - return from kprobe (event is filtered out)
+ * 1 - store kprobe event into ring buffer
+ * Other values are reserved and currently alias to 1
+ */
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	unsigned int ret;
+
+	if (in_nmi()) /* not supported yet */
+		return 1;
+
+	preempt_disable();
+
+	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+		/*
+		 * since some bpf program is already running on this cpu,
+		 * don't call into another bpf program (same or different)
+		 * and don't send kprobe event into ring-buffer,
+		 * so return zero here
+		 */
+		ret = 0;
+		goto out;
+	}
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+
+ out:
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *dst = (void *) (long) r1;
+	int size = (int) r2;
+	void *unsafe_ptr = (void *) (long) r3;
+
+	return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static const struct bpf_func_proto bpf_probe_read_proto = {
+	.func		= bpf_probe_read,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_probe_read:
+		return &bpf_probe_read_proto;
+	default:
+		return NULL;
+	}
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct pt_regs))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+	.get_func_proto  = kprobe_prog_func_proto,
+	.is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+	.ops	= &kprobe_prog_ops,
+	.type	= BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+	bpf_register_prog_type(&kprobe_tl);
+	return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8fa549f6f528..dc3462507d7c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1134,11 +1134,15 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
@@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tk->tp.call;
+	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
+	if (prog && !trace_call_bpf(prog, regs))
+		return;
+
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
 		return;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v10 tip 4/9] tracing: allow BPF programs to call bpf_ktime_get_ns()
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-1-git-send-email-ast@plumgrid.com>

bpf_ktime_get_ns() is used by programs to compute time delta between events
or as a timestamp

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b2948feeb70b..238c6883877b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f1e87da91da3..8f5787294971 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -78,6 +78,18 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
+static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+	.func		= bpf_ktime_get_ns,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -89,6 +101,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_map_delete_elem_proto;
 	case BPF_FUNC_probe_read:
 		return &bpf_probe_read_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
 	default:
 		return NULL;
 	}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v10 tip 5/9] tracing: allow BPF programs to call bpf_trace_printk()
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-1-git-send-email-ast@plumgrid.com>

Debugging of BPF programs needs some form of printk from the program,
so let programs call limited trace_printk() with %d %u %x %p modifiers only.

Similar to kernel modules, during program load verifier checks whether program
is calling bpf_trace_printk() and if so, kernel allocates trace_printk buffers
and emits big 'this is debug only' banner.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   78 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 238c6883877b..cc47ef41076a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -166,6 +166,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
+	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8f5787294971..2d56ce501632 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -10,6 +10,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/uaccess.h>
+#include <linux/ctype.h>
 #include "trace.h"
 
 static DEFINE_PER_CPU(int, bpf_prog_active);
@@ -90,6 +91,74 @@ static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+/*
+ * limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+	char *fmt = (char *) (long) r1;
+	int mod[3] = {};
+	int fmt_cnt = 0;
+	int i;
+
+	/*
+	 * bpf_check()->check_func_arg()->check_stack_boundary()
+	 * guarantees that fmt points to bpf program stack,
+	 * fmt_size bytes of it were initialized and fmt_size > 0
+	 */
+	if (fmt[--fmt_size] != 0)
+		return -EINVAL;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++) {
+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+			return -EINVAL;
+
+		if (fmt[i] != '%')
+			continue;
+
+		if (fmt_cnt >= 3)
+			return -EINVAL;
+
+		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+		i++;
+		if (fmt[i] == 'l') {
+			mod[fmt_cnt]++;
+			i++;
+		} else if (fmt[i] == 'p') {
+			mod[fmt_cnt]++;
+			i++;
+			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+				return -EINVAL;
+			fmt_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'l') {
+			mod[fmt_cnt]++;
+			i++;
+		}
+
+		if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+			return -EINVAL;
+		fmt_cnt++;
+	}
+
+	return __trace_printk(1/* fake ip will not be printed */, fmt,
+			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+}
+
+static const struct bpf_func_proto bpf_trace_printk_proto = {
+	.func		= bpf_trace_printk,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -103,6 +172,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+
+	case BPF_FUNC_trace_printk:
+		/*
+		 * this program might be calling bpf_trace_printk,
+		 * so allocate per-cpu printk buffers
+		 */
+		trace_printk_init_buffers();
+
+		return &bpf_trace_printk_proto;
 	default:
 		return NULL;
 	}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v10 tip 6/9] samples: bpf: simple non-portable kprobe filter example
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-1-git-send-email-ast@plumgrid.com>

tracex1_kern.c - C program compiled into BPF.
It attaches to kprobe:netif_receive_skb
When skb->dev->name == "lo", it prints sample debug message into trace_pipe
via bpf_trace_printk() helper function.

tracex1_user.c - corresponding user space component that:
- loads bpf program via bpf() syscall
- opens kprobes:netif_receive_skb event via perf_event_open() syscall
- attaches the program to event via ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
- prints from trace_pipe

Note, this bpf program is completely non-portable. It must be recompiled
with current kernel headers. kprobe is not a stable ABI and bpf+kprobe scripts
may stop working any time.

bpf verifier will detect that it's using bpf_trace_printk() and kernel will
print warning banner:
** trace_printk() being used. Allocating extra memory.  **
**                                                      **
** This means that this is a DEBUG kernel and it is     **
** unsafe for production use.                           **

bpf_trace_printk() should be used for debugging of bpf program only.

Usage:
$ sudo tracex1
            ping-19826 [000] d.s2 63103.382648: : skb ffff880466b1ca00 len 84
            ping-19826 [000] d.s2 63103.382684: : skb ffff880466b1d300 len 84

            ping-19826 [000] d.s2 63104.382533: : skb ffff880466b1ca00 len 84
            ping-19826 [000] d.s2 63104.382594: : skb ffff880466b1d300 len 84

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile        |    4 ++
 samples/bpf/bpf_helpers.h   |    6 +++
 samples/bpf/bpf_load.c      |  125 ++++++++++++++++++++++++++++++++++++++++---
 samples/bpf/bpf_load.h      |    3 ++
 samples/bpf/libbpf.c        |   14 ++++-
 samples/bpf/libbpf.h        |    5 +-
 samples/bpf/sock_example.c  |    2 +-
 samples/bpf/test_verifier.c |    2 +-
 samples/bpf/tracex1_kern.c  |   50 +++++++++++++++++
 samples/bpf/tracex1_user.c  |   25 +++++++++
 10 files changed, 224 insertions(+), 12 deletions(-)
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..51f6f01e5a3a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,23 +6,27 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += tracex1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += tracex1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..1c872bcf5a80 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,12 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 	(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
 	(void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
+	(void *) BPF_FUNC_probe_read;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+	(void *) BPF_FUNC_ktime_get_ns;
+static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
+	(void *) BPF_FUNC_trace_printk;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..95c106e4bcdb 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,70 @@
 #include <unistd.h>
 #include <string.h>
 #include <stdbool.h>
+#include <stdlib.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
 #include "libbpf.h"
 #include "bpf_helpers.h"
 #include "bpf_load.h"
 
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
 static char license[128];
+static int kern_version;
 static bool processed_sec[128];
 int map_fd[MAX_MAPS];
 int prog_fd[MAX_PROGS];
+int event_fd[MAX_PROGS];
 int prog_cnt;
 
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
-	int fd;
 	bool is_socket = strncmp(event, "socket", 6) == 0;
-
-	if (!is_socket)
-		/* tracing events tbd */
+	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
+	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+	enum bpf_prog_type prog_type;
+	char buf[256];
+	int fd, efd, err, id;
+	struct perf_event_attr attr = {};
+
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	if (is_socket) {
+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	} else if (is_kprobe || is_kretprobe) {
+		prog_type = BPF_PROG_TYPE_KPROBE;
+	} else {
+		printf("Unknown event '%s'\n", event);
 		return -1;
+	}
+
+	if (is_kprobe || is_kretprobe) {
+		if (is_kprobe)
+			event += 7;
+		else
+			event += 10;
+
+		snprintf(buf, sizeof(buf),
+			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+			 is_kprobe ? 'p' : 'r', event, event);
+		err = system(buf);
+		if (err < 0) {
+			printf("failed to create kprobe '%s' error '%s'\n",
+			       event, strerror(errno));
+			return -1;
+		}
+	}
 
-	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
-			   prog, size, license);
+	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
 
 	if (fd < 0) {
 		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +80,41 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_socket)
+		return 0;
+
+	strcpy(buf, DEBUGFS);
+	strcat(buf, "events/kprobes/");
+	strcat(buf, event);
+	strcat(buf, "/id");
+
+	efd = open(buf, O_RDONLY, 0);
+	if (efd < 0) {
+		printf("failed to open event %s\n", event);
+		return -1;
+	}
+
+	err = read(efd, buf, sizeof(buf));
+	if (err < 0 || err >= sizeof(buf)) {
+		printf("read from '%s' failed '%s'\n", event, strerror(errno));
+		return -1;
+	}
+
+	close(efd);
+
+	buf[err] = 0;
+	id = atoi(buf);
+	attr.config = id;
+
+	efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+	if (efd < 0) {
+		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
+		return -1;
+	}
+	event_fd[prog_cnt - 1] = efd;
+	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+
 	return 0;
 }
 
@@ -135,6 +211,9 @@ int load_bpf_file(char *path)
 	if (gelf_getehdr(elf, &ehdr) != &ehdr)
 		return 1;
 
+	/* clear all kprobes */
+	i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
+
 	/* scan over all elf sections to get license and map info */
 	for (i = 1; i < ehdr.e_shnum; i++) {
 
@@ -149,6 +228,14 @@ int load_bpf_file(char *path)
 		if (strcmp(shname, "license") == 0) {
 			processed_sec[i] = true;
 			memcpy(license, data->d_buf, data->d_size);
+		} else if (strcmp(shname, "version") == 0) {
+			processed_sec[i] = true;
+			if (data->d_size != sizeof(int)) {
+				printf("invalid size of version section %zd\n",
+				       data->d_size);
+				return 1;
+			}
+			memcpy(&kern_version, data->d_buf, sizeof(int));
 		} else if (strcmp(shname, "maps") == 0) {
 			processed_sec[i] = true;
 			if (load_maps(data->d_buf, data->d_size))
@@ -178,7 +265,8 @@ int load_bpf_file(char *path)
 			if (parse_relo_and_apply(data, symbols, &shdr, insns))
 				continue;
 
-			if (memcmp(shname_prog, "events/", 7) == 0 ||
+			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
+			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
 			    memcmp(shname_prog, "socket", 6) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
@@ -193,7 +281,8 @@ int load_bpf_file(char *path)
 		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
 			continue;
 
-		if (memcmp(shname, "events/", 7) == 0 ||
+		if (memcmp(shname, "kprobe/", 7) == 0 ||
+		    memcmp(shname, "kretprobe/", 10) == 0 ||
 		    memcmp(shname, "socket", 6) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
@@ -201,3 +290,23 @@ int load_bpf_file(char *path)
 	close(fd);
 	return 0;
 }
+
+void read_trace_pipe(void)
+{
+	int trace_fd;
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0)
+		return;
+
+	while (1) {
+		static char buf[4096];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf));
+		if (sz) {
+			buf[sz] = 0;
+			puts(buf);
+		}
+	}
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..cbd7c2b532b9 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
 
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
+extern int event_fd[MAX_PROGS];
 
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
@@ -21,4 +22,6 @@ extern int prog_fd[MAX_PROGS];
  */
 int load_bpf_file(char *path);
 
+void read_trace_pipe(void);
+
 #endif
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 46d50b7ddf79..7e1efa7e2ed7 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -81,7 +81,7 @@ char bpf_log_buf[LOG_BUF_SIZE];
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int prog_len,
-		  const char *license)
+		  const char *license, int kern_version)
 {
 	union bpf_attr attr = {
 		.prog_type = prog_type,
@@ -93,6 +93,11 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		.log_level = 1,
 	};
 
+	/* assign one field outside of struct init to make sure any
+	 * padding is zero initialized
+	 */
+	attr.kern_version = kern_version;
+
 	bpf_log_buf[0] = 0;
 
 	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
@@ -121,3 +126,10 @@ int open_raw_sock(const char *name)
 
 	return sock;
 }
+
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, attr, pid, cpu,
+		       group_fd, flags);
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 58c5fe1bdba1..ac7b09672b46 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -13,7 +13,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key);
 
 int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, int insn_len,
-		  const char *license);
+		  const char *license, int kern_version);
 
 #define LOG_BUF_SIZE 65536
 extern char bpf_log_buf[LOG_BUF_SIZE];
@@ -182,4 +182,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
 /* create RAW socket and bind to interface 'name' */
 int open_raw_sock(const char *name);
 
+struct perf_event_attr;
+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
+		    int group_fd, unsigned long flags);
 #endif
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index c8ad0404416f..a0ce251c5390 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -56,7 +56,7 @@ static int test_sock(void)
 	};
 
 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
-				"GPL");
+				"GPL", 0);
 	if (prog_fd < 0) {
 		printf("failed to load prog '%s'\n", strerror(errno));
 		goto cleanup;
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index b96175e90363..740ce97cda5e 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -689,7 +689,7 @@ static int test(void)
 
 		prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
 					prog_len * sizeof(struct bpf_insn),
-					"GPL");
+					"GPL", 0);
 
 		if (tests[i].result == ACCEPT) {
 			if (prog_fd < 0) {
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..42176fce4847
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,50 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of argumnets and their posistions can change, etc.
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/__netif_receive_skb_core")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	/* attaches to kprobe netif_receive_skb,
+	 * looks for packets on loobpack device and prints them
+	 */
+	char devname[IFNAMSIZ] = {};
+	struct net_device *dev;
+	struct sk_buff *skb;
+	int len;
+
+	/* non-portable! works for the given kernel only */
+	skb = (struct sk_buff *) ctx->di;
+
+	dev = _(skb->dev);
+
+	len = _(skb->len);
+
+	bpf_probe_read(devname, sizeof(devname), dev->name);
+
+	if (devname[0] == 'l' && devname[1] == 'o') {
+		char fmt[] = "skb %p len %d\n";
+		/* using bpf_trace_printk() for DEBUG ONLY */
+		bpf_trace_printk(fmt, sizeof(fmt), skb, len);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..31a48183beea
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	f = popen("taskset 1 ping -c5 localhost", "r");
+	(void) f;
+
+	read_trace_pipe();
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v10 tip 7/9] samples: bpf: counting example for kfree_skb and write syscall
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-1-git-send-email-ast@plumgrid.com>

this example has two probes in one C file that attach to different kprove events
and use two different maps.

1st probe is x64 specific equivalent of dropmon. It attaches to kfree_skb,
retrevies 'ip' address of kfree_skb() caller and counts number of packet drops
at that 'ip' address. User space prints 'location - count' map every second.

2nd probe attaches to kprobe:sys_write and computes a histogram of different
write sizes

Usage:
$ sudo tracex2
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2

557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
           syscall write() stats
     byte_size       : count     distribution
       1 -> 1        : 3        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 0        |                                      |
       8 -> 15       : 0        |                                      |
      16 -> 31       : 2        |                                      |
      32 -> 63       : 3        |                                      |
      64 -> 127      : 1        |                                      |
     128 -> 255      : 1        |                                      |
     256 -> 511      : 0        |                                      |
     512 -> 1023     : 1118968  |************************************* |

Ctrl-C at any time. Kernel will auto cleanup maps and programs

$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex2_kern.c |   86 +++++++++++++++++++++++++++++++++++++++
 samples/bpf/tracex2_user.c |   95 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 185 insertions(+)
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 51f6f01e5a3a..6dd272143733 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += tracex1
+hostprogs-y += tracex2
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -14,12 +15,14 @@ sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
+always += tracex2_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -27,6 +30,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..334348335795
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,86 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(long),
+	.max_entries = 1024,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/kfree_skb")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long loc = 0;
+	long init_val = 1;
+	long *value;
+
+	/* x64 specific: read ip of kfree_skb caller.
+	 * non-portable version of __builtin_return_address(0)
+	 */
+	bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp);
+
+	value = bpf_map_lookup_elem(&my_map, &loc);
+	if (value)
+		*value += 1;
+	else
+		bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 64,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog3(struct pt_regs *ctx)
+{
+	long write_size = ctx->dx; /* arg3 */
+	long init_val = 1;
+	long *value;
+	u32 index = log2l(write_size);
+
+	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	if (value)
+		__sync_fetch_and_add(value, 1);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..91b8d0896fbb
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX	64
+#define MAX_STARS	38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	long value;
+	long data[MAX_INDEX] = {};
+	char starstr[MAX_STARS];
+	int i;
+	int max_ind = -1;
+	long max_value = 0;
+
+	for (key = 0; key < MAX_INDEX; key++) {
+		bpf_lookup_elem(fd, &key, &value);
+		data[key] = value;
+		if (value && key > max_ind)
+			max_ind = key;
+		if (value > max_value)
+			max_value = value;
+	}
+
+	printf("           syscall write() stats\n");
+	printf("     byte_size       : count     distribution\n");
+	for (i = 1; i <= max_ind + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+}
+static void int_exit(int sig)
+{
+	print_hist(map_fd[1]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	long key, next_key, value;
+	FILE *f;
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	signal(SIGINT, int_exit);
+
+	/* start 'ping' in the background to have some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	/* start 'dd' in the background to have plenty of 'write' syscalls */
+	f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
+	(void) f;
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; i < 5; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd[0], &next_key, &value);
+			printf("location 0x%lx count %ld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+	print_hist(map_fd[1]);
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v10 tip 8/9] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-1-git-send-email-ast@plumgrid.com>

BPF C program attaches to blk_mq_start_request/blk_update_request kprobe events
to calculate IO latency.
For every completed block IO event it computes the time delta in nsec
and records in a histogram map: map[log10(delta)*10]++
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.

Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.

Similar experiments can be done for network transmit latencies, syscalls, etc

'-t' flag prints the heatmap using normal ascii characters:

$ sudo ./tracex3 -t
  heatmap of IO latency
  # - many events with this latency
    - few events
|1us      |10us     |100us    |1ms      |10ms     |100ms    |1s       |10s
                         *ooo. *O.#.                                    # 221
                      .  *#     .                                       # 125
                         ..   .o#*..                                    # 55
                    .  . .  .  .#O                                      # 37
                         .#                                             # 175
                               .#*.                                     # 37
                          #                                             # 199
              .              . *#*.                                     # 55
                               *#..*                                    # 42
                          #                                             # 266
                      ...***Oo#*OO**o#* .                               # 629
                          #                                             # 271
                              . .#o* o.*o*                              # 221
                        . . o* *#O..                                    # 50

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex3_kern.c |   89 ++++++++++++++++++++++++++
 samples/bpf/tracex3_user.c |  150 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 243 insertions(+)
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 6dd272143733..dcd850546d52 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += tracex1
 hostprogs-y += tracex2
+hostprogs-y += tracex3
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -16,6 +17,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -23,6 +25,7 @@ always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
+always += tracex3_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -31,6 +34,7 @@ HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..b9d66766f07a
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(u64),
+	.max_entries = 4096,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/blk_mq_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	long rq = ctx->di;
+	u64 val = bpf_ktime_get_ns();
+
+	bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2l(unsigned long long n)
+{
+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
+	int i = -(n == 0);
+	S(32); S(16); S(8); S(4); S(2); S(1);
+	return i;
+#undef S
+}
+
+#define SLOTS 100
+
+struct bpf_map_def SEC("maps") lat_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = SLOTS,
+};
+
+SEC("kprobe/blk_update_request")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long rq = ctx->di;
+	u64 *value, l, base;
+	u32 index;
+
+	value = bpf_map_lookup_elem(&my_map, &rq);
+	if (!value)
+		return 0;
+
+	u64 cur_time = bpf_ktime_get_ns();
+	u64 delta = cur_time - *value;
+
+	bpf_map_delete_elem(&my_map, &rq);
+
+	/* the lines below are computing index = log10(delta)*10
+	 * using integer arithmetic
+	 * index = 29 ~ 1 usec
+	 * index = 59 ~ 1 msec
+	 * index = 89 ~ 1 sec
+	 * index = 99 ~ 10sec or more
+	 * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+	 */
+	l = log2l(delta);
+	base = 1ll << l;
+	index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
+
+	if (index >= SLOTS)
+		index = SLOTS - 1;
+
+	value = bpf_map_lookup_elem(&lat_map, &index);
+	if (value)
+		__sync_fetch_and_add((long *)value, 1);
+
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..0aaa933ab938
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,150 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+#define SLOTS 100
+
+static void clear_stats(int fd)
+{
+	__u32 key;
+	__u64 value = 0;
+
+	for (key = 0; key < SLOTS; key++)
+		bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+	"\033[48;5;255m",
+	"\033[48;5;252m",
+	"\033[48;5;250m",
+	"\033[48;5;248m",
+	"\033[48;5;246m",
+	"\033[48;5;244m",
+	"\033[48;5;242m",
+	"\033[48;5;240m",
+	"\033[48;5;238m",
+	"\033[48;5;236m",
+	"\033[48;5;234m",
+	"\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+const char *sym[] = {
+	" ",
+	" ",
+	".",
+	".",
+	"*",
+	"*",
+	"o",
+	"o",
+	"O",
+	"O",
+	"#",
+	"#",
+};
+
+bool full_range = false;
+bool text_only = false;
+
+static void print_banner(void)
+{
+	if (full_range)
+		printf("|1ns     |10ns     |100ns    |1us      |10us     |100us"
+		       "    |1ms      |10ms     |100ms    |1s       |10s\n");
+	else
+		printf("|1us      |10us     |100us    |1ms      |10ms     "
+		       "|100ms    |1s       |10s\n");
+}
+
+static void print_hist(int fd)
+{
+	__u32 key;
+	__u64 value;
+	__u64 cnt[SLOTS];
+	__u64 max_cnt = 0;
+	__u64 total_events = 0;
+
+	for (key = 0; key < SLOTS; key++) {
+		value = 0;
+		bpf_lookup_elem(fd, &key, &value);
+		cnt[key] = value;
+		total_events += value;
+		if (value > max_cnt)
+			max_cnt = value;
+	}
+	clear_stats(fd);
+	for (key = full_range ? 0 : 29; key < SLOTS; key++) {
+		int c = num_colors * cnt[key] / (max_cnt + 1);
+
+		if (text_only)
+			printf("%s", sym[c]);
+		else
+			printf("%s %s", color[c], nocolor);
+	}
+	printf(" # %lld\n", total_events);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 1; i < ac; i++) {
+		if (strcmp(argv[i], "-a") == 0) {
+			full_range = true;
+		} else if (strcmp(argv[i], "-t") == 0) {
+			text_only = true;
+		} else if (strcmp(argv[i], "-h") == 0) {
+			printf("Usage:\n"
+			       "  -a display wider latency range\n"
+			       "  -t text only\n");
+			return 1;
+		}
+	}
+
+	printf("  heatmap of IO latency\n");
+	if (text_only)
+		printf("  %s", sym[num_colors - 1]);
+	else
+		printf("  %s %s", color[num_colors - 1], nocolor);
+	printf(" - many events with this latency\n");
+
+	if (text_only)
+		printf("  %s", sym[0]);
+	else
+		printf("  %s %s", color[0], nocolor);
+	printf(" - few events\n");
+
+	for (i = 0; ; i++) {
+		if (i % 20 == 0)
+			print_banner();
+		print_hist(map_fd[1]);
+		sleep(2);
+	}
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v10 tip 9/9] samples: bpf: kmem_alloc/free tracker
From: Alexei Starovoitov @ 2015-03-22 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-1-git-send-email-ast@plumgrid.com>

One bpf program attaches to kmem_cache_alloc_node() and remembers all allocated
objects in the map.
Another program attaches to kmem_cache_free() and deletes corresponding
object from the map.

User space walks the map every second and prints any objects which are
older than 1 second.

Usage:
$ sudo tracex4

Then start few long living processes. The tracex4 will print:

obj 0xffff880465928000 is 13sec old was allocated at ip ffffffff8105dc32
obj 0xffff88043181c280 is 13sec old was allocated at ip ffffffff8105dc32
obj 0xffff880465848000 is  8sec old was allocated at ip ffffffff8105dc32
obj 0xffff8804338bc280 is 15sec old was allocated at ip ffffffff8105dc32

$ addr2line -fispe vmlinux ffffffff8105dc32
do_fork at fork.c:1665

As soon as processes exit the memory is reclaimed and tracex4 prints nothing.

Similar experiment can be done with __kmalloc/kfree pair.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 +++
 samples/bpf/tracex4_kern.c |   54 ++++++++++++++++++++++++++++++++++
 samples/bpf/tracex4_user.c |   69 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 samples/bpf/tracex4_kern.c
 create mode 100644 samples/bpf/tracex4_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index dcd850546d52..fe98fb226e6e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
 hostprogs-y += tracex1
 hostprogs-y += tracex2
 hostprogs-y += tracex3
+hostprogs-y += tracex4
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -18,6 +19,7 @@ sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -26,6 +28,7 @@ always += sockex2_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
 always += tracex3_kern.o
+always += tracex4_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -35,6 +38,7 @@ HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
+HOSTLOADLIBES_tracex4 += -lelf -lrt
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000000..5754e21caf3b
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,54 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct pair {
+	u64 val;
+	u64 ip;
+};
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(struct pair),
+	.max_entries = 1000000,
+};
+
+/* kprobe is NOT a stable ABI
+ * This bpf+kprobe example can stop working any time.
+ */
+SEC("kprobe/kmem_cache_free")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	long ptr = ctx->si;
+
+	bpf_map_delete_elem(&my_map, &ptr);
+	return 0;
+}
+
+SEC("kretprobe/kmem_cache_alloc_node")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	long ptr = ctx->ax;
+	long ip = 0;
+
+	/* get ip address of kmem_cache_alloc_node() caller */
+	bpf_probe_read(&ip, sizeof(ip), (void *)(ctx->bp + sizeof(ip)));
+
+	struct pair v = {
+		.val = bpf_ktime_get_ns(),
+		.ip = ip,
+	};
+
+	bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000000..bc4a3bdea6ed
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,69 @@
+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+struct pair {
+	long long val;
+	__u64 ip;
+};
+
+static __u64 time_get_ns(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static void print_old_objects(int fd)
+{
+	long long val = time_get_ns();
+	__u64 key, next_key;
+	struct pair v;
+
+	key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
+
+	key = -1;
+	while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+		bpf_lookup_elem(map_fd[0], &next_key, &v);
+		key = next_key;
+		if (val - v.val < 1000000000ll)
+			/* object was allocated more then 1 sec ago */
+			continue;
+		printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n",
+		       next_key, (val - v.val) / 1000000000ll, v.ip);
+	}
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; ; i++) {
+		print_old_objects(map_fd[1]);
+		sleep(1);
+	}
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 01/11] stm class: Introduce an abstraction for System Trace Module devices
From: Alexander Shishkin @ 2015-03-22 20:32 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: linux-kernel, mathieu.poirier, pebolle, peter.lachner,
	norbert.schulz, keven.boell, yann.fouassier, laurent.fert,
	Alexander Shishkin, linux-api, Pratik Patel
In-Reply-To: <1427056381-27614-1-git-send-email-alexander.shishkin@linux.intel.com>

A System Trace Module (STM) is a device exporting data in System Trace
Protocol (STP) format as defined by MIPI STP standards. Examples of such
devices are Intel Trace Hub and Coresight STM.

This abstraction provides a unified interface for software trace sources
to send their data over an STM device to a debug host. In order to do
that, such a trace source needs to be assigned a pair of master/channel
identifiers that all the data from this source will be tagged with. The
STP decoder on the debug host side will use these master/channel tags to
distinguish different trace streams from one another inside one STP
stream.

This abstraction provides a configfs-based policy management mechanism
for dynamic allocation of these master/channel pairs based on trace
source-supplied string identifier. It has the flexibility of being
defined at runtime and at the same time (provided that the policy
definition is aligned with the decoding end) consistency.

For userspace trace sources, this abstraction provides write()-based and
mmap()-based (if the underlying stm device allows this) output mechanism.

For kernel-side trace sources, we provide "stm_source" device class that
can be connected to an stm device at run time.

Cc: linux-api@vger.kernel.org
Cc: Pratik Patel <pratikp@codeaurora.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
 Documentation/ABI/testing/configfs-stp-policy    |  44 ++
 Documentation/ABI/testing/sysfs-class-stm        |  14 +
 Documentation/ABI/testing/sysfs-class-stm_source |  11 +
 Documentation/trace/stm.txt                      |  77 ++
 drivers/Kconfig                                  |   2 +
 drivers/Makefile                                 |   1 +
 drivers/hwtracing/stm/Kconfig                    |   8 +
 drivers/hwtracing/stm/Makefile                   |   3 +
 drivers/hwtracing/stm/core.c                     | 897 +++++++++++++++++++++++
 drivers/hwtracing/stm/policy.c                   | 467 ++++++++++++
 drivers/hwtracing/stm/stm.h                      |  79 ++
 include/linux/stm.h                              | 102 +++
 include/uapi/linux/stm.h                         |  47 ++
 13 files changed, 1752 insertions(+)
 create mode 100644 Documentation/ABI/testing/configfs-stp-policy
 create mode 100644 Documentation/ABI/testing/sysfs-class-stm
 create mode 100644 Documentation/ABI/testing/sysfs-class-stm_source
 create mode 100644 Documentation/trace/stm.txt
 create mode 100644 drivers/hwtracing/stm/Kconfig
 create mode 100644 drivers/hwtracing/stm/Makefile
 create mode 100644 drivers/hwtracing/stm/core.c
 create mode 100644 drivers/hwtracing/stm/policy.c
 create mode 100644 drivers/hwtracing/stm/stm.h
 create mode 100644 include/linux/stm.h
 create mode 100644 include/uapi/linux/stm.h

diff --git a/Documentation/ABI/testing/configfs-stp-policy b/Documentation/ABI/testing/configfs-stp-policy
new file mode 100644
index 0000000000..1c7ab3dbcd
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-stp-policy
@@ -0,0 +1,44 @@
+What:		/config/stp-policy
+Date:		Jan 2015
+KernelVersion:	3.20
+Description:
+		This group contains policies mandating Master/Channel allocation
+		for software sources wishing to send trace data over an STM
+		device.
+
+What:		/config/stp-policy/<policy>
+Date:		Jan 2015
+KernelVersion:	3.20
+Description:
+		Root of a policy. This name is an arbitrary string.
+
+What:		/config/stp-policy/<policy>/device
+Date:		Jan 2015
+KernelVersion:	3.20
+Description:
+		STM device to which this policy applies. Write a valid stm class
+		device name here to assign this policy to that device.
+
+What:		/config/stp-policy/<policy>/<node>
+Date:		Jan 2015
+KernelVersion:	3.20
+Description:
+		Policy node is a string identifier that software clients will
+		use to request a master/channel to be allocated and assigned to
+		them.
+
+What:		/config/stp-policy/<policy>/<node>/masters
+Date:		Jan 2015
+KernelVersion:	3.20
+Description:
+		Range of masters from which to allocate for users of this node.
+		Write two numbers: the first master and the last master number.
+
+What:		/config/stp-policy/<policy>/<node>/channels
+Date:		Jan 2015
+KernelVersion:	3.20
+Description:
+		Range of channels from which to allocate for users of this node.
+		Write two numbers: the first channel and the last channel
+		number.
+
diff --git a/Documentation/ABI/testing/sysfs-class-stm b/Documentation/ABI/testing/sysfs-class-stm
new file mode 100644
index 0000000000..186f8e66e1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-stm
@@ -0,0 +1,14 @@
+What:		/sys/class/stm/<stm>/masters
+Date:		Jan 2015
+KernelVersion:	3.20
+Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Description:
+		Shows first and last available to software master numbers on
+		this STM device.
+
+What:		/sys/class/stm/<stm>/channels
+Date:		Jan 2015
+KernelVersion:	3.20
+Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Description:
+		Shows the number of channels per master on this STM device.
diff --git a/Documentation/ABI/testing/sysfs-class-stm_source b/Documentation/ABI/testing/sysfs-class-stm_source
new file mode 100644
index 0000000000..735b0d657f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-stm_source
@@ -0,0 +1,11 @@
+What:		/sys/class/stm_source/<stm_source>/stm_source_link
+Date:		Jan 2015
+KernelVersion:	3.20
+Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Description:
+		stm_source device linkage to stm device, where its tracing data
+		is directed. Reads return an existing connection or "<none>" if
+		this stm_source is not connected to any stm device yet.
+		Write an existing (registered) stm device's name here to
+		connect that device. If a device is already connected to this
+		stm_source, it will first be disconnected.
diff --git a/Documentation/trace/stm.txt b/Documentation/trace/stm.txt
new file mode 100644
index 0000000000..0ba5c9115c
--- /dev/null
+++ b/Documentation/trace/stm.txt
@@ -0,0 +1,77 @@
+System Trace Module
+===================
+
+System Trace Module (STM) is a device described in MIPI STP specs as
+STP trace stream generator. STP (System Trace Protocol) is a trace
+protocol multiplexing data from multiple trace sources, each one of
+which is assigned a unique pair of master and channel. While some of
+these masters and channels are statically allocated to certain
+hardware trace sources, others are available to software. Software
+trace sources are usually free to pick for themselves any
+master/channel combination from this pool.
+
+On the receiving end of this STP stream (the decoder side), trace
+sources can only be identified by master/channel combination, so in
+order for the decoder to be able to make sense of the trace that
+involves multiple trace sources, it needs to be able to map those
+master/channel pairs to the trace sources that it understands.
+
+For instance, it is helpful to know that syslog messages come on
+master 7 channel 15, while arbitrary user applications can use masters
+48 to 63 and channels 0 to 127.
+
+To solve this mapping problem, stm class provides a policy management
+mechanism via configfs, that allows defining rules that map string
+identifiers to ranges of masters and channels. If these rules (policy)
+are consistent with what decoder expects, it will be able to properly
+process the trace data.
+
+This policy is a tree structure containing rules (policy_node) that
+have a name (string identifier) and a range of masters and channels
+associated with it, located in "stp-policy" subsystem directory in
+configfs. From the examle above, a rule may look like this:
+
+$ ls /config/stp-policy/my-policy/user
+channels masters
+$ cat /config/stp-policy/my-policy/user/masters
+48 63
+$ cat /config/stp-policy/my-policy/user/channels
+0 127
+
+which means that the master allocation pool for this rule consists of
+masters 48 through 63 and channel allocation pool has channels 0
+through 127 in it. Now, any producer (trace source) identifying itself
+with "user" identification string will be allocated a master and
+channel from within these ranges.
+
+These rules can be nested, for example, one can define a rule "dummy"
+under "user" directory from the example above and this new rule will
+be used for trace sources with the id string of "user/dummy".
+
+Trace sources have to open the stm class device's node and write their
+trace data into its file descriptor. In order to identify themselves
+to the policy, they need to do a STP_POLICY_ID_SET ioctl on this file
+descriptor providing their id string. Otherwise, they will be
+automatically allocated a master/channel pair upon first write to this
+file descriptor according to the "default" rule of the policy, if such
+exists.
+
+Some STM devices may allow direct mapping of the channel mmio regions
+to userspace for zero-copy writing. One mappable page (in terms of
+mmu) will usually contain multiple channels' mmios, so the user will
+need to allocate that many channels to themselves (via the
+aforementioned ioctl() call) to be able to do this. That is, if your
+stm device's channel mmio region is 64 bytes and hardware page size is
+4096 bytes, after a successful STP_POLICY_ID_SET ioctl() call with
+width==64, you should be able to mmap() one page on this file
+descriptor and obtain direct access to an mmio region for 64 channels.
+
+For kernel-based trace sources, there is "stm_source" device
+class. Devices of this class can be connected and disconnected to/from
+stm devices at runtime via a sysfs attribute.
+
+Examples of STM devices are Intel Trace Hub [1] and Coresight STM
+[2].
+
+[1] https://software.intel.com/sites/default/files/managed/d3/3c/intel-th-developer-manual.pdf
+[2] http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0444b/index.html
diff --git a/drivers/Kconfig b/drivers/Kconfig
index c0cc96bab9..9850ab81cc 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -182,4 +182,6 @@ source "drivers/thunderbolt/Kconfig"
 
 source "drivers/android/Kconfig"
 
+source "drivers/hwtracing/stm/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 527a6da8d5..87d7c74e39 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -165,3 +165,4 @@ obj-$(CONFIG_RAS)		+= ras/
 obj-$(CONFIG_THUNDERBOLT)	+= thunderbolt/
 obj-$(CONFIG_CORESIGHT)		+= coresight/
 obj-$(CONFIG_ANDROID)		+= android/
+obj-$(CONFIG_STM)		+= hwtracing/stm/
diff --git a/drivers/hwtracing/stm/Kconfig b/drivers/hwtracing/stm/Kconfig
new file mode 100644
index 0000000000..90ed327461
--- /dev/null
+++ b/drivers/hwtracing/stm/Kconfig
@@ -0,0 +1,8 @@
+config STM
+	tristate "System Trace Module devices"
+	help
+	  A System Trace Module (STM) is a device exporting data in System
+	  Trace Protocol (STP) format as defined by MIPI STP standards.
+	  Examples of such devices are Intel Trace Hub and Coresight STM.
+
+	  Say Y here to enable System Trace Module device support.
diff --git a/drivers/hwtracing/stm/Makefile b/drivers/hwtracing/stm/Makefile
new file mode 100644
index 0000000000..adec701649
--- /dev/null
+++ b/drivers/hwtracing/stm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_STM)	+= stm_core.o
+
+stm_core-y		:= core.o policy.o
diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c
new file mode 100644
index 0000000000..9e82634590
--- /dev/null
+++ b/drivers/hwtracing/stm/core.c
@@ -0,0 +1,897 @@
+/*
+ * System Trace Module (STM) infrastructure
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * STM class implements generic infrastructure for  System Trace Module devices
+ * as defined in MIPI STPv2 specification.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/compat.h>
+#include <linux/kdev_t.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/stm.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "stm.h"
+
+#include <uapi/linux/stm.h>
+
+static unsigned int stm_core_up;
+
+/*
+ * The SRCU here makes sure that STM device doesn't disappear from under a
+ * stm_source_write() caller, which may want to have as little overhead as
+ * possible.
+ */
+static struct srcu_struct stm_source_srcu;
+
+static ssize_t masters_show(struct device *dev,
+			    struct device_attribute *attr,
+			    char *buf)
+{
+	struct stm_device *stm = dev_get_drvdata(dev);
+	int ret;
+
+	ret = sprintf(buf, "%u %u\n", stm->data->sw_start, stm->data->sw_end);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(masters);
+
+static ssize_t channels_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct stm_device *stm = dev_get_drvdata(dev);
+	int ret;
+
+	ret = sprintf(buf, "%u\n", stm->data->sw_nchannels);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(channels);
+
+static struct attribute *stm_attrs[] = {
+	&dev_attr_masters.attr,
+	&dev_attr_channels.attr,
+	NULL,
+};
+
+static const struct attribute_group stm_group = {
+	.attrs	= stm_attrs,
+};
+
+static const struct attribute_group *stm_groups[] = {
+	&stm_group,
+	NULL,
+};
+
+static struct class stm_class = {
+	.name		= "stm",
+	.dev_groups	= stm_groups,
+};
+
+static int stm_dev_match(struct device *dev, const void *data)
+{
+	const char *name = data;
+
+	return sysfs_streq(name, dev_name(dev));
+}
+
+/**
+ * stm_find_device() - find stm device by name
+ * @buf:	character buffer containing the name
+ * @len:	length of the name in @buf
+ *
+ * This is called from attributes' store methods, so it will
+ * also trim the trailing newline if necessary.
+ *
+ * Return:	device pointer or null if lookup failed.
+ */
+struct device *stm_find_device(const char *buf, size_t len)
+{
+	if (!stm_core_up)
+		return NULL;
+
+	return class_find_device(&stm_class, NULL, buf, stm_dev_match);
+}
+
+/*
+ * Internally we only care about software-writable masters here, that is the
+ * ones in the range [stm_data->sw_start..stm_data..sw_end], however we need
+ * original master numbers to be visible externally, since they are the ones
+ * that will appear in the STP stream. Thus, the internal bookkeeping uses
+ * $master - stm_data->sw_start to reference master descriptors and such.
+ */
+
+#define __stm_master(_s, _m)				\
+	((_s)->masters[(_m) - (_s)->data->sw_start])
+
+static inline struct stp_master *
+stm_master(struct stm_device *stm, unsigned int idx)
+{
+	if (idx < stm->data->sw_start || idx > stm->data->sw_end)
+		return NULL;
+
+	return __stm_master(stm, idx);
+}
+
+static int stp_master_alloc(struct stm_device *stm, unsigned int idx)
+{
+	struct stp_master *master;
+	size_t size;
+
+	size = ALIGN(stm->data->sw_nchannels, 8) / 8;
+	size += sizeof(struct stp_master);
+	master = kzalloc(size, GFP_ATOMIC);
+	if (!master)
+		return -ENOMEM;
+
+	master->nr_free = stm->data->sw_nchannels;
+	__stm_master(stm, idx) = master;
+
+	return 0;
+}
+
+static void stp_master_free(struct stm_device *stm, unsigned int idx)
+{
+	struct stp_master *master = stm_master(stm, idx);
+
+	if (!master)
+		return;
+
+	__stm_master(stm, idx) = NULL;
+	kfree(master);
+}
+
+static void stm_output_claim(struct stm_device *stm, struct stm_output *output)
+{
+	struct stp_master *master = stm_master(stm, output->master);
+
+	if (WARN_ON_ONCE(master->nr_free < output->nr_chans))
+		return;
+
+	bitmap_allocate_region(&master->chan_map[0], output->channel,
+			       ilog2(output->nr_chans));
+
+	master->nr_free -= output->nr_chans;
+}
+
+static void
+stm_output_disclaim(struct stm_device *stm, struct stm_output *output)
+{
+	struct stp_master *master = stm_master(stm, output->master);
+
+	bitmap_release_region(&master->chan_map[0], output->channel,
+			      ilog2(output->nr_chans));
+
+	master->nr_free += output->nr_chans;
+}
+
+/*
+ * This is like bitmap_find_free_region(), except it can ignore @start bits
+ * at the beginning.
+ */
+static int find_free_channels(unsigned long *bitmap, unsigned int start,
+			      unsigned int end, unsigned int width)
+{
+	unsigned int pos;
+	int i;
+
+	for (pos = start; pos < end + 1; pos = ALIGN(pos, width)) {
+		pos = find_next_zero_bit(bitmap, end + 1, pos);
+		if (pos + width > end + 1)
+			break;
+
+		if (pos & (width - 1))
+			continue;
+
+		for (i = 1; i < width && !test_bit(pos + i, bitmap); i++)
+			;
+		if (i == width)
+			return pos;
+	}
+
+	return -1;
+}
+
+static unsigned int
+stm_find_master_chan(struct stm_device *stm, unsigned int width,
+		     unsigned int *mstart, unsigned int mend,
+		     unsigned int *cstart, unsigned int cend)
+{
+	struct stp_master *master;
+	unsigned int midx;
+	int pos, err;
+
+	for (midx = *mstart; midx <= mend; midx++) {
+		if (!stm_master(stm, midx)) {
+			err = stp_master_alloc(stm, midx);
+			if (err)
+				return err;
+		}
+
+		master = stm_master(stm, midx);
+
+		if (!master->nr_free)
+			continue;
+
+		pos = find_free_channels(master->chan_map, *cstart, cend,
+					 width);
+		if (pos < 0)
+			continue;
+
+		*mstart = midx;
+		*cstart = pos;
+		return 0;
+	}
+
+	return -ENOSPC;
+}
+
+static int stm_output_assign(struct stm_device *stm, unsigned int width,
+			     struct stp_policy_node *policy_node,
+			     struct stm_output *output)
+{
+	unsigned int midx, cidx, mend, cend;
+	int ret = -EBUSY;
+
+	if (width > stm->data->sw_nchannels)
+		return -EINVAL;
+
+	if (policy_node) {
+		stp_policy_node_get_ranges(policy_node,
+					   &midx, &mend, &cidx, &cend);
+	} else {
+		midx = stm->data->sw_start;
+		cidx = 0;
+		mend = stm->data->sw_end;
+		cend = stm->data->sw_nchannels - 1;
+	}
+
+	spin_lock(&stm->mc_lock);
+	if (output->nr_chans)
+		goto unlock;
+
+	ret = stm_find_master_chan(stm, width, &midx, mend, &cidx, cend);
+	if (ret)
+		goto unlock;
+
+	output->master = midx;
+	output->channel = cidx;
+	output->nr_chans = width;
+	stm_output_claim(stm, output);
+	dev_dbg(stm->dev, "assigned %u:%u (+%u)\n", midx, cidx, width);
+
+	ret = 0;
+unlock:
+	spin_unlock(&stm->mc_lock);
+
+	return ret;
+}
+
+static void stm_output_free(struct stm_device *stm, struct stm_output *output)
+{
+	spin_lock(&stm->mc_lock);
+	if (output->nr_chans)
+		stm_output_disclaim(stm, output);
+	spin_unlock(&stm->mc_lock);
+}
+
+static int major_match(struct device *dev, const void *data)
+{
+	unsigned int major = *(unsigned int *)data;
+
+	return MAJOR(dev->devt) == major;
+}
+
+static int stm_char_open(struct inode *inode, struct file *file)
+{
+	struct stm_file *stmf;
+	struct device *dev;
+	unsigned int major = imajor(inode);
+	int err = -ENODEV;
+
+	dev = class_find_device(&stm_class, NULL, &major, major_match);
+	if (!dev)
+		return -ENODEV;
+
+	stmf = kzalloc(sizeof(*stmf), GFP_KERNEL);
+	if (!stmf)
+		return -ENOMEM;
+
+	stmf->stm = dev_get_drvdata(dev);
+
+	if (!try_module_get(stmf->stm->owner))
+		goto err_free;
+
+	file->private_data = stmf;
+
+	return nonseekable_open(inode, file);
+
+err_free:
+	kfree(stmf);
+
+	return err;
+}
+
+static int stm_char_release(struct inode *inode, struct file *file)
+{
+	struct stm_file *stmf = file->private_data;
+
+	stm_output_free(stmf->stm, &stmf->output);
+	module_put(stmf->stm->owner);
+	kfree(stmf);
+
+	return 0;
+}
+
+static int stm_file_assign(struct stm_file *stmf, char *id, unsigned int width)
+{
+	struct stm_device *stm = stmf->stm;
+	int ret;
+
+	mutex_lock(&stm->policy_mutex);
+	if (stm->policy)
+		stmf->policy_node = stp_policy_node_lookup(stm->policy, id);
+
+	ret = stm_output_assign(stm, width, stmf->policy_node, &stmf->output);
+	mutex_unlock(&stm->policy_mutex);
+
+	return ret;
+}
+
+static ssize_t stm_char_write(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	struct stm_file *stmf = file->private_data;
+	struct stm_device *stm = stmf->stm;
+	char *kbuf;
+	int err;
+
+	/*
+	 * if no m/c have been assigned to this writer up to this
+	 * point, use "default" policy entry
+	 */
+	if (!stmf->output.nr_chans) {
+		err = stm_file_assign(stmf, "default", 1);
+		/*
+		 * EBUSY means that somebody else just assigned this
+		 * output, which is just fine for write()
+		 */
+		if (err && err != -EBUSY)
+			return err;
+	}
+
+	kbuf = kmalloc(count + 1, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	err = copy_from_user(kbuf, buf, count);
+	if (err) {
+		kfree(kbuf);
+		return -EFAULT;
+	}
+
+	stm->data->write(stm->data, stmf->output.master,
+			 stmf->output.channel, kbuf, count);
+
+
+	kfree(kbuf);
+
+	return count;
+}
+
+static int stm_char_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct stm_file *stmf = file->private_data;
+	struct stm_device *stm = stmf->stm;
+	unsigned long size, phys;
+
+	if (!stm->data->mmio_addr)
+		return -EOPNOTSUPP;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	size = vma->vm_end - vma->vm_start;
+
+	if (stmf->output.nr_chans * stm->data->sw_mmiosz != size)
+		return -EINVAL;
+
+	phys = stm->data->mmio_addr(stm->data, stmf->output.master,
+				    stmf->output.channel,
+				    stmf->output.nr_chans);
+
+	if (!phys)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
+	vm_iomap_memory(vma, phys, size);
+
+	return 0;
+}
+
+static int stm_char_policy_set_ioctl(struct stm_file *stmf, void __user *arg)
+{
+	struct stm_device *stm = stmf->stm;
+	struct stp_policy_id *id;
+	int ret = -EINVAL;
+	u32 size;
+
+	if (stmf->output.nr_chans)
+		return -EBUSY;
+
+	if (copy_from_user(&size, arg, sizeof(size)))
+		return -EFAULT;
+
+	if (size >= PATH_MAX + sizeof(*id))
+		return -EINVAL;
+
+	/* size + 1 to make sure the .id string at the bottom is terminated */
+	id = kzalloc(size + 1, GFP_KERNEL);
+	if (!id)
+		return -ENOMEM;
+
+	if (copy_from_user(id, arg, size)) {
+		ret = -EFAULT;
+		goto err_free;
+	}
+
+	if (id->__reserved_0 || id->__reserved_1)
+		goto err_free;
+
+	if (id->width < 1 ||
+	    id->width > PAGE_SIZE / stm->data->sw_mmiosz)
+		goto err_free;
+
+	ret = stm_file_assign(stmf, id->id, id->width);
+	if (ret)
+		goto err_free;
+
+	if (stm->data->link)
+		stm->data->link(stm->data, stmf->output.master,
+				stmf->output.channel);
+
+	ret = 0;
+
+err_free:
+	kfree(id);
+
+	return ret;
+}
+
+static int stm_char_policy_get_ioctl(struct stm_file *stmf, void __user *arg)
+{
+	struct stp_policy_id id = {
+		.size		= sizeof(id),
+		.master		= stmf->output.master,
+		.channel	= stmf->output.channel,
+		.width		= stmf->output.nr_chans,
+		.__reserved_0	= 0,
+		.__reserved_1	= 0,
+	};
+
+	return copy_to_user(arg, &id, id.size) ? -EFAULT : 0;
+}
+
+static long
+stm_char_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct stm_file *stmf = file->private_data;
+	struct stm_data *stm_data = stmf->stm->data;
+	int err = -ENOTTY;
+
+	switch (cmd) {
+	case STP_POLICY_ID_SET:
+		err = stm_char_policy_set_ioctl(stmf, (void __user *)arg);
+		if (err)
+			return err;
+
+		return stm_char_policy_get_ioctl(stmf, (void __user *)arg);
+
+	case STP_POLICY_ID_GET:
+		return stm_char_policy_get_ioctl(stmf, (void __user *)arg);
+
+	default:
+		if (stm_data->ioctl)
+			err = stm_data->ioctl(stm_data, cmd, arg);
+
+		break;
+	}
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long
+stm_char_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return stm_char_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#else
+#define stm_char_compat_ioctl	NULL
+#endif
+
+static const struct file_operations stm_fops = {
+	.open		= stm_char_open,
+	.release	= stm_char_release,
+	.write		= stm_char_write,
+	.mmap		= stm_char_mmap,
+	.unlocked_ioctl	= stm_char_ioctl,
+	.compat_ioctl	= stm_char_compat_ioctl,
+	.llseek		= no_llseek,
+};
+
+int stm_register_device(struct device *parent, struct stm_data *stm_data,
+			struct module *owner)
+{
+	struct stm_device *stm;
+	struct device *dev;
+	unsigned int nmasters;
+	int err = -ENOMEM;
+
+	if (!stm_core_up)
+		return -EPROBE_DEFER;
+
+	if (!stm_data->write || !stm_data->sw_nchannels)
+		return -EINVAL;
+
+	nmasters = stm_data->sw_end - stm_data->sw_start;
+	stm = kzalloc(sizeof(*stm) + nmasters * sizeof(void *), GFP_KERNEL);
+	if (!stm)
+		return -ENOMEM;
+
+	stm->major = register_chrdev(0, stm_data->name, &stm_fops);
+	if (stm->major < 0)
+		goto err_free;
+
+	dev = device_create(&stm_class, parent, MKDEV(stm->major, 0), NULL,
+			    "%s", stm_data->name);
+	if (IS_ERR(dev)) {
+		err = PTR_ERR(dev);
+		goto err_device;
+	}
+
+	spin_lock_init(&stm->link_lock);
+	INIT_LIST_HEAD(&stm->link_list);
+
+	spin_lock_init(&stm->mc_lock);
+	mutex_init(&stm->policy_mutex);
+	stm->sw_nmasters = nmasters;
+	stm->owner = owner;
+	stm->data = stm_data;
+	stm->dev = dev;
+	stm_data->stm = stm;
+
+	dev_set_drvdata(dev, stm);
+
+	return 0;
+
+err_device:
+	device_unregister(dev);
+err_free:
+	kfree(stm);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(stm_register_device);
+
+static void stm_source_link_drop(struct stm_source_device *src);
+
+void stm_unregister_device(struct stm_data *stm_data)
+{
+	struct stm_device *stm = stm_data->stm;
+	struct stm_source_device *src, *iter;
+	int i;
+
+	spin_lock(&stm->link_lock);
+	list_for_each_entry_safe(src, iter, &stm->link_list, link_entry) {
+		stm_source_link_drop(src);
+	}
+	spin_unlock(&stm->link_lock);
+
+	synchronize_srcu(&stm_source_srcu);
+
+	unregister_chrdev(stm->major, stm_data->name);
+
+	if (stm->policy)
+		stp_policy_unbind(stm->policy);
+
+	for (i = 0; i < stm->sw_nmasters; i++)
+		stp_master_free(stm, i);
+
+	device_unregister(stm->dev);
+	kfree(stm);
+	stm_data->stm = NULL;
+}
+EXPORT_SYMBOL_GPL(stm_unregister_device);
+
+/**
+ * stm_source_link_add() - connect an stm_source device to an stm device
+ * @src:	stm_source device
+ * @stm:	stm device
+ *
+ * This function establishes a link from stm_source to an stm device so that
+ * the former can send out trace data to the latter.
+ *
+ * Return:	0 on success, -errno otherwise.
+ */
+static int stm_source_link_add(struct stm_source_device *src,
+			       struct stm_device *stm)
+{
+	int err;
+
+	spin_lock(&stm->link_lock);
+	spin_lock(&src->link_lock);
+
+	/* src->link is dereferenced under stm_source_srcu but not the list */
+	rcu_assign_pointer(src->link, stm);
+	list_add_tail(&src->link_entry, &stm->link_list);
+
+	spin_unlock(&src->link_lock);
+	spin_unlock(&stm->link_lock);
+
+	if (stm->policy) {
+		char *id = kstrdup(src->data->name, GFP_KERNEL);
+
+		if (id) {
+			src->policy_node =
+				stp_policy_node_lookup(stm->policy, id);
+
+			kfree(id);
+		}
+	}
+
+	err = stm_output_assign(stm, src->data->nr_chans,
+				src->policy_node, &src->output);
+	if (err)
+		return err;
+
+	/* this is to notify the STM device that a new link has been made */
+	if (stm->data->link)
+		stm->data->link(stm->data, src->output.master,
+				src->output.channel);
+
+	/* this is to let the source carry out all necessary preparations */
+	if (src->data->link)
+		src->data->link(src->data);
+
+	return 0;
+}
+
+/**
+ * stm_source_link_drop() - detach stm_source from its stm device
+ * @src:	stm_source device
+ *
+ * Unlinking means disconnecting from source's STM device; after this
+ * writes will be unsuccessful until it is linked to a new STM device.
+ *
+ * This will happen on "stm_source_link" sysfs attribute write to undo
+ * the existing link (if any), or on linked STM device's de-registration.
+ */
+static void stm_source_link_drop(struct stm_source_device *src)
+{
+	int idx = srcu_read_lock(&stm_source_srcu);
+
+	if (src->link && src->data->unlink)
+		src->data->unlink(src->data);
+
+	srcu_read_unlock(&stm_source_srcu, idx);
+
+	spin_lock(&src->link_lock);
+	if (src->link) {
+		stm_output_free(src->link, &src->output);
+		list_del_init(&src->link_entry);
+		rcu_assign_pointer(src->link, NULL);
+	}
+	spin_unlock(&src->link_lock);
+}
+
+static ssize_t stm_source_link_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct stm_source_device *src = dev_get_drvdata(dev);
+	int idx, ret;
+
+	idx = srcu_read_lock(&stm_source_srcu);
+	ret = sprintf(buf, "%s\n",
+		      src->link ? dev_name(src->link->dev) : "<none>");
+	srcu_read_unlock(&stm_source_srcu, idx);
+
+	return ret;
+}
+
+static ssize_t stm_source_link_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct stm_source_device *src = dev_get_drvdata(dev);
+	struct stm_device *link;
+	struct device *linkdev;
+	int err;
+
+	stm_source_link_drop(src);
+
+	linkdev = stm_find_device(buf, count);
+	if (!linkdev)
+		return -EINVAL;
+
+	link = dev_get_drvdata(linkdev);
+
+	err = stm_source_link_add(src, link);
+
+	return err ? : count;
+}
+
+static DEVICE_ATTR_RW(stm_source_link);
+
+static struct attribute *stm_source_attrs[] = {
+	&dev_attr_stm_source_link.attr,
+	NULL,
+};
+
+static const struct attribute_group stm_source_group = {
+	.attrs	= stm_source_attrs,
+};
+
+static const struct attribute_group *stm_source_groups[] = {
+	&stm_source_group,
+	NULL,
+};
+
+static struct class stm_source_class = {
+	.name		= "stm_source",
+	.dev_groups	= stm_source_groups,
+};
+
+/**
+ * stm_source_register_device() - register an stm_source device
+ * @parent:	parent device
+ * @data:	device description structure
+ *
+ * This will create a device of stm_source class that can write
+ * data to an stm device once linked.
+ *
+ * Return:	0 on success, -errno otherwise.
+ */
+int stm_source_register_device(struct device *parent,
+			       struct stm_source_data *data)
+{
+	struct stm_source_device *src;
+	struct device *dev;
+
+	if (!stm_core_up)
+		return -EPROBE_DEFER;
+
+	src = kzalloc(sizeof(*src), GFP_KERNEL);
+	if (!src)
+		return -ENOMEM;
+
+	dev = device_create(&stm_source_class, parent, MKDEV(0, 0), NULL, "%s",
+			    data->name);
+	if (IS_ERR(dev)) {
+		kfree(src);
+		return PTR_ERR(dev);
+	}
+
+	spin_lock_init(&src->link_lock);
+	INIT_LIST_HEAD(&src->link_entry);
+	src->dev = dev;
+	src->data = data;
+	data->src = src;
+	dev_set_drvdata(dev, src);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(stm_source_register_device);
+
+/**
+ * stm_source_unregister_device() - unregister an stm_source device
+ * @data:	device description that was used to register the device
+ *
+ * This will remove a previously created stm_source device from the system.
+ */
+void stm_source_unregister_device(struct stm_source_data *data)
+{
+	struct stm_source_device *src = data->src;
+
+	stm_source_link_drop(src);
+
+	device_destroy(&stm_source_class, src->dev->devt);
+
+	kfree(src);
+}
+EXPORT_SYMBOL_GPL(stm_source_unregister_device);
+
+int stm_source_write(struct stm_source_data *data, unsigned int chan,
+		     const char *buf, size_t count)
+{
+	struct stm_source_device *src = data->src;
+	struct stm_device *stm;
+	int idx;
+
+	if (!src->output.nr_chans)
+		return -ENODEV;
+
+	if (chan >= src->output.nr_chans)
+		return -EINVAL;
+
+	idx = srcu_read_lock(&stm_source_srcu);
+
+	stm = srcu_dereference(src->link, &stm_source_srcu);
+	if (stm)
+		count = stm->data->write(stm->data, src->output.master,
+					 src->output.channel + chan, buf,
+					 count);
+	else
+		count = -ENODEV;
+
+	srcu_read_unlock(&stm_source_srcu, idx);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(stm_source_write);
+
+static int __init stm_core_init(void)
+{
+	int err;
+
+	err = class_register(&stm_class);
+	if (err)
+		return err;
+
+	err = class_register(&stm_source_class);
+	if (err)
+		goto err_stm;
+
+	err = stp_configfs_init();
+	if (err)
+		goto err_src;
+
+	init_srcu_struct(&stm_source_srcu);
+
+	stm_core_up++;
+
+	return 0;
+
+err_src:
+	class_unregister(&stm_source_class);
+err_stm:
+	class_unregister(&stm_class);
+
+	return err;
+}
+
+postcore_initcall(stm_core_init);
+
+static void __exit stm_core_exit(void)
+{
+	class_unregister(&stm_source_class);
+	class_unregister(&stm_class);
+	stp_configfs_exit();
+}
+
+module_exit(stm_core_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("System Trace Module device class");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/stm/policy.c b/drivers/hwtracing/stm/policy.c
new file mode 100644
index 0000000000..b5c59a0e0c
--- /dev/null
+++ b/drivers/hwtracing/stm/policy.c
@@ -0,0 +1,467 @@
+/*
+ * System Trace Module (STM) master/channel allocation policy management
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * A master/channel allocation policy allows mapping string identifiers to
+ * master and channel ranges, where allocation can be done.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/configfs.h>
+#include <linux/slab.h>
+#include <linux/stm.h>
+#include "stm.h"
+
+/*
+ * STP Master/Channel allocation policy configfs layout.
+ */
+
+struct stp_policy {
+	struct config_group	group;
+	struct stm_device	*stm;
+};
+
+struct stp_policy_node {
+	struct config_group	group;
+	struct stm_device	*stm;
+	struct stp_policy	*policy;
+	unsigned int		first_master;
+	unsigned int		last_master;
+	unsigned int		first_channel;
+	unsigned int		last_channel;
+};
+
+void stp_policy_node_get_ranges(struct stp_policy_node *policy_node,
+				unsigned int *mstart, unsigned int *mend,
+				unsigned int *cstart, unsigned int *cend)
+{
+	*mstart	= policy_node->first_master;
+	*mend	= policy_node->last_master;
+	*cstart	= policy_node->first_channel;
+	*cend	= policy_node->last_channel;
+}
+
+static inline char *stp_policy_node_name(struct stp_policy_node *policy_node)
+{
+	return policy_node->group.cg_item.ci_name ? : "<none>";
+}
+
+static inline struct stp_policy *to_stp_policy(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct stp_policy, group) :
+		NULL;
+}
+
+static inline struct stp_policy_node *
+to_stp_policy_node(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct stp_policy_node,
+			     group) :
+		NULL;
+}
+
+static ssize_t stp_policy_node_masters_show(struct stp_policy_node *policy_node,
+					    char *page)
+{
+	ssize_t count;
+
+	count = sprintf(page, "%u %u\n", policy_node->first_master,
+			policy_node->last_master);
+
+	return count;
+}
+
+static ssize_t
+stp_policy_node_masters_store(struct stp_policy_node *policy_node,
+			      const char *page, size_t count)
+{
+	struct stm_device *stm = policy_node->stm;
+	unsigned int first, last;
+	char *p = (char *) page;
+
+	if (sscanf(p, "%u %u", &first, &last) != 2)
+		return -EINVAL;
+
+	/* must be within [sw_start..sw_end], which is an inclusive range */
+	if (first > INT_MAX || last > INT_MAX || first > last ||
+	    first < stm->data->sw_start ||
+	    last > stm->data->sw_end)
+		return -ERANGE;
+
+	policy_node->first_master = first;
+	policy_node->last_master = last;
+
+	return count;
+}
+
+static ssize_t
+stp_policy_node_channels_show(struct stp_policy_node *policy_node, char *page)
+{
+	ssize_t count;
+
+	count = sprintf(page, "%u %u\n", policy_node->first_channel,
+			policy_node->last_channel);
+
+	return count;
+}
+
+static ssize_t
+stp_policy_node_channels_store(struct stp_policy_node *policy_node,
+			       const char *page, size_t count)
+{
+	unsigned int first, last;
+	char *p = (char *) page;
+
+	if (sscanf(p, "%u %u", &first, &last) != 2)
+		return -EINVAL;
+
+	if (first > INT_MAX || last > INT_MAX || first > last ||
+	    last >= policy_node->stm->data->sw_nchannels)
+		return -ERANGE;
+
+	policy_node->first_channel = first;
+	policy_node->last_channel = last;
+
+	return count;
+}
+
+static void stp_policy_node_release(struct config_item *item)
+{
+	kfree(to_stp_policy_node(item));
+}
+
+struct stp_policy_node_attribute {
+	struct configfs_attribute	attr;
+	ssize_t (*show)(struct stp_policy_node *, char *);
+	ssize_t (*store)(struct stp_policy_node *, const char *, size_t);
+};
+
+static ssize_t stp_policy_node_attr_show(struct config_item *item,
+					 struct configfs_attribute *attr,
+					 char *page)
+{
+	struct stp_policy_node *policy_node = to_stp_policy_node(item);
+	struct stp_policy_node_attribute *pn_attr =
+		container_of(attr, struct stp_policy_node_attribute, attr);
+	ssize_t count = 0;
+
+	if (pn_attr->show)
+		count = pn_attr->show(policy_node, page);
+
+	return count;
+}
+
+static ssize_t stp_policy_node_attr_store(struct config_item *item,
+					  struct configfs_attribute *attr,
+					  const char *page, size_t len)
+{
+	struct stp_policy_node *policy_node = to_stp_policy_node(item);
+	struct stp_policy_node_attribute *pn_attr =
+		container_of(attr, struct stp_policy_node_attribute, attr);
+	ssize_t count = -EINVAL;
+
+	if (pn_attr->store)
+		count = pn_attr->store(policy_node, page, len);
+
+	return count;
+}
+
+static struct configfs_item_operations stp_policy_node_item_ops = {
+	.release		= stp_policy_node_release,
+	.show_attribute		= stp_policy_node_attr_show,
+	.store_attribute	= stp_policy_node_attr_store,
+};
+
+static struct stp_policy_node_attribute stp_policy_node_attr_range = {
+	.attr	= {
+		.ca_owner = THIS_MODULE,
+		.ca_name = "masters",
+		.ca_mode = S_IRUGO | S_IWUSR,
+	},
+	.show	= stp_policy_node_masters_show,
+	.store	= stp_policy_node_masters_store,
+};
+
+static struct stp_policy_node_attribute stp_policy_node_attr_channels = {
+	.attr	= {
+		.ca_owner = THIS_MODULE,
+		.ca_name = "channels",
+		.ca_mode = S_IRUGO | S_IWUSR,
+	},
+	.show	= stp_policy_node_channels_show,
+	.store	= stp_policy_node_channels_store,
+};
+
+static struct configfs_attribute *stp_policy_node_attrs[] = {
+	&stp_policy_node_attr_range.attr,
+	&stp_policy_node_attr_channels.attr,
+	NULL,
+};
+
+static struct config_item_type stp_policy_type;
+static struct config_item_type stp_policy_node_type;
+
+static struct config_group *
+stp_policy_node_make(struct config_group *group, const char *name)
+{
+	struct stp_policy_node *policy_node, *parent_node;
+	struct stp_policy *policy;
+
+	if (group->cg_item.ci_type == &stp_policy_type) {
+		policy = container_of(group, struct stp_policy, group);
+	} else {
+		parent_node = container_of(group, struct stp_policy_node,
+					   group);
+		policy = parent_node->policy;
+	}
+
+	if (!policy->stm)
+		return ERR_PTR(-ENODEV);
+
+	policy_node = kzalloc(sizeof(struct stp_policy_node), GFP_KERNEL);
+	if (!policy_node)
+		return ERR_PTR(-ENOMEM);
+
+	config_group_init_type_name(&policy_node->group, name,
+				    &stp_policy_node_type);
+
+	policy_node->policy = policy;
+	policy_node->stm = policy->stm;
+
+	/* default values for the attributes */
+	policy_node->first_master = policy->stm->data->sw_start;
+	policy_node->last_master = policy->stm->data->sw_end;
+	policy_node->first_channel = 0;
+	policy_node->last_channel = policy->stm->data->sw_nchannels - 1;
+
+	return &policy_node->group;
+}
+
+static void
+stp_policy_node_drop(struct config_group *group, struct config_item *item)
+{
+	config_item_put(item);
+}
+
+static struct configfs_group_operations stp_policy_node_group_ops = {
+	.make_group	= stp_policy_node_make,
+	.drop_item	= stp_policy_node_drop,
+};
+
+static struct config_item_type stp_policy_node_type = {
+	.ct_item_ops	= &stp_policy_node_item_ops,
+	.ct_group_ops	= &stp_policy_node_group_ops,
+	.ct_attrs	= stp_policy_node_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/*
+ * Root group: policies.
+ */
+static struct configfs_attribute stp_policy_attr_device = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "device",
+	.ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *stp_policy_attrs[] = {
+	&stp_policy_attr_device,
+	NULL,
+};
+
+static ssize_t stp_policy_attr_show(struct config_item *item,
+				    struct configfs_attribute *attr,
+				    char *page)
+{
+	struct stp_policy *policy = to_stp_policy(item);
+
+	return sprintf(page, "%s\n",
+		       (policy && policy->stm) ?
+		       policy->stm->data->name :
+		       "<none>");
+}
+
+static ssize_t stp_policy_attr_store(struct config_item *item,
+				     struct configfs_attribute *attr,
+				     const char *page, size_t len)
+{
+	struct stp_policy *policy = to_stp_policy(item);
+	ssize_t count = -EINVAL;
+	struct device *dev;
+
+	dev = stm_find_device(page, len);
+	if (dev) {
+		count = len;
+		if (policy->stm)
+			put_device(policy->stm->dev);
+
+		policy->stm = dev_get_drvdata(dev);
+
+		mutex_lock(&policy->stm->policy_mutex);
+		policy->stm->policy = policy;
+		mutex_unlock(&policy->stm->policy_mutex);
+	}
+
+	return count;
+}
+
+void stp_policy_unbind(struct stp_policy *policy)
+{
+	put_device(policy->stm->dev);
+
+	mutex_lock(&policy->stm->policy_mutex);
+	policy->stm->policy = NULL;
+	mutex_unlock(&policy->stm->policy_mutex);
+
+	policy->stm = NULL;
+}
+
+static void stp_policy_release(struct config_item *item)
+{
+	struct stp_policy *policy = to_stp_policy(item);
+
+	stp_policy_unbind(policy);
+	kfree(policy);
+}
+
+static struct configfs_item_operations stp_policy_item_ops = {
+	.release		= stp_policy_release,
+	.show_attribute		= stp_policy_attr_show,
+	.store_attribute	= stp_policy_attr_store,
+};
+
+static struct configfs_group_operations stp_policy_group_ops = {
+	.make_group	= stp_policy_node_make,
+};
+
+static struct config_item_type stp_policy_type = {
+	.ct_item_ops	= &stp_policy_item_ops,
+	.ct_group_ops	= &stp_policy_group_ops,
+	.ct_attrs	= stp_policy_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_group *
+stp_policies_make(struct config_group *group, const char *name)
+{
+	struct stp_policy *policy;
+
+	policy = kzalloc(sizeof(*policy), GFP_KERNEL);
+	if (!policy)
+		return ERR_PTR(-ENOMEM);
+
+	config_group_init_type_name(&policy->group, name,
+				    &stp_policy_type);
+	policy->stm = NULL;
+
+	return &policy->group;
+}
+
+static struct configfs_group_operations stp_policies_group_ops = {
+	.make_group	= stp_policies_make,
+};
+
+static struct config_item_type stp_policies_type = {
+	.ct_group_ops	= &stp_policies_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct configfs_subsystem stp_policy_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf	= "stp-policy",
+			.ci_type	= &stp_policies_type,
+		},
+	},
+};
+
+/*
+ * Lock the policy mutex from the outside
+ */
+static struct stp_policy_node *
+__stp_policy_node_lookup(struct stp_policy *policy, char *s)
+{
+	struct stp_policy_node *policy_node, *ret;
+	struct list_head *head = &policy->group.cg_children;
+	struct config_item *item;
+	char *start, *end = s;
+
+	if (list_empty(head))
+		return NULL;
+
+	/* return the first entry if everything else fails */
+	item = list_entry(head->next, struct config_item, ci_entry);
+	ret = to_stp_policy_node(item);
+
+next:
+	for (;;) {
+		start = strsep(&end, "/");
+		if (!start)
+			break;
+
+		if (!*start)
+			continue;
+
+		list_for_each_entry(item, head, ci_entry) {
+			policy_node = to_stp_policy_node(item);
+
+			if (!strcmp(start,
+				    policy_node->group.cg_item.ci_name)) {
+				ret = policy_node;
+
+				if (!end)
+					goto out;
+
+				head = &policy_node->group.cg_children;
+				goto next;
+			}
+		}
+		break;
+	}
+
+out:
+	return ret;
+}
+
+struct stp_policy_node *
+stp_policy_node_lookup(struct stp_policy *policy, char *s)
+{
+	struct stp_policy_node *policy_node;
+
+	mutex_lock(&stp_policy_subsys.su_mutex);
+	policy_node = __stp_policy_node_lookup(policy, s);
+	mutex_unlock(&stp_policy_subsys.su_mutex);
+
+	return policy_node;
+}
+
+int __init stp_configfs_init(void)
+{
+	int err;
+
+	config_group_init(&stp_policy_subsys.su_group);
+	mutex_init(&stp_policy_subsys.su_mutex);
+	err = configfs_register_subsystem(&stp_policy_subsys);
+
+	return err;
+}
+
+void __exit stp_configfs_exit(void)
+{
+	configfs_unregister_subsystem(&stp_policy_subsys);
+}
diff --git a/drivers/hwtracing/stm/stm.h b/drivers/hwtracing/stm/stm.h
new file mode 100644
index 0000000000..4d088a1400
--- /dev/null
+++ b/drivers/hwtracing/stm/stm.h
@@ -0,0 +1,79 @@
+/*
+ * System Trace Module (STM) infrastructure
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * STM class implements generic infrastructure for  System Trace Module devices
+ * as defined in MIPI STPv2 specification.
+ */
+
+#ifndef _CLASS_STM_H_
+#define _CLASS_STM_H_
+
+struct stp_policy;
+struct stp_policy_node;
+
+struct stp_policy_node *
+stp_policy_node_lookup(struct stp_policy *policy, char *s);
+void stp_policy_unbind(struct stp_policy *policy);
+
+void stp_policy_node_get_ranges(struct stp_policy_node *policy_node,
+				unsigned int *mstart, unsigned int *mend,
+				unsigned int *cstart, unsigned int *cend);
+int stp_configfs_init(void);
+void stp_configfs_exit(void);
+
+struct stp_master {
+	unsigned int	nr_free;
+	unsigned long	chan_map[0];
+};
+
+struct stm_device {
+	struct device		*dev;
+	struct module		*owner;
+	struct stp_policy	*policy;
+	struct mutex		policy_mutex;
+	int			major;
+	unsigned int		sw_nmasters;
+	struct stm_data		*data;
+	spinlock_t		link_lock;
+	struct list_head	link_list;
+	/* master allocation */
+	spinlock_t		mc_lock;
+	struct stp_master	*masters[0];
+};
+
+struct stm_output {
+	unsigned int		master;
+	unsigned int		channel;
+	unsigned int		nr_chans;
+};
+
+struct stm_file {
+	struct stm_device	*stm;
+	struct stp_policy_node	*policy_node;
+	struct stm_output	output;
+};
+
+struct device *stm_find_device(const char *name, size_t len);
+
+struct stm_source_device {
+	struct device		*dev;
+	struct stm_source_data	*data;
+	spinlock_t		link_lock;
+	struct stm_device	*link;
+	struct list_head	link_entry;
+	/* one output per stm_source device */
+	struct stp_policy_node	*policy_node;
+	struct stm_output	output;
+};
+
+#endif /* _CLASS_STM_H_ */
diff --git a/include/linux/stm.h b/include/linux/stm.h
new file mode 100644
index 0000000000..976c94d8f1
--- /dev/null
+++ b/include/linux/stm.h
@@ -0,0 +1,102 @@
+/*
+ * System Trace Module (STM) infrastructure apis
+ * Copyright (C) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _STM_H_
+#define _STM_H_
+
+#include <linux/device.h>
+
+struct stp_policy;
+
+struct stm_device;
+
+/**
+ * struct stm_data - STM device description and callbacks
+ * @name:		device name
+ * @stm:		internal structure, only used by stm class code
+ * @sw_start:		first STP master available to software
+ * @sw_end:		last STP master available to software
+ * @sw_nchannels:	number of STP channels per master
+ * @sw_mmiosz:		size of one channel's IO space, for mmap, optional
+ * @write:		write callback
+ * @mmio_addr:		mmap callback, optional
+ * @link:		called when a new stm_source gets linked to us, optional
+ * @unlink:		likewise for unlinking, again optional
+ * @ioctl:		ioctl callback for device-specific commands
+ *
+ * Fill out this structure before calling stm_register_device() to create
+ * an STM device and stm_unregister_device() to destroy it. It will also be
+ * passed back to @write(), @mmio_addr(), @link(), @unlink() and @ioctl()
+ * callbacks.
+ *
+ * Normally, an STM device will have a range of masters available to software
+ * and the rest being statically assigned to various hardware trace sources.
+ * The former is defined by the the range [@sw_start..@sw_end] of the device
+ * description. That is, the lowest master that can be allocated to software
+ * writers is @sw_start and data from this writer will appear is @sw_start
+ * master in the STP stream.
+ */
+struct stm_data {
+	const char		*name;
+	struct stm_device	*stm;
+	unsigned int		sw_start;
+	unsigned int		sw_end;
+	unsigned int		sw_nchannels;
+	unsigned int		sw_mmiosz;
+	ssize_t			(*write)(struct stm_data *, unsigned int,
+					 unsigned int, const char *, size_t);
+	phys_addr_t		(*mmio_addr)(struct stm_data *, unsigned int,
+					     unsigned int, unsigned int);
+	void			(*link)(struct stm_data *, unsigned int,
+					unsigned int);
+	void			(*unlink)(struct stm_data *, unsigned int,
+					  unsigned int);
+	long			(*ioctl)(struct stm_data *, unsigned int,
+					 unsigned long);
+};
+
+int stm_register_device(struct device *parent, struct stm_data *stm_data,
+			struct module *owner);
+void stm_unregister_device(struct stm_data *stm_data);
+
+struct stm_source_device;
+
+/**
+ * struct stm_source_data - STM source device description and callbacks
+ * @name:	device name, will be used for policy lookup
+ * @src:	internal structure, only used by stm class code
+ * @nr_chans:	number of channels to allocate
+ * @link:	called when this source gets linked to an STM device
+ * @unlink:	called when this source is about to get unlinked from its STM
+ *
+ * Fill in this structure before calling stm_source_register_device() to
+ * register a source device. Also pass it to unregister and write calls.
+ */
+struct stm_source_data {
+	const char		*name;
+	struct stm_source_device *src;
+	unsigned int		percpu;
+	unsigned int		nr_chans;
+	int			(*link)(struct stm_source_data *data);
+	void			(*unlink)(struct stm_source_data *data);
+};
+
+int stm_source_register_device(struct device *parent,
+			       struct stm_source_data *data);
+void stm_source_unregister_device(struct stm_source_data *data);
+
+int stm_source_write(struct stm_source_data *data, unsigned int chan,
+		     const char *buf, size_t count);
+
+#endif /* _STM_H_ */
diff --git a/include/uapi/linux/stm.h b/include/uapi/linux/stm.h
new file mode 100644
index 0000000000..042b58b53b
--- /dev/null
+++ b/include/uapi/linux/stm.h
@@ -0,0 +1,47 @@
+/*
+ * System Trace Module (STM) userspace interfaces
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * STM class implements generic infrastructure for  System Trace Module devices
+ * as defined in MIPI STPv2 specification.
+ */
+
+#ifndef _UAPI_LINUX_STM_H
+#define _UAPI_LINUX_STM_H
+
+/**
+ * struct stp_policy_id - identification for the STP policy
+ * @size:	size of the structure including real id[] length
+ * @master:	assigned master
+ * @channel:	first assigned channel
+ * @width:	number of requested channels
+ * @id:		identification string
+ *
+ * User must calculate the total size of the structure and put it into
+ * @size field, fill out the @id and desired @width. In return, kernel
+ * fills out @master, @channel and @width.
+ */
+struct stp_policy_id {
+	__u32		size;
+	__u16		master;
+	__u16		channel;
+	__u16		width;
+	/* padding */
+	__u16		__reserved_0;
+	__u32		__reserved_1;
+	char		id[0];
+};
+
+#define STP_POLICY_ID_SET	_IOWR('%', 0, struct stp_policy_id)
+#define STP_POLICY_ID_GET	_IOR('%', 1, struct stp_policy_id)
+
+#endif /* _UAPI_LINUX_STM_H */
-- 
2.1.4

^ permalink raw reply related

* [PATCH 0/5] Enhancements to twl4030 phy to support better charging - V2
From: NeilBrown @ 2015-03-22 22:35 UTC (permalink / raw)
  To: Kishon Vijay Abraham I
  Cc: Tony Lindgren, linux-api-u79uwXL29TY76Z2rM5mHXA, GTA04 owners,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Pavel Machek

Hi Kishon,
 I wonder if you could queue the following for the next merge window.
 They allow the twl4030 phy to provide more information to the
 twl4030 battery charger.
 There are only minimal changes since the first version, particularly
 documentation has been improved.

Thanks,
NeilBrown


---

NeilBrown (5):
      usb: phy: twl4030: make runtime pm more reliable.
      usb: phy: twl4030: allow charger to see usb current draw limits.
      usb: phy: twl4030: add ABI documentation
      usb: phy: twl4030: add support for reading restore on ID pin.
      usb: phy: twl4030: test ID resistance to see if charger is present.


 .../ABI/testing/sysfs-platform-twl4030-usb         |   30 ++++
 drivers/phy/phy-twl4030-usb.c                      |  147 ++++++++++++++++++--
 2 files changed, 162 insertions(+), 15 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-platform-twl4030-usb

--
Signature

^ permalink raw reply

* [PATCH 1/5] usb: phy: twl4030: make runtime pm more reliable.
From: NeilBrown @ 2015-03-22 22:35 UTC (permalink / raw)
  To: Kishon Vijay Abraham I
  Cc: Tony Lindgren, linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, GTA04 owners, NeilBrown,
	Pavel Machek
In-Reply-To: <20150322223307.21765.62974.stgit-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>

From: NeilBrown <neilb-l3A5Bk7waGM@public.gmane.org>

A construct like:

        if (pm_runtime_suspended(twl->dev))
               pm_runtime_get_sync(twl->dev);

is against the spirit of the runtime_pm interface as it
makes the internal refcounting useless.

In this case it is also racy, particularly as 'put_autosuspend'
is use to drop a reference.
When that happens a timer is started and the device is
runtime-suspended after the timeout.
If the above code runs in this window, the device will not be
found to be suspended so no pm_runtime reference is taken.
When the timer expires the device will be suspended, which is
against the intention of the code.

So be more direct is taking and dropping references.
If twl->linkstat is VBUS_VALID or ID_GROUND, then hold a
pm_runtime reference, otherwise don't.
Define "cable_present()" to test for this condition.

Tested-by: Tony Lindgren <tony-4v6yS6AI5VpBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: NeilBrown <neilb-l3A5Bk7waGM@public.gmane.org>
---
 drivers/phy/phy-twl4030-usb.c |   29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/phy/phy-twl4030-usb.c b/drivers/phy/phy-twl4030-usb.c
index 8e87f54671f3..1a244f34b748 100644
--- a/drivers/phy/phy-twl4030-usb.c
+++ b/drivers/phy/phy-twl4030-usb.c
@@ -144,6 +144,16 @@
 #define PMBR1				0x0D
 #define GPIO_USB_4PIN_ULPI_2430C	(3 << 0)
 
+/*
+ * If VBUS is valid or ID is ground, then we know a
+ * cable is present and we need to be runtime-enabled
+ */
+static inline bool cable_present(enum omap_musb_vbus_id_status stat)
+{
+	return stat == OMAP_MUSB_VBUS_VALID ||
+		stat == OMAP_MUSB_ID_GROUND;
+}
+
 struct twl4030_usb {
 	struct usb_phy		phy;
 	struct device		*dev;
@@ -536,8 +546,10 @@ static irqreturn_t twl4030_usb_irq(int irq, void *_twl)
 
 	mutex_lock(&twl->lock);
 	if (status >= 0 && status != twl->linkstat) {
+		status_changed =
+			cable_present(twl->linkstat) !=
+			cable_present(status);
 		twl->linkstat = status;
-		status_changed = true;
 	}
 	mutex_unlock(&twl->lock);
 
@@ -553,15 +565,11 @@ static irqreturn_t twl4030_usb_irq(int irq, void *_twl)
 		 * USB_LINK_VBUS state.  musb_hdrc won't care until it
 		 * starts to handle softconnect right.
 		 */
-		if ((status == OMAP_MUSB_VBUS_VALID) ||
-		    (status == OMAP_MUSB_ID_GROUND)) {
-			if (pm_runtime_suspended(twl->dev))
-				pm_runtime_get_sync(twl->dev);
+		if (cable_present(status)) {
+			pm_runtime_get_sync(twl->dev);
 		} else {
-			if (pm_runtime_active(twl->dev)) {
-				pm_runtime_mark_last_busy(twl->dev);
-				pm_runtime_put_autosuspend(twl->dev);
-			}
+			pm_runtime_mark_last_busy(twl->dev);
+			pm_runtime_put_autosuspend(twl->dev);
 		}
 		omap_musb_mailbox(status);
 	}
@@ -768,6 +776,9 @@ static int twl4030_usb_remove(struct platform_device *pdev)
 
 	/* disable complete OTG block */
 	twl4030_usb_clear_bits(twl, POWER_CTRL, POWER_CTRL_OTG_ENAB);
+
+	if (cable_present(twl->linkstat))
+		pm_runtime_put_noidle(twl->dev);
 	pm_runtime_mark_last_busy(twl->dev);
 	pm_runtime_put(twl->dev);
 

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox