Netdev List
 help / color / mirror / Atom feed
* [PATCH v2 net-next 5/6] tls: RX path for ktls
From: Dave Watson @ 2018-03-22 17:10 UTC (permalink / raw)
  To: David S. Miller, Tom Herbert, Alexei Starovoitov, herbert,
	linux-crypto, netdev, borisp
  Cc: Atul Gupta, Vakul Garg, Hannes Frederic Sowa, Steffen Klassert,
	John Fastabend, Daniel Borkmann
In-Reply-To: <cover.1521738244.git.davejwatson@fb.com>

Add rx path for tls software implementation.

recvmsg, splice_read, and poll implemented.

An additional sockopt TLS_RX is added, with the same interface as
TLS_TX.  Either TLX_RX or TLX_TX may be provided separately, or
together (with two different setsockopt calls with appropriate keys).

Control messages are passed via CMSG in a similar way to transmit.
If no cmsg buffer is passed, then only application data records
will be passed to userspace, and EIO is returned for other types of
alerts.

EBADMSG is passed for decryption errors, and EMSGSIZE is passed for
framing too big, and EBADMSG for framing too small (matching openssl
semantics). EINVAL is returned for TLS versions that do not match the
original setsockopt call.  All are unrecoverable.

strparser is used to parse TLS framing.   Decryption is done directly
in to userspace buffers if they are large enough to support it, otherwise
sk_cow_data is called (similar to ipsec), and buffers are decrypted in
place and copied.  splice_read always decrypts in place, since no
buffers are provided to decrypt in to.

sk_poll is overridden, and only returns POLLIN if a full TLS message is
received.  Otherwise we wait for strparser to finish reading a full frame.
Actual decryption is only done during recvmsg or splice_read calls.

Signed-off-by: Dave Watson <davejwatson@fb.com>
---
 include/net/tls.h        |  27 ++-
 include/uapi/linux/tls.h |   2 +
 net/tls/Kconfig          |   1 +
 net/tls/tls_main.c       |  62 ++++-
 net/tls/tls_sw.c         | 587 ++++++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 609 insertions(+), 70 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 095b722..437a746 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -40,6 +40,7 @@
 #include <linux/socket.h>
 #include <linux/tcp.h>
 #include <net/tcp.h>
+#include <net/strparser.h>
 
 #include <uapi/linux/tls.h>
 
@@ -58,8 +59,18 @@
 
 struct tls_sw_context {
 	struct crypto_aead *aead_send;
+	struct crypto_aead *aead_recv;
 	struct crypto_wait async_wait;
 
+	/* Receive context */
+	struct strparser strp;
+	void (*saved_data_ready)(struct sock *sk);
+	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+				struct poll_table_struct *wait);
+	struct sk_buff *recv_pkt;
+	u8 control;
+	bool decrypted;
+
 	/* Sending context */
 	char aad_space[TLS_AAD_SPACE_SIZE];
 
@@ -96,12 +107,17 @@ struct tls_context {
 		struct tls_crypto_info crypto_send;
 		struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128;
 	};
+	union {
+		struct tls_crypto_info crypto_recv;
+		struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128;
+	};
 
 	void *priv_ctx;
 
 	u8 conf:2;
 
 	struct cipher_context tx;
+	struct cipher_context rx;
 
 	struct scatterlist *partially_sent_record;
 	u16 partially_sent_offset;
@@ -128,12 +144,19 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval,
 		  unsigned int optlen);
 
 
-int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx);
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx);
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_sw_sendpage(struct sock *sk, struct page *page,
 		    int offset, size_t size, int flags);
 void tls_sw_close(struct sock *sk, long timeout);
-void tls_sw_free_tx_resources(struct sock *sk);
+void tls_sw_free_resources(struct sock *sk);
+int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+		   int nonblock, int flags, int *addr_len);
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+			 struct poll_table_struct *wait);
+ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+			   struct pipe_inode_info *pipe,
+			   size_t len, unsigned int flags);
 
 void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 void tls_icsk_clean_acked(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 293b2cd..c6633e9 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -38,6 +38,7 @@
 
 /* TLS socket options */
 #define TLS_TX			1	/* Set transmit parameters */
+#define TLS_RX			2	/* Set receive parameters */
 
 /* Supported versions */
 #define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
@@ -59,6 +60,7 @@
 #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE		8
 
 #define TLS_SET_RECORD_TYPE	1
+#define TLS_GET_RECORD_TYPE	2
 
 struct tls_crypto_info {
 	__u16 version;
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index eb58303..89b8745a 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -7,6 +7,7 @@ config TLS
 	select CRYPTO
 	select CRYPTO_AES
 	select CRYPTO_GCM
+	select STREAM_PARSER
 	default n
 	---help---
 	Enable kernel support for TLS protocol. This allows symmetric
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index c405bee..6f5c114 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -54,12 +54,15 @@ enum {
 enum {
 	TLS_BASE,
 	TLS_SW_TX,
+	TLS_SW_RX,
+	TLS_SW_RXTX,
 	TLS_NUM_CONFIG,
 };
 
 static struct proto *saved_tcpv6_prot;
 static DEFINE_MUTEX(tcpv6_prot_mutex);
 static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
+static struct proto_ops tls_sw_proto_ops;
 
 static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
 {
@@ -261,9 +264,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 
 	kfree(ctx->tx.rec_seq);
 	kfree(ctx->tx.iv);
+	kfree(ctx->rx.rec_seq);
+	kfree(ctx->rx.iv);
 
-	if (ctx->conf == TLS_SW_TX)
-		tls_sw_free_tx_resources(sk);
+	if (ctx->conf == TLS_SW_TX ||
+	    ctx->conf == TLS_SW_RX ||
+	    ctx->conf == TLS_SW_RXTX) {
+		tls_sw_free_resources(sk);
+	}
 
 skip_tx_cleanup:
 	release_sock(sk);
@@ -365,8 +373,8 @@ static int tls_getsockopt(struct sock *sk, int level, int optname,
 	return do_tls_getsockopt(sk, optname, optval, optlen);
 }
 
-static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
-				unsigned int optlen)
+static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
+				  unsigned int optlen, int tx)
 {
 	struct tls_crypto_info *crypto_info;
 	struct tls_context *ctx = tls_get_ctx(sk);
@@ -378,7 +386,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 		goto out;
 	}
 
-	crypto_info = &ctx->crypto_send;
+	if (tx)
+		crypto_info = &ctx->crypto_send;
+	else
+		crypto_info = &ctx->crypto_recv;
+
 	/* Currently we don't support set crypto info more than one time */
 	if (TLS_CRYPTO_INFO_READY(crypto_info)) {
 		rc = -EBUSY;
@@ -417,15 +429,31 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 	}
 
 	/* currently SW is default, we will have ethtool in future */
-	rc = tls_set_sw_offload(sk, ctx);
-	conf = TLS_SW_TX;
+	if (tx) {
+		rc = tls_set_sw_offload(sk, ctx, 1);
+		if (ctx->conf == TLS_SW_RX)
+			conf = TLS_SW_RXTX;
+		else
+			conf = TLS_SW_TX;
+	} else {
+		rc = tls_set_sw_offload(sk, ctx, 0);
+		if (ctx->conf == TLS_SW_TX)
+			conf = TLS_SW_RXTX;
+		else
+			conf = TLS_SW_RX;
+	}
+
 	if (rc)
 		goto err_crypto_info;
 
 	ctx->conf = conf;
 	update_sk_prot(sk, ctx);
-	ctx->sk_write_space = sk->sk_write_space;
-	sk->sk_write_space = tls_write_space;
+	if (tx) {
+		ctx->sk_write_space = sk->sk_write_space;
+		sk->sk_write_space = tls_write_space;
+	} else {
+		sk->sk_socket->ops = &tls_sw_proto_ops;
+	}
 	goto out;
 
 err_crypto_info:
@@ -441,8 +469,10 @@ static int do_tls_setsockopt(struct sock *sk, int optname,
 
 	switch (optname) {
 	case TLS_TX:
+	case TLS_RX:
 		lock_sock(sk);
-		rc = do_tls_setsockopt_tx(sk, optval, optlen);
+		rc = do_tls_setsockopt_conf(sk, optval, optlen,
+					    optname == TLS_TX);
 		release_sock(sk);
 		break;
 	default:
@@ -473,6 +503,14 @@ static void build_protos(struct proto *prot, struct proto *base)
 	prot[TLS_SW_TX] = prot[TLS_BASE];
 	prot[TLS_SW_TX].sendmsg		= tls_sw_sendmsg;
 	prot[TLS_SW_TX].sendpage	= tls_sw_sendpage;
+
+	prot[TLS_SW_RX] = prot[TLS_BASE];
+	prot[TLS_SW_RX].recvmsg		= tls_sw_recvmsg;
+	prot[TLS_SW_RX].close		= tls_sk_proto_close;
+
+	prot[TLS_SW_RXTX] = prot[TLS_SW_TX];
+	prot[TLS_SW_RXTX].recvmsg	= tls_sw_recvmsg;
+	prot[TLS_SW_RXTX].close		= tls_sk_proto_close;
 }
 
 static int tls_init(struct sock *sk)
@@ -531,6 +569,10 @@ static int __init tls_register(void)
 {
 	build_protos(tls_prots[TLSV4], &tcp_prot);
 
+	tls_sw_proto_ops = inet_stream_ops;
+	tls_sw_proto_ops.poll = tls_sw_poll;
+	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
+
 	tcp_register_ulp(&tcp_tls_ulp_ops);
 
 	return 0;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 1c79d9a..4dc766b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -34,11 +34,60 @@
  * SOFTWARE.
  */
 
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <crypto/aead.h>
 
+#include <net/strparser.h>
 #include <net/tls.h>
 
+static int tls_do_decryption(struct sock *sk,
+			     struct scatterlist *sgin,
+			     struct scatterlist *sgout,
+			     char *iv_recv,
+			     size_t data_len,
+			     struct sk_buff *skb,
+			     gfp_t flags)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct strp_msg *rxm = strp_msg(skb);
+	struct aead_request *aead_req;
+
+	int ret;
+	unsigned int req_size = sizeof(struct aead_request) +
+		crypto_aead_reqsize(ctx->aead_recv);
+
+	aead_req = kzalloc(req_size, flags);
+	if (!aead_req)
+		return -ENOMEM;
+
+	aead_request_set_tfm(aead_req, ctx->aead_recv);
+	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
+	aead_request_set_crypt(aead_req, sgin, sgout,
+			       data_len + tls_ctx->rx.tag_size,
+			       (u8 *)iv_recv);
+	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  crypto_req_done, &ctx->async_wait);
+
+	ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait);
+
+	if (ret < 0)
+		goto out;
+
+	rxm->offset += tls_ctx->rx.prepend_size;
+	rxm->full_len -= tls_ctx->rx.overhead_size;
+	tls_advance_record_sn(sk, &tls_ctx->rx);
+
+	ctx->decrypted = true;
+
+	ctx->saved_data_ready(sk);
+
+out:
+	kfree(aead_req);
+	return ret;
+}
+
 static void trim_sg(struct sock *sk, struct scatterlist *sg,
 		    int *sg_num_elem, unsigned int *sg_size, int target_size)
 {
@@ -581,13 +630,404 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 	return ret;
 }
 
-void tls_sw_free_tx_resources(struct sock *sk)
+static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
+				     long timeo, int *err)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct sk_buff *skb;
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	while (!(skb = ctx->recv_pkt)) {
+		if (sk->sk_err) {
+			*err = sock_error(sk);
+			return NULL;
+		}
+
+		if (sock_flag(sk, SOCK_DONE))
+			return NULL;
+
+		if ((flags & MSG_DONTWAIT) || !timeo) {
+			*err = -EAGAIN;
+			return NULL;
+		}
+
+		add_wait_queue(sk_sleep(sk), &wait);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+		sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+		remove_wait_queue(sk_sleep(sk), &wait);
+
+		/* Handle signals */
+		if (signal_pending(current)) {
+			*err = sock_intr_errno(timeo);
+			return NULL;
+		}
+	}
+
+	return skb;
+}
+
+static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+		       struct scatterlist *sgout)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size];
+	struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
+	struct scatterlist *sgin = &sgin_arr[0];
+	struct strp_msg *rxm = strp_msg(skb);
+	int ret, nsg = ARRAY_SIZE(sgin_arr);
+	char aad_recv[TLS_AAD_SPACE_SIZE];
+	struct sk_buff *unused;
+
+	ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE,
+			    iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			    tls_ctx->rx.iv_size);
+	if (ret < 0)
+		return ret;
+
+	memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	if (!sgout) {
+		nsg = skb_cow_data(skb, 0, &unused) + 1;
+		sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation);
+		if (!sgout)
+			sgout = sgin;
+	}
+
+	sg_init_table(sgin, nsg);
+	sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv));
+
+	nsg = skb_to_sgvec(skb, &sgin[1],
+			   rxm->offset + tls_ctx->rx.prepend_size,
+			   rxm->full_len - tls_ctx->rx.prepend_size);
+
+	tls_make_aad(aad_recv,
+		     rxm->full_len - tls_ctx->rx.overhead_size,
+		     tls_ctx->rx.rec_seq,
+		     tls_ctx->rx.rec_seq_size,
+		     ctx->control);
+
+	ret = tls_do_decryption(sk, sgin, sgout, iv,
+				rxm->full_len - tls_ctx->rx.overhead_size,
+				skb, sk->sk_allocation);
+
+	if (sgin != &sgin_arr[0])
+		kfree(sgin);
+
+	return ret;
+}
+
+static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
+			       unsigned int len)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct strp_msg *rxm = strp_msg(skb);
+
+	if (len < rxm->full_len) {
+		rxm->offset += len;
+		rxm->full_len -= len;
+
+		return false;
+	}
+
+	/* Finished with message */
+	ctx->recv_pkt = NULL;
+	kfree_skb(skb);
+	strp_unpause(&ctx->strp);
+
+	return true;
+}
+
+int tls_sw_recvmsg(struct sock *sk,
+		   struct msghdr *msg,
+		   size_t len,
+		   int nonblock,
+		   int flags,
+		   int *addr_len)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	unsigned char control;
+	struct strp_msg *rxm;
+	struct sk_buff *skb;
+	ssize_t copied = 0;
+	bool cmsg = false;
+	int err = 0;
+	long timeo;
+
+	flags |= nonblock;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	do {
+		bool zc = false;
+		int chunk = 0;
+
+		skb = tls_wait_data(sk, flags, timeo, &err);
+		if (!skb)
+			goto recv_end;
+
+		rxm = strp_msg(skb);
+		if (!cmsg) {
+			int cerr;
+
+			cerr = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE,
+					sizeof(ctx->control), &ctx->control);
+			cmsg = true;
+			control = ctx->control;
+			if (ctx->control != TLS_RECORD_TYPE_DATA) {
+				if (cerr || msg->msg_flags & MSG_CTRUNC) {
+					err = -EIO;
+					goto recv_end;
+				}
+			}
+		} else if (control != ctx->control) {
+			goto recv_end;
+		}
+
+		if (!ctx->decrypted) {
+			int page_count;
+			int to_copy;
+
+			page_count = iov_iter_npages(&msg->msg_iter,
+						     MAX_SKB_FRAGS);
+			to_copy = rxm->full_len - tls_ctx->rx.overhead_size;
+			if (to_copy <= len && page_count < MAX_SKB_FRAGS &&
+			    likely(!(flags & MSG_PEEK)))  {
+				struct scatterlist sgin[MAX_SKB_FRAGS + 1];
+				char unused[21];
+				int pages = 0;
+
+				zc = true;
+				sg_init_table(sgin, MAX_SKB_FRAGS + 1);
+				sg_set_buf(&sgin[0], unused, 13);
+
+				err = zerocopy_from_iter(sk, &msg->msg_iter,
+							 to_copy, &pages,
+							 &chunk, &sgin[1],
+							 MAX_SKB_FRAGS,	false);
+				if (err < 0)
+					goto fallback_to_reg_recv;
+
+				err = decrypt_skb(sk, skb, sgin);
+				for (; pages > 0; pages--)
+					put_page(sg_page(&sgin[pages]));
+				if (err < 0) {
+					tls_err_abort(sk, EBADMSG);
+					goto recv_end;
+				}
+			} else {
+fallback_to_reg_recv:
+				err = decrypt_skb(sk, skb, NULL);
+				if (err < 0) {
+					tls_err_abort(sk, EBADMSG);
+					goto recv_end;
+				}
+			}
+			ctx->decrypted = true;
+		}
+
+		if (!zc) {
+			chunk = min_t(unsigned int, rxm->full_len, len);
+			err = skb_copy_datagram_msg(skb, rxm->offset, msg,
+						    chunk);
+			if (err < 0)
+				goto recv_end;
+		}
+
+		copied += chunk;
+		len -= chunk;
+		if (likely(!(flags & MSG_PEEK))) {
+			u8 control = ctx->control;
+
+			if (tls_sw_advance_skb(sk, skb, chunk)) {
+				/* Return full control message to
+				 * userspace before trying to parse
+				 * another message type
+				 */
+				msg->msg_flags |= MSG_EOR;
+				if (control != TLS_RECORD_TYPE_DATA)
+					goto recv_end;
+			}
+		}
+	} while (len);
+
+recv_end:
+	release_sock(sk);
+	return copied ? : err;
+}
+
+ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
+			   struct pipe_inode_info *pipe,
+			   size_t len, unsigned int flags)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sock->sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct strp_msg *rxm = NULL;
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	ssize_t copied = 0;
+	int err = 0;
+	long timeo;
+	int chunk;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	skb = tls_wait_data(sk, flags, timeo, &err);
+	if (!skb)
+		goto splice_read_end;
+
+	/* splice does not support reading control messages */
+	if (ctx->control != TLS_RECORD_TYPE_DATA) {
+		err = -ENOTSUPP;
+		goto splice_read_end;
+	}
+
+	if (!ctx->decrypted) {
+		err = decrypt_skb(sk, skb, NULL);
+
+		if (err < 0) {
+			tls_err_abort(sk, EBADMSG);
+			goto splice_read_end;
+		}
+		ctx->decrypted = true;
+	}
+	rxm = strp_msg(skb);
+
+	chunk = min_t(unsigned int, rxm->full_len, len);
+	copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags);
+	if (copied < 0)
+		goto splice_read_end;
+
+	if (likely(!(flags & MSG_PEEK)))
+		tls_sw_advance_skb(sk, skb, copied);
+
+splice_read_end:
+	release_sock(sk);
+	return copied ? : err;
+}
+
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+			 struct poll_table_struct *wait)
+{
+	unsigned int ret;
+	struct sock *sk = sock->sk;
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	/* Grab POLLOUT and POLLHUP from the underlying socket */
+	ret = ctx->sk_poll(file, sock, wait);
+
+	/* Clear POLLIN bits, and set based on recv_pkt */
+	ret &= ~(POLLIN | POLLRDNORM);
+	if (ctx->recv_pkt)
+		ret |= POLLIN | POLLRDNORM;
+
+	return ret;
+}
+
+static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	char header[tls_ctx->rx.prepend_size];
+	struct strp_msg *rxm = strp_msg(skb);
+	size_t cipher_overhead;
+	size_t data_len = 0;
+	int ret;
+
+	/* Verify that we have a full TLS header, or wait for more data */
+	if (rxm->offset + tls_ctx->rx.prepend_size > skb->len)
+		return 0;
+
+	/* Linearize header to local buffer */
+	ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size);
+
+	if (ret < 0)
+		goto read_failure;
+
+	ctx->control = header[0];
+
+	data_len = ((header[4] & 0xFF) | (header[3] << 8));
+
+	cipher_overhead = tls_ctx->rx.tag_size + tls_ctx->rx.iv_size;
+
+	if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead) {
+		ret = -EMSGSIZE;
+		goto read_failure;
+	}
+	if (data_len < cipher_overhead) {
+		ret = -EBADMSG;
+		goto read_failure;
+	}
+
+	if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) ||
+	    header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) {
+		ret = -EINVAL;
+		goto read_failure;
+	}
+
+	return data_len + TLS_HEADER_SIZE;
+
+read_failure:
+	tls_err_abort(strp->sk, ret);
+
+	return ret;
+}
+
+static void tls_queue(struct strparser *strp, struct sk_buff *skb)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct strp_msg *rxm;
+
+	rxm = strp_msg(skb);
+
+	ctx->decrypted = false;
+
+	ctx->recv_pkt = skb;
+	strp_pause(strp);
+
+	strp->sk->sk_state_change(strp->sk);
+}
+
+static void tls_data_ready(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	strp_data_ready(&ctx->strp);
+}
+
+void tls_sw_free_resources(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
 
 	if (ctx->aead_send)
 		crypto_free_aead(ctx->aead_send);
+	if (ctx->aead_recv) {
+		if (ctx->recv_pkt) {
+			kfree_skb(ctx->recv_pkt);
+			ctx->recv_pkt = NULL;
+		}
+		crypto_free_aead(ctx->aead_recv);
+		strp_stop(&ctx->strp);
+		write_lock_bh(&sk->sk_callback_lock);
+		sk->sk_data_ready = ctx->saved_data_ready;
+		write_unlock_bh(&sk->sk_callback_lock);
+		release_sock(sk);
+		strp_done(&ctx->strp);
+		lock_sock(sk);
+	}
 
 	tls_free_both_sg(sk);
 
@@ -595,12 +1035,15 @@ void tls_sw_free_tx_resources(struct sock *sk)
 	kfree(tls_ctx);
 }
 
-int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 {
 	char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
 	struct tls_crypto_info *crypto_info;
 	struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
 	struct tls_sw_context *sw_ctx;
+	struct cipher_context *cctx;
+	struct crypto_aead **aead;
+	struct strp_callbacks cb;
 	u16 nonce_size, tag_size, iv_size, rec_seq_size;
 	char *iv, *rec_seq;
 	int rc = 0;
@@ -610,22 +1053,29 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 		goto out;
 	}
 
-	if (ctx->priv_ctx) {
-		rc = -EEXIST;
-		goto out;
-	}
-
-	sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
-	if (!sw_ctx) {
-		rc = -ENOMEM;
-		goto out;
+	if (!ctx->priv_ctx) {
+		sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
+		if (!sw_ctx) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		crypto_init_wait(&sw_ctx->async_wait);
+	} else {
+		sw_ctx = ctx->priv_ctx;
 	}
 
-	crypto_init_wait(&sw_ctx->async_wait);
-
 	ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
 
-	crypto_info = &ctx->crypto_send;
+	if (tx) {
+		crypto_info = &ctx->crypto_send;
+		cctx = &ctx->tx;
+		aead = &sw_ctx->aead_send;
+	} else {
+		crypto_info = &ctx->crypto_recv;
+		cctx = &ctx->rx;
+		aead = &sw_ctx->aead_recv;
+	}
+
 	switch (crypto_info->cipher_type) {
 	case TLS_CIPHER_AES_GCM_128: {
 		nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
@@ -644,48 +1094,49 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 		goto free_priv;
 	}
 
-	ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
-	ctx->tx.tag_size = tag_size;
-	ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
-	ctx->tx.iv_size = iv_size;
-	ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
-			     GFP_KERNEL);
-	if (!ctx->tx.iv) {
+	cctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
+	cctx->tag_size = tag_size;
+	cctx->overhead_size = cctx->prepend_size + cctx->tag_size;
+	cctx->iv_size = iv_size;
+	cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			   GFP_KERNEL);
+	if (!cctx->iv) {
 		rc = -ENOMEM;
 		goto free_priv;
 	}
-	memcpy(ctx->tx.iv, gcm_128_info->salt,
-	       TLS_CIPHER_AES_GCM_128_SALT_SIZE);
-	memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
-	ctx->tx.rec_seq_size = rec_seq_size;
-	ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
-	if (!ctx->tx.rec_seq) {
+	memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+	cctx->rec_seq_size = rec_seq_size;
+	cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+	if (!cctx->rec_seq) {
 		rc = -ENOMEM;
 		goto free_iv;
 	}
-	memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
-
-	sg_init_table(sw_ctx->sg_encrypted_data,
-		      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
-	sg_init_table(sw_ctx->sg_plaintext_data,
-		      ARRAY_SIZE(sw_ctx->sg_plaintext_data));
-
-	sg_init_table(sw_ctx->sg_aead_in, 2);
-	sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
-		   sizeof(sw_ctx->aad_space));
-	sg_unmark_end(&sw_ctx->sg_aead_in[1]);
-	sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
-	sg_init_table(sw_ctx->sg_aead_out, 2);
-	sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
-		   sizeof(sw_ctx->aad_space));
-	sg_unmark_end(&sw_ctx->sg_aead_out[1]);
-	sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
-
-	if (!sw_ctx->aead_send) {
-		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
-		if (IS_ERR(sw_ctx->aead_send)) {
-			rc = PTR_ERR(sw_ctx->aead_send);
-			sw_ctx->aead_send = NULL;
+	memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
+
+	if (tx) {
+		sg_init_table(sw_ctx->sg_encrypted_data,
+			      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
+		sg_init_table(sw_ctx->sg_plaintext_data,
+			      ARRAY_SIZE(sw_ctx->sg_plaintext_data));
+
+		sg_init_table(sw_ctx->sg_aead_in, 2);
+		sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
+			   sizeof(sw_ctx->aad_space));
+		sg_unmark_end(&sw_ctx->sg_aead_in[1]);
+		sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
+		sg_init_table(sw_ctx->sg_aead_out, 2);
+		sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
+			   sizeof(sw_ctx->aad_space));
+		sg_unmark_end(&sw_ctx->sg_aead_out[1]);
+		sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
+	}
+
+	if (!*aead) {
+		*aead = crypto_alloc_aead("gcm(aes)", 0, 0);
+		if (IS_ERR(*aead)) {
+			rc = PTR_ERR(*aead);
+			*aead = NULL;
 			goto free_rec_seq;
 		}
 	}
@@ -694,21 +1145,41 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 
 	memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
 
-	rc = crypto_aead_setkey(sw_ctx->aead_send, keyval,
+	rc = crypto_aead_setkey(*aead, keyval,
 				TLS_CIPHER_AES_GCM_128_KEY_SIZE);
 	if (rc)
 		goto free_aead;
 
-	rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tx.tag_size);
-	if (!rc)
-		return 0;
+	rc = crypto_aead_setauthsize(*aead, cctx->tag_size);
+	if (rc)
+		goto free_aead;
+
+	if (!tx) {
+		/* Set up strparser */
+		memset(&cb, 0, sizeof(cb));
+		cb.rcv_msg = tls_queue;
+		cb.parse_msg = tls_read_size;
+
+		strp_init(&sw_ctx->strp, sk, &cb);
+
+		write_lock_bh(&sk->sk_callback_lock);
+		sw_ctx->saved_data_ready = sk->sk_data_ready;
+		sk->sk_data_ready = tls_data_ready;
+		write_unlock_bh(&sk->sk_callback_lock);
+
+		sw_ctx->sk_poll = sk->sk_socket->ops->poll;
+
+		strp_check_rcv(&sw_ctx->strp);
+	}
+
+	goto out;
 
 free_aead:
-	crypto_free_aead(sw_ctx->aead_send);
-	sw_ctx->aead_send = NULL;
+	crypto_free_aead(*aead);
+	*aead = NULL;
 free_rec_seq:
-	kfree(ctx->tx.rec_seq);
-	ctx->tx.rec_seq = NULL;
+	kfree(cctx->rec_seq);
+	cctx->rec_seq = NULL;
 free_iv:
 	kfree(ctx->tx.iv);
 	ctx->tx.iv = NULL;
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH net-next 00/11] fix some bugs for HNS3 driver
From: David Miller @ 2018-03-22 17:12 UTC (permalink / raw)
  To: lipeng321; +Cc: netdev, linux-kernel, linuxarm, salil.mehta
In-Reply-To: <1521618570-129694-1-git-send-email-lipeng321@huawei.com>

From: Peng Li <lipeng321@huawei.com>
Date: Wed, 21 Mar 2018 15:49:19 +0800

> This patchset fixes some bugs for HNS3 driver:
> [Patch 1/11 - 5/11] fix various bugs reported by hisilicon test team.
> [Patch 6/11 - 7/11] fix bugs about interrupt coalescing self-adaptive
> function.
> [Patch 8/11 - 11/11] fix bugs about ethtool_ops.get_link_ksettings.

Series applied, thank you.

^ permalink raw reply

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
From: Linus Torvalds @ 2018-03-22 17:16 UTC (permalink / raw)
  To: David Laight
  Cc: Alexander Duyck, Rahul Lakkireddy, Thomas Gleixner,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org, Ganesh GR,
	Nirranjan Kirubaharan, Indranil Choudhury
In-Reply-To: <035ac754e54a4b14844f3300ae1432a9@AcuMS.aculab.com>

On Thu, Mar 22, 2018 at 3:48 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Linus Torvalds
>>
>> Note that we definitely have seen hardware that *depends* on the
>> regular memcpy_fromio()" not doing big reads. I don't know how
>> hardware people screw it up, but it's clearly possible.

[ That should have been "big writes" ]

> I wonder if that hardware works with the current kernel on recent cpus?
> I bet it doesn't like the byte accesses that get generated either.

The one case we knew about we just fixed to use the special "16-bit
word at a time" scr_memcpyw().

If I recall correctly, it was some "enterprise graphics console".

If it's something I've found over the years, is that "enterprise"
hardware is absolutely the dregs. It's the low-volume stuff that
almost nobody uses and where the customer is used to the whole notion
of boutique hardware, so they're used to having to do special things
for special hardware.

And I very much use the word "special" in the "my mom told me I was
special" sense. It's not the *good* kind of "special". It's the "short
bus" kind of special.

> For x86 being able to request a copy done as 'rep movsx' (for each x)
> would be useful.

The portability issues are horrendous. Particularly if the memory area
(source or dest) might be unaligned.

The rule of thumb should be: don't use PIO, and if you *do* use PIO,
don't be picky about what you get.

And most *definitely* don't complain about performance to software
people. Blame the hardware people. Get a competent piece of hardware,
or a competent hardware designer.

Let's face it, PIO is stupid. Use it for command and status words. Not
for data transfer.

                 Linus

^ permalink raw reply

* Re: [PATCH net-next 0/2] mlxsw: Update supported firmware version
From: David Miller @ 2018-03-22 17:16 UTC (permalink / raw)
  To: idosch; +Cc: netdev, jiri, talb, dsahern, mlxsw
In-Reply-To: <20180321073406.23131-1-idosch@mellanox.com>

From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 21 Mar 2018 09:34:04 +0200

> The first patch bumps the firmware version supported by the driver. The
> second patch enables a feature introduced in the new version,
> auto-negotiation disable.

Series applied, thank you.

^ permalink raw reply

* Re: pull-request: mac80211 2018-03-21
From: David Miller @ 2018-03-22 17:19 UTC (permalink / raw)
  To: johannes-cdvu00un1VgdHxzADdlk8Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20180321120655.7354-1-johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>

From: Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>
Date: Wed, 21 Mar 2018 13:06:54 +0100

> Another few fixes - one for hwsim, so not really all that interesting,
> and two patches to work around an ath9k_htc problem.
> 
> Note that I pulled your net tree today, so you may need to be careful
> to not fast-forward if you don't merge anything else before this.
> 
> Please pull and let me know if there's any problem.

Pulled, thanks a lot Johannes.

^ permalink raw reply

* Re: [bpf-next V4 PATCH 11/15] page_pool: refurbish version of page_pool code
From: Alexei Starovoitov @ 2018-03-22 17:21 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, BjörnTöpel, magnus.karlsson, eugenia,
	Jason Wang, John Fastabend, Eran Ben Elisha, Saeed Mahameed, galp,
	Daniel Borkmann, Tariq Toukan
In-Reply-To: <152172852402.20979.12261180749945313455.stgit@firesoul>

On Thu, Mar 22, 2018 at 03:22:04PM +0100, Jesper Dangaard Brouer wrote:
> Need a fast page recycle mechanism for ndo_xdp_xmit API for returning
> pages on DMA-TX completion time, which have good cross CPU
> performance, given DMA-TX completion time can happen on a remote CPU.
> 
> Refurbish my page_pool code, that was presented[1] at MM-summit 2016.
> Adapted page_pool code to not depend the page allocator and
> integration into struct page.  The DMA mapping feature is kept,
> even-though it will not be activated/used in this patchset.
> 
> [1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf
> 
> V2: Adjustments requested by Tariq
>  - Changed page_pool_create return codes, don't return NULL, only
>    ERR_PTR, as this simplifies err handling in drivers.
> 
> V4: many small improvements and cleanups
> - Add DOC comment section, that can be used by kernel-doc
> - Improve fallback mode, to work better with refcnt based recycling
>   e.g. remove a WARN as pointed out by Tariq
>   e.g. quicker fallback if ptr_ring is empty.
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
...
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> new file mode 100644
> index 000000000000..1ff11e641b2e
> --- /dev/null
> +++ b/include/net/page_pool.h
> @@ -0,0 +1,133 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
...
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> new file mode 100644
> index 000000000000..04112feb2df6
> --- /dev/null
> +++ b/net/core/page_pool.c
> @@ -0,0 +1,329 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note

These are kernel internal files. 'WITH ...' doesn't apply.
See LICENSES/exceptions/Linux-syscall-note

^ permalink raw reply

* Re: [PATCH 06/28] aio: implement IOCB_CMD_POLL
From: Christoph Hellwig @ 2018-03-22 17:24 UTC (permalink / raw)
  To: Al Viro
  Cc: Christoph Hellwig, Avi Kivity, linux-aio, linux-fsdevel, netdev,
	linux-api, linux-kernel
In-Reply-To: <20180322165255.GI30522@ZenIV.linux.org.uk>

On Thu, Mar 22, 2018 at 04:52:55PM +0000, Al Viro wrote:
> On Wed, Mar 21, 2018 at 08:40:10AM +0100, Christoph Hellwig wrote:
> > Simple one-shot poll through the io_submit() interface.  To poll for
> > a file descriptor the application should submit an iocb of type
> > IOCB_CMD_POLL.  It will poll the fd for the events specified in the
> > the first 32 bits of the aio_buf field of the iocb.
> > 
> > Unlike poll or epoll without EPOLLONESHOT this interface always works
> > in one shot mode, that is once the iocb is completed, it will have to be
> > resubmitted.
> 
> AFAICS, your wakeup can race with io_cancel(), leading to double fput().
> You are checking the "somebody had committed itself to cancelling that
> thing" bit outside of ->ctx_lock on the wakeup side, and I don't see
> anything to prevent both getting to __aio_poll_complete() on the same
> iocb, with obvious results.

True.  Probably wants something like this to fix, although for this
is entirely untested:

diff --git a/fs/aio.c b/fs/aio.c
index 38b408129697..66d5cc272617 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -187,8 +187,9 @@ struct aio_kiocb {
 						 * for cancellation */
 
 	unsigned int		flags;		/* protected by ctx->ctx_lock */
-#define AIO_IOCB_DELAYED_CANCEL	(1 << 0)
-#define AIO_IOCB_CANCELLED	(1 << 1)
+#define AIO_IOCB_CAN_CANCEL	(1 << 0)
+#define AIO_IOCB_DELAYED_CANCEL	(1 << 1)
+#define AIO_IOCB_CANCELLED	(1 << 2)
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
@@ -568,7 +569,7 @@ static void __kiocb_set_cancel_fn(struct aio_kiocb *req,
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 	list_add_tail(&req->ki_list, &ctx->active_reqs);
 	req->ki_cancel = cancel;
-	req->flags |= iocb_flags;
+	req->flags |= (AIO_IOCB_CAN_CANCEL | iocb_flags);
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
 
@@ -1086,22 +1087,30 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	return ret;
 }
 
+#define AIO_COMPLETE_CANCEL	(1 << 0)
+
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+static bool aio_complete(struct aio_kiocb *iocb, long res, long res2,
+		unsigned complete_flags)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
 	struct io_event	*ev_page, *event;
 	unsigned tail, pos, head;
-	unsigned long	flags;
-
-	if (!list_empty_careful(iocb->ki_list.next)) {
-		unsigned long flags;
+	unsigned long flags;
 
+	if (iocb->flags & AIO_IOCB_CAN_CANCEL) {
 		spin_lock_irqsave(&ctx->ctx_lock, flags);
-		list_del(&iocb->ki_list);
+		if (!(complete_flags & AIO_COMPLETE_CANCEL) &&
+		    (iocb->flags & AIO_IOCB_CANCELLED)) {
+			spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+			return false;
+		}
+
+		if (!list_empty(&iocb->ki_list))
+			list_del(&iocb->ki_list);
 		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 	}
 
@@ -1177,6 +1186,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 		wake_up(&ctx->wait);
 
 	percpu_ref_put(&ctx->reqs);
+	return true;
 }
 
 /* aio_read_events_ring
@@ -1425,6 +1435,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
+	struct file *file = kiocb->ki_filp;
 
 	WARN_ON_ONCE(is_sync_kiocb(kiocb));
 
@@ -1440,8 +1451,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 		file_end_write(kiocb->ki_filp);
 	}
 
-	fput(kiocb->ki_filp);
-	aio_complete(iocb, res, res2);
+	if (aio_complete(iocb, res, res2, 0))
+		fput(file);
 }
 
 static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
@@ -1584,11 +1595,13 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
 static void aio_fsync_work(struct work_struct *work)
 {
 	struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
+	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, fsync);
+	struct file *file = req->file;
 	int ret;
 
 	ret = vfs_fsync(req->file, req->datasync);
-	fput(req->file);
-	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+	if (aio_complete(iocb, ret, 0, 0))
+		fput(file);
 }
 
 static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
@@ -1617,27 +1630,23 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
 	return ret;
 }
 
-static void __aio_complete_poll(struct poll_iocb *req, __poll_t mask)
-{
-	fput(req->file);
-	aio_complete(container_of(req, struct aio_kiocb, poll),
-			mangle_poll(mask), 0);
-}
-
 static void aio_complete_poll(struct poll_iocb *req, __poll_t mask)
 {
 	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+	struct file *file = req->file;
 
-	if (!(iocb->flags & AIO_IOCB_CANCELLED))
-		__aio_complete_poll(req, mask);
+	if (aio_complete(iocb, mangle_poll(mask), 0, 0))
+		fput(file);
 }
 
 static int aio_poll_cancel(struct kiocb *rw)
 {
 	struct aio_kiocb *iocb = container_of(rw, struct aio_kiocb, rw);
+	struct file *file = iocb->poll.file;
 
 	remove_wait_queue(iocb->poll.head, &iocb->poll.wait);
-	__aio_complete_poll(&iocb->poll, 0); /* no events to report */
+	if (aio_complete(iocb, 0, 0, AIO_COMPLETE_CANCEL))
+		fput(file);
 	return 0;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* Re: [PATCH v2 6/8] page_frag_cache: Use a mask instead of offset
From: Alexander Duyck @ 2018-03-22 17:31 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Matthew Wilcox, Netdev, linux-mm, Jesper Dangaard Brouer,
	Eric Dumazet
In-Reply-To: <20180322164157.GE28468@bombadil.infradead.org>

On Thu, Mar 22, 2018 at 9:41 AM, Matthew Wilcox <willy@infradead.org> wrote:
> On Thu, Mar 22, 2018 at 09:22:31AM -0700, Alexander Duyck wrote:
>> On Thu, Mar 22, 2018 at 8:31 AM, Matthew Wilcox <willy@infradead.org> wrote:
>> > By combining 'va' and 'offset' into 'addr' and using a mask instead,
>> > we can save a compare-and-branch in the fast-path of the allocator.
>> > This removes 4 instructions on x86 (both 32 and 64 bit).
>>
>> What is the point of renaming "va"? I'm seeing a lot of unneeded
>> renaming in these patches that doesn't really seem needed and is just
>> making things harder to review.
>
> By renaming 'va', I made sure that I saw everywhere that 'va' was touched,
> and reviewed it to be sure it was still correct with the new meaning.
> The advantage of keeping that in the patch submission, rather than
> renaming it back again, is that you can see everywhere that it's been
> touched and verify that for yourself.

Okay, I guess that makes some sense. I was just mentally lumping it in
with the fragsz -> size and nc -> pfc changes.

>> > We can avoid storing the mask at all if we know that we're only allocating
>> > a single page.  This shrinks page_frag_cache from 12 to 8 bytes on 32-bit
>> > CONFIG_BASE_SMALL build.
>> >
>> > Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
>>
>> So I am not really a fan of CONFIG_BASE_SMALL in general, so
>> advertising gains in size is just going back down the reducing size at
>> the expense of performance train of thought.
>
> There's no tradeoff for performance *in this patch* with
> CONFIG_BASE_SMALL.  Indeed, being able to assume that the cache contains a
> single PAGE_SIZE page reduces the number of instructions by two on x86-64
> (and is neutral on x86-32).  IIRC it saves a register, so there's one fewer
> 'push' at the beginning of the function and one fewer 'pop' at the end.

The issue is that you are now going to have to perform 8x as many
allocations and take the slow path that many more times.

> I think the more compelling argument for conditioning the number of pages
> allocated on CONFIG_BASE_SMALL is that a machine which needs to shrink
> its data structures so badly isn't going to have 32k of memory available,
> nor want to spend it on a networking allocation.  Eric's commit which
> introduced NETDEV_FRAG_PAGE_MAX_ORDER back in 2012 (69b08f62e174) didn't
> mention small machines as a consideration.

The problem is that is assuming that something is doing small enough
allocations that there is an advantage to using 4K. In the case of
this API I'm not certain that is the case. More often then not this is
used when allocating an skb in the Rx path. The typical Rx skb size is
headroom + 1514 + skb_shared_info. If you take a look that is
dangerously close to 2K. With your change you now get 2 allocations
per page instead of the 16 that was seen with a 32K page. If you have
a device that cannot control the Rx along that boundary things get
worse since you are looking at something like headroom + 2K +
skb_shared_info. In such a case there wouldn't be any point using the
API anymore since you might as well just use the page allocator.

>> Do we know for certain that a higher order page is always aligned to
>> the size of the higher order page itself? That is one thing I have
>> never been certain about. I know for example there are head and tail
>> pages so I was never certain if it was possible to create a higher
>> order page that is not aligned to to the size of the page itself.
>
> It's intrinsic to the buddy allocator that pages are naturally aligned
> to their order.  There's a lot of code in the kernel which relies on
> it, including much of the mm (particularly THP).  I suppose one could
> construct a non-aligned compound page, but it'd be really weird, and you'd
> have to split it up manually before handing it back to the page allocator.
> I don't see this ever changing.
>
>> >  struct page_frag_cache {
>> > -       void * va;
>> > +       void *addr;
>> >  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> > -       __u16 offset;
>> > -       __u16 size;
>> > -#else
>> > -       __u32 offset;
>> > +       unsigned int mask;
>>
>> So this is just an akward layout. You now have essentially:
>> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> #else
>>     unsigned int mask;
>> #endif
>
> Huh?  There's a '-' in front of the '#else'.  It looks like this:

Yeah, I might need to increase the font size on my email client. The
"-" had blended into the "#".

>
> struct page_frag_cache {
>         void *addr;
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>         unsigned int mask;

We might need to check here and see if this needs to be an "unsigned
int" or if we can get away with an "unsigned short". If the maximum
page size supported for most architectures is 64K or less. For those
cases we could just use an unsigned short. There are a rare few that
are larger where this would be forced into an "unsigned int".

> #endif
>         /* we maintain a pagecount bias, so that we dont dirty cache line
>          * containing page->_refcount every time we allocate a fragment.
>          */
>         unsigned int            pagecnt_bias;

We could probably look at doing something similar for pagecnt_bias.
For real use cases we currently force this to be aligned to something
like the L1 cache bytes. That usually reduces the number of actual
uses for this. If we put a limitation where the fragsz has to be
aligned to some value we could use that to limit this so the upper
limit for pagecnt_bias would be PAGE_FRAG_CACHE_MAX_SIZE / (required
alignment size).

> };
>
>> > @@ -4364,27 +4361,24 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *pfc,
>> >                                 PAGE_FRAG_CACHE_MAX_ORDER);PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> >         if (page)
>> >                 size = PAGE_FRAG_CACHE_MAX_SIZE;
>> > -       pfc->size = size;
>> > +       pfc->mask = size - 1;
>> >  #endif
>> >         if (unlikely(!page))
>> >                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
>> >         if (!page) {
>> > -               pfc->va = NULL;
>> > +               pfc->addr = NULL;
>> >                 return NULL;
>> >         }
>> >
>> > -       pfc->va = page_address(page);
>> > -
>> >         /* Using atomic_set() would break get_page_unless_zero() users. */
>> >         page_ref_add(page, size - 1);
>>
>> You could just use the pfc->mask here instead of size - 1 just to
>> avoid having to do the subtraction more than once assuming the
>> compiler doesn't optimize it.
>
> Either way I'm assuming a compiler optimisation -- that it won't reload
> from memory, or that it'll remember the subtraction.  I don't much care
> which, and I'll happily use the page_frag_cache_mask() if that reads better
> for you.

If the compiler is doing it then you are probably fine.

Thanks.

- Alex

^ permalink raw reply

* Re: [PATCH v2 6/8] page_frag_cache: Use a mask instead of offset
From: Matthew Wilcox @ 2018-03-22 17:34 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Matthew Wilcox, Netdev, linux-mm, Jesper Dangaard Brouer,
	Eric Dumazet
In-Reply-To: <20180322164157.GE28468@bombadil.infradead.org>

On Thu, Mar 22, 2018 at 09:41:57AM -0700, Matthew Wilcox wrote:
> On Thu, Mar 22, 2018 at 09:22:31AM -0700, Alexander Duyck wrote:
> > You could just use the pfc->mask here instead of size - 1 just to
> > avoid having to do the subtraction more than once assuming the
> > compiler doesn't optimize it.
> 
> Either way I'm assuming a compiler optimisation -- that it won't reload
> from memory, or that it'll remember the subtraction.  I don't much care
> which, and I'll happily use the page_frag_cache_mask() if that reads better
> for you.

Looks like it does reload from memory if I make that change.  Before:

    37e7:       c7 43 08 ff 7f 00 00    movl   $0x7fff,0x8(%rbx)
    37ee:       b9 00 80 00 00          mov    $0x8000,%ecx
    37f3:       be ff 7f 00 00          mov    $0x7fff,%esi
    37f8:       ba 00 80 00 00          mov    $0x8000,%edx
...
    380b:       01 70 1c                add    %esi,0x1c(%rax)

After:

    37e7:       c7 43 08 ff 7f 00 00    movl   $0x7fff,0x8(%rbx)
    37ee:       b9 00 80 00 00          mov    $0x8000,%ecx
    37f3:       ba 00 80 00 00          mov    $0x8000,%edx
...
    3806:       8b 73 08                mov    0x8(%rbx),%esi
    3809:       01 70 1c                add    %esi,0x1c(%rax)

Of course, it's shorter because it's fewer bytes to reload from memory
than it is to put a 32-bit immediate in the instruction stream, but
it's one additional memory reference (cache-hot, of course).  I don't
really care because it's the cold path.

^ permalink raw reply

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
From: Alexei Starovoitov @ 2018-03-22 17:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Gleixner, David Laight, Rahul Lakkireddy,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric 
In-Reply-To: <20180322093343.aatl3prhheha4dlm@gmail.com>

On Thu, Mar 22, 2018 at 10:33:43AM +0100, Ingo Molnar wrote:
> 
>  - I think the BPF JIT, whose byte code machine languge is used by an
>    increasing number of kernel subsystems, could benefit from having vector ops.
>    It would possibly allow the handling of floating point types.

this is on our todo list already.
To process certain traffic inside BPF in XDP we'd like to have access to
floating point. The current workaround is to precompute the math and do
bpf map lookup instead.
Since XDP processing of packets is batched (typically up to napi budget
of 64 packets at a time), we can, in theory, wrap the loop with
kernel_fpu_begin/end and it will be cleaner and faster,
but the work hasn't started yet.
The microbenchmark numbers you quoted for xsave/xrestore look promising,
so we probably will focus on it soon.

Another use case for vector insns is to accelerate fib/lpm lookups
which is likely beneficial for kernel overall regardless of bpf usage.

^ permalink raw reply

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
From: Andy Lutomirski @ 2018-03-22 17:44 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Linus Torvalds, Thomas Gleixner, David Laight,
	Rahul Lakkireddy, x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua 
In-Reply-To: <20180322174024.tuillc5ojh4oadf4@ast-mbp.dhcp.thefacebook.com>

On Thu, Mar 22, 2018 at 5:40 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Thu, Mar 22, 2018 at 10:33:43AM +0100, Ingo Molnar wrote:
>>
>>  - I think the BPF JIT, whose byte code machine languge is used by an
>>    increasing number of kernel subsystems, could benefit from having vector ops.
>>    It would possibly allow the handling of floating point types.
>
> this is on our todo list already.
> To process certain traffic inside BPF in XDP we'd like to have access to
> floating point. The current workaround is to precompute the math and do
> bpf map lookup instead.
> Since XDP processing of packets is batched (typically up to napi budget
> of 64 packets at a time), we can, in theory, wrap the loop with
> kernel_fpu_begin/end and it will be cleaner and faster,
> but the work hasn't started yet.
> The microbenchmark numbers you quoted for xsave/xrestore look promising,
> so we probably will focus on it soon.
>
> Another use case for vector insns is to accelerate fib/lpm lookups
> which is likely beneficial for kernel overall regardless of bpf usage.
>

This is a further argument for the deferred restore approach IMO.
With deferred restore, kernel_fpu_begin() + kernel_fpu_end() has very
low amortized cost as long as we do lots of work in the kernel before
re-entering userspace.  For XDP workloads, this could be a pretty big
win, I imagine.

Someone just needs to do the nasty x86 work.

^ permalink raw reply

* Re: [patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
From: Jiri Pirko @ 2018-03-22 17:49 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, davem, idosch, jakub.kicinski, mlxsw, andrew,
	vivien.didelot, f.fainelli, michael.chan, ganeshgr, saeedm,
	simon.horman, pieter.jansenvanvuuren, john.hurley,
	dirk.vandermerwe, alexander.h.duyck, ogerlitz, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <7217d1fb-665f-92cf-2704-364b91cb8248@gmail.com>

Thu, Mar 22, 2018 at 04:34:07PM CET, dsahern@gmail.com wrote:
>On 3/22/18 4:55 AM, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> This patchset resolves 2 issues we have right now:
>> 1) There are many netdevices / ports in the system, for port, pf, vf
>>    represenatation but the user has no way to see which is which
>> 2) The ndo_get_phys_port_name is implemented in each driver separatelly,
>>    which may lead to inconsistent names between drivers.
>
>Similar to ndo_get_phys_port_{name,id}, devlink requires drivers to opt
>in with an implementation right, so you can't really force a solution to
>the consistent naming.

Yeah, drivers would still have free choice to implemen the ndo
themselves. But most of them, like all sriov switch drivers should use
the devlink helper to have consistent naming. In other words, devlink
helper should be the standard way, in weird cases (like rocker), driver
implements it himself.


>
>> 
>> This patchset introduces port flavours which should address the first
>> problem. I'm testing this with Netronome nfp hardware. When the user
>> has 2 physical ports, 1 pf, and 4 vfs, he should see something like this:
>> # devlink port
>> pci/0000:05:00.0/0: type eth netdev enp5s0np0 flavour physical number 0
>> pci/0000:05:00.0/268435456: type eth netdev eth0 flavour physical number 0
>> pci/0000:05:00.0/268435460: type eth netdev enp5s0np1 flavour physical number 1
>> pci/0000:05:00.0/536875008: type eth netdev eth2 flavour pf_rep number 536875008
>> pci/0000:05:00.0/536870912: type eth netdev eth1 flavour vf_rep number 0
>> pci/0000:05:00.0/536870976: type eth netdev eth3 flavour vf_rep number 1
>> pci/0000:05:00.0/536871040: type eth netdev eth4 flavour vf_rep number 2
>> pci/0000:05:00.0/536871104: type eth netdev eth5 flavour vf_rep number 3
>
>How about 'kind' instead of flavo{u}r?

Yeah, kind is often used in kernel already with different meaning
git grep kind net/core
I wanted to avoid confusions

>

^ permalink raw reply

* [PATCH net 0/3] mlxsw: Handle changes to MTU in GRE tunnels
From: Ido Schimmel @ 2018-03-22 17:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, lucien.xin, mlxsw, Ido Schimmel

Petr says:

When offloading GRE tunnels, the MTU setting is kept fixed after the
initial offload even as the slow-path configuration changed. Worse: the
offloaded MTU setting is actually just a transient value set at the time
of NETDEV_REGISTER of the tunnel. As of commit ffc2b6ee4174 ("ip_gre:
fix IFLA_MTU ignored on NEWLINK"), that transient value is zero, and
unless there's e.g. a VRF migration that prompts re-offload, it stays at
zero, and all GRE packets end up trapping.

Thus, in patch #1, change the way the MTU is changed post-registration,
so that the full event protocol is observed. That way the drivers get to
see the change and have a chance to react.

In the remaining two patches, implement support for MTU change in mlxsw
driver.

Petr Machata (3):
  ip_tunnel: Emit events for post-register MTU changes
  mlxsw: spectrum_router: Move mlxsw_sp_rif_ipip_lb_op()
  mlxsw: spectrum_router: Handle MTU change of GRE netdevs

 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 78 ++++++++++++++--------
 net/ipv4/ip_tunnel.c                               | 26 ++++++--
 2 files changed, 72 insertions(+), 32 deletions(-)

-- 
2.14.3

^ permalink raw reply

* [PATCH net 1/3] ip_tunnel: Emit events for post-register MTU changes
From: Ido Schimmel @ 2018-03-22 17:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, lucien.xin, mlxsw, Petr Machata, Ido Schimmel
In-Reply-To: <20180322175335.26232-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

For tunnels created with IFLA_MTU, MTU of the netdevice is set by
rtnl_create_link() (called from rtnl_newlink()) before the device is
registered. However without IFLA_MTU that's not done.

rtnl_newlink() proceeds by calling struct rtnl_link_ops.newlink, which
via ip_tunnel_newlink() calls register_netdevice(), and that emits
NETDEV_REGISTER. Thus any listeners that inspect the netdevice get the
MTU of 0.

After ip_tunnel_newlink() corrects the MTU after registering the
netdevice, but since there's no event, the listeners don't get to know
about the MTU until something else happens--such as a NETDEV_UP event.
That's not ideal.

So instead of setting the MTU directly, go through dev_set_mtu(), which
takes care of distributing the necessary NETDEV_PRECHANGEMTU and
NETDEV_CHANGEMTU events.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 net/ipv4/ip_tunnel.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 6d21068f9b55..7b85ffad5d74 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -362,13 +362,18 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
 	struct ip_tunnel *nt;
 	struct net_device *dev;
 	int t_hlen;
+	int mtu;
+	int err;
 
 	BUG_ON(!itn->fb_tunnel_dev);
 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
 	if (IS_ERR(dev))
 		return ERR_CAST(dev);
 
-	dev->mtu = ip_tunnel_bind_dev(dev);
+	mtu = ip_tunnel_bind_dev(dev);
+	err = dev_set_mtu(dev, mtu);
+	if (err)
+		goto err_dev_set_mtu;
 
 	nt = netdev_priv(dev);
 	t_hlen = nt->hlen + sizeof(struct iphdr);
@@ -376,6 +381,10 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
 	dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
 	ip_tunnel_add(itn, nt);
 	return nt;
+
+err_dev_set_mtu:
+	unregister_netdevice(dev);
+	return ERR_PTR(err);
 }
 
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
@@ -1102,17 +1111,24 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 	nt->fwmark = fwmark;
 	err = register_netdevice(dev);
 	if (err)
-		goto out;
+		goto err_register_netdevice;
 
 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
 		eth_hw_addr_random(dev);
 
 	mtu = ip_tunnel_bind_dev(dev);
-	if (!tb[IFLA_MTU])
-		dev->mtu = mtu;
+	if (!tb[IFLA_MTU]) {
+		err = dev_set_mtu(dev, mtu);
+		if (err)
+			goto err_dev_set_mtu;
+	}
 
 	ip_tunnel_add(itn, nt);
-out:
+	return 0;
+
+err_dev_set_mtu:
+	unregister_netdevice(dev);
+err_register_netdevice:
 	return err;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net 2/3] mlxsw: spectrum_router: Move mlxsw_sp_rif_ipip_lb_op()
From: Ido Schimmel @ 2018-03-22 17:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, lucien.xin, mlxsw, Petr Machata, Ido Schimmel
In-Reply-To: <20180322175335.26232-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Move the function so that it can be called without forward declaration
from a function that will be added in a follow-up patch.

Fixes: 0063587d3587 ("mlxsw: spectrum: Support decap-only IP-in-IP tunnels")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 54 +++++++++++-----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index f7948e983637..a7d0adf068ec 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1380,6 +1380,33 @@ mlxsw_sp_ipip_entry_ol_up_event(struct mlxsw_sp *mlxsw_sp,
 						  decap_fib_entry);
 }
 
+static int
+mlxsw_sp_rif_ipip_lb_op(struct mlxsw_sp_rif_ipip_lb *lb_rif,
+			struct mlxsw_sp_vr *ul_vr, bool enable)
+{
+	struct mlxsw_sp_rif_ipip_lb_config lb_cf = lb_rif->lb_config;
+	struct mlxsw_sp_rif *rif = &lb_rif->common;
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	u32 saddr4;
+
+	switch (lb_cf.ul_protocol) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		saddr4 = be32_to_cpu(lb_cf.saddr.addr4);
+		mlxsw_reg_ritr_pack(ritr_pl, enable, MLXSW_REG_RITR_LOOPBACK_IF,
+				    rif->rif_index, rif->vr_id, rif->dev->mtu);
+		mlxsw_reg_ritr_loopback_ipip4_pack(ritr_pl, lb_cf.lb_ipipt,
+			    MLXSW_REG_RITR_LOOPBACK_IPIP_OPTIONS_GRE_KEY_PRESET,
+			    ul_vr->id, saddr4, lb_cf.okey);
+		break;
+
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return -EAFNOSUPPORT;
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
 static void mlxsw_sp_netdevice_ipip_ol_up_event(struct mlxsw_sp *mlxsw_sp,
 						struct net_device *ol_dev)
 {
@@ -6843,33 +6870,6 @@ mlxsw_sp_rif_ipip_lb_setup(struct mlxsw_sp_rif *rif,
 	rif_lb->lb_config = params_lb->lb_config;
 }
 
-static int
-mlxsw_sp_rif_ipip_lb_op(struct mlxsw_sp_rif_ipip_lb *lb_rif,
-			struct mlxsw_sp_vr *ul_vr, bool enable)
-{
-	struct mlxsw_sp_rif_ipip_lb_config lb_cf = lb_rif->lb_config;
-	struct mlxsw_sp_rif *rif = &lb_rif->common;
-	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
-	char ritr_pl[MLXSW_REG_RITR_LEN];
-	u32 saddr4;
-
-	switch (lb_cf.ul_protocol) {
-	case MLXSW_SP_L3_PROTO_IPV4:
-		saddr4 = be32_to_cpu(lb_cf.saddr.addr4);
-		mlxsw_reg_ritr_pack(ritr_pl, enable, MLXSW_REG_RITR_LOOPBACK_IF,
-				    rif->rif_index, rif->vr_id, rif->dev->mtu);
-		mlxsw_reg_ritr_loopback_ipip4_pack(ritr_pl, lb_cf.lb_ipipt,
-			    MLXSW_REG_RITR_LOOPBACK_IPIP_OPTIONS_GRE_KEY_PRESET,
-			    ul_vr->id, saddr4, lb_cf.okey);
-		break;
-
-	case MLXSW_SP_L3_PROTO_IPV6:
-		return -EAFNOSUPPORT;
-	}
-
-	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
-}
-
 static int
 mlxsw_sp_rif_ipip_lb_configure(struct mlxsw_sp_rif *rif)
 {
-- 
2.14.3

^ permalink raw reply related

* [PATCH net 3/3] mlxsw: spectrum_router: Handle MTU change of GRE netdevs
From: Ido Schimmel @ 2018-03-22 17:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, lucien.xin, mlxsw, Petr Machata, Ido Schimmel
In-Reply-To: <20180322175335.26232-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Update MTU of overlay loopback in accordance with the setting on the
tunnel netdevice.

Fixes: 0063587d3587 ("mlxsw: spectrum: Support decap-only IP-in-IP tunnels")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index a7d0adf068ec..997e24dcb053 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1407,6 +1407,28 @@ mlxsw_sp_rif_ipip_lb_op(struct mlxsw_sp_rif_ipip_lb *lb_rif,
 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
 }
 
+static int mlxsw_sp_netdevice_ipip_ol_update_mtu(struct mlxsw_sp *mlxsw_sp,
+						 struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct mlxsw_sp_rif_ipip_lb *lb_rif;
+	struct mlxsw_sp_vr *ul_vr;
+	int err = 0;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry) {
+		lb_rif = ipip_entry->ol_lb;
+		ul_vr = &mlxsw_sp->router->vrs[lb_rif->ul_vr_id];
+		err = mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, true);
+		if (err)
+			goto out;
+		lb_rif->common.mtu = ol_dev->mtu;
+	}
+
+out:
+	return err;
+}
+
 static void mlxsw_sp_netdevice_ipip_ol_up_event(struct mlxsw_sp *mlxsw_sp,
 						struct net_device *ol_dev)
 {
@@ -1687,6 +1709,8 @@ int mlxsw_sp_netdevice_ipip_ol_event(struct mlxsw_sp *mlxsw_sp,
 		extack = info->extack;
 		return mlxsw_sp_netdevice_ipip_ol_change_event(mlxsw_sp,
 							       ol_dev, extack);
+	case NETDEV_CHANGEMTU:
+		return mlxsw_sp_netdevice_ipip_ol_update_mtu(mlxsw_sp, ol_dev);
 	}
 	return 0;
 }
-- 
2.14.3

^ permalink raw reply related

* [PATCH v3 bpf-next 00/10] bpf, tracing: introduce bpf raw tracepoints
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api

From: Alexei Starovoitov <ast@kernel.org>

v2->v3:
- with Linus's suggestion introduced generic COUNT_ARGS and CONCATENATE macros
  (or rather moved them from apparmor)
  that cleaned up patches 6 and 7
- added patch 4 to refactor trace_iwlwifi_dev_ucode_error() from 17 args to 4
  Now any tracepoint with >12 args will have build error

v1->v2:
- simplified api by combing bpf_raw_tp_open(name) + bpf_attach(prog_fd) into
  bpf_raw_tp_open(name, prog_fd) as suggested by Daniel.
  That simplifies bpf_detach as well which is now simple close() of fd.
- fixed memory leak in error path which was spotted by Daniel.
- fixed bpf_get_stackid(), bpf_perf_event_output() called from raw tracepoints
- added more tests
- fixed allyesconfig build caught by buildbot

v1:
This patch set is a different way to address the pressing need to access
task_struct pointers in sched tracepoints from bpf programs.

The first approach simply added these pointers to sched tracepoints:
https://lkml.org/lkml/2017/12/14/753
which Peter nacked.
Few options were discussed and eventually the discussion converged on
doing bpf specific tracepoint_probe_register() probe functions.
Details here:
https://lkml.org/lkml/2017/12/20/929

Patch 1 is kernel wide cleanup of pass-struct-by-value into
pass-struct-by-reference into tracepoints.

Patches 2 and 3 are minor cleanups to address allyesconfig build

Patch 4 refactor trace_iwlwifi_dev_ucode_error from 17 to 4 args

Patch 5 introduces COUNT_ARGS macro

Patch 6 minor prep work to expose number of arguments passed
into tracepoints.

Patch 7 introduces BPF_RAW_TRACEPOINT api.
the auto-cleanup and multiple concurrent users are must have
features of tracing api. For bpf raw tracepoints it looks like:
  // load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type
  prog_fd = bpf_prog_load(...);

  // receive anon_inode fd for given bpf_raw_tracepoint
  // and attach bpf program to it
  raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd);

Ctrl-C of tracing daemon or cmdline tool will automatically
detach bpf program, unload it and unregister tracepoint probe.
More details in patch 7.

Patch 8 - trivial support in libbpf
Patches 9, 10 - user space tests

samples/bpf/test_overhead performance on 1 cpu:

tracepoint    base  kprobe+bpf tracepoint+bpf raw_tracepoint+bpf
task_rename   1.1M   769K        947K            1.0M
urandom_read  789K   697K        750K            755K

Alexei Starovoitov (10):
  treewide: remove struct-pass-by-value from tracepoints arguments
  net/mediatek: disambiguate mt76 vs mt7601u trace events
  net/mac802154: disambiguate mac80215 vs mac802154 trace events
  net/wireless/iwlwifi: fix iwlwifi_dev_ucode_error tracepoint
  macro: introduce COUNT_ARGS() macro
  tracepoint: compute num_args at build time
  bpf: introduce BPF_RAW_TRACEPOINT
  libbpf: add bpf_raw_tracepoint_open helper
  samples/bpf: raw tracepoint test
  selftests/bpf: test for bpf_get_stackid() from raw tracepoints

 arch/x86/xen/mmu_pv.c                              |  16 +-
 drivers/gpu/drm/i915/i915_trace.h                  |  13 +-
 drivers/infiniband/hw/hfi1/file_ops.c              |   2 +-
 drivers/infiniband/hw/hfi1/trace_ctxts.h           |  12 +-
 drivers/net/wireless/intel/iwlwifi/dvm/main.c      |   7 +-
 .../wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h  |  39 ++-
 drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c  |   1 +
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c     |   7 +-
 drivers/net/wireless/mediatek/mt7601u/trace.h      |   6 +-
 drivers/s390/cio/ioasm.c                           |  18 +-
 drivers/s390/cio/trace.h                           |  50 ++--
 fs/dax.c                                           |   2 +-
 include/linux/bpf_types.h                          |   1 +
 include/linux/kernel.h                             |   7 +
 include/linux/trace_events.h                       |  37 +++
 include/linux/tracepoint-defs.h                    |   1 +
 include/linux/tracepoint.h                         |  28 ++-
 include/trace/bpf_probe.h                          |  82 +++++++
 include/trace/define_trace.h                       |  15 +-
 include/trace/events/f2fs.h                        |   2 +-
 include/trace/events/fs_dax.h                      |   6 +-
 include/trace/events/rcu.h                         |   4 +-
 include/trace/events/xen.h                         |  32 +--
 include/uapi/linux/bpf.h                           |  11 +
 kernel/bpf/syscall.c                               |  87 +++++++
 kernel/rcu/tree.c                                  |  10 +-
 kernel/trace/bpf_trace.c                           | 270 +++++++++++++++++++++
 kernel/tracepoint.c                                |  27 ++-
 net/mac802154/trace.h                              |   8 +-
 net/wireless/trace.h                               |   2 +-
 samples/bpf/Makefile                               |   1 +
 samples/bpf/bpf_load.c                             |  14 ++
 samples/bpf/test_overhead_raw_tp_kern.c            |  17 ++
 samples/bpf/test_overhead_user.c                   |  12 +
 security/apparmor/include/path.h                   |   7 +-
 sound/firewire/amdtp-stream-trace.h                |   2 +-
 tools/include/uapi/linux/bpf.h                     |  11 +
 tools/lib/bpf/bpf.c                                |  11 +
 tools/lib/bpf/bpf.h                                |   1 +
 tools/testing/selftests/bpf/test_progs.c           |  91 +++++--
 40 files changed, 794 insertions(+), 176 deletions(-)
 create mode 100644 include/trace/bpf_probe.h
 create mode 100644 samples/bpf/test_overhead_raw_tp_kern.c

-- 
2.9.5

^ permalink raw reply

* [PATCH v3 bpf-next 09/10] samples/bpf: raw tracepoint test
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180322180157.742725-1-ast@fb.com>

From: Alexei Starovoitov <ast@kernel.org>

add empty raw_tracepoint bpf program to test overhead similar
to kprobe and traditional tracepoint tests

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile                    |  1 +
 samples/bpf/bpf_load.c                  | 14 ++++++++++++++
 samples/bpf/test_overhead_raw_tp_kern.c | 17 +++++++++++++++++
 samples/bpf/test_overhead_user.c        | 12 ++++++++++++
 4 files changed, 44 insertions(+)
 create mode 100644 samples/bpf/test_overhead_raw_tp_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 2c2a587e0942..4d6a6edd4bf6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -119,6 +119,7 @@ always += offwaketime_kern.o
 always += spintest_kern.o
 always += map_perf_test_kern.o
 always += test_overhead_tp_kern.o
+always += test_overhead_raw_tp_kern.o
 always += test_overhead_kprobe_kern.o
 always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index b1a310c3ae89..bebe4188b4b3 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -61,6 +61,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
 	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
 	bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
+	bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
 	bool is_xdp = strncmp(event, "xdp", 3) == 0;
 	bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
 	bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
@@ -85,6 +86,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_KPROBE;
 	} else if (is_tracepoint) {
 		prog_type = BPF_PROG_TYPE_TRACEPOINT;
+	} else if (is_raw_tracepoint) {
+		prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
 	} else if (is_xdp) {
 		prog_type = BPF_PROG_TYPE_XDP;
 	} else if (is_perf_event) {
@@ -131,6 +134,16 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		return populate_prog_array(event, fd);
 	}
 
+	if (is_raw_tracepoint) {
+		efd = bpf_raw_tracepoint_open(event + 15, fd);
+		if (efd < 0) {
+			printf("tracepoint %s %s\n", event + 15, strerror(errno));
+			return -1;
+		}
+		event_fd[prog_cnt - 1] = efd;
+		return 0;
+	}
+
 	if (is_kprobe || is_kretprobe) {
 		if (is_kprobe)
 			event += 7;
@@ -587,6 +600,7 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 		if (memcmp(shname, "kprobe/", 7) == 0 ||
 		    memcmp(shname, "kretprobe/", 10) == 0 ||
 		    memcmp(shname, "tracepoint/", 11) == 0 ||
+		    memcmp(shname, "raw_tracepoint/", 15) == 0 ||
 		    memcmp(shname, "xdp", 3) == 0 ||
 		    memcmp(shname, "perf_event", 10) == 0 ||
 		    memcmp(shname, "socket", 6) == 0 ||
diff --git a/samples/bpf/test_overhead_raw_tp_kern.c b/samples/bpf/test_overhead_raw_tp_kern.c
new file mode 100644
index 000000000000..d2af8bc1c805
--- /dev/null
+++ b/samples/bpf/test_overhead_raw_tp_kern.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("raw_tracepoint/task_rename")
+int prog(struct bpf_raw_tracepoint_args *ctx)
+{
+	return 0;
+}
+
+SEC("raw_tracepoint/urandom_read")
+int prog2(struct bpf_raw_tracepoint_args *ctx)
+{
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c
index d291167fd3c7..e1d35e07a10e 100644
--- a/samples/bpf/test_overhead_user.c
+++ b/samples/bpf/test_overhead_user.c
@@ -158,5 +158,17 @@ int main(int argc, char **argv)
 		unload_progs();
 	}
 
+	if (test_flags & 0xC0) {
+		snprintf(filename, sizeof(filename),
+			 "%s_raw_tp_kern.o", argv[0]);
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+		printf("w/RAW_TRACEPOINT\n");
+		run_perf_test(num_cpu, test_flags >> 6);
+		unload_progs();
+	}
+
 	return 0;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 03/10] net/mac802154: disambiguate mac80215 vs mac802154 trace events
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180322180157.742725-1-ast@fb.com>

From: Alexei Starovoitov <ast@kernel.org>

two trace events defined with the same name and both unused.
They conflict in allyesconfig build. Rename one of them.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/mac802154/trace.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h
index 2c8a43d3607f..df855c33daf2 100644
--- a/net/mac802154/trace.h
+++ b/net/mac802154/trace.h
@@ -33,7 +33,7 @@
 
 /* Tracing for driver callbacks */
 
-DECLARE_EVENT_CLASS(local_only_evt,
+DECLARE_EVENT_CLASS(local_only_evt4,
 	TP_PROTO(struct ieee802154_local *local),
 	TP_ARGS(local),
 	TP_STRUCT__entry(
@@ -45,7 +45,7 @@ DECLARE_EVENT_CLASS(local_only_evt,
 	TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
 );
 
-DEFINE_EVENT(local_only_evt, 802154_drv_return_void,
+DEFINE_EVENT(local_only_evt4, 802154_drv_return_void,
 	TP_PROTO(struct ieee802154_local *local),
 	TP_ARGS(local)
 );
@@ -65,12 +65,12 @@ TRACE_EVENT(802154_drv_return_int,
 		  __entry->ret)
 );
 
-DEFINE_EVENT(local_only_evt, 802154_drv_start,
+DEFINE_EVENT(local_only_evt4, 802154_drv_start,
 	TP_PROTO(struct ieee802154_local *local),
 	TP_ARGS(local)
 );
 
-DEFINE_EVENT(local_only_evt, 802154_drv_stop,
+DEFINE_EVENT(local_only_evt4, 802154_drv_stop,
 	TP_PROTO(struct ieee802154_local *local),
 	TP_ARGS(local)
 );
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 04/10] net/wireless/iwlwifi: fix iwlwifi_dev_ucode_error tracepoint
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180322180157.742725-1-ast@fb.com>

From: Alexei Starovoitov <ast@kernel.org>

fix iwlwifi_dev_ucode_error tracepoint to pass pointer to a table
instead of all 17 arguments by value.
dvm/main.c and mvm/utils.c have 'struct iwl_error_event_table'
defined with very similar yet subtly different fields and offsets.
tracepoint is still common and using definition of 'struct iwl_error_event_table'
from dvm/commands.h while copying fields.
Long term this tracepoint probably should be split into two.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/wireless/intel/iwlwifi/dvm/main.c      |  7 +---
 .../wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h  | 39 ++++++++++------------
 drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c  |  1 +
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c     |  7 +---
 4 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/main.c b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
index d11d72615de2..e68254e12764 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/main.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
@@ -1651,12 +1651,7 @@ static void iwl_dump_nic_error_log(struct iwl_priv *priv)
 			priv->status, table.valid);
 	}
 
-	trace_iwlwifi_dev_ucode_error(trans->dev, table.error_id, table.tsf_low,
-				      table.data1, table.data2, table.line,
-				      table.blink2, table.ilink1, table.ilink2,
-				      table.bcon_time, table.gp1, table.gp2,
-				      table.gp3, table.ucode_ver, table.hw_ver,
-				      0, table.brd_ver);
+	trace_iwlwifi_dev_ucode_error(trans->dev, &table, 0, table.brd_ver);
 	IWL_ERR(priv, "0x%08X | %-28s\n", table.error_id,
 		desc_lookup(table.error_id));
 	IWL_ERR(priv, "0x%08X | uPc\n", table.pc);
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h
index 9518a82f44c2..27e3e4e96aa2 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h
@@ -126,14 +126,11 @@ TRACE_EVENT(iwlwifi_dev_tx,
 		  __entry->framelen, __entry->skbaddr)
 );
 
+struct iwl_error_event_table;
 TRACE_EVENT(iwlwifi_dev_ucode_error,
-	TP_PROTO(const struct device *dev, u32 desc, u32 tsf_low,
-		 u32 data1, u32 data2, u32 line, u32 blink2, u32 ilink1,
-		 u32 ilink2, u32 bcon_time, u32 gp1, u32 gp2, u32 rev_type,
-		 u32 major, u32 minor, u32 hw_ver, u32 brd_ver),
-	TP_ARGS(dev, desc, tsf_low, data1, data2, line,
-		 blink2, ilink1, ilink2, bcon_time, gp1, gp2,
-		 rev_type, major, minor, hw_ver, brd_ver),
+	TP_PROTO(const struct device *dev, const struct iwl_error_event_table *table,
+		 u32 hw_ver, u32 brd_ver),
+	TP_ARGS(dev, table, hw_ver, brd_ver),
 	TP_STRUCT__entry(
 		DEV_ENTRY
 		__field(u32, desc)
@@ -155,20 +152,20 @@ TRACE_EVENT(iwlwifi_dev_ucode_error,
 	),
 	TP_fast_assign(
 		DEV_ASSIGN;
-		__entry->desc = desc;
-		__entry->tsf_low = tsf_low;
-		__entry->data1 = data1;
-		__entry->data2 = data2;
-		__entry->line = line;
-		__entry->blink2 = blink2;
-		__entry->ilink1 = ilink1;
-		__entry->ilink2 = ilink2;
-		__entry->bcon_time = bcon_time;
-		__entry->gp1 = gp1;
-		__entry->gp2 = gp2;
-		__entry->rev_type = rev_type;
-		__entry->major = major;
-		__entry->minor = minor;
+		__entry->desc = table->error_id;
+		__entry->tsf_low = table->tsf_low;
+		__entry->data1 = table->data1;
+		__entry->data2 = table->data2;
+		__entry->line = table->line;
+		__entry->blink2 = table->blink2;
+		__entry->ilink1 = table->ilink1;
+		__entry->ilink2 = table->ilink2;
+		__entry->bcon_time = table->bcon_time;
+		__entry->gp1 = table->gp1;
+		__entry->gp2 = table->gp2;
+		__entry->rev_type = table->gp3;
+		__entry->major = table->ucode_ver;
+		__entry->minor = table->hw_ver;
 		__entry->hw_ver = hw_ver;
 		__entry->brd_ver = brd_ver;
 	),
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
index 50510fb6ab8c..6aa719865a58 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
@@ -30,6 +30,7 @@
 #ifndef __CHECKER__
 #include "iwl-trans.h"
 
+#include "dvm/commands.h"
 #define CREATE_TRACE_POINTS
 #include "iwl-devtrace.h"
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index d65e1db7c097..5442ead876eb 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -549,12 +549,7 @@ static void iwl_mvm_dump_lmac_error_log(struct iwl_mvm *mvm, u32 base)
 
 	IWL_ERR(mvm, "Loaded firmware version: %s\n", mvm->fw->fw_version);
 
-	trace_iwlwifi_dev_ucode_error(trans->dev, table.error_id, table.tsf_low,
-				      table.data1, table.data2, table.data3,
-				      table.blink2, table.ilink1,
-				      table.ilink2, table.bcon_time, table.gp1,
-				      table.gp2, table.fw_rev_type, table.major,
-				      table.minor, table.hw_ver, table.brd_ver);
+	trace_iwlwifi_dev_ucode_error(trans->dev, &table, table.hw_ver, table.brd_ver);
 	IWL_ERR(mvm, "0x%08X | %-28s\n", table.error_id,
 		desc_lookup(table.error_id));
 	IWL_ERR(mvm, "0x%08X | trm_hw_status0\n", table.trm_hw_status0);
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 05/10] macro: introduce COUNT_ARGS() macro
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180322180157.742725-1-ast@fb.com>

From: Alexei Starovoitov <ast@kernel.org>

move COUNT_ARGS() macro from apparmor to generic header and extend it
to count till twelve.

COUNT() was an alternative name for this logic, but it's used for
different purpose in many other places.

Similarly for CONCATENATE() macro.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/kernel.h           | 7 +++++++
 security/apparmor/include/path.h | 7 +------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3fd291503576..293fa0677fba 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -919,6 +919,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #define swap(a, b) \
 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 
+/* This counts to 12. Any more, it will return 13th argument. */
+#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
+#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+
+#define __CONCAT(a, b) a ## b
+#define CONCATENATE(a, b) __CONCAT(a, b)
+
 /**
  * container_of - cast a member of a structure out to the containing structure
  * @ptr:	the pointer to the member.
diff --git a/security/apparmor/include/path.h b/security/apparmor/include/path.h
index 05fb3305671e..e042b994f2b8 100644
--- a/security/apparmor/include/path.h
+++ b/security/apparmor/include/path.h
@@ -43,15 +43,10 @@ struct aa_buffers {
 
 DECLARE_PER_CPU(struct aa_buffers, aa_buffers);
 
-#define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
-#define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n
-#define CONCAT(X, Y) X ## Y
-#define CONCAT_AFTER(X, Y) CONCAT(X, Y)
-
 #define ASSIGN(FN, X, N) ((X) = FN(N))
 #define EVAL1(FN, X) ASSIGN(FN, X, 0) /*X = FN(0)*/
 #define EVAL2(FN, X, Y...) do { ASSIGN(FN, X, 1);  EVAL1(FN, Y); } while (0)
-#define EVAL(FN, X...) CONCAT_AFTER(EVAL, COUNT_ARGS(X))(FN, X)
+#define EVAL(FN, X...) CONCATENATE(EVAL, COUNT_ARGS(X))(FN, X)
 
 #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++)
 
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 07/10] bpf: introduce BPF_RAW_TRACEPOINT
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180322180157.742725-1-ast@fb.com>

From: Alexei Starovoitov <ast@kernel.org>

Introduce BPF_PROG_TYPE_RAW_TRACEPOINT bpf program type to access
kernel internal arguments of the tracepoints in their raw form.

>From bpf program point of view the access to the arguments look like:
struct bpf_raw_tracepoint_args {
       __u64 args[0];
};

int bpf_prog(struct bpf_raw_tracepoint_args *ctx)
{
  // program can read args[N] where N depends on tracepoint
  // and statically verified at program load+attach time
}

kprobe+bpf infrastructure allows programs access function arguments.
This feature allows programs access raw tracepoint arguments.

Similar to proposed 'dynamic ftrace events' there are no abi guarantees
to what the tracepoints arguments are and what their meaning is.
The program needs to type cast args properly and use bpf_probe_read()
helper to access struct fields when argument is a pointer.

For every tracepoint __bpf_trace_##call function is prepared.
In assembler it looks like:
(gdb) disassemble __bpf_trace_xdp_exception
Dump of assembler code for function __bpf_trace_xdp_exception:
   0xffffffff81132080 <+0>:     mov    %ecx,%ecx
   0xffffffff81132082 <+2>:     jmpq   0xffffffff811231f0 <bpf_trace_run3>

where

TRACE_EVENT(xdp_exception,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp, u32 act),

The above assembler snippet is casting 32-bit 'act' field into 'u64'
to pass into bpf_trace_run3(), while 'dev' and 'xdp' args are passed as-is.
All of ~500 of __bpf_trace_*() functions are only 5-10 byte long
and in total this approach adds 7k bytes to .text and 8k bytes
to .rodata since the probe funcs need to appear in kallsyms.
The alternative of having __bpf_trace_##call being global in kallsyms
could have been to keep them static and add another pointer to these
static functions to 'struct trace_event_class' and 'struct trace_event_call',
but keeping them global simplifies implementation and keeps it indepedent
from the tracing side.

Also such approach gives the lowest possible overhead
while calling trace_xdp_exception() from kernel C code and
transitioning into bpf land.
Since tracepoint+bpf are used at speeds of 1M+ events per second
this is very valuable optimization.

Since ftrace and perf side are not involved the new
BPF_RAW_TRACEPOINT_OPEN sys_bpf command is introduced
that returns anon_inode FD of 'bpf-raw-tracepoint' object.

The user space looks like:
// load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type
prog_fd = bpf_prog_load(...);
// receive anon_inode fd for given bpf_raw_tracepoint with prog attached
raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd);

Ctrl-C of tracing daemon or cmdline tool that uses this feature
will automatically detach bpf program, unload it and
unregister tracepoint probe.

On the kernel side for_each_kernel_tracepoint() is used
to find a tracepoint with "xdp_exception" name
(that would be __tracepoint_xdp_exception record)

Then kallsyms_lookup_name() is used to find the addr
of __bpf_trace_xdp_exception() probe function.

And finally tracepoint_probe_register() is used to connect probe
with tracepoint.

Addition of bpf_raw_tracepoint doesn't interfere with ftrace and perf
tracepoint mechanisms. perf_event_open() can be used in parallel
on the same tracepoint.
Multiple bpf_raw_tracepoint_open("xdp_exception", prog_fd) are permitted.
Each with its own bpf program. The kernel will execute
all tracepoint probes and all attached bpf programs.

In the future bpf_raw_tracepoints can be extended with
query/introspection logic.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_types.h    |   1 +
 include/linux/trace_events.h |  37 ++++++
 include/trace/bpf_probe.h    |  82 +++++++++++++
 include/trace/define_trace.h |   1 +
 include/uapi/linux/bpf.h     |  11 ++
 kernel/bpf/syscall.c         |  87 ++++++++++++++
 kernel/trace/bpf_trace.c     | 270 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 489 insertions(+)
 create mode 100644 include/trace/bpf_probe.h

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5e2e8a49fb21..6d7243bfb0ff 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -19,6 +19,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
+BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 #endif
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a1442c4e513..e37fcd7505da 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -468,6 +468,8 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
+int bpf_probe_register(struct tracepoint *tp, struct bpf_prog *prog);
+int bpf_probe_unregister(struct tracepoint *tp, struct bpf_prog *prog);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
@@ -487,6 +489,14 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info)
 {
 	return -EOPNOTSUPP;
 }
+static inline int bpf_probe_register(struct tracepoint *tp, struct bpf_prog *p)
+{
+	return -EOPNOTSUPP;
+}
+static inline int bpf_probe_unregister(struct tracepoint *tp, struct bpf_prog *p)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -546,6 +556,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
+void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
+void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
+void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3);
+void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4);
+void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5);
+void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6);
+void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7);
+void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		    u64 arg8);
+void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		    u64 arg8, u64 arg9);
+void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10);
+void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10, u64 arg11);
+void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12);
 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 			       struct trace_event_call *call, u64 count,
 			       struct pt_regs *regs, struct hlist_head *head,
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
new file mode 100644
index 000000000000..3e8ac3617cb4
--- /dev/null
+++ b/include/trace/bpf_probe.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#undef TRACE_SYSTEM_VAR
+
+#ifdef CONFIG_BPF_EVENTS
+
+#undef __entry
+#define __entry entry
+
+#undef __get_dynamic_array
+#define __get_dynamic_array(field)	\
+		((void *)__entry + (__entry->__data_loc_##field & 0xffff))
+
+#undef __get_dynamic_array_len
+#define __get_dynamic_array_len(field)	\
+		((__entry->__data_loc_##field >> 16) & 0xffff)
+
+#undef __get_str
+#define __get_str(field) ((char *)__get_dynamic_array(field))
+
+#undef __get_bitmask
+#define __get_bitmask(field) (char *)__get_dynamic_array(field)
+
+#undef __perf_count
+#define __perf_count(c)	(c)
+
+#undef __perf_task
+#define __perf_task(t)	(t)
+
+/*
+ * cast any integer or pointer type to u64 without warnings
+ * on 32 and 64 bit archs
+ */
+#define __CAST_TO_U64(expr) \
+	(u64) __builtin_choose_expr(sizeof(long) < sizeof(expr), \
+				    (expr), \
+				    (long) expr)
+#define __CAST1(a,...) __CAST_TO_U64(a)
+#define __CAST2(a,...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__)
+#define __CAST3(a,...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__)
+#define __CAST4(a,...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__)
+#define __CAST5(a,...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__)
+#define __CAST6(a,...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__)
+#define __CAST7(a,...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__)
+#define __CAST8(a,...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__)
+#define __CAST9(a,...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__)
+#define __CAST10(a,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__)
+#define __CAST11(a,...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__)
+#define __CAST12(a,...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__)
+/* tracepoints with more than 12 arguments will hit build error */
+#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
+/* no 'static' here. The bpf probe functions are global */		\
+notrace void								\
+__bpf_trace_##call(void *__data, proto)					\
+{									\
+	struct bpf_prog *prog = __data;					\
+	\
+	CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args));	\
+}
+
+/*
+ * This part is compiled out, it is only here as a build time check
+ * to make sure that if the tracepoint handling changes, the
+ * bpf probe will fail to compile unless it too is updated.
+ */
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, call, proto, args)			\
+static inline void bpf_test_probe_##call(void)				\
+{									\
+	check_trace_callback_type_##call(__bpf_trace_##template);	\
+}
+
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+#endif /* CONFIG_BPF_EVENTS */
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 96b22ace9ae7..5f8216bc261f 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -95,6 +95,7 @@
 #ifdef TRACEPOINTS_ENABLED
 #include <trace/trace_events.h>
 #include <trace/perf.h>
+#include <trace/bpf_probe.h>
 #endif
 
 #undef TRACE_EVENT
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 18b7c510c511..1878201c2d77 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -94,6 +94,7 @@ enum bpf_cmd {
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
 	BPF_PROG_QUERY,
+	BPF_RAW_TRACEPOINT_OPEN,
 };
 
 enum bpf_map_type {
@@ -134,6 +135,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_SKB,
 	BPF_PROG_TYPE_CGROUP_DEVICE,
 	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_RAW_TRACEPOINT,
 };
 
 enum bpf_attach_type {
@@ -344,6 +346,11 @@ union bpf_attr {
 		__aligned_u64	prog_ids;
 		__u32		prog_cnt;
 	} query;
+
+	struct {
+		__u64 name;
+		__u32 prog_fd;
+	} raw_tracepoint;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
@@ -1152,4 +1159,8 @@ struct bpf_cgroup_dev_ctx {
 	__u32 minor;
 };
 
+struct bpf_raw_tracepoint_args {
+	__u64 args[0];
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3aeb4ea2a93a..96bc45a6e7d6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1311,6 +1311,90 @@ static int bpf_obj_get(const union bpf_attr *attr)
 				attr->file_flags);
 }
 
+struct bpf_raw_tracepoint {
+	struct tracepoint *tp;
+	struct bpf_prog *prog;
+};
+
+static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
+{
+	struct bpf_raw_tracepoint *raw_tp = filp->private_data;
+
+	if (raw_tp->prog) {
+		bpf_probe_unregister(raw_tp->tp, raw_tp->prog);
+		bpf_prog_put(raw_tp->prog);
+	}
+	kfree(raw_tp);
+	return 0;
+}
+
+static const struct file_operations bpf_raw_tp_fops = {
+	.release	= bpf_raw_tracepoint_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
+};
+
+static void *__find_tp(struct tracepoint *tp, void *priv)
+{
+	char *name = priv;
+
+	if (!strcmp(tp->name, name))
+		return tp;
+	return NULL;
+}
+
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+{
+	struct bpf_raw_tracepoint *raw_tp;
+	struct tracepoint *tp;
+	struct bpf_prog *prog;
+	char tp_name[128];
+	int tp_fd, err;
+
+	if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
+			      sizeof(tp_name) - 1) < 0)
+		return -EFAULT;
+	tp_name[sizeof(tp_name) - 1] = 0;
+
+	tp = for_each_kernel_tracepoint(__find_tp, tp_name);
+	if (!tp)
+		return -ENOENT;
+
+	raw_tp = kmalloc(sizeof(*raw_tp), GFP_USER | __GFP_ZERO);
+	if (!raw_tp)
+		return -ENOMEM;
+	raw_tp->tp = tp;
+
+	prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
+				 BPF_PROG_TYPE_RAW_TRACEPOINT);
+	if (IS_ERR(prog)) {
+		err = PTR_ERR(prog);
+		goto out_free_tp;
+	}
+
+	err = bpf_probe_register(raw_tp->tp, prog);
+	if (err)
+		goto out_put_prog;
+
+	raw_tp->prog = prog;
+	tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
+				 O_CLOEXEC);
+	if (tp_fd < 0) {
+		bpf_probe_unregister(raw_tp->tp, prog);
+		err = tp_fd;
+		goto out_put_prog;
+	}
+	return tp_fd;
+
+out_put_prog:
+	bpf_prog_put(prog);
+out_free_tp:
+	kfree(raw_tp);
+	return err;
+}
+
 #ifdef CONFIG_CGROUP_BPF
 
 #define BPF_PROG_ATTACH_LAST_FIELD attach_flags
@@ -1921,6 +2005,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET_INFO_BY_FD:
 		err = bpf_obj_get_info_by_fd(&attr, uattr);
 		break;
+	case BPF_RAW_TRACEPOINT_OPEN:
+		err = bpf_raw_tracepoint_open(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c634e093951f..c2b853cd941d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -723,6 +723,86 @@ const struct bpf_verifier_ops tracepoint_verifier_ops = {
 const struct bpf_prog_ops tracepoint_prog_ops = {
 };
 
+/*
+ * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
+ * to avoid potential recursive reuse issue when/if tracepoints are added
+ * inside bpf_*_event_output and/or bpf_get_stack_id
+ */
+static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   struct bpf_map *, map, u64, flags, void *, data, u64, size)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return ____bpf_perf_event_output(regs, map, flags, data, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
+	.func		= bpf_perf_event_output_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   struct bpf_map *, map, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
+	return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+			       flags, 0, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
+	.func		= bpf_get_stackid_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto_raw_tp;
+	case BPF_FUNC_get_stackid:
+		return &bpf_get_stackid_proto_raw_tp;
+	default:
+		return tracing_func_proto(func_id);
+	}
+}
+
+static bool raw_tp_prog_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					struct bpf_insn_access_aux *info)
+{
+	/* largest tracepoint in the kernel has 12 args */
+	if (off < 0 || off >= sizeof(__u64) * 12)
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+	return true;
+}
+
+const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
+	.get_func_proto  = raw_tp_prog_func_proto,
+	.is_valid_access = raw_tp_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops raw_tracepoint_prog_ops = {
+};
+
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    struct bpf_insn_access_aux *info)
 {
@@ -896,3 +976,193 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 
 	return ret;
 }
+
+static __always_inline
+void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
+{
+	rcu_read_lock();
+	preempt_disable();
+	(void) BPF_PROG_RUN(prog, args);
+	preempt_enable();
+	rcu_read_unlock();
+}
+
+#define EVAL1(FN, X) FN(X)
+#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
+#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
+#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
+#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
+#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
+
+#define COPY(X) args[X - 1] = arg##X;
+
+void bpf_trace_run1(struct bpf_prog *prog, u64 arg1)
+{
+	u64 args[1];
+
+	EVAL1(COPY, 1);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run1);
+void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2)
+{
+	u64 args[2];
+
+	EVAL2(COPY, 1, 2);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run2);
+void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3)
+{
+	u64 args[3];
+
+	EVAL3(COPY, 1, 2, 3);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run3);
+void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4)
+{
+	u64 args[4];
+
+	EVAL4(COPY, 1, 2, 3, 4);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run4);
+void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5)
+{
+	u64 args[5];
+
+	EVAL5(COPY, 1, 2, 3, 4, 5);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run5);
+void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6)
+{
+	u64 args[6];
+
+	EVAL6(COPY, 1, 2, 3, 4, 5, 6);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run6);
+void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7)
+{
+	u64 args[7];
+
+	EVAL6(COPY, 1, 2, 3, 4, 5, 6);
+	EVAL1(COPY, 7);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run7);
+void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		    u64 arg8)
+{
+	u64 args[8];
+
+	EVAL6(COPY, 1, 2, 3, 4, 5, 6);
+	EVAL2(COPY, 7, 8);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run8);
+void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		    u64 arg8, u64 arg9)
+{
+	u64 args[9];
+
+	EVAL6(COPY, 1, 2, 3, 4, 5, 6);
+	EVAL3(COPY, 7, 8, 9);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run9);
+void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10)
+{
+	u64 args[10];
+
+	EVAL6(COPY, 1, 2, 3, 4, 5, 6);
+	EVAL4(COPY, 7, 8, 9, 10);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run10);
+void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10, u64 arg11)
+{
+	u64 args[11];
+
+	EVAL6(COPY, 1, 2, 3, 4, 5, 6);
+	EVAL5(COPY, 7, 8, 9, 10, 11);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run11);
+void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12)
+{
+	u64 args[12];
+
+	EVAL6(COPY, 1, 2, 3, 4, 5, 6);
+	EVAL6(COPY, 7, 8, 9, 10, 11, 12);
+	__bpf_trace_run(prog, args);
+}
+EXPORT_SYMBOL_GPL(bpf_trace_run12);
+
+static int __bpf_probe_register(struct tracepoint *tp, struct bpf_prog *prog)
+{
+	unsigned long addr;
+	char buf[128];
+
+	/*
+	 * check that program doesn't access arguments beyond what's
+	 * available in this tracepoint
+	 */
+	if (prog->aux->max_ctx_offset > tp->num_args * sizeof(u64))
+		return -EINVAL;
+
+	snprintf(buf, sizeof(buf), "__bpf_trace_%s", tp->name);
+	addr = kallsyms_lookup_name(buf);
+	if (!addr)
+		return -ENOENT;
+
+	return tracepoint_probe_register(tp, (void *)addr, prog);
+}
+
+int bpf_probe_register(struct tracepoint *tp, struct bpf_prog *prog)
+{
+	int err;
+
+	mutex_lock(&bpf_event_mutex);
+	err = __bpf_probe_register(tp, prog);
+	mutex_unlock(&bpf_event_mutex);
+	return err;
+}
+
+static int __bpf_probe_unregister(struct tracepoint *tp, struct bpf_prog *prog)
+{
+	unsigned long addr;
+	char buf[128];
+
+	snprintf(buf, sizeof(buf), "__bpf_trace_%s", tp->name);
+	addr = kallsyms_lookup_name(buf);
+	if (!addr)
+		return -ENOENT;
+
+	return tracepoint_probe_unregister(tp, (void *)addr, prog);
+}
+
+int bpf_probe_unregister(struct tracepoint *tp, struct bpf_prog *prog)
+{
+	int err;
+
+	mutex_lock(&bpf_event_mutex);
+	err = __bpf_probe_unregister(tp, prog);
+	mutex_unlock(&bpf_event_mutex);
+	return err;
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 02/10] net/mediatek: disambiguate mt76 vs mt7601u trace events
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180322180157.742725-1-ast@fb.com>

From: Alexei Starovoitov <ast@kernel.org>

two trace events defined with the same name and both unused.
They conflict in allyesconfig build. Rename one of them.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/wireless/mediatek/mt7601u/trace.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt7601u/trace.h b/drivers/net/wireless/mediatek/mt7601u/trace.h
index 289897300ef0..82c8898b9076 100644
--- a/drivers/net/wireless/mediatek/mt7601u/trace.h
+++ b/drivers/net/wireless/mediatek/mt7601u/trace.h
@@ -34,7 +34,7 @@
 #define REG_PR_FMT	"%04x=%08x"
 #define REG_PR_ARG	__entry->reg, __entry->val
 
-DECLARE_EVENT_CLASS(dev_reg_evt,
+DECLARE_EVENT_CLASS(dev_reg_evtu,
 	TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val),
 	TP_ARGS(dev, reg, val),
 	TP_STRUCT__entry(
@@ -51,12 +51,12 @@ DECLARE_EVENT_CLASS(dev_reg_evt,
 	)
 );
 
-DEFINE_EVENT(dev_reg_evt, reg_read,
+DEFINE_EVENT(dev_reg_evtu, reg_read,
 	TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val),
 	TP_ARGS(dev, reg, val)
 );
 
-DEFINE_EVENT(dev_reg_evt, reg_write,
+DEFINE_EVENT(dev_reg_evtu, reg_write,
 	TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val),
 	TP_ARGS(dev, reg, val)
 );
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 10/10] selftests/bpf: test for bpf_get_stackid() from raw tracepoints
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180322180157.742725-1-ast@fb.com>

From: Alexei Starovoitov <ast@kernel.org>

similar to traditional traceopint test add bpf_get_stackid() test
from raw tracepoints
and reduce verbosity of existing stackmap test

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_progs.c | 91 ++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 21 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index e9df48b306df..faadbe233966 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -877,7 +877,7 @@ static void test_stacktrace_map()
 
 	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
 	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
-		goto out;
+		return;
 
 	/* Get the ID for the sched/sched_switch tracepoint */
 	snprintf(buf, sizeof(buf),
@@ -888,8 +888,7 @@ static void test_stacktrace_map()
 
 	bytes = read(efd, buf, sizeof(buf));
 	close(efd);
-	if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
-		  "read", "bytes %d errno %d\n", bytes, errno))
+	if (bytes <= 0 || bytes >= sizeof(buf))
 		goto close_prog;
 
 	/* Open the perf event and attach bpf progrram */
@@ -906,29 +905,24 @@ static void test_stacktrace_map()
 		goto close_prog;
 
 	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
-	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
-		  err, errno))
-		goto close_pmu;
+	if (err)
+		goto disable_pmu;
 
 	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
-	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
-		  err, errno))
+	if (err)
 		goto disable_pmu;
 
 	/* find map fds */
 	control_map_fd = bpf_find_map(__func__, obj, "control_map");
-	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
-		  "err %d errno %d\n", err, errno))
+	if (control_map_fd < 0)
 		goto disable_pmu;
 
 	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
-		  "err %d errno %d\n", err, errno))
+	if (stackid_hmap_fd < 0)
 		goto disable_pmu;
 
 	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
-		  err, errno))
+	if (stackmap_fd < 0)
 		goto disable_pmu;
 
 	/* give some time for bpf program run */
@@ -945,24 +939,78 @@ static void test_stacktrace_map()
 	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
 	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
 		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
+		goto disable_pmu_noerr;
 
 	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
 	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
 		  "err %d errno %d\n", err, errno))
-		; /* fall through */
+		goto disable_pmu_noerr;
 
+	goto disable_pmu_noerr;
 disable_pmu:
+	error_cnt++;
+disable_pmu_noerr:
 	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
-
-close_pmu:
 	close(pmu_fd);
-
 close_prog:
 	bpf_object__close(obj);
+}
 
-out:
-	return;
+static void test_stacktrace_map_raw_tp()
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	const char *file = "./test_stacktrace_map.o";
+	int efd, err, prog_fd;
+	__u32 key, val, duration = 0;
+	struct bpf_object *obj;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sched_switch", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (control_map_fd < 0)
+		goto close_prog;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (stackid_hmap_fd < 0)
+		goto close_prog;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (stackmap_fd < 0)
+		goto close_prog;
+
+	/* give some time for bpf program run */
+	sleep(1);
+
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
 }
 
 static int extract_build_id(char *build_id, size_t size)
@@ -1138,6 +1186,7 @@ int main(void)
 	test_tp_attach_query();
 	test_stacktrace_map();
 	test_stacktrace_build_id();
+	test_stacktrace_map_raw_tp();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 01/10] treewide: remove struct-pass-by-value from tracepoints arguments
From: Alexei Starovoitov @ 2018-03-22 18:01 UTC (permalink / raw)
  To: davem; +Cc: daniel, torvalds, peterz, rostedt, netdev, kernel-team, linux-api
In-Reply-To: <20180322180157.742725-1-ast@fb.com>

From: Alexei Starovoitov <ast@kernel.org>

Fix all tracepoint arguments to pass structures (large and small) by reference
instead of by value.
Avoiding passing large structs by value is a good coding style.
Passing small structs sometimes is beneficial, but in all cases
it makes no difference vs readability of the code.
The subsequent patch enforces that all tracepoints args are either integers
or pointers and fit into 64-bit.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/xen/mmu_pv.c                    | 16 +++++-----
 drivers/gpu/drm/i915/i915_trace.h        | 13 +++++++--
 drivers/infiniband/hw/hfi1/file_ops.c    |  2 +-
 drivers/infiniband/hw/hfi1/trace_ctxts.h | 12 ++++----
 drivers/s390/cio/ioasm.c                 | 18 ++++++------
 drivers/s390/cio/trace.h                 | 50 ++++++++++++++++----------------
 fs/dax.c                                 |  2 +-
 include/trace/events/f2fs.h              |  2 +-
 include/trace/events/fs_dax.h            |  6 ++--
 include/trace/events/rcu.h               |  4 +--
 include/trace/events/xen.h               | 32 ++++++++++----------
 kernel/rcu/tree.c                        | 10 +++----
 net/wireless/trace.h                     |  2 +-
 sound/firewire/amdtp-stream-trace.h      |  2 +-
 14 files changed, 89 insertions(+), 82 deletions(-)

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index aae88fec9941..b1a8061c3b28 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -218,7 +218,7 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 
 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 {
-	trace_xen_mmu_set_pmd(ptr, val);
+	trace_xen_mmu_set_pmd(ptr, &val);
 
 	/* If page is not pinned, we can just update the entry
 	   directly */
@@ -277,14 +277,14 @@ static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 
 static void xen_set_pte(pte_t *ptep, pte_t pteval)
 {
-	trace_xen_mmu_set_pte(ptep, pteval);
+	trace_xen_mmu_set_pte(ptep, &pteval);
 	__xen_set_pte(ptep, pteval);
 }
 
 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
-	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
+	trace_xen_mmu_set_pte_at(mm, addr, ptep, &pteval);
 	__xen_set_pte(ptep, pteval);
 }
 
@@ -292,7 +292,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 				 unsigned long addr, pte_t *ptep)
 {
 	/* Just return the pte as-is.  We preserve the bits on commit */
-	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
+	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, ptep);
 	return *ptep;
 }
 
@@ -301,7 +301,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 {
 	struct mmu_update u;
 
-	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
+	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, &pte);
 	xen_mc_batch();
 
 	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
@@ -409,7 +409,7 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 
 static void xen_set_pud(pud_t *ptr, pud_t val)
 {
-	trace_xen_mmu_set_pud(ptr, val);
+	trace_xen_mmu_set_pud(ptr, &val);
 
 	/* If page is not pinned, we can just update the entry
 	   directly */
@@ -424,7 +424,7 @@ static void xen_set_pud(pud_t *ptr, pud_t val)
 #ifdef CONFIG_X86_PAE
 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
-	trace_xen_mmu_set_pte_atomic(ptep, pte);
+	trace_xen_mmu_set_pte_atomic(ptep, &pte);
 	set_64bit((u64 *)ptep, native_pte_val(pte));
 }
 
@@ -514,7 +514,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
 	pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
 	pgd_t pgd_val;
 
-	trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
+	trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, &val);
 
 	/* If page is not pinned, we can just update the entry
 	   directly */
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index e1169c02eb2b..681da1f51911 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -849,8 +849,8 @@ TRACE_EVENT(i915_flip_complete,
 	    TP_printk("plane=%d, obj=%p", __entry->plane, __entry->obj)
 );
 
-TRACE_EVENT_CONDITION(i915_reg_rw,
-	TP_PROTO(bool write, i915_reg_t reg, u64 val, int len, bool trace),
+TRACE_EVENT_CONDITION(i915_reg_rw__,
+	TP_PROTO(bool write, u32 reg, u64 val, int len, bool trace),
 
 	TP_ARGS(write, reg, val, len, trace),
 
@@ -865,7 +865,7 @@ TRACE_EVENT_CONDITION(i915_reg_rw,
 
 	TP_fast_assign(
 		__entry->val = (u64)val;
-		__entry->reg = i915_mmio_reg_offset(reg);
+		__entry->reg = reg;
 		__entry->write = write;
 		__entry->len = len;
 		),
@@ -876,6 +876,13 @@ TRACE_EVENT_CONDITION(i915_reg_rw,
 		(u32)(__entry->val & 0xffffffff),
 		(u32)(__entry->val >> 32))
 );
+#if !defined(CREATE_TRACE_POINTS) && !defined(TRACE_HEADER_MULTI_READ)
+static inline void trace_i915_reg_rw(bool write, i915_reg_t reg, u64 val,
+				     int len, bool trace)
+{
+	trace_i915_reg_rw__(write, i915_mmio_reg_offset(reg), val, len, trace);
+}
+#endif
 
 TRACE_EVENT(intel_gpu_freq_change,
 	    TP_PROTO(u32 freq),
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 41fafebe3b0d..da4aa1a95b11 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -1153,7 +1153,7 @@ static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
 	cinfo.sdma_ring_size = fd->cq->nentries;
 	cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
 
-	trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
+	trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, &cinfo);
 	if (copy_to_user((void __user *)arg, &cinfo, len))
 		return -EFAULT;
 
diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h
index 4eb4cc798035..e00c8a7d559c 100644
--- a/drivers/infiniband/hw/hfi1/trace_ctxts.h
+++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h
@@ -106,7 +106,7 @@ TRACE_EVENT(hfi1_uctxtdata,
 TRACE_EVENT(hfi1_ctxt_info,
 	    TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt,
 		     unsigned int subctxt,
-		     struct hfi1_ctxt_info cinfo),
+		     struct hfi1_ctxt_info *cinfo),
 	    TP_ARGS(dd, ctxt, subctxt, cinfo),
 	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
 			     __field(unsigned int, ctxt)
@@ -120,11 +120,11 @@ TRACE_EVENT(hfi1_ctxt_info,
 	    TP_fast_assign(DD_DEV_ASSIGN(dd);
 			    __entry->ctxt = ctxt;
 			    __entry->subctxt = subctxt;
-			    __entry->egrtids = cinfo.egrtids;
-			    __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
-			    __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
-			    __entry->sdma_ring_size = cinfo.sdma_ring_size;
-			    __entry->rcvegr_size = cinfo.rcvegr_size;
+			    __entry->egrtids = cinfo->egrtids;
+			    __entry->rcvhdrq_cnt = cinfo->rcvhdrq_cnt;
+			    __entry->rcvhdrq_size = cinfo->rcvhdrq_entsize;
+			    __entry->sdma_ring_size = cinfo->sdma_ring_size;
+			    __entry->rcvegr_size = cinfo->rcvegr_size;
 			    ),
 	    TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
 		      __get_str(dev),
diff --git a/drivers/s390/cio/ioasm.c b/drivers/s390/cio/ioasm.c
index 4fa9ee1d09fa..0aecb6314e6f 100644
--- a/drivers/s390/cio/ioasm.c
+++ b/drivers/s390/cio/ioasm.c
@@ -35,7 +35,7 @@ int stsch(struct subchannel_id schid, struct schib *addr)
 	int ccode;
 
 	ccode = __stsch(schid, addr);
-	trace_s390_cio_stsch(schid, addr, ccode);
+	trace_s390_cio_stsch(&schid, addr, ccode);
 
 	return ccode;
 }
@@ -63,7 +63,7 @@ int msch(struct subchannel_id schid, struct schib *addr)
 	int ccode;
 
 	ccode = __msch(schid, addr);
-	trace_s390_cio_msch(schid, addr, ccode);
+	trace_s390_cio_msch(&schid, addr, ccode);
 
 	return ccode;
 }
@@ -88,7 +88,7 @@ int tsch(struct subchannel_id schid, struct irb *addr)
 	int ccode;
 
 	ccode = __tsch(schid, addr);
-	trace_s390_cio_tsch(schid, addr, ccode);
+	trace_s390_cio_tsch(&schid, addr, ccode);
 
 	return ccode;
 }
@@ -115,7 +115,7 @@ int ssch(struct subchannel_id schid, union orb *addr)
 	int ccode;
 
 	ccode = __ssch(schid, addr);
-	trace_s390_cio_ssch(schid, addr, ccode);
+	trace_s390_cio_ssch(&schid, addr, ccode);
 
 	return ccode;
 }
@@ -141,7 +141,7 @@ int csch(struct subchannel_id schid)
 	int ccode;
 
 	ccode = __csch(schid);
-	trace_s390_cio_csch(schid, ccode);
+	trace_s390_cio_csch(&schid, ccode);
 
 	return ccode;
 }
@@ -202,7 +202,7 @@ int rchp(struct chp_id chpid)
 	int ccode;
 
 	ccode = __rchp(chpid);
-	trace_s390_cio_rchp(chpid, ccode);
+	trace_s390_cio_rchp(&chpid, ccode);
 
 	return ccode;
 }
@@ -228,7 +228,7 @@ int rsch(struct subchannel_id schid)
 	int ccode;
 
 	ccode = __rsch(schid);
-	trace_s390_cio_rsch(schid, ccode);
+	trace_s390_cio_rsch(&schid, ccode);
 
 	return ccode;
 }
@@ -253,7 +253,7 @@ int hsch(struct subchannel_id schid)
 	int ccode;
 
 	ccode = __hsch(schid);
-	trace_s390_cio_hsch(schid, ccode);
+	trace_s390_cio_hsch(&schid, ccode);
 
 	return ccode;
 }
@@ -278,7 +278,7 @@ int xsch(struct subchannel_id schid)
 	int ccode;
 
 	ccode = __xsch(schid);
-	trace_s390_cio_xsch(schid, ccode);
+	trace_s390_cio_xsch(&schid, ccode);
 
 	return ccode;
 }
diff --git a/drivers/s390/cio/trace.h b/drivers/s390/cio/trace.h
index 1f8d1c1e566d..4aa6d1426106 100644
--- a/drivers/s390/cio/trace.h
+++ b/drivers/s390/cio/trace.h
@@ -22,7 +22,7 @@
 #include <linux/tracepoint.h>
 
 DECLARE_EVENT_CLASS(s390_class_schib,
-	TP_PROTO(struct subchannel_id schid, struct schib *schib, int cc),
+	TP_PROTO(struct subchannel_id *schid, struct schib *schib, int cc),
 	TP_ARGS(schid, schib, cc),
 	TP_STRUCT__entry(
 		__field(u8, cssid)
@@ -33,9 +33,9 @@ DECLARE_EVENT_CLASS(s390_class_schib,
 		__field(int, cc)
 	),
 	TP_fast_assign(
-		__entry->cssid = schid.cssid;
-		__entry->ssid = schid.ssid;
-		__entry->schno = schid.sch_no;
+		__entry->cssid = schid->cssid;
+		__entry->ssid = schid->ssid;
+		__entry->schno = schid->sch_no;
 		__entry->devno = schib->pmcw.dev;
 		__entry->schib = *schib;
 		__entry->cc = cc;
@@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(s390_class_schib,
  * @cc: Condition code
  */
 DEFINE_EVENT(s390_class_schib, s390_cio_stsch,
-	TP_PROTO(struct subchannel_id schid, struct schib *schib, int cc),
+	TP_PROTO(struct subchannel_id *schid, struct schib *schib, int cc),
 	TP_ARGS(schid, schib, cc)
 );
 
@@ -71,7 +71,7 @@ DEFINE_EVENT(s390_class_schib, s390_cio_stsch,
  * @cc: Condition code
  */
 DEFINE_EVENT(s390_class_schib, s390_cio_msch,
-	TP_PROTO(struct subchannel_id schid, struct schib *schib, int cc),
+	TP_PROTO(struct subchannel_id *schid, struct schib *schib, int cc),
 	TP_ARGS(schid, schib, cc)
 );
 
@@ -82,7 +82,7 @@ DEFINE_EVENT(s390_class_schib, s390_cio_msch,
  * @cc: Condition code
  */
 TRACE_EVENT(s390_cio_tsch,
-	TP_PROTO(struct subchannel_id schid, struct irb *irb, int cc),
+	TP_PROTO(struct subchannel_id *schid, struct irb *irb, int cc),
 	TP_ARGS(schid, irb, cc),
 	TP_STRUCT__entry(
 		__field(u8, cssid)
@@ -92,9 +92,9 @@ TRACE_EVENT(s390_cio_tsch,
 		__field(int, cc)
 	),
 	TP_fast_assign(
-		__entry->cssid = schid.cssid;
-		__entry->ssid = schid.ssid;
-		__entry->schno = schid.sch_no;
+		__entry->cssid = schid->cssid;
+		__entry->ssid = schid->ssid;
+		__entry->schno = schid->sch_no;
 		__entry->irb = *irb;
 		__entry->cc = cc;
 	),
@@ -151,7 +151,7 @@ TRACE_EVENT(s390_cio_tpi,
  * @cc: Condition code
  */
 TRACE_EVENT(s390_cio_ssch,
-	TP_PROTO(struct subchannel_id schid, union orb *orb, int cc),
+	TP_PROTO(struct subchannel_id *schid, union orb *orb, int cc),
 	TP_ARGS(schid, orb, cc),
 	TP_STRUCT__entry(
 		__field(u8, cssid)
@@ -161,9 +161,9 @@ TRACE_EVENT(s390_cio_ssch,
 		__field(int, cc)
 	),
 	TP_fast_assign(
-		__entry->cssid = schid.cssid;
-		__entry->ssid = schid.ssid;
-		__entry->schno = schid.sch_no;
+		__entry->cssid = schid->cssid;
+		__entry->ssid = schid->ssid;
+		__entry->schno = schid->sch_no;
 		__entry->orb = *orb;
 		__entry->cc = cc;
 	),
@@ -173,7 +173,7 @@ TRACE_EVENT(s390_cio_ssch,
 );
 
 DECLARE_EVENT_CLASS(s390_class_schid,
-	TP_PROTO(struct subchannel_id schid, int cc),
+	TP_PROTO(struct subchannel_id *schid, int cc),
 	TP_ARGS(schid, cc),
 	TP_STRUCT__entry(
 		__field(u8, cssid)
@@ -182,9 +182,9 @@ DECLARE_EVENT_CLASS(s390_class_schid,
 		__field(int, cc)
 	),
 	TP_fast_assign(
-		__entry->cssid = schid.cssid;
-		__entry->ssid = schid.ssid;
-		__entry->schno = schid.sch_no;
+		__entry->cssid = schid->cssid;
+		__entry->ssid = schid->ssid;
+		__entry->schno = schid->sch_no;
 		__entry->cc = cc;
 	),
 	TP_printk("schid=%x.%x.%04x cc=%d", __entry->cssid, __entry->ssid,
@@ -198,7 +198,7 @@ DECLARE_EVENT_CLASS(s390_class_schid,
  * @cc: Condition code
  */
 DEFINE_EVENT(s390_class_schid, s390_cio_csch,
-	TP_PROTO(struct subchannel_id schid, int cc),
+	TP_PROTO(struct subchannel_id *schid, int cc),
 	TP_ARGS(schid, cc)
 );
 
@@ -208,7 +208,7 @@ DEFINE_EVENT(s390_class_schid, s390_cio_csch,
  * @cc: Condition code
  */
 DEFINE_EVENT(s390_class_schid, s390_cio_hsch,
-	TP_PROTO(struct subchannel_id schid, int cc),
+	TP_PROTO(struct subchannel_id *schid, int cc),
 	TP_ARGS(schid, cc)
 );
 
@@ -218,7 +218,7 @@ DEFINE_EVENT(s390_class_schid, s390_cio_hsch,
  * @cc: Condition code
  */
 DEFINE_EVENT(s390_class_schid, s390_cio_xsch,
-	TP_PROTO(struct subchannel_id schid, int cc),
+	TP_PROTO(struct subchannel_id *schid, int cc),
 	TP_ARGS(schid, cc)
 );
 
@@ -228,7 +228,7 @@ DEFINE_EVENT(s390_class_schid, s390_cio_xsch,
  * @cc: Condition code
  */
 DEFINE_EVENT(s390_class_schid, s390_cio_rsch,
-	TP_PROTO(struct subchannel_id schid, int cc),
+	TP_PROTO(struct subchannel_id *schid, int cc),
 	TP_ARGS(schid, cc)
 );
 
@@ -238,7 +238,7 @@ DEFINE_EVENT(s390_class_schid, s390_cio_rsch,
  * @cc: Condition code
  */
 TRACE_EVENT(s390_cio_rchp,
-	TP_PROTO(struct chp_id chpid, int cc),
+	TP_PROTO(struct chp_id *chpid, int cc),
 	TP_ARGS(chpid, cc),
 	TP_STRUCT__entry(
 		__field(u8, cssid)
@@ -246,8 +246,8 @@ TRACE_EVENT(s390_cio_rchp,
 		__field(int, cc)
 	),
 	TP_fast_assign(
-		__entry->cssid = chpid.cssid;
-		__entry->id = chpid.id;
+		__entry->cssid = chpid->cssid;
+		__entry->id = chpid->id;
 		__entry->cc = cc;
 	),
 	TP_printk("chpid=%x.%02x cc=%d", __entry->cssid, __entry->id,
diff --git a/fs/dax.c b/fs/dax.c
index 0276df90e86c..6d03ead8e788 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1429,7 +1429,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			goto finish_iomap;
 		}
 
-		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
+		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, &pfn, entry);
 		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
 					    write);
 		break;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 06c87f9f720c..795698925d20 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -491,7 +491,7 @@ DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_node,
 
 TRACE_EVENT(f2fs_truncate_partial_nodes,
 
-	TP_PROTO(struct inode *inode, nid_t nid[], int depth, int err),
+	TP_PROTO(struct inode *inode, nid_t *nid, int depth, int err),
 
 	TP_ARGS(inode, nid, depth, err),
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 97b09fcf7e52..5a6a8285750f 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -104,7 +104,7 @@ DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
 
 DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
-		long length, pfn_t pfn, void *radix_entry),
+		long length, pfn_t *pfn, void *radix_entry),
 	TP_ARGS(inode, vmf, length, pfn, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
@@ -123,7 +123,7 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 		__entry->address = vmf->address;
 		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
 		__entry->length = length;
-		__entry->pfn_val = pfn.val;
+		__entry->pfn_val = pfn->val;
 		__entry->radix_entry = radix_entry;
 	),
 	TP_printk("dev %d:%d ino %#lx %s %s address %#lx length %#lx "
@@ -145,7 +145,7 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 #define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
 DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
-		long length, pfn_t pfn, void *radix_entry), \
+		long length, pfn_t *pfn, void *radix_entry), \
 	TP_ARGS(inode, vmf, length, pfn, radix_entry))
 
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 0b50fda80db0..4b463294306f 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -436,7 +436,7 @@ TRACE_EVENT(rcu_fqs,
  */
 TRACE_EVENT(rcu_dyntick,
 
-	TP_PROTO(const char *polarity, long oldnesting, long newnesting, atomic_t dynticks),
+	TP_PROTO(const char *polarity, long oldnesting, long newnesting, atomic_t *dynticks),
 
 	TP_ARGS(polarity, oldnesting, newnesting, dynticks),
 
@@ -451,7 +451,7 @@ TRACE_EVENT(rcu_dyntick,
 		__entry->polarity = polarity;
 		__entry->oldnesting = oldnesting;
 		__entry->newnesting = newnesting;
-		__entry->dynticks = atomic_read(&dynticks);
+		__entry->dynticks = atomic_read(dynticks);
 	),
 
 	TP_printk("%s %lx %lx %#3x", __entry->polarity,
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index 7dd8f34c37df..ea9e9014f0c5 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -128,14 +128,14 @@ TRACE_EVENT(xen_mc_extend_args,
 TRACE_DEFINE_SIZEOF(pteval_t);
 /* mmu */
 DECLARE_EVENT_CLASS(xen_mmu__set_pte,
-	    TP_PROTO(pte_t *ptep, pte_t pteval),
+	    TP_PROTO(pte_t *ptep, pte_t *pteval),
 	    TP_ARGS(ptep, pteval),
 	    TP_STRUCT__entry(
 		    __field(pte_t *, ptep)
 		    __field(pteval_t, pteval)
 		    ),
 	    TP_fast_assign(__entry->ptep = ptep;
-			   __entry->pteval = pteval.pte),
+			   __entry->pteval = pteval->pte),
 	    TP_printk("ptep %p pteval %0*llx (raw %0*llx)",
 		      __entry->ptep,
 		      (int)sizeof(pteval_t) * 2, (unsigned long long)pte_val(native_make_pte(__entry->pteval)),
@@ -144,14 +144,14 @@ DECLARE_EVENT_CLASS(xen_mmu__set_pte,
 
 #define DEFINE_XEN_MMU_SET_PTE(name)				\
 	DEFINE_EVENT(xen_mmu__set_pte, name,			\
-		     TP_PROTO(pte_t *ptep, pte_t pteval),	\
+		     TP_PROTO(pte_t *ptep, pte_t *pteval),	\
 		     TP_ARGS(ptep, pteval))
 
 DEFINE_XEN_MMU_SET_PTE(xen_mmu_set_pte);
 
 TRACE_EVENT(xen_mmu_set_pte_at,
 	    TP_PROTO(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t pteval),
+		     pte_t *ptep, pte_t *pteval),
 	    TP_ARGS(mm, addr, ptep, pteval),
 	    TP_STRUCT__entry(
 		    __field(struct mm_struct *, mm)
@@ -162,7 +162,7 @@ TRACE_EVENT(xen_mmu_set_pte_at,
 	    TP_fast_assign(__entry->mm = mm;
 			   __entry->addr = addr;
 			   __entry->ptep = ptep;
-			   __entry->pteval = pteval.pte),
+			   __entry->pteval = pteval->pte),
 	    TP_printk("mm %p addr %lx ptep %p pteval %0*llx (raw %0*llx)",
 		      __entry->mm, __entry->addr, __entry->ptep,
 		      (int)sizeof(pteval_t) * 2, (unsigned long long)pte_val(native_make_pte(__entry->pteval)),
@@ -172,14 +172,14 @@ TRACE_EVENT(xen_mmu_set_pte_at,
 TRACE_DEFINE_SIZEOF(pmdval_t);
 
 TRACE_EVENT(xen_mmu_set_pmd,
-	    TP_PROTO(pmd_t *pmdp, pmd_t pmdval),
+	    TP_PROTO(pmd_t *pmdp, pmd_t *pmdval),
 	    TP_ARGS(pmdp, pmdval),
 	    TP_STRUCT__entry(
 		    __field(pmd_t *, pmdp)
 		    __field(pmdval_t, pmdval)
 		    ),
 	    TP_fast_assign(__entry->pmdp = pmdp;
-			   __entry->pmdval = pmdval.pmd),
+			   __entry->pmdval = pmdval->pmd),
 	    TP_printk("pmdp %p pmdval %0*llx (raw %0*llx)",
 		      __entry->pmdp,
 		      (int)sizeof(pmdval_t) * 2, (unsigned long long)pmd_val(native_make_pmd(__entry->pmdval)),
@@ -220,14 +220,14 @@ TRACE_EVENT(xen_mmu_pmd_clear,
 TRACE_DEFINE_SIZEOF(pudval_t);
 
 TRACE_EVENT(xen_mmu_set_pud,
-	    TP_PROTO(pud_t *pudp, pud_t pudval),
+	    TP_PROTO(pud_t *pudp, pud_t *pudval),
 	    TP_ARGS(pudp, pudval),
 	    TP_STRUCT__entry(
 		    __field(pud_t *, pudp)
 		    __field(pudval_t, pudval)
 		    ),
 	    TP_fast_assign(__entry->pudp = pudp;
-			   __entry->pudval = native_pud_val(pudval)),
+			   __entry->pudval = native_pud_val(*pudval)),
 	    TP_printk("pudp %p pudval %0*llx (raw %0*llx)",
 		      __entry->pudp,
 		      (int)sizeof(pudval_t) * 2, (unsigned long long)pud_val(native_make_pud(__entry->pudval)),
@@ -237,7 +237,7 @@ TRACE_EVENT(xen_mmu_set_pud,
 TRACE_DEFINE_SIZEOF(p4dval_t);
 
 TRACE_EVENT(xen_mmu_set_p4d,
-	    TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval),
+	    TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t *p4dval),
 	    TP_ARGS(p4dp, user_p4dp, p4dval),
 	    TP_STRUCT__entry(
 		    __field(p4d_t *, p4dp)
@@ -246,7 +246,7 @@ TRACE_EVENT(xen_mmu_set_p4d,
 		    ),
 	    TP_fast_assign(__entry->p4dp = p4dp;
 			   __entry->user_p4dp = user_p4dp;
-			   __entry->p4dval = p4d_val(p4dval)),
+			   __entry->p4dval = p4d_val(*p4dval)),
 	    TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)",
 		      __entry->p4dp, __entry->user_p4dp,
 		      (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)),
@@ -255,14 +255,14 @@ TRACE_EVENT(xen_mmu_set_p4d,
 #else
 
 TRACE_EVENT(xen_mmu_set_pud,
-	    TP_PROTO(pud_t *pudp, pud_t pudval),
+	    TP_PROTO(pud_t *pudp, pud_t *pudval),
 	    TP_ARGS(pudp, pudval),
 	    TP_STRUCT__entry(
 		    __field(pud_t *, pudp)
 		    __field(pudval_t, pudval)
 		    ),
 	    TP_fast_assign(__entry->pudp = pudp;
-			   __entry->pudval = native_pud_val(pudval)),
+			   __entry->pudval = native_pud_val(*pudval)),
 	    TP_printk("pudp %p pudval %0*llx (raw %0*llx)",
 		      __entry->pudp,
 		      (int)sizeof(pudval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pudval)),
@@ -273,7 +273,7 @@ TRACE_EVENT(xen_mmu_set_pud,
 
 DECLARE_EVENT_CLASS(xen_mmu_ptep_modify_prot,
 	    TP_PROTO(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t pteval),
+		     pte_t *ptep, pte_t *pteval),
 	    TP_ARGS(mm, addr, ptep, pteval),
 	    TP_STRUCT__entry(
 		    __field(struct mm_struct *, mm)
@@ -284,7 +284,7 @@ DECLARE_EVENT_CLASS(xen_mmu_ptep_modify_prot,
 	    TP_fast_assign(__entry->mm = mm;
 			   __entry->addr = addr;
 			   __entry->ptep = ptep;
-			   __entry->pteval = pteval.pte),
+			   __entry->pteval = pteval->pte),
 	    TP_printk("mm %p addr %lx ptep %p pteval %0*llx (raw %0*llx)",
 		      __entry->mm, __entry->addr, __entry->ptep,
 		      (int)sizeof(pteval_t) * 2, (unsigned long long)pte_val(native_make_pte(__entry->pteval)),
@@ -293,7 +293,7 @@ DECLARE_EVENT_CLASS(xen_mmu_ptep_modify_prot,
 #define DEFINE_XEN_MMU_PTEP_MODIFY_PROT(name)				\
 	DEFINE_EVENT(xen_mmu_ptep_modify_prot, name,			\
 		     TP_PROTO(struct mm_struct *mm, unsigned long addr,	\
-			      pte_t *ptep, pte_t pteval),		\
+			      pte_t *ptep, pte_t *pteval),		\
 		     TP_ARGS(mm, addr, ptep, pteval))
 
 DEFINE_XEN_MMU_PTEP_MODIFY_PROT(xen_mmu_ptep_modify_prot_start);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 491bdf39f276..43c0f899f78c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -772,7 +772,7 @@ static void rcu_eqs_enter(bool user)
 	}
 
 	lockdep_assert_irqs_disabled();
-	trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks);
+	trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, &rdtp->dynticks);
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	for_each_rcu_flavor(rsp) {
 		rdp = this_cpu_ptr(rsp->rda);
@@ -848,14 +848,14 @@ void rcu_nmi_exit(void)
 	 * leave it in non-RCU-idle state.
 	 */
 	if (rdtp->dynticks_nmi_nesting != 1) {
-		trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks);
+		trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, &rdtp->dynticks);
 		WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */
 			   rdtp->dynticks_nmi_nesting - 2);
 		return;
 	}
 
 	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
-	trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks);
+	trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, &rdtp->dynticks);
 	WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 	rcu_dynticks_eqs_enter();
 }
@@ -930,7 +930,7 @@ static void rcu_eqs_exit(bool user)
 	rcu_dynticks_task_exit();
 	rcu_dynticks_eqs_exit();
 	rcu_cleanup_after_idle();
-	trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks);
+	trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, &rdtp->dynticks);
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	WRITE_ONCE(rdtp->dynticks_nesting, 1);
 	WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
@@ -1004,7 +1004,7 @@ void rcu_nmi_enter(void)
 	}
 	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
 			  rdtp->dynticks_nmi_nesting,
-			  rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks);
+			  rdtp->dynticks_nmi_nesting + incby, &rdtp->dynticks);
 	WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */
 		   rdtp->dynticks_nmi_nesting + incby);
 	barrier();
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 5152938b358d..018c81fa72fb 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3137,7 +3137,7 @@ TRACE_EVENT(rdev_start_radar_detection,
 
 TRACE_EVENT(rdev_set_mcast_rate,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
-		 int mcast_rate[NUM_NL80211_BANDS]),
+		 int *mcast_rate),
 	TP_ARGS(wiphy, netdev, mcast_rate),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
diff --git a/sound/firewire/amdtp-stream-trace.h b/sound/firewire/amdtp-stream-trace.h
index ea0d486652c8..54cdd4ffa9ce 100644
--- a/sound/firewire/amdtp-stream-trace.h
+++ b/sound/firewire/amdtp-stream-trace.h
@@ -14,7 +14,7 @@
 #include <linux/tracepoint.h>
 
 TRACE_EVENT(in_packet,
-	TP_PROTO(const struct amdtp_stream *s, u32 cycles, u32 cip_header[2], unsigned int payload_length, unsigned int index),
+	TP_PROTO(const struct amdtp_stream *s, u32 cycles, u32 *cip_header, unsigned int payload_length, unsigned int index),
 	TP_ARGS(s, cycles, cip_header, payload_length, index),
 	TP_STRUCT__entry(
 		__field(unsigned int, second)
-- 
2.9.5

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox