Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v5 4/4] samples/bpf: use bpf_set_link_xdp_fd
From: Eric Leblond @ 2018-01-04  8:21 UTC (permalink / raw)
  To: daniel, Toshiaki Makita, Philippe Ombredanne
  Cc: Alexei Starovoitov, netdev, linux-kernel, Thomas Graf,
	Eric Leblond
In-Reply-To: <20180104082149.13758-1-eric@regit.org>

Use bpf_set_link_xdp_fd instead of set_link_xdp_fd to remove some
code duplication and benefit of netlink ext ack errors message.

Signed-off-by: Eric Leblond <eric@regit.org>
---
 samples/bpf/bpf_load.c              | 102 ------------------------------------
 samples/bpf/bpf_load.h              |   2 +-
 samples/bpf/xdp1_user.c             |   4 +-
 samples/bpf/xdp_redirect_cpu_user.c |   6 +--
 samples/bpf/xdp_redirect_map_user.c |   8 +--
 samples/bpf/xdp_redirect_user.c     |   8 +--
 samples/bpf/xdp_router_ipv4_user.c  |  10 ++--
 samples/bpf/xdp_tx_iptunnel_user.c  |   6 +--
 8 files changed, 22 insertions(+), 124 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 242631aa4ea2..69806d74fa53 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -695,105 +695,3 @@ struct ksym *ksym_search(long key)
 	return &syms[0];
 }
 
-int set_link_xdp_fd(int ifindex, int fd, __u32 flags)
-{
-	struct sockaddr_nl sa;
-	int sock, seq = 0, len, ret = -1;
-	char buf[4096];
-	struct nlattr *nla, *nla_xdp;
-	struct {
-		struct nlmsghdr  nh;
-		struct ifinfomsg ifinfo;
-		char             attrbuf[64];
-	} req;
-	struct nlmsghdr *nh;
-	struct nlmsgerr *err;
-
-	memset(&sa, 0, sizeof(sa));
-	sa.nl_family = AF_NETLINK;
-
-	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	if (sock < 0) {
-		printf("open netlink socket: %s\n", strerror(errno));
-		return -1;
-	}
-
-	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
-		printf("bind to netlink: %s\n", strerror(errno));
-		goto cleanup;
-	}
-
-	memset(&req, 0, sizeof(req));
-	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
-	req.nh.nlmsg_type = RTM_SETLINK;
-	req.nh.nlmsg_pid = 0;
-	req.nh.nlmsg_seq = ++seq;
-	req.ifinfo.ifi_family = AF_UNSPEC;
-	req.ifinfo.ifi_index = ifindex;
-
-	/* started nested attribute for XDP */
-	nla = (struct nlattr *)(((char *)&req)
-				+ NLMSG_ALIGN(req.nh.nlmsg_len));
-	nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
-	nla->nla_len = NLA_HDRLEN;
-
-	/* add XDP fd */
-	nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
-	nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
-	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
-	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
-	nla->nla_len += nla_xdp->nla_len;
-
-	/* if user passed in any flags, add those too */
-	if (flags) {
-		nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
-		nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
-		nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
-		memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
-		nla->nla_len += nla_xdp->nla_len;
-	}
-
-	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
-
-	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
-		printf("send to netlink: %s\n", strerror(errno));
-		goto cleanup;
-	}
-
-	len = recv(sock, buf, sizeof(buf), 0);
-	if (len < 0) {
-		printf("recv from netlink: %s\n", strerror(errno));
-		goto cleanup;
-	}
-
-	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
-	     nh = NLMSG_NEXT(nh, len)) {
-		if (nh->nlmsg_pid != getpid()) {
-			printf("Wrong pid %d, expected %d\n",
-			       nh->nlmsg_pid, getpid());
-			goto cleanup;
-		}
-		if (nh->nlmsg_seq != seq) {
-			printf("Wrong seq %d, expected %d\n",
-			       nh->nlmsg_seq, seq);
-			goto cleanup;
-		}
-		switch (nh->nlmsg_type) {
-		case NLMSG_ERROR:
-			err = (struct nlmsgerr *)NLMSG_DATA(nh);
-			if (!err->error)
-				continue;
-			printf("nlmsg error %s\n", strerror(-err->error));
-			goto cleanup;
-		case NLMSG_DONE:
-			break;
-		}
-	}
-
-	ret = 0;
-
-cleanup:
-	close(sock);
-	return ret;
-}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 7d57a4248893..453c200b389b 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -61,5 +61,5 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
-int set_link_xdp_fd(int ifindex, int fd, __u32 flags);
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 #endif
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index fdaefe91801d..b901ee2b3336 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -25,7 +25,7 @@ static __u32 xdp_flags;
 
 static void int_exit(int sig)
 {
-	set_link_xdp_fd(ifindex, -1, xdp_flags);
+	bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
 	exit(0);
 }
 
@@ -116,7 +116,7 @@ int main(int argc, char **argv)
 	signal(SIGINT, int_exit);
 	signal(SIGTERM, int_exit);
 
-	if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+	if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
 		printf("link set xdp fd failed\n");
 		return 1;
 	}
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index 35fec9fecb57..23744a8aaf21 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -26,7 +26,7 @@ static const char *__doc__ =
 
 /* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead
  * use bpf/libbpf.h), but cannot as (currently) needed for XDP
- * attaching to a device via set_link_xdp_fd()
+ * attaching to a device via bpf_set_link_xdp_fd()
  */
 #include "libbpf.h"
 #include "bpf_load.h"
@@ -67,7 +67,7 @@ static void int_exit(int sig)
 		"Interrupted: Removing XDP program on ifindex:%d device:%s\n",
 		ifindex, ifname);
 	if (ifindex > -1)
-		set_link_xdp_fd(ifindex, -1, xdp_flags);
+		bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
 	exit(EXIT_OK);
 }
 
@@ -682,7 +682,7 @@ int main(int argc, char **argv)
 	/* Remove XDP program when program is interrupted */
 	signal(SIGINT, int_exit);
 
-	if (set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) {
+	if (bpf_set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) {
 		fprintf(stderr, "link set xdp fd failed\n");
 		return EXIT_FAIL_XDP;
 	}
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index 978a532f0748..7eae07d7293e 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -34,9 +34,9 @@ static __u32 xdp_flags;
 
 static void int_exit(int sig)
 {
-	set_link_xdp_fd(ifindex_in, -1, xdp_flags);
+	bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
 	if (ifindex_out_xdp_dummy_attached)
-		set_link_xdp_fd(ifindex_out, -1, xdp_flags);
+		bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
 	exit(0);
 }
 
@@ -120,13 +120,13 @@ int main(int argc, char **argv)
 		return 1;
 	}
 
-	if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
+	if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
 		printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
 		return 1;
 	}
 
 	/* Loading dummy XDP prog on out-device */
-	if (set_link_xdp_fd(ifindex_out, prog_fd[1],
+	if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1],
 			    (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
 		printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
 		ifindex_out_xdp_dummy_attached = false;
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index 4475d837bf2c..d54e91eb6cbf 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -33,9 +33,9 @@ static __u32 xdp_flags;
 
 static void int_exit(int sig)
 {
-	set_link_xdp_fd(ifindex_in, -1, xdp_flags);
+	bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
 	if (ifindex_out_xdp_dummy_attached)
-		set_link_xdp_fd(ifindex_out, -1, xdp_flags);
+		bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
 	exit(0);
 }
 
@@ -114,13 +114,13 @@ int main(int argc, char **argv)
 		return 1;
 	}
 
-	if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
+	if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
 		printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
 		return 1;
 	}
 
 	/* Loading dummy XDP prog on out-device */
-	if (set_link_xdp_fd(ifindex_out, prog_fd[1],
+	if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1],
 			    (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
 		printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
 		ifindex_out_xdp_dummy_attached = false;
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
index 916462112d55..6296741c1fbd 100644
--- a/samples/bpf/xdp_router_ipv4_user.c
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -37,7 +37,7 @@ static void int_exit(int sig)
 	int i = 0;
 
 	for (i = 0; i < total_ifindex; i++)
-		set_link_xdp_fd(ifindex_list[i], -1, flags);
+		bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
 	exit(0);
 }
 
@@ -49,7 +49,7 @@ static void close_and_exit(int sig)
 	close(sock_arp);
 
 	for (i = 0; i < total_ifindex; i++)
-		set_link_xdp_fd(ifindex_list[i], -1, flags);
+		bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
 	exit(0);
 }
 
@@ -183,7 +183,7 @@ static void read_route(struct nlmsghdr *nh, int nll)
 			int i = 0;
 
 			for (i = 0; i < total_ifindex; i++)
-				set_link_xdp_fd(ifindex_list[i], -1, flags);
+				bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
 			exit(0);
 		}
 		assert(bpf_map_update_elem(map_fd[4], &route.iface, &route.iface, 0) == 0);
@@ -633,12 +633,12 @@ int main(int ac, char **argv)
 		}
 	}
 	for (i = 0; i < total_ifindex; i++) {
-		if (set_link_xdp_fd(ifindex_list[i], prog_fd[0], flags) < 0) {
+		if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd[0], flags) < 0) {
 			printf("link set xdp fd failed\n");
 			int recovery_index = i;
 
 			for (i = 0; i < recovery_index; i++)
-				set_link_xdp_fd(ifindex_list[i], -1, flags);
+				bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
 
 			return 1;
 		}
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index 715cd12eaca5..f0a787268a87 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -30,7 +30,7 @@ static __u32 xdp_flags = 0;
 static void int_exit(int sig)
 {
 	if (ifindex > -1)
-		set_link_xdp_fd(ifindex, -1, xdp_flags);
+		bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
 	exit(0);
 }
 
@@ -254,14 +254,14 @@ int main(int argc, char **argv)
 		}
 	}
 
-	if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+	if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
 		printf("link set xdp fd failed\n");
 		return 1;
 	}
 
 	poll_stats(kill_after_s);
 
-	set_link_xdp_fd(ifindex, -1, xdp_flags);
+	bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
 
 	return 0;
 }
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next v5 3/4] libbpf: add missing SPDX-License-Identifier
From: Eric Leblond @ 2018-01-04  8:21 UTC (permalink / raw)
  To: daniel, Toshiaki Makita, Philippe Ombredanne
  Cc: Alexei Starovoitov, netdev, linux-kernel, Thomas Graf,
	Eric Leblond
In-Reply-To: <20180104082149.13758-1-eric@regit.org>

Signed-off-by: Eric Leblond <eric@regit.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf.c    | 2 ++
 tools/lib/bpf/bpf.h    | 2 ++
 tools/lib/bpf/libbpf.c | 2 ++
 tools/lib/bpf/libbpf.h | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 10d71b9fdbd0..38d720466fe8 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: LGPL-2.1
+
 /*
  * common eBPF ELF operations.
  *
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 9f44c196931e..8d18fb73d7fb 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+
 /*
  * common eBPF ELF operations.
  *
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5fe8aaa2123e..924a8b8431ab 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: LGPL-2.1
+
 /*
  * Common eBPF ELF object loading operations.
  *
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e42f96900318..f85906533cdd 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1,3 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+
 /*
  * Common eBPF ELF object loading operations.
  *
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next v5 2/4] libbpf: add error reporting in XDP
From: Eric Leblond @ 2018-01-04  8:21 UTC (permalink / raw)
  To: daniel, Toshiaki Makita, Philippe Ombredanne
  Cc: Alexei Starovoitov, netdev, linux-kernel, Thomas Graf,
	Eric Leblond
In-Reply-To: <20180104082149.13758-1-eric@regit.org>

Parse netlink ext attribute to get the error message returned by
the card. Code is partially take from libnl.

Signed-off-by: Eric Leblond <eric@regit.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile   |   2 +-
 tools/lib/bpf/Build    |   2 +-
 tools/lib/bpf/bpf.c    |  10 ++-
 tools/lib/bpf/nlattr.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/nlattr.h |  70 ++++++++++++++++++
 5 files changed, 268 insertions(+), 3 deletions(-)
 create mode 100644 tools/lib/bpf/nlattr.c
 create mode 100644 tools/lib/bpf/nlattr.h

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4fb944a7ecf8..c889ebcba9b3 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -44,7 +44,7 @@ hostprogs-y += xdp_monitor
 hostprogs-y += syscall_tp
 
 # Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o
+LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
 
 test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index d8749756352d..64c679d67109 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o
+libbpf-y := libbpf.o bpf.o nlattr.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index e6c61035b64c..10d71b9fdbd0 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -26,6 +26,7 @@
 #include <linux/bpf.h>
 #include "bpf.h"
 #include "libbpf.h"
+#include "nlattr.h"
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
 #include <errno.h>
@@ -440,6 +441,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 	struct nlmsghdr *nh;
 	struct nlmsgerr *err;
 	socklen_t addrlen;
+	int one;
 
 	memset(&sa, 0, sizeof(sa));
 	sa.nl_family = AF_NETLINK;
@@ -449,6 +451,11 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 		return -errno;
 	}
 
+	if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
+		       &one, sizeof(one)) < 0) {
+		fprintf(stderr, "Netlink error reporting not supported\n");
+	}
+
 	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
 		ret = -errno;
 		goto cleanup;
@@ -524,7 +531,8 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 			err = (struct nlmsgerr *)NLMSG_DATA(nh);
 			if (!err->error)
 				continue;
-			ret = err->error;
+			ret = -err->error;
+			nla_dump_errormsg(nh);
 			goto cleanup;
 		case NLMSG_DONE:
 			break;
diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
new file mode 100644
index 000000000000..4719434278b2
--- /dev/null
+++ b/tools/lib/bpf/nlattr.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: LGPL-2.1
+
+/*
+ * NETLINK      Netlink attributes
+ *
+ *	This library is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU Lesser General Public
+ *	License as published by the Free Software Foundation version 2.1
+ *	of the License.
+ *
+ * Copyright (c) 2003-2013 Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <errno.h>
+#include "nlattr.h"
+#include <linux/rtnetlink.h>
+#include <string.h>
+#include <stdio.h>
+
+static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = {
+	[NLA_U8]	= sizeof(uint8_t),
+	[NLA_U16]	= sizeof(uint16_t),
+	[NLA_U32]	= sizeof(uint32_t),
+	[NLA_U64]	= sizeof(uint64_t),
+	[NLA_STRING]	= 1,
+	[NLA_FLAG]	= 0,
+};
+
+static int nla_len(const struct nlattr *nla)
+{
+	return nla->nla_len - NLA_HDRLEN;
+}
+
+static struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
+{
+	int totlen = NLA_ALIGN(nla->nla_len);
+
+	*remaining -= totlen;
+	return (struct nlattr *) ((char *) nla + totlen);
+}
+
+static int nla_ok(const struct nlattr *nla, int remaining)
+{
+	return remaining >= sizeof(*nla) &&
+	       nla->nla_len >= sizeof(*nla) &&
+	       nla->nla_len <= remaining;
+}
+
+static void *nla_data(const struct nlattr *nla)
+{
+	return (char *) nla + NLA_HDRLEN;
+}
+
+static int nla_type(const struct nlattr *nla)
+{
+	return nla->nla_type & NLA_TYPE_MASK;
+}
+
+static int validate_nla(struct nlattr *nla, int maxtype,
+			struct nla_policy *policy)
+{
+	struct nla_policy *pt;
+	unsigned int minlen = 0;
+	int type = nla_type(nla);
+
+	if (type < 0 || type > maxtype)
+		return 0;
+
+	pt = &policy[type];
+
+	if (pt->type > NLA_TYPE_MAX)
+		return 0;
+
+	if (pt->minlen)
+		minlen = pt->minlen;
+	else if (pt->type != NLA_UNSPEC)
+		minlen = nla_attr_minlen[pt->type];
+
+	if (nla_len(nla) < minlen)
+		return -1;
+
+	if (pt->maxlen && nla_len(nla) > pt->maxlen)
+		return -1;
+
+	if (pt->type == NLA_STRING) {
+		char *data = nla_data(nla);
+		if (data[nla_len(nla) - 1] != '\0')
+			return -1;
+	}
+
+	return 0;
+}
+
+static inline int nlmsg_len(const struct nlmsghdr *nlh)
+{
+	return nlh->nlmsg_len - NLMSG_HDRLEN;
+}
+
+/**
+ * Create attribute index based on a stream of attributes.
+ * @arg tb		Index array to be filled (maxtype+1 elements).
+ * @arg maxtype		Maximum attribute type expected and accepted.
+ * @arg head		Head of attribute stream.
+ * @arg len		Length of attribute stream.
+ * @arg policy		Attribute validation policy.
+ *
+ * Iterates over the stream of attributes and stores a pointer to each
+ * attribute in the index array using the attribute type as index to
+ * the array. Attribute with a type greater than the maximum type
+ * specified will be silently ignored in order to maintain backwards
+ * compatibility. If \a policy is not NULL, the attribute will be
+ * validated using the specified policy.
+ *
+ * @see nla_validate
+ * @return 0 on success or a negative error code.
+ */
+static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+		     struct nla_policy *policy)
+{
+	struct nlattr *nla;
+	int rem, err;
+
+	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+
+	nla_for_each_attr(nla, head, len, rem) {
+		int type = nla_type(nla);
+
+		if (type > maxtype)
+			continue;
+
+		if (policy) {
+			err = validate_nla(nla, maxtype, policy);
+			if (err < 0)
+				goto errout;
+		}
+
+		if (tb[type])
+			fprintf(stderr, "Attribute of type %#x found multiple times in message, "
+				  "previous attribute is being ignored.\n", type);
+
+		tb[type] = nla;
+	}
+
+	err = 0;
+errout:
+	return err;
+}
+
+/* dump netlink extended ack error message */
+int nla_dump_errormsg(struct nlmsghdr *nlh)
+{
+	struct nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = {
+		[NLMSGERR_ATTR_MSG]	= { .type = NLA_STRING },
+		[NLMSGERR_ATTR_OFFS]	= { .type = NLA_U32 },
+	};
+	struct nlattr *tb[NLMSGERR_ATTR_MAX + 1], *attr;
+	struct nlmsgerr *err;
+	char *errmsg = NULL;
+	int hlen, alen;
+
+	/* no TLVs, nothing to do here */
+	if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS))
+		return 0;
+
+	err = (struct nlmsgerr *)NLMSG_DATA(nlh);
+	hlen = sizeof(*err);
+
+	/* if NLM_F_CAPPED is set then the inner err msg was capped */
+	if (!(nlh->nlmsg_flags & NLM_F_CAPPED))
+		hlen += nlmsg_len(&err->msg);
+
+	attr = (struct nlattr *) ((void *) err + hlen);
+	alen = nlh->nlmsg_len - hlen;
+
+	if (nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, extack_policy) != 0) {
+		fprintf(stderr,
+			"Failed to parse extended error attributes\n");
+		return 0;
+	}
+
+	if (tb[NLMSGERR_ATTR_MSG])
+		errmsg = (char *) nla_data(tb[NLMSGERR_ATTR_MSG]);
+
+	fprintf(stderr, "Kernel error message: %s\n", errmsg);
+
+	return 0;
+}
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
new file mode 100644
index 000000000000..373444fbdd7d
--- /dev/null
+++ b/tools/lib/bpf/nlattr.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+
+/*
+ * NETLINK      Netlink attributes
+ *
+ *	This library is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU Lesser General Public
+ *	License as published by the Free Software Foundation version 2.1
+ *	of the License.
+ *
+ * Copyright (c) 2003-2013 Thomas Graf <tgraf@suug.ch>
+ */
+
+#ifndef __NLATTR_H
+#define __NLATTR_H
+
+#include <stdint.h>
+#include <linux/netlink.h>
+
+/**
+ * Standard attribute types to specify validation policy
+ */
+enum {
+	NLA_UNSPEC,	/**< Unspecified type, binary data chunk */
+	NLA_U8,		/**< 8 bit integer */
+	NLA_U16,	/**< 16 bit integer */
+	NLA_U32,	/**< 32 bit integer */
+	NLA_U64,	/**< 64 bit integer */
+	NLA_STRING,	/**< NUL terminated character string */
+	NLA_FLAG,	/**< Flag */
+	NLA_MSECS,	/**< Micro seconds (64bit) */
+	NLA_NESTED,	/**< Nested attributes */
+	__NLA_TYPE_MAX,
+};
+
+#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)
+
+/**
+ * @ingroup attr
+ * Attribute validation policy.
+ *
+ * See section @core_doc{core_attr_parse,Attribute Parsing} for more details.
+ */
+struct nla_policy {
+	/** Type of attribute or NLA_UNSPEC */
+	uint16_t	type;
+
+	/** Minimal length of payload required */
+	uint16_t	minlen;
+
+	/** Maximal length of payload allowed */
+	uint16_t	maxlen;
+};
+
+/**
+ * @ingroup attr
+ * Iterate over a stream of attributes
+ * @arg pos	loop counter, set to current attribute
+ * @arg head	head of attribute stream
+ * @arg len	length of attribute stream
+ * @arg rem	initialized to len, holds bytes currently remaining in stream
+ */
+#define nla_for_each_attr(pos, head, len, rem) \
+	for (pos = head, rem = len; \
+	     nla_ok(pos, rem); \
+	     pos = nla_next(pos, &(rem)))
+
+int nla_dump_errormsg(struct nlmsghdr *nlh);
+
+#endif /* __NLATTR_H */
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next v5 1/4] libbpf: add function to setup XDP
From: Eric Leblond @ 2018-01-04  8:21 UTC (permalink / raw)
  To: daniel, Toshiaki Makita, Philippe Ombredanne
  Cc: Alexei Starovoitov, netdev, linux-kernel, Thomas Graf,
	Eric Leblond
In-Reply-To: <1515023944.7473.7.camel@regit.org>

Most of the code is taken from set_link_xdp_fd() in bpf_load.c and
slightly modified to be library compliant.

Signed-off-by: Eric Leblond <eric@regit.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf.c    | 126 +++++++++++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.c |   2 +
 tools/lib/bpf/libbpf.h |   4 ++
 3 files changed, 132 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 5128677e4117..e6c61035b64c 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -25,6 +25,16 @@
 #include <asm/unistd.h>
 #include <linux/bpf.h>
 #include "bpf.h"
+#include "libbpf.h"
+#include <linux/rtnetlink.h>
+#include <sys/socket.h>
+#include <errno.h>
+
+#ifndef IFLA_XDP_MAX
+#define IFLA_XDP	43
+#define IFLA_XDP_FD	1
+#define IFLA_XDP_FLAGS	3
+#endif
 
 /*
  * When building perf, unistd.h is overridden. __NR_bpf is
@@ -46,7 +56,9 @@
 # endif
 #endif
 
+#ifndef min
 #define min(x, y) ((x) < (y) ? (x) : (y))
+#endif
 
 static inline __u64 ptr_to_u64(const void *ptr)
 {
@@ -413,3 +425,117 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len)
 
 	return err;
 }
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
+{
+	struct sockaddr_nl sa;
+	int sock, seq = 0, len, ret = -1;
+	char buf[4096];
+	struct nlattr *nla, *nla_xdp;
+	struct {
+		struct nlmsghdr  nh;
+		struct ifinfomsg ifinfo;
+		char             attrbuf[64];
+	} req;
+	struct nlmsghdr *nh;
+	struct nlmsgerr *err;
+	socklen_t addrlen;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.nl_family = AF_NETLINK;
+
+	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (sock < 0) {
+		return -errno;
+	}
+
+	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	addrlen = sizeof(sa);
+	if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	if (addrlen != sizeof(sa)) {
+		ret = -LIBBPF_ERRNO__INTERNAL;
+		goto cleanup;
+	}
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_type = RTM_SETLINK;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.ifinfo.ifi_family = AF_UNSPEC;
+	req.ifinfo.ifi_index = ifindex;
+
+	/* started nested attribute for XDP */
+	nla = (struct nlattr *)(((char *)&req)
+				+ NLMSG_ALIGN(req.nh.nlmsg_len));
+	nla->nla_type = NLA_F_NESTED | IFLA_XDP;
+	nla->nla_len = NLA_HDRLEN;
+
+	/* add XDP fd */
+	nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+	nla_xdp->nla_type = IFLA_XDP_FD;
+	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+	nla->nla_len += nla_xdp->nla_len;
+
+	/* if user passed in any flags, add those too */
+	if (flags) {
+		nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+		nla_xdp->nla_type = IFLA_XDP_FLAGS;
+		nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+		memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
+		nla->nla_len += nla_xdp->nla_len;
+	}
+
+	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	len = recv(sock, buf, sizeof(buf), 0);
+	if (len < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+	     nh = NLMSG_NEXT(nh, len)) {
+		if (nh->nlmsg_pid != sa.nl_pid) {
+			ret = -LIBBPF_ERRNO__WRNGPID;
+			goto cleanup;
+		}
+		if (nh->nlmsg_seq != seq) {
+			ret = -LIBBPF_ERRNO__INVSEQ;
+			goto cleanup;
+		}
+		switch (nh->nlmsg_type) {
+		case NLMSG_ERROR:
+			err = (struct nlmsgerr *)NLMSG_DATA(nh);
+			if (!err->error)
+				continue;
+			ret = err->error;
+			goto cleanup;
+		case NLMSG_DONE:
+			break;
+		default:
+			break;
+		}
+	}
+
+	ret = 0;
+
+cleanup:
+	close(sock);
+	return ret;
+}
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e9c4b7cabcf2..5fe8aaa2123e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -106,6 +106,8 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(PROG2BIG)]	= "Program too big",
 	[ERRCODE_OFFSET(KVER)]		= "Incorrect kernel version",
 	[ERRCODE_OFFSET(PROGTYPE)]	= "Kernel doesn't support this program type",
+	[ERRCODE_OFFSET(WRNGPID)]	= "Wrong pid in netlink message",
+	[ERRCODE_OFFSET(INVSEQ)]	= "Invalid netlink sequence",
 };
 
 int libbpf_strerror(int err, char *buf, size_t size)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 6e20003109e0..e42f96900318 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -42,6 +42,8 @@ enum libbpf_errno {
 	LIBBPF_ERRNO__PROG2BIG,	/* Program too big */
 	LIBBPF_ERRNO__KVER,	/* Incorrect kernel version */
 	LIBBPF_ERRNO__PROGTYPE,	/* Kernel doesn't support this program type */
+	LIBBPF_ERRNO__WRNGPID,	/* Wrong pid in netlink message */
+	LIBBPF_ERRNO__INVSEQ,	/* Invalid netlink sequence */
 	__LIBBPF_ERRNO__END,
 };
 
@@ -246,4 +248,6 @@ long libbpf_get_error(const void *ptr);
 
 int bpf_prog_load(const char *file, enum bpf_prog_type type,
 		  struct bpf_object **pobj, int *prog_fd);
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 #endif
-- 
2.15.1

^ permalink raw reply related

* Re: bonding: Completion of error handling around bond_update_slave_arr()
From: SF Markus Elfring @ 2018-01-04  8:19 UTC (permalink / raw)
  To: Mahesh Bandewar (महेश बंडेवार),
	linux-netdev
  Cc: Andy Gospodarek, Jay Vosburgh, Veaceslav Falico, LKML,
	kernel-janitors
In-Reply-To: <CAF2d9jh1EHUx3BzWghBWRAZt4acO6W-_eUXtD7dk2YBJZfuwsg@mail.gmail.com>

> If you see 8 out of 9 call sites in this file ignore the return value.

How do you think about to fix error detection and corresponding
exception handling then?

Regards,
Markus

^ permalink raw reply

* Re: [PATCH v3 00/27] kill devm_ioremap_nocache
From: Christophe LEROY @ 2018-01-04  8:05 UTC (permalink / raw)
  To: Yisheng Xie, Greg KH
  Cc: linux-kernel, ysxie, ulf.hansson, linux-mmc, boris.brezillon,
	richard, marek.vasut, cyrille.pitchen, linux-mtd, alsa-devel, wim,
	linux, linux-watchdog, b.zolnierkie, linux-fbdev, linus.walleij,
	linux-gpio, ralf, linux-mips, lgirdwood, broonie, tglx, jason,
	marc.zyngier, arnd, andriy.shevchenko, industrypack-devel, wg,
	mkl, linux-can, mchehab, linux-media
In-Reply-To: <8dd19411-5b06-0aa4-fd0e-e5b112c25dcb@huawei.com>



Le 25/12/2017 à 02:34, Yisheng Xie a écrit :
> 
> 
> On 2017/12/24 17:05, christophe leroy wrote:
>>
>>
>> Le 23/12/2017 à 14:48, Greg KH a écrit :
>>> On Sat, Dec 23, 2017 at 06:55:25PM +0800, Yisheng Xie wrote:
>>>> Hi all,
>>>>
>>>> When I tried to use devm_ioremap function and review related code, I found
>>>> devm_ioremap and devm_ioremap_nocache is almost the same with each other,
>>>> except one use ioremap while the other use ioremap_nocache.
>>>
>>> For all arches?  Really?  Look at MIPS, and x86, they have different
>>> functions.
>>>
>>>> While ioremap's
>>>> default function is ioremap_nocache, so devm_ioremap_nocache also have the
>>>> same function with devm_ioremap, which can just be killed to reduce the size
>>>> of devres.o(from 20304 bytes to 18992 bytes in my compile environment).
>>>>
>>>> I have posted two versions, which use macro instead of function for
>>>> devm_ioremap_nocache[1] or devm_ioremap[2]. And Greg suggest me to kill
>>>> devm_ioremap_nocache for no need to keep a macro around for the duplicate
>>>> thing. So here comes v3 and please help to review.
>>>
>>> I don't think this can be done, what am I missing?  These functions are
>>> not identical, sorry for missing that before.
>>
>> devm_ioremap() and devm_ioremap_nocache() are quite similar, both use devm_ioremap_release() for the release, why not just defining:
>>
>> static void __iomem *__devm_ioremap(struct device *dev, resource_size_t offset,
>>                 resource_size_t size, bool nocache)
>> {
>> [...]
>>      if (nocache)
>>          addr = ioremap_nocache(offset, size);
>>      else
>>          addr = ioremap(offset, size);
>> [...]
>> }
>>
>> then in include/linux/io.h
>>
>> static inline void __iomem *devm_ioremap(struct device *dev, resource_size_t offset,
>>                 resource_size_t size)
>> {return __devm_ioremap(dev, offset, size, false);}
>>
>> static inline void __iomem *devm_ioremap_nocache(struct device *dev, resource_size_t offset,
>>                     resource_size_t size);
>> {return __devm_ioremap(dev, offset, size, true);}
> 
> Yeah, this seems good to me, right now we have devm_ioremap, devm_ioremap_wc, devm_ioremap_nocache
> May be we can use an enum like:
> typedef enum {
> 	DEVM_IOREMAP = 0,
> 	DEVM_IOREMAP_NOCACHE,
> 	DEVM_IOREMAP_WC,
> } devm_ioremap_type;
> 
> static inline void __iomem *devm_ioremap(struct device *dev, resource_size_t offset,
>                  resource_size_t size)
>   {return __devm_ioremap(dev, offset, size, DEVM_IOREMAP);}
> 
>   static inline void __iomem *devm_ioremap_nocache(struct device *dev, resource_size_t offset,
>                      resource_size_t size);
>   {return __devm_ioremap(dev, offset, size, DEVM_IOREMAP_NOCACHE);}
> 
>   static inline void __iomem *devm_ioremap_wc(struct device *dev, resource_size_t offset,
>                      resource_size_t size);
>   {return __devm_ioremap(dev, offset, size, DEVM_IOREMAP_WC);}
> 
>   static void __iomem *__devm_ioremap(struct device *dev, resource_size_t offset,
>                  resource_size_t size, devm_ioremap_type type)
>   {
>       void __iomem **ptr, *addr = NULL;
>   [...]
>       switch (type){
>       case DEVM_IOREMAP:
>           addr = ioremap(offset, size);
>           break;
>       case DEVM_IOREMAP_NOCACHE:
>           addr = ioremap_nocache(offset, size);
>           break;
>       case DEVM_IOREMAP_WC:
>           addr = ioremap_wc(offset, size);
>           break;
>       }
>   [...]
>   }


That looks good to me, will you submit a v4 ?

Christophe

> 
> Thanks
> Yisheng
> 
>>
>> Christophe
>>
>>>
>>> thanks,
>>>
>>> greg k-h
>>> -- 
>>> To unsubscribe from this list: send the line "unsubscribe linux-watchdog" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>
>> ---
>> L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel antivirus Avast.
>> https://www.avast.com/antivirus
>>
>>
>> .
>>

^ permalink raw reply

* [PATCH 6/6] add test for aio poll and io_pgetevents
From: Christoph Hellwig @ 2018-01-04  8:03 UTC (permalink / raw)
  To: linux-aio; +Cc: Avi Kivity, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080325.14716-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 harness/cases/22.t | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 harness/cases/22.t

diff --git a/harness/cases/22.t b/harness/cases/22.t
new file mode 100644
index 0000000..a9fec6b
--- /dev/null
+++ b/harness/cases/22.t
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2006-2018 Free Software Foundation, Inc.
+ * Copyright (C) 2018 Christoph Hellwig.
+ * License: LGPLv2.1 or later.
+ *
+ * Description: test aio poll and io_pgetevents signal handling.
+ *
+ * Very roughly based on glibc tst-pselect.c.
+ */
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/poll.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+
+static volatile int handler_called;
+
+static void
+handler(int sig)
+{
+	handler_called = 1;
+}
+
+int test_main(void)
+{
+	struct timespec to = { .tv_sec = 0, .tv_nsec = 500000000 };
+	pid_t parent = getpid(), p;
+	int pipe1[2], pipe2[2];
+	struct sigaction sa = { .sa_flags = 0 };
+	sigset_t sigmask;
+	struct io_context *ctx = NULL;
+	struct io_event ev;
+	struct iocb iocb;
+	struct iocb *iocbs[] = { &iocb };
+	int ret;
+
+	sigemptyset(&sa.sa_mask);
+
+	sa.sa_handler = handler;
+	if (sigaction(SIGUSR1, &sa, NULL) != 0) {
+		printf("sigaction(1) failed\n");
+		return 1;
+	}
+
+	sa.sa_handler = SIG_IGN;
+	if (sigaction(SIGCHLD, &sa, NULL) != 0) {
+		printf("sigaction(2) failed\n");
+		return 1;
+	}
+
+	sigemptyset(&sigmask);
+	sigaddset(&sigmask, SIGUSR1);
+	if (sigprocmask(SIG_BLOCK, &sigmask, NULL) != 0) {
+		printf("sigprocmask failed\n");
+		return 1;
+	}
+
+	if (pipe(pipe1) != 0 || pipe(pipe2) != 0) {
+		printf("pipe failed\n");
+		return 1;
+	}
+
+	sigprocmask(SIG_SETMASK, NULL, &sigmask);
+	sigdelset(&sigmask, SIGUSR1);
+
+	p = fork();
+	switch (p) {
+	case -1:
+		printf("fork failed\n");
+		exit(2);
+	case 0:
+		close(pipe1[1]);
+		close(pipe2[0]);
+
+		ret = io_setup(1, &ctx);
+		if (ret) {
+			printf("child: io_setup failed\n");
+			return 1;
+		}
+
+		io_prep_poll(&iocb, pipe1[0], POLLIN);
+		ret = io_submit(ctx, 1, iocbs);
+		if (ret != 1) {
+			printf("child: io_submit failed\n");
+			return 1;
+		}
+
+		do {
+			if (getppid() != parent) {
+				printf("parent died\n");
+				exit(2);
+			}
+			ret = io_pgetevents(ctx, 1, 1, &ev, &to, &sigmask);
+		} while (ret == 0);
+
+		if (ret != -EINTR) {
+			printf("child: io_pgetevents did not set errno to EINTR\n");
+			return 1;
+		}
+
+		do {
+			errno = 0;
+			ret = write(pipe2[1], "foo", 3);
+		} while (ret == -1 && errno == EINTR);
+
+		exit(0);
+	default:
+		close(pipe1[0]);
+		close(pipe2[1]);
+
+		io_prep_poll(&iocb, pipe2[0], POLLIN);
+
+		ret = io_setup(1, &ctx);
+		if (ret) {
+			printf("child: io_setup failed\n");
+			return 1;
+		}
+
+		ret = io_submit(ctx, 1, iocbs);
+		if (ret != 1) {
+			printf("child: io_submit failed\n");
+			return 1;
+		}
+
+		kill(p, SIGUSR1);
+
+		ret = io_pgetevents(ctx, 1, 1, &ev, NULL, &sigmask);
+		if (ret < 0) {
+			printf("parent: io_pgetevents failed\n");
+			return 1;
+		}
+		if (ret != 1) {
+			printf("parent: io_pgetevents did not report event\n");
+			return 1;
+		}
+		if (ev.obj != &iocb) {
+			printf("parent: io_pgetevents reports wrong fd\n");
+			return 1;
+		}
+		if (ev.res != POLLIN) {
+			printf("parent: io_pgetevents did not report readable fd\n");
+			return 1;
+		}
+
+		return 0;
+	}
+}
-- 
2.14.2

^ permalink raw reply related

* [PATCH 5/6] add support for io_pgetevents
From: Christoph Hellwig @ 2018-01-04  8:03 UTC (permalink / raw)
  To: linux-aio; +Cc: Avi Kivity, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080325.14716-1-hch@lst.de>

This is ppoll/pselect equivalent for io_getevents.  It atomically executes
the following sequence:

	sigset_t origmask;

	pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
	ret = io_getevents(ctx, min_nr, nr, events, timeout);
	pthread_sigmask(SIG_SETMASK, &origmask, NULL);

And thus allows to safely mix aio and signals, especially together with
IO_CMD_POLL.  See the pselect(2) man page for a more detailed explanation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 man/io_getevents.3   | 61 +++++++++++++++++++++++++++++++++++++++++++++++++---
 src/Makefile         |  2 +-
 src/io_pgetevents.c  | 47 ++++++++++++++++++++++++++++++++++++++++
 src/libaio.h         |  4 ++++
 src/libaio.map       |  5 +++++
 src/syscall-i386.h   |  1 +
 src/syscall-x86_64.h |  1 +
 7 files changed, 117 insertions(+), 4 deletions(-)
 create mode 100644 src/io_pgetevents.c

diff --git a/man/io_getevents.3 b/man/io_getevents.3
index 8e9ddc8..5062daa 100644
--- a/man/io_getevents.3
+++ b/man/io_getevents.3
@@ -18,7 +18,7 @@
 ./"
 .TH io_getevents 2 2002-09-03 "Linux 2.4" "Linux AIO"
 .SH NAME
-io_getevents \- Read resulting events from io requests
+io_getevents, aio_pgetevents \- Read resulting events from io requests
 .SH SYNOPSIS
 .nf
 .B #include <errno.h>
@@ -43,7 +43,7 @@ struct io_event {
 };
 .sp
 .BI "int io_getevents(io_context_t " ctx ",  long " nr ", struct io_event *" events "[], struct timespec *" timeout ");"
-
+.BI "int io_pgetevents(io_context_t " ctx ",  long " nr ", struct io_event *" events "[], struct timespec *" timeout ", sigset_t *" sigmask ");"
 .fi
 .SH DESCRIPTION
 Attempts to read  up to nr events from
@@ -55,6 +55,60 @@ by when has elapsed, where when == NULL specifies an infinite
 timeout.  Note that the timeout pointed to by when is relative and
 will be updated if not NULL and the operation blocks.  Will fail
 with ENOSYS if not implemented.
+.SS io_pgetevents()
+The relationship between
+.BR io_getevents ()
+and
+.BR io_pgetevents ()
+is analogous to the relationship between
+.BR select (2)
+and
+.BR pselect (2):
+similar
+.BR pselect (2),
+.BR pgetevents ()
+allows an application to safely wait until either an aio completion
+events happens or until a signal is caught.
+.PP
+The following
+.BR io_pgetevents ()
+call:
+call:
+.PP
+.in +4n
+.EX
+ret = io_pgetevents(ctx, min_nr, nr, events, timeout, sigmask);
+.EE
+.in
+.PP
+is equivalent to
+.I atomically
+executing the following calls:
+.PP
+.in +4n
+.EX
+sigset_t origmask;
+
+pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
+ret = io_getevents(ctx, min_nr, nr, events, timeout);
+pthread_sigmask(SIG_SETMASK, &origmask, NULL);
+.EE
+.in
+.PP
+See the description of
+.BR pselect (2)
+for an explanation of why
+.BR io_pgetevents ()
+is necessary.
+.PP
+If the
+.I sigmask
+argument is specified as NULL, then no signal mask manipulation is
+performed (and thus
+.BR io_pgetevents ()
+behaves the same as
+.BR io_getevents()
+) .
 .SH ERRORS
 .TP
 .B EINVAL 
@@ -76,4 +130,5 @@ if any of the memory specified to is invalid.
 .BR io_queue_wait(3),
 .BR io_set_callback(3),
 .BR io_submit(3),
-.BR errno(3)
+.BR errno(3),
+.BR pselect(2)
diff --git a/src/Makefile b/src/Makefile
index eadb336..f5a57d3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -23,7 +23,7 @@ libaio_srcs += io_queue_wait.c io_queue_run.c
 
 # real syscalls
 libaio_srcs += io_getevents.c io_submit.c io_cancel.c
-libaio_srcs += io_setup.c io_destroy.c
+libaio_srcs += io_setup.c io_destroy.c io_pgetevents.c
 
 # internal functions
 libaio_srcs += raw_syscall.c
diff --git a/src/io_pgetevents.c b/src/io_pgetevents.c
new file mode 100644
index 0000000..47dbbb7
--- /dev/null
+++ b/src/io_pgetevents.c
@@ -0,0 +1,47 @@
+/*
+   libaio Linux async I/O interface
+   Copyright 2018 Christoph Hellwig.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#include <libaio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <time.h>
+#include "syscall.h"
+#include "aio_ring.h"
+
+#ifdef __NR_io_pgetevents
+io_syscall6(int, __io_pgetevents, io_pgetevents, io_context_t, ctx, long,
+		min_nr, long, nr, struct io_event *, events,
+		struct timespec *, timeout, sigset_t *, sigmask);
+
+int io_pgetevents(io_context_t ctx, long min_nr, long nr,
+		struct io_event *events, struct timespec *timeout,
+		sigset_t *sigmask)
+{
+	if (aio_ring_is_empty(ctx, timeout))
+		return 0;
+	return __io_pgetevents(ctx, min_nr, nr, events, timeout, sigmask);
+}
+#else
+int io_pgetevents(io_context_t ctx, long min_nr, long nr,
+		struct io_event *events, struct timespec *timeout,
+		sigset_t *sigmask)
+
+{
+	return -ENOSYS;
+}
+#endif /* __NR_io_pgetevents */
diff --git a/src/libaio.h b/src/libaio.h
index a1a8fc4..f356c97 100644
--- a/src/libaio.h
+++ b/src/libaio.h
@@ -29,6 +29,7 @@ extern "C" {
 
 #include <sys/types.h>
 #include <string.h>
+#include <signal.h>
 
 struct timespec;
 struct sockaddr;
@@ -161,6 +162,9 @@ extern int io_destroy(io_context_t ctx);
 extern int io_submit(io_context_t ctx, long nr, struct iocb *ios[]);
 extern int io_cancel(io_context_t ctx, struct iocb *iocb, struct io_event *evt);
 extern int io_getevents(io_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout);
+extern int io_pgetevents(io_context_t ctx_id, long min_nr, long nr,
+		struct io_event *events, struct timespec *timeout,
+		sigset_t *sigmask);
 
 
 static inline void io_set_callback(struct iocb *iocb, io_callback_t cb)
diff --git a/src/libaio.map b/src/libaio.map
index dc37725..ec9d13b 100644
--- a/src/libaio.map
+++ b/src/libaio.map
@@ -20,3 +20,8 @@ LIBAIO_0.4 {
 		io_getevents;
 		io_queue_wait;
 } LIBAIO_0.1;
+
+LIBAIO_0.5 {
+	global:
+		io_pgetevents;
+} LIBAIO_0.4;
diff --git a/src/syscall-i386.h b/src/syscall-i386.h
index 9576975..e312a3f 100644
--- a/src/syscall-i386.h
+++ b/src/syscall-i386.h
@@ -3,6 +3,7 @@
 #define __NR_io_getevents	247
 #define __NR_io_submit		248
 #define __NR_io_cancel		249
+#define __NR_io_pgetevents	385
 
 #define io_syscall1(type,fname,sname,type1,arg1)	\
 type fname(type1 arg1)					\
diff --git a/src/syscall-x86_64.h b/src/syscall-x86_64.h
index 9361856..f811ce4 100644
--- a/src/syscall-x86_64.h
+++ b/src/syscall-x86_64.h
@@ -3,6 +3,7 @@
 #define __NR_io_getevents	208
 #define __NR_io_submit		209
 #define __NR_io_cancel		210
+#define __NR_io_pgetevents	333
 
 #define __syscall_clobber "r11","rcx","memory" 
 #define __syscall "syscall"
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 4/6] move the aio_ring defintion and empty check into a header
From: Christoph Hellwig @ 2018-01-04  8:03 UTC (permalink / raw)
  To: linux-aio; +Cc: Avi Kivity, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080325.14716-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 src/aio_ring.h     | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/io_getevents.c | 28 +++-------------------------
 2 files changed, 52 insertions(+), 25 deletions(-)
 create mode 100644 src/aio_ring.h

diff --git a/src/aio_ring.h b/src/aio_ring.h
new file mode 100644
index 0000000..3842c4b
--- /dev/null
+++ b/src/aio_ring.h
@@ -0,0 +1,49 @@
+/*
+   libaio Linux async I/O interface
+   Copyright 2002 Red Hat, Inc.
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+#ifndef _AIO_RING_H
+#define _AIO_RING_H
+
+#define AIO_RING_MAGIC                  0xa10a10a1
+
+struct aio_ring {
+	unsigned        id;     /* kernel internal index number */
+	unsigned        nr;     /* number of io_events */
+	unsigned        head;
+	unsigned        tail;
+
+	unsigned        magic;
+	unsigned        compat_features;
+	unsigned        incompat_features;
+	unsigned        header_length;  /* size of aio_ring */
+};
+
+static inline int aio_ring_is_empty(io_context_t ctx, struct timespec *timeout)
+{
+	struct aio_ring *ring = (struct aio_ring *)ctx;
+
+	if (!ring || ring->magic != AIO_RING_MAGIC)
+		return 0;
+	if (!timeout || timeout->tv_sec || timeout->tv_nsec)
+		return 0;
+	if (ring->head != ring->tail)
+		return 0;
+	return 1;
+}
+
+#endif /* _AIO_RING_H */
diff --git a/src/io_getevents.c b/src/io_getevents.c
index 5a05174..90d6081 100644
--- a/src/io_getevents.c
+++ b/src/io_getevents.c
@@ -21,36 +21,14 @@
 #include <stdlib.h>
 #include <time.h>
 #include "syscall.h"
+#include "aio_ring.h"
 
 io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx, long, min_nr, long, nr, struct io_event *, events, struct timespec *, timeout)
 
-#define AIO_RING_MAGIC                  0xa10a10a1
-
-/* Ben will hate me for this */
-struct aio_ring {
-	unsigned        id;     /* kernel internal index number */
-	unsigned        nr;     /* number of io_events */
-	unsigned        head;
-	unsigned        tail;
- 
-	unsigned        magic;
-	unsigned        compat_features;
-	unsigned        incompat_features;
-	unsigned        header_length;  /* size of aio_ring */
-};
-
 int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct io_event * events, struct timespec * timeout)
 {
-	struct aio_ring *ring;
-	ring = (struct aio_ring*)ctx;
-	if (ring==NULL || ring->magic != AIO_RING_MAGIC)
-		goto do_syscall;
-	if (timeout!=NULL && timeout->tv_sec == 0 && timeout->tv_nsec == 0) {
-		if (ring->head == ring->tail)
-			return 0;
-	}
-	
-do_syscall:	
+	if (aio_ring_is_empty(ctx, timeout))
+		return 0;
 	return __io_getevents_0_4(ctx, min_nr, nr, events, timeout);
 }
 
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 3/6] provide a generic io_syscall6
From: Christoph Hellwig @ 2018-01-04  8:03 UTC (permalink / raw)
  To: linux-aio; +Cc: Avi Kivity, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080325.14716-1-hch@lst.de>

This allows to call a 6-argument syscall using the generic syscall()
function from libc.  No arch-specific version is specified as it would
be a lot of effort for very little gain.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 src/syscall.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/syscall.h b/src/syscall.h
index 3819519..e7ff81b 100644
--- a/src/syscall.h
+++ b/src/syscall.h
@@ -39,3 +39,12 @@
 #warning "using generic syscall method"
 #include "syscall-generic.h"
 #endif
+
+#ifndef io_syscall6
+#define io_syscall6(type,fname,sname,type1,arg1,type2,arg2,type3,arg3, \
+		type4,arg4,type5,arg5,type6,arg6) \
+type fname (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5, \
+		type6 arg6) \
+_body_io_syscall(sname, (long)arg1, (long)arg2, (long)arg3, (long)arg4, \
+               (long)arg5, (long)arg5)
+#endif /* io_syscall6 */
-- 
2.14.2

^ permalink raw reply related

* [PATCH 2/6] move _body_io_syscall to the generic syscall.h
From: Christoph Hellwig @ 2018-01-04  8:03 UTC (permalink / raw)
  To: linux-aio; +Cc: Avi Kivity, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080325.14716-1-hch@lst.de>

This way it can be used for the fallback 6-argument version on
all architectures.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 src/syscall-generic.h | 6 ------
 src/syscall.h         | 7 +++++++
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/syscall-generic.h b/src/syscall-generic.h
index 24d7c7c..35b8580 100644
--- a/src/syscall-generic.h
+++ b/src/syscall-generic.h
@@ -2,12 +2,6 @@
 #include <unistd.h>
 #include <sys/syscall.h>
 
-#define _body_io_syscall(sname, args...) \
-{ \
-	int ret = syscall(__NR_##sname, ## args); \
-	return ret < 0 ? -errno : ret; \
-}
-
 #define io_syscall1(type,fname,sname,type1,arg1) \
 type fname(type1 arg1) \
 _body_io_syscall(sname, (long)arg1)
diff --git a/src/syscall.h b/src/syscall.h
index a2da030..3819519 100644
--- a/src/syscall.h
+++ b/src/syscall.h
@@ -10,6 +10,13 @@
 #define DEFSYMVER(compat_sym, orig_sym, ver_sym)	\
 	__asm__(".symver " SYMSTR(compat_sym) "," SYMSTR(orig_sym) "@@LIBAIO_" SYMSTR(ver_sym));
 
+/* generic fallback */
+#define _body_io_syscall(sname, args...) \
+{ \
+	int ret = syscall(__NR_##sname, ## args); \
+	return ret < 0 ? -errno : ret; \
+}
+
 #if defined(__i386__)
 #include "syscall-i386.h"
 #elif defined(__x86_64__)
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 1/6] resurrect aio poll support
From: Christoph Hellwig @ 2018-01-04  8:03 UTC (permalink / raw)
  To: linux-aio; +Cc: Avi Kivity, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080325.14716-1-hch@lst.de>

Now that we have poll support in mainline, remove comments about the
do not use status.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 src/libaio.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/libaio.h b/src/libaio.h
index a5cd2e1..a1a8fc4 100644
--- a/src/libaio.h
+++ b/src/libaio.h
@@ -43,7 +43,7 @@ typedef enum io_iocb_cmd {
 	IO_CMD_FSYNC = 2,
 	IO_CMD_FDSYNC = 3,
 
-	IO_CMD_POLL = 5, /* Never implemented in mainline, see io_prep_poll */
+	IO_CMD_POLL = 5,
 	IO_CMD_NOOP = 6,
 	IO_CMD_PREADV = 7,
 	IO_CMD_PWRITEV = 8,
@@ -236,8 +236,6 @@ static inline void io_prep_pwritev2(struct iocb *iocb, int fd, const struct iove
 	iocb->u.c.offset = offset;
 }
 
-/* Jeff Moyer says this was implemented in Red Hat AS2.1 and RHEL3.
- * AFAICT, it was never in mainline, and should not be used. --RR */
 static inline void io_prep_poll(struct iocb *iocb, int fd, int events)
 {
         memset(iocb, 0, sizeof(*iocb));
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* libaio: resurrect aio poll and add io_pgetevents support
From: Christoph Hellwig @ 2018-01-04  8:03 UTC (permalink / raw)
  To: linux-aio; +Cc: Avi Kivity, linux-fsdevel, netdev, linux-kernel

Hi all,

this series resurrects IOCB_CMD_POLL support and adds support for the
new io_pgetevents system call, as well as adding a test case.

Git branch:

    git://git.infradead.org/users/hch/libaio.git aio-poll

Gitweb:

    http://git.infradead.org/users/hch/libaio.git/shortlog/refs/heads/aio-poll

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply

* [PATCH 31/31] aio: implement io_pgetevents
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

This is the io_getevents equivalent of ppoll/pselect and allows to
properly mix signals and aio completions (especially with IOCB_CMD_POLL)
and atomically executes the following sequence:

	sigset_t origmask;

	pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
	ret = io_getevents(ctx, min_nr, nr, events, timeout);
	pthread_sigmask(SIG_SETMASK, &origmask, NULL);

Note that unlike many other signal related calls we do not pass a sigmask
size, as that would get us to 7 arguments, which aren't easily supported
by the syscall infrastructure.  It seems a lot less painful to just add a
new syscall variant in the unlikely case we're going to increase the
sigset size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/aio.c                               | 96 ++++++++++++++++++++++++++++++----
 include/linux/compat.h                 |  6 +++
 include/linux/syscalls.h               |  6 +++
 include/uapi/asm-generic/unistd.h      |  4 +-
 kernel/sys_ni.c                        |  2 +
 7 files changed, 105 insertions(+), 11 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac2161112..5997c3e9ac3e 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
 384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+385	i386	io_pgetevents		sys_io_pgetevents		compat_sys_io_pgetevents
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..e995cd2b4e65 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	common	io_pgetevents		sys_io_pgetevents
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/aio.c b/fs/aio.c
index cae90ac6e4a3..57a4e8d89f78 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1299,10 +1299,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 		wait_event_interruptible_hrtimeout(ctx->wait,
 				aio_read_events(ctx, min_nr, nr, event, &ret),
 				until);
-
-	if (!ret && signal_pending(current))
-		ret = -EINTR;
-
 	return ret;
 }
 
@@ -1978,13 +1974,54 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		struct timespec __user *, timeout)
 {
 	struct timespec64	ts;
+	int			ret;
+
+	if (timeout && unlikely(get_timespec64(&ts, timeout)))
+		return -EFAULT;
 
-	if (timeout) {
-		if (unlikely(get_timespec64(&ts, timeout)))
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+	return ret;
+}
+
+SYSCALL_DEFINE6(io_pgetevents,
+		aio_context_t, ctx_id,
+		long, min_nr,
+		long, nr,
+		struct io_event __user *, events,
+		struct timespec __user *, timeout,
+		const sigset_t __user *, sigmask)
+{
+	sigset_t		ksigmask, sigsaved;
+	struct timespec64	ts;
+	int ret;
+
+	if (timeout && unlikely(get_timespec64(&ts, timeout)))
+		return -EFAULT;
+
+	if (sigmask) {
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
 			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+	if (signal_pending(current)) {
+		if (sigmask) {
+			current->saved_sigmask = sigsaved;
+			set_restore_sigmask();
+		}
+
+		if (!ret)
+			ret = -ERESTARTNOHAND;
+	} else {
+		if (sigmask)
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	}
+
+	return ret;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1995,13 +2032,52 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
 		       struct compat_timespec __user *, timeout)
 {
 	struct timespec64 t;
+	int ret;
+
+	if (timeout && compat_get_timespec64(&t, timeout))
+		return -EFAULT;
+
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents,
+		compat_aio_context_t, ctx_id,
+		compat_long_t, min_nr,
+		compat_long_t, nr,
+		struct io_event __user *, events,
+		struct compat_timespec __user *, timeout,
+		const compat_sigset_t __user *, sigmask)
+{
+	sigset_t ksigmask, sigsaved;
+	struct timespec64 t;
+	int ret;
 
-	if (timeout) {
-		if (compat_get_timespec64(&t, timeout))
+	if (timeout && compat_get_timespec64(&t, timeout))
+		return -EFAULT;
+
+	if (sigmask) {
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
 
+	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	if (signal_pending(current)) {
+		if (sigmask) {
+			current->saved_sigmask = sigsaved;
+			set_restore_sigmask();
+		}
+		if (!ret)
+			ret = -ERESTARTNOHAND;
+	} else {
+		if (sigmask)
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 	}
 
-	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+	return ret;
 }
 #endif
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 0fc36406f32c..a4cda98073f1 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -536,6 +536,12 @@ asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id,
 					compat_long_t nr,
 					struct io_event __user *events,
 					struct compat_timespec __user *timeout);
+asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id,
+					compat_long_t min_nr,
+					compat_long_t nr,
+					struct io_event __user *events,
+					struct compat_timespec __user *timeout,
+					const compat_sigset_t __user *sigmask);
 asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr,
 				     u32 __user *iocb);
 asmlinkage long compat_sys_mount(const char __user *dev_name,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a78186d826d7..3bc9a130f4a9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -539,6 +539,12 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id,
 				long nr,
 				struct io_event __user *events,
 				struct timespec __user *timeout);
+asmlinkage long sys_io_pgetevents(aio_context_t ctx_id,
+				long min_nr,
+				long nr,
+				struct io_event __user *events,
+				struct timespec __user *timeout,
+				const sigset_t __user *sigmask);
 asmlinkage long sys_io_submit(aio_context_t, long,
 				struct iocb __user * __user *);
 asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 8b87de067bc7..ce2ebbeece10 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -732,9 +732,11 @@ __SYSCALL(__NR_pkey_alloc,    sys_pkey_alloc)
 __SYSCALL(__NR_pkey_free,     sys_pkey_free)
 #define __NR_statx 291
 __SYSCALL(__NR_statx,     sys_statx)
+#define __NR_io_pgetevents 292
+__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 
 #undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
 
 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b5189762d275..8f7705559b38 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -151,9 +151,11 @@ cond_syscall(sys_io_destroy);
 cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
+cond_syscall(sys_io_pgetevents);
 cond_syscall(compat_sys_io_setup);
 cond_syscall(compat_sys_io_submit);
 cond_syscall(compat_sys_io_getevents);
+cond_syscall(compat_sys_io_pgetevents);
 cond_syscall(sys_sysfs);
 cond_syscall(sys_syslog);
 cond_syscall(sys_process_vm_readv);
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 30/31] aio: implement IOCB_CMD_POLL
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

Simple one-shot poll through the io_submit() interface.  To poll for
a file descriptor the application should submit an iocb of type
IOCB_CMD_POLL.  It will poll the fd for the events specified in the
the first 32 bits of the aio_buf field of the iocb.

Unlike poll or epoll without EPOLLONESHOT this interface always works
in one shot mode, that is once the iocb is completed, it will have to be
resubmitted.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c                     | 103 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/aio_abi.h |   6 +--
 2 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 6fca1408a8a8..cae90ac6e4a3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -156,9 +156,17 @@ struct kioctx {
 	unsigned		id;
 };
 
+struct poll_iocb {
+	struct file		*file;
+	__poll_t		events;
+	struct wait_queue_head	*head;
+	struct wait_queue_entry	wait;
+};
+
 struct aio_kiocb {
 	union {
 		struct kiocb		rw;
+		struct poll_iocb	poll;
 	};
 
 	struct kioctx		*ki_ctx;
@@ -1570,6 +1578,98 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
 	return ret;
 }
 
+static void __aio_complete_poll(struct poll_iocb *req, __poll_t mask)
+{
+	fput(req->file);
+	aio_complete(container_of(req, struct aio_kiocb, poll),
+			mangle_poll(mask), 0);
+}
+
+static void aio_complete_poll(struct poll_iocb *req, __poll_t mask)
+{
+	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+
+	if (!(iocb->flags & AIO_IOCB_CANCELLED))
+		__aio_complete_poll(req, mask);
+}
+
+static int aio_poll_cancel(struct kiocb *rw)
+{
+	struct aio_kiocb *iocb = container_of(rw, struct aio_kiocb, rw);
+
+	remove_wait_queue(iocb->poll.head, &iocb->poll.wait);
+	__aio_complete_poll(&iocb->poll, 0); /* no events to report */
+	return 0;
+}
+
+static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+		void *key)
+{
+	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
+	struct file *file = req->file;
+	__poll_t mask = key_to_poll(key);
+
+	assert_spin_locked(&req->head->lock);
+
+	/* for instances that support it check for an event match first: */
+	if (mask && !(mask & req->events))
+		return 0;
+
+	mask = vfs_poll_mask(file, req->events);
+	if (!mask)
+		return 0;
+
+	__remove_wait_queue(req->head, &req->wait);
+	aio_complete_poll(req, mask);
+	return 1;
+}
+
+static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+{
+	struct poll_iocb *req = &aiocb->poll;
+	struct file *file;
+	unsigned long flags;
+	__poll_t mask;
+	int ret;
+
+	/* reject any unknown events outside the normal event mask. */
+	if (unlikely((u16)iocb->aio_buf != iocb->aio_buf))
+		return -EINVAL;
+	/* reject fields that are not defined for poll */
+	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
+		return -EINVAL;
+
+	req->events = demangle_poll(iocb->aio_buf) | POLLERR | POLLHUP;
+	req->file = file = fget(iocb->aio_fildes);
+	if (unlikely(!req->file))
+		return -EBADF;
+
+	ret = -EOPNOTSUPP;
+	if (unlikely(!file->f_op->get_poll_head || !file->f_op->poll_mask))
+		goto out_fput;
+	req->head = file->f_op->get_poll_head(file, req->events);
+	if (!req->head)
+		goto out_fput;
+
+	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
+
+	spin_lock_irqsave(&req->head->lock, flags);
+	mask = vfs_poll_mask(file, req->events);
+	if (!mask) {
+		__kiocb_set_cancel_fn(aiocb, aio_poll_cancel,
+				AIO_IOCB_DELAYED_CANCEL);
+		__add_wait_queue(req->head, &req->wait);
+	}
+	spin_unlock_irqrestore(&req->head->lock, flags);
+
+	if (mask)
+		aio_complete_poll(req, mask);
+	return -EIOCBQUEUED;
+out_fput:
+	fput(file);
+	return ret;
+}
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 struct iocb *iocb, bool compat)
 {
@@ -1633,6 +1733,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	case IOCB_CMD_PWRITEV:
 		ret = aio_write(&req->rw, iocb, true, compat);
 		break;
+	case IOCB_CMD_POLL:
+		ret = aio_poll(req, iocb);
+		break;
 	default:
 		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
 		ret = -EINVAL;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a04adbc70ddf..28330105a4b6 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -38,10 +38,8 @@ enum {
 	IOCB_CMD_PWRITE = 1,
 	IOCB_CMD_FSYNC = 2,
 	IOCB_CMD_FDSYNC = 3,
-	/* These two are experimental.
-	 * IOCB_CMD_PREADX = 4,
-	 * IOCB_CMD_POLL = 5,
-	 */
+	/* 4 was the experimental IOCB_CMD_PREADX */
+	IOCB_CMD_POLL = 5,
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 29/31] aio: add delayed cancel support
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

The upcoming aio poll support would like to be able to complete the
iocb inline from the cancellation context, but that would cause
a lock order reversal.  Add support for optionally moving the cancelation
outside the context lock to avoid this reversal.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 99c1d2ad67e9..6fca1408a8a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -170,6 +170,10 @@ struct aio_kiocb {
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
 
+	unsigned int		flags;		/* protected by ctx->ctx_lock */
+#define AIO_IOCB_DELAYED_CANCEL	(1 << 0)
+#define AIO_IOCB_CANCELLED	(1 << 1)
+
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
 	 * this is the underlying eventfd context to deliver events to.
@@ -536,9 +540,9 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
+static void __kiocb_set_cancel_fn(struct aio_kiocb *req,
+		kiocb_cancel_fn *cancel, unsigned int iocb_flags)
 {
-	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
 	struct kioctx *ctx = req->ki_ctx;
 	unsigned long flags;
 
@@ -548,8 +552,15 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 	list_add_tail(&req->ki_list, &ctx->active_reqs);
 	req->ki_cancel = cancel;
+	req->flags |= iocb_flags;
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
+
+void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
+{
+	return __kiocb_set_cancel_fn(container_of(iocb, struct aio_kiocb, rw),
+			cancel, 0);
+}
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
 /*
@@ -603,17 +614,27 @@ static void free_ioctx_users(struct percpu_ref *ref)
 {
 	struct kioctx *ctx = container_of(ref, struct kioctx, users);
 	struct aio_kiocb *req;
+	LIST_HEAD(list);
 
 	spin_lock_irq(&ctx->ctx_lock);
-
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
 				       struct aio_kiocb, ki_list);
-		kiocb_cancel(req);
-	}
 
+		if (req->flags & AIO_IOCB_DELAYED_CANCEL) {
+			req->flags |= AIO_IOCB_CANCELLED;
+			list_move_tail(&req->ki_list, &list);
+		} else {
+			kiocb_cancel(req);
+		}
+	}
 	spin_unlock_irq(&ctx->ctx_lock);
 
+	while (!list_empty(&list)) {
+		req = list_first_entry(&list, struct aio_kiocb, ki_list);
+		kiocb_cancel(req);
+	}
+
 	percpu_ref_kill(&ctx->reqs);
 	percpu_ref_put(&ctx->reqs);
 }
@@ -1786,15 +1807,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	if (unlikely(!ctx))
 		return -EINVAL;
 
-	spin_lock_irq(&ctx->ctx_lock);
+	ret = -EINVAL;
 
+	spin_lock_irq(&ctx->ctx_lock);
 	kiocb = lookup_kiocb(ctx, iocb, key);
+	if (kiocb) {
+		if (kiocb->flags & AIO_IOCB_DELAYED_CANCEL) {
+			kiocb->flags |= AIO_IOCB_CANCELLED;
+		} else {
+			ret = kiocb_cancel(kiocb);
+			kiocb = NULL;
+		}
+	}
+	spin_unlock_irq(&ctx->ctx_lock);
+
 	if (kiocb)
 		ret = kiocb_cancel(kiocb);
-	else
-		ret = -EINVAL;
-
-	spin_unlock_irq(&ctx->ctx_lock);
 
 	if (!ret) {
 		/*
@@ -1806,7 +1834,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	}
 
 	percpu_ref_put(&ctx->users);
-
 	return ret;
 }
 
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 28/31] aio: delete iocbs from the active_reqs list in kiocb_cancel
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

One we cancel an iocb there is no reason to keep it on the active_reqs
list, given that the list is only used to look for cancelation candidates.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 6204c1b81a36..99c1d2ad67e9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -561,6 +561,8 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
 {
 	kiocb_cancel_fn *cancel = kiocb->ki_cancel;
 
+	list_del_init(&kiocb->ki_list);
+
 	if (!cancel)
 		return -EINVAL;
 	kiocb->ki_cancel = NULL;
@@ -607,8 +609,6 @@ static void free_ioctx_users(struct percpu_ref *ref)
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
 				       struct aio_kiocb, ki_list);
-
-		list_del_init(&req->ki_list);
 		kiocb_cancel(req);
 	}
 
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 27/31] aio: simplify cancellation
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

With the current aio code there is no need for the magic KIOCB_CANCELLED
value, as a cancelation just kicks the driver to queue the completion
ASAP, with all actual completion handling done in another thread. Given
that both the completion path and cancelation take the context lock there
is no need for magic cmpxchg loops either.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 37 +++++++++----------------------------
 1 file changed, 9 insertions(+), 28 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index fe241b5b44b2..6204c1b81a36 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -156,19 +156,6 @@ struct kioctx {
 	unsigned		id;
 };
 
-/*
- * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
- * cancelled or completed (this makes a certain amount of sense because
- * successful cancellation - io_cancel() - does deliver the completion to
- * userspace).
- *
- * And since most things don't implement kiocb cancellation and we'd really like
- * kiocb completion to be lockless when possible, we use ki_cancel to
- * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
- * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
- */
-#define KIOCB_CANCELLED		((void *) (~0ULL))
-
 struct aio_kiocb {
 	union {
 		struct kiocb		rw;
@@ -565,24 +552,18 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
+/*
+ * Only cancel if there ws a ki_cancel function to start with, and we
+ * are the one how managed to clear it (to protect against simulatinious
+ * cancel calls).
+ */
 static int kiocb_cancel(struct aio_kiocb *kiocb)
 {
-	kiocb_cancel_fn *old, *cancel;
-
-	/*
-	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
-	 * actually has a cancel function, hence the cmpxchg()
-	 */
-
-	cancel = READ_ONCE(kiocb->ki_cancel);
-	do {
-		if (!cancel || cancel == KIOCB_CANCELLED)
-			return -EINVAL;
-
-		old = cancel;
-		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
-	} while (cancel != old);
+	kiocb_cancel_fn *cancel = kiocb->ki_cancel;
 
+	if (!cancel)
+		return -EINVAL;
+	kiocb->ki_cancel = NULL;
 	return cancel(&kiocb->rw);
 }
 
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 26/31] aio: sanitize ki_list handling
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

Instead of handcoded non-null checks always initialize ki_list to an
empty list and use list_empty / list_empty_careful on it.  While we're
at it also error out on a double call to kiocb_set_cancel_fn instead
of ignoring it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 496b50f9e9b1..fe241b5b44b2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -555,13 +555,12 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 	struct kioctx *ctx = req->ki_ctx;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
-
-	if (!req->ki_list.next)
-		list_add(&req->ki_list, &ctx->active_reqs);
+	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
+		return;
 
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+	list_add_tail(&req->ki_list, &ctx->active_reqs);
 	req->ki_cancel = cancel;
-
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
@@ -1034,7 +1033,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 		goto out_put;
 
 	percpu_ref_get(&ctx->reqs);
-
+	INIT_LIST_HEAD(&req->ki_list);
 	req->ki_ctx = ctx;
 	return req;
 out_put:
@@ -1080,7 +1079,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 	unsigned tail, pos, head;
 	unsigned long	flags;
 
-	if (iocb->ki_list.next) {
+	if (!list_empty_careful(iocb->ki_list.next)) {
 		unsigned long flags;
 
 		spin_lock_irqsave(&ctx->ctx_lock, flags);
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 25/31] aio: refactor read/write iocb setup
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

Don't reference the kiocb structure from the common aio code, and move
any use of it into helper specific to the read/write path.  This is in
preparation for aio_poll support that wants to use the space for different
fields.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 172 ++++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 98 insertions(+), 74 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 41fc8ce6bc7f..496b50f9e9b1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -170,7 +170,9 @@ struct kioctx {
 #define KIOCB_CANCELLED		((void *) (~0ULL))
 
 struct aio_kiocb {
-	struct kiocb		common;
+	union {
+		struct kiocb		rw;
+	};
 
 	struct kioctx		*ki_ctx;
 	kiocb_cancel_fn		*ki_cancel;
@@ -549,7 +551,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 
 void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 {
-	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
+	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
 	struct kioctx *ctx = req->ki_ctx;
 	unsigned long flags;
 
@@ -582,7 +584,7 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
 		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
 	} while (cancel != old);
 
-	return cancel(&kiocb->common);
+	return cancel(&kiocb->rw);
 }
 
 static void free_ioctx(struct work_struct *work)
@@ -1040,15 +1042,6 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 	return NULL;
 }
 
-static void kiocb_free(struct aio_kiocb *req)
-{
-	if (req->common.ki_filp)
-		fput(req->common.ki_filp);
-	if (req->ki_eventfd != NULL)
-		eventfd_ctx_put(req->ki_eventfd);
-	kmem_cache_free(kiocb_cachep, req);
-}
-
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
 	struct aio_ring __user *ring  = (void __user *)ctx_id;
@@ -1079,29 +1072,14 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
-static void aio_complete(struct kiocb *kiocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 {
-	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
 	struct io_event	*ev_page, *event;
 	unsigned tail, pos, head;
 	unsigned long	flags;
 
-	BUG_ON(is_sync_kiocb(kiocb));
-
-	if (kiocb->ki_flags & IOCB_WRITE) {
-		struct file *file = kiocb->ki_filp;
-
-		/*
-		 * Tell lockdep we inherited freeze protection from submission
-		 * thread.
-		 */
-		if (S_ISREG(file_inode(file)->i_mode))
-			__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
-		file_end_write(file);
-	}
-
 	if (iocb->ki_list.next) {
 		unsigned long flags;
 
@@ -1163,11 +1141,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
 	 * eventfd. The eventfd_signal() function is safe to be called
 	 * from IRQ context.
 	 */
-	if (iocb->ki_eventfd != NULL)
+	if (iocb->ki_eventfd) {
 		eventfd_signal(iocb->ki_eventfd, 1);
+		eventfd_ctx_put(iocb->ki_eventfd);
+	}
 
-	/* everything turned out well, dispose of the aiocb. */
-	kiocb_free(iocb);
+	kmem_cache_free(kiocb_cachep, iocb);
 
 	/*
 	 * We have to order our ring_info tail store above and test
@@ -1430,6 +1409,48 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 	return -EINVAL;
 }
 
+static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
+
+	WARN_ON_ONCE(is_sync_kiocb(kiocb));
+
+	if (kiocb->ki_flags & IOCB_WRITE) {
+		struct inode *inode = file_inode(kiocb->ki_filp);
+
+		/*
+		 * Tell lockdep we inherited freeze protection from submission
+		 * thread.
+		 */
+		if (S_ISREG(inode->i_mode))
+			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+		file_end_write(kiocb->ki_filp);
+	}
+
+	fput(kiocb->ki_filp);
+	aio_complete(iocb, res, res2);
+}
+
+static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+{
+	int ret;
+
+	req->ki_filp = fget(iocb->aio_fildes);
+	if (unlikely(!req->ki_filp))
+		return -EBADF;
+	req->ki_complete = aio_complete_rw;
+	req->ki_flags = 0;
+	req->ki_pos = iocb->aio_offset;
+	req->ki_flags = iocb_flags(req->ki_filp);
+	if (iocb->aio_flags & IOCB_FLAG_RESFD)
+		req->ki_flags |= IOCB_EVENTFD;
+	req->ki_hint = file_write_hint(req->ki_filp);
+	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+	if (unlikely(ret))
+		fput(req->ki_filp);
+	return ret;
+}
+
 static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
 		bool vectored, bool compat, struct iov_iter *iter)
 {
@@ -1449,7 +1470,7 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
 	return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
 }
 
-static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
+static inline ssize_t aio_rw_ret(struct kiocb *req, ssize_t ret)
 {
 	switch (ret) {
 	case -EIOCBQUEUED:
@@ -1465,7 +1486,7 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
 		ret = -EINTR;
 		/*FALLTHRU*/
 	default:
-		aio_complete(req, ret, 0);
+		aio_complete_rw(req, ret, 0);
 		return 0;
 	}
 }
@@ -1473,56 +1494,78 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
 static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
 		bool compat)
 {
-	struct file *file = req->ki_filp;
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
+	struct file *file;
 	ssize_t ret;
 
+	ret = aio_prep_rw(req, iocb);
+	if (ret)
+		return ret;
+	file = req->ki_filp;
+
+	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_READ)))
-		return -EBADF;
+		goto out_fput;
+	ret = -EINVAL;
 	if (unlikely(!file->f_op->read_iter))
-		return -EINVAL;
+		goto out_fput;
 
 	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
 	if (ret)
-		return ret;
+		goto out_fput;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
-		ret = aio_ret(req, call_read_iter(file, req, &iter));
+		ret = aio_rw_ret(req, call_read_iter(file, req, &iter));
 	kfree(iovec);
+out_fput:
+	if (unlikely(ret && ret != -EIOCBQUEUED))
+		fput(file);
 	return ret;
 }
 
 static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
 		bool compat)
 {
-	struct file *file = req->ki_filp;
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
+	struct file *file;
 	ssize_t ret;
 
+	ret = aio_prep_rw(req, iocb);
+	if (ret)
+		return ret;
+	file = req->ki_filp;
+
+	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		return -EBADF;
+		goto out_fput;
+	ret = -EINVAL;
 	if (unlikely(!file->f_op->write_iter))
-		return -EINVAL;
+		goto out_fput;
 
 	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
 	if (ret)
-		return ret;
+		goto out_fput;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
+		struct inode *inode = file_inode(file);
+
 		req->ki_flags |= IOCB_WRITE;
 		file_start_write(file);
-		ret = aio_ret(req, call_write_iter(file, req, &iter));
+		ret = aio_rw_ret(req, call_write_iter(file, req, &iter));
 		/*
-		 * We release freeze protection in aio_complete().  Fool lockdep
-		 * by telling it the lock got released so that it doesn't
-		 * complain about held lock when we return to userspace.
+		 * We release freeze protection in aio_complete_rw().  Fool
+		 * lockdep by telling it the lock got released so that it
+		 * doesn't complain about held lock when we return to userspace.
 		 */
-		if (S_ISREG(file_inode(file)->i_mode))
-			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+		if (S_ISREG(inode->i_mode))
+			__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
 	}
 	kfree(iovec);
+out_fput:
+	if (unlikely(ret && ret != -EIOCBQUEUED))
+		fput(file);
 	return ret;
 }
 
@@ -1530,7 +1573,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 struct iocb *iocb, bool compat)
 {
 	struct aio_kiocb *req;
-	struct file *file;
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
@@ -1553,16 +1595,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	if (unlikely(!req))
 		return -EAGAIN;
 
-	req->common.ki_filp = file = fget(iocb->aio_fildes);
-	if (unlikely(!req->common.ki_filp)) {
-		ret = -EBADF;
-		goto out_put_req;
-	}
-	req->common.ki_pos = iocb->aio_offset;
-	req->common.ki_complete = aio_complete;
-	req->common.ki_flags = iocb_flags(req->common.ki_filp);
-	req->common.ki_hint = file_write_hint(file);
-
 	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
 		/*
 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1576,14 +1608,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			req->ki_eventfd = NULL;
 			goto out_put_req;
 		}
-
-		req->common.ki_flags |= IOCB_EVENTFD;
-	}
-
-	ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
-	if (unlikely(ret)) {
-		pr_debug("EINVAL: aio_rw_flags\n");
-		goto out_put_req;
 	}
 
 	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1595,26 +1619,24 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_user_iocb = user_iocb;
 	req->ki_user_data = iocb->aio_data;
 
-	get_file(file);
 	switch (iocb->aio_lio_opcode) {
 	case IOCB_CMD_PREAD:
-		ret = aio_read(&req->common, iocb, false, compat);
+		ret = aio_read(&req->rw, iocb, false, compat);
 		break;
 	case IOCB_CMD_PWRITE:
-		ret = aio_write(&req->common, iocb, false, compat);
+		ret = aio_write(&req->rw, iocb, false, compat);
 		break;
 	case IOCB_CMD_PREADV:
-		ret = aio_read(&req->common, iocb, true, compat);
+		ret = aio_read(&req->rw, iocb, true, compat);
 		break;
 	case IOCB_CMD_PWRITEV:
-		ret = aio_write(&req->common, iocb, true, compat);
+		ret = aio_write(&req->rw, iocb, true, compat);
 		break;
 	default:
 		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
 		ret = -EINVAL;
 		break;
 	}
-	fput(file);
 
 	if (ret && ret != -EIOCBQUEUED)
 		goto out_put_req;
@@ -1622,7 +1644,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 out_put_req:
 	put_reqs_available(ctx, 1);
 	percpu_ref_put(&ctx->reqs);
-	kiocb_free(req);
+	if (req->ki_eventfd)
+		eventfd_ctx_put(req->ki_eventfd);
+	kmem_cache_free(kiocb_cachep, req);
 	return ret;
 }
 
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 24/31] aio: remove an outdated comment in aio_complete
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

These days we don't treat sync iocbs special in the aio completion code as
they never use it.  Remove the old comment, and move the BUG_ON for a sync
iocb to the top of the function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 03d59593912d..41fc8ce6bc7f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1088,6 +1088,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
 	unsigned tail, pos, head;
 	unsigned long	flags;
 
+	BUG_ON(is_sync_kiocb(kiocb));
+
 	if (kiocb->ki_flags & IOCB_WRITE) {
 		struct file *file = kiocb->ki_filp;
 
@@ -1100,15 +1102,6 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
 		file_end_write(file);
 	}
 
-	/*
-	 * Special case handling for sync iocbs:
-	 *  - events go directly into the iocb for fast handling
-	 *  - the sync task with the iocb in its stack holds the single iocb
-	 *    ref, no other paths have a way to get another ref
-	 *  - the sync task helpfully left a reference to itself in the iocb
-	 */
-	BUG_ON(is_sync_kiocb(kiocb));
-
 	if (iocb->ki_list.next) {
 		unsigned long flags;
 
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 23/31] aio: don't print the page size at boot time
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

The page size is in no way related to the aio code, and printing it in
the (debug) dmesg at every boot serves no purpose.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index a062d75109cb..03d59593912d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -264,9 +264,6 @@ static int __init aio_setup(void)
 
 	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-
-	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
-
 	return 0;
 }
 __initcall(aio_setup);
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 22/31] timerfd: convert to ->poll_mask
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/timerfd.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0510717f3a53..ee36bb5b2698 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -226,21 +226,20 @@ static int timerfd_release(struct inode *inode, struct file *file)
 	kfree_rcu(ctx, rcu);
 	return 0;
 }
-
-static __poll_t timerfd_poll(struct file *file, poll_table *wait)
+	
+static struct wait_queue_head *timerfd_get_poll_head(struct file *file,
+		__poll_t eventmask)
 {
 	struct timerfd_ctx *ctx = file->private_data;
-	__poll_t events = 0;
-	unsigned long flags;
 
-	poll_wait(file, &ctx->wqh, wait);
+	return &ctx->wqh;
+}
 
-	spin_lock_irqsave(&ctx->wqh.lock, flags);
-	if (ctx->ticks)
-		events |= POLLIN;
-	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask)
+{
+	struct timerfd_ctx *ctx = file->private_data;
 
-	return events;
+	return ctx->ticks ? POLLIN : 0;
 }
 
 static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
@@ -364,7 +363,8 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 
 static const struct file_operations timerfd_fops = {
 	.release	= timerfd_release,
-	.poll		= timerfd_poll,
+	.get_poll_head	= timerfd_get_poll_head,
+	.poll_mask	= timerfd_poll_mask,
 	.read		= timerfd_read,
 	.llseek		= noop_llseek,
 	.show_fdinfo	= timerfd_show,
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 21/31] eventfd: switch to ->poll_mask
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/eventfd.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6318a9b57e53..2bbbf13cb816 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -114,14 +114,20 @@ static int eventfd_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static __poll_t eventfd_poll(struct file *file, poll_table *wait)
+static struct wait_queue_head *
+eventfd_get_poll_head(struct file *file, __poll_t events)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+
+	return &ctx->wqh;
+}
+
+static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
 {
 	struct eventfd_ctx *ctx = file->private_data;
 	__poll_t events = 0;
 	u64 count;
 
-	poll_wait(file, &ctx->wqh, wait);
-
 	/*
 	 * All writes to ctx->count occur within ctx->wqh.lock.  This read
 	 * can be done outside ctx->wqh.lock because we know that poll_wait
@@ -341,7 +347,8 @@ static const struct file_operations eventfd_fops = {
 	.show_fdinfo	= eventfd_show_fdinfo,
 #endif
 	.release	= eventfd_release,
-	.poll		= eventfd_poll,
+	.get_poll_head	= eventfd_get_poll_head,
+	.poll_mask	= eventfd_poll_mask,
 	.read		= eventfd_read,
 	.write		= eventfd_write,
 	.llseek		= noop_llseek,
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 20/31] pipe: convert to ->poll_mask
From: Christoph Hellwig @ 2018-01-04  8:00 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-kernel
In-Reply-To: <20180104080043.14506-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/pipe.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index a449ca0ec0c6..33a32d13b83d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -514,19 +514,22 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	}
 }
 
-/* No kernel lock held - fine */
-static __poll_t
-pipe_poll(struct file *filp, poll_table *wait)
+static struct wait_queue_head *
+pipe_get_poll_head(struct file *filp, __poll_t events)
 {
-	__poll_t mask;
 	struct pipe_inode_info *pipe = filp->private_data;
-	int nrbufs;
 
-	poll_wait(filp, &pipe->wait, wait);
+	return &pipe->wait;
+}
+
+/* No kernel lock held - fine */
+static __poll_t pipe_poll_mask(struct file *filp, __poll_t events)
+{
+	struct pipe_inode_info *pipe = filp->private_data;
+	int nrbufs = pipe->nrbufs;
+	__poll_t mask = 0;
 
 	/* Reading only -- no need for acquiring the semaphore.  */
-	nrbufs = pipe->nrbufs;
-	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
 		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
 		if (!pipe->writers && filp->f_version != pipe->w_counter)
@@ -1010,7 +1013,8 @@ const struct file_operations pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read_iter	= pipe_read,
 	.write_iter	= pipe_write,
-	.poll		= pipe_poll,
+	.get_poll_head	= pipe_get_poll_head,
+	.poll_mask	= pipe_poll_mask,
 	.unlocked_ioctl	= pipe_ioctl,
 	.release	= pipe_release,
 	.fasync		= pipe_fasync,
-- 
2.14.2

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox