Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next 08/13] tools/bpf: add test for bpf_map_update_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Add tests for bpf_map_update_batch().
  $ ./test_maps
  ...
  test_map_update_batch:PASS
  ...
---
 .../bpf/map_tests/map_update_batch.c          | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_update_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_update_batch.c b/tools/testing/selftests/bpf/map_tests/map_update_batch.c
new file mode 100644
index 000000000000..67c1e11fc911
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_update_batch.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+void test_map_update_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, key, value;
+	const int max_entries = 10;
+	__u32 count, max_count;
+	int err, i;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values, "malloc()", "error:%s\n", strerror(errno));
+
+	/* do not fill in the whole hash table, so we could test
+	 * update with new elements.
+	 */
+	max_count = max_entries - 2;
+
+	for (i = 0; i < max_count; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	/* test 1: count == 0, expect success. */
+	count = 0;
+	err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+	CHECK(err, "count = 0", "error:%s\n", strerror(errno));
+
+	/* test 2: update initial map with BPF_NOEXIST, expect success. */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	CHECK(err, "elem_flags = BPF_NOEXIST",
+	      "error:%s\n", strerror(errno));
+
+	/* use bpf_map_get_next_key to ensure all keys/values are indeed
+	 * covered.
+	 */
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	CHECK(err, "bpf_map_lookup_elem()", "error: %s\n", strerror(errno));
+	CHECK(key + 1 != value, "key/value checking",
+	      "error: key %d value %d\n", key, value);
+	i = 1;
+	while (!bpf_map_get_next_key(map_fd, &key, &key)) {
+		err = bpf_map_lookup_elem(map_fd, &key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+		CHECK(key + 1 != value,
+		      "key/value checking", "error: key %d value %d\n",
+		      key, value);
+		i++;
+	}
+	CHECK(i != max_count, "checking number of entries",
+	      "err: i %u max_count %u\n", i, max_count);
+
+	/* test 3: elem_flags = BPF_NOEXIST, already exists, expect failure */
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	/* failure to due to flag BPF_NOEXIST, count is set to 0 */
+	CHECK(!err || count, "elem_flags = BPF_NOEXIST again",
+	      "unexpected success\n");
+
+	/* test 4: elem_flags = 0, expect success */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "elem_flags = 0", "error %s\n", strerror(errno));
+
+	/* test 5: keys = NULL, expect failure */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, NULL, values,
+				   &count, 0, 0);
+	CHECK(!err, "keys = NULL", "unexpected success\n");
+
+	/* test 6: values = NULL, expect failure */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, NULL, &count, 0, 0);
+	CHECK(!err, "values = NULL", "unexpected success\n");
+
+	/* test 7: modify the first key to be max_count + 10,
+	 * elem_flags = BPF_NOEXIST,
+	 * expect failure, the return count = 1.
+	 */
+	count = max_count;
+	keys[0] = max_count + 10;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	CHECK(!err, "keys[0] = max_count + 10", "unexpected success\n");
+	CHECK(count != 1, "keys[0] = max_count + 10",
+	      "error: %s, incorrect count %u\n", strerror(errno), count);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 03/13] bpf: refactor map_delete_elem()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Refactor function map_delete_elem() with a new helper
bpf_map_delete_elem(), which will be used later
for batched lookup_and_delete and delete operations.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3caa0ab3d30d..b8bba499df11 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -992,6 +992,25 @@ static int map_update_elem(union bpf_attr *attr)
 	return err;
 }
 
+static int bpf_map_delete_elem(struct bpf_map *map, void *key)
+{
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return bpf_map_offload_delete_elem(map, key);
+
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	rcu_read_lock();
+	err = map->ops->map_delete_elem(map, key);
+	rcu_read_unlock();
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+	maybe_wait_bpf_programs(map);
+
+	return err;
+}
+
 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
 
 static int map_delete_elem(union bpf_attr *attr)
@@ -1021,20 +1040,8 @@ static int map_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_delete_elem(map, key);
-		goto out;
-	}
+	err = bpf_map_delete_elem(map, key);
 
-	preempt_disable();
-	__this_cpu_inc(bpf_prog_active);
-	rcu_read_lock();
-	err = map->ops->map_delete_elem(map, key);
-	rcu_read_unlock();
-	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-	maybe_wait_bpf_programs(map);
-out:
 	kfree(key);
 err_put:
 	fdput(f);
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 10/13] tools/bpf: add test for bpf_map_lookup_and_delete_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Test bpf_map_lookup_and_delete_batch() functionality.
  $ ./test_maps
  ...
  test_map_lookup_and_delete_batch:PASS
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../map_tests/map_lookup_and_delete_batch.c   | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c b/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
new file mode 100644
index 000000000000..b151513a0ab2
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+			     int *keys, int *values)
+{
+	int i;
+
+	memset(visited, 0, max_entries * sizeof(*visited));
+	for (i = 0; i < max_entries; i++) {
+		CHECK(keys[i] + 1 != values[i], "key/value checking",
+		      "error: i %d key %d value %d\n", i, keys[i], values[i]);
+		visited[i] = 1;
+	}
+	for (i = 0; i < max_entries; i++) {
+		CHECK(visited[i] != 1, "visited checking",
+		      "error: keys array at index %d missing\n", i);
+	}
+}
+
+void test_map_lookup_and_delete_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: lookup/delete an empty hash table, success */
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_next_skey, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "empty map", "error: %s\n", strerror(errno));
+	CHECK(p_next_skey || count, "empty map",
+	      "p_next_skey = %p, count = %u\n", p_next_skey, count);
+
+	/* populate elements to the map */
+	map_batch_update(map_fd, max_entries, keys, values);
+
+	/* test 2: lookup/delete with count = 0, success */
+	count = 0;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+
+	/* test 3: lookup/delete with count = max_entries, skey && !nskey,
+	 * failure.
+	 */
+	count = max_entries;
+	key = 1;
+	err = bpf_map_lookup_and_delete_batch(map_fd, &key, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(!err, "skey && !nskey", "unexpected success\n");
+
+	/* test 4: lookup/delete with count = max_entries, success */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_next_skey, keys,
+					      values, &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+	map_batch_verify(visited, max_entries, keys, values);
+
+	/* bpf_map_get_next_key() should return -ENOENT for an empty map. */
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	/* test 5: lookup/delete in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively lookup/delete elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_lookup_and_delete_batch(map_fd, p_skey,
+							      &p_next_skey,
+							      keys + i * step,
+							      values + i * step,
+							      &count, 0, 0);
+			CHECK(err, "lookup/delete with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "lookup/delete with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "lookup/delete with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		map_batch_verify(visited, max_entries, keys, values);
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	}
+
+	/* test 6: lookup/delete with keys in keys[]. */
+	map_batch_update(map_fd, max_entries, keys, values);
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "lookup/delete key in keys[]", "error: %s\n", strerror(errno));
+	map_batch_verify(visited, max_entries, keys, values);
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 12/13] tools/bpf: add a multithreaded test for map batch operations
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

A multithreaded test is added. Three threads repeatedly did:
  - batch update
  - batch lookup_and_delete
  - batch delete

It is totally possible each batch element operation in kernel
may find that the key, retrieved from bpf_map_get_next_key(),
may fail lookup and/or delete as some other threads in parallel
operates on the same map.

The default mode for new batch APIs is to ignore -ENOENT errors
in case of lookup and delete and move to the next element.
The test would otherwise fail if the kernel reacts as -ENOENT
as a real error and propogates it back to user space.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/map_tests/map_batch_mt.c    | 126 ++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_mt.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_batch_mt.c b/tools/testing/selftests/bpf/map_tests/map_batch_mt.c
new file mode 100644
index 000000000000..a0e2591d0079
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_batch_mt.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+/* Create three threads. Each thread will iteratively do:
+ *   . update constantly
+ *   . lookup and delete constantly
+ *   . delete constantly
+ * So this will make lookup and delete operations
+ * may fail as the elements may be deleted by another
+ * thread.
+ *
+ * By default, we should not see a problem as
+ * -ENOENT for bpf_map_delete_elem() and bpf_map_lookup_elem()
+ * will be ignored. But with flag, BPF_F_ENFORCE_ENOENT
+ * we may see errors.
+ */
+
+static int map_fd;
+static const __u32 max_entries = 10;
+static volatile bool stop = false;
+
+static void do_batch_update()
+{
+	int i, err, keys[max_entries], values[max_entries];
+	__u32 count;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	while (!stop) {
+		count = max_entries;
+		err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+		CHECK(err, "bpf_map_update_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void do_batch_delete()
+{
+	__u32 count;
+	int err;
+
+	while (!stop) {
+		count = 0;
+		err = bpf_map_delete_batch(map_fd, NULL, NULL, NULL, &count,
+					   0, 0);
+		CHECK(err, "bpf_map_delete_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void do_batch_lookup_and_delete()
+{
+	int err, key, keys[max_entries], values[max_entries];
+	__u32 count;
+	void *p_key;
+
+	while (!stop) {
+		p_key = &key;
+		count = max_entries;
+		err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_key,
+						      keys, values, &count,
+						      0, 0);
+		CHECK(err, "bpf_map_lookup_and_delete_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void *do_work(void *arg)
+{
+	int work_index = (int)(long)arg;
+
+	if (work_index == 0)
+		do_batch_update();
+	else if (work_index == 1)
+		do_batch_delete();
+	else
+		do_batch_lookup_and_delete();
+
+	return NULL;
+}
+
+void test_map_batch_mt(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	const int nr_threads = 3;
+	pthread_t threads[nr_threads];
+	int i, err;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	for (i = 0; i < nr_threads; i++) {
+		err = pthread_create(&threads[i], NULL, do_work,
+				     (void *)(long)i);
+		CHECK(err, "pthread_create", "error: %s\n", strerror(errno));
+	}
+
+	sleep(1);
+	stop = true;
+
+	for (i = 0; i < nr_threads; i++)
+		pthread_join(threads[i], NULL);
+
+	close(map_fd);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 13/13] tools/bpf: measure map batching perf
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

The test program run result:
  $ ./test_maps
  ...
  measure_lookup: max_entries 1000000, batch 10, time 342
  measure_lookup: max_entries 1000000, batch 1000, time 295
  measure_lookup: max_entries 1000000, batch 1000000, time 270
  measure_lookup: max_entries 1000000, no batching, time 1346
  measure_lookup_delete: max_entries 1000000, batch 10, time 433
  measure_lookup_delete: max_entries 1000000, batch 1000, time 363
  measure_lookup_delete: max_entries 1000000, batch 1000000, time 357
  measure_lookup_delete: max_entries 1000000, not batch, time 1894
  measure_delete: max_entries 1000000, batch, time 220
  measure_delete: max_entries 1000000, not batch, time 1289
  test_map_batch_perf:PASS
  ...

  The test is running on a qemu VM environment. The time
  unit is millisecond. A simple hash table with 1M elements
  is created.

  For lookup and lookup_and_deletion, since buffer allocation
  is needed to hold the lookup results, three different
  batch sizes (10, 1000, and 1M) are tried. The performance
  without batching is also measured. A batch size of 10
  can already improves performance dramatically, more than 70%,
  and increasing batch size may continue to improve performance,
  but with diminishing returns.

  For delete, the batch API provides a mechanism to delete all elements
  in the map, which is used here. The deletion of the whole map
  improves performance by 80% compared to non-batch mechanism.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/map_tests/map_batch_perf.c  | 242 ++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_perf.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_batch_perf.c b/tools/testing/selftests/bpf/map_tests/map_batch_perf.c
new file mode 100644
index 000000000000..42d95651e1ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_batch_perf.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+/* Test map batch performance.
+ * Test three common scenarios:
+ *    - batched lookup
+ *    - batched lookup and delete
+ *    - batched deletion
+ */
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static unsigned long util_gettime(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void measure_lookup(int map_fd, __u32 max_entries, int *keys,
+			   int *values)
+{
+	__u32 batches[3] = {10, 1000};
+	int err, key, value, option;
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	__u32 count, total;
+
+	map_batch_update(map_fd, max_entries, keys, values);
+
+	/* batched */
+	batches[2] = max_entries;
+	for (option = 0; option < 3; option++) {
+		p_key = NULL;
+		p_next_key = &key;
+		count = batches[option];
+		start = util_gettime();
+		total = 0;
+
+		while (true) {
+			err = bpf_map_lookup_batch(map_fd, p_key, &p_next_key,
+						   keys, values, &count, 0, 0);
+			CHECK(err, "bpf_map_lookup_batch()", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_key)
+				break;
+
+			if (!p_key)
+				p_key = p_next_key;
+		}
+
+		end = util_gettime();
+		CHECK(total != max_entries,
+		      "checking total", "total %u, max_entries %u\n",
+		      total, max_entries);
+		printf("%s: max_entries %u, batch %u, time %ld\n", __func__,
+		       max_entries, batches[option], end - start);
+	}
+
+	/* not batched */
+	start = util_gettime();
+	p_key = NULL;
+	p_next_key = &key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+		p_key = p_next_key;
+	}
+	end = util_gettime();
+	printf("%s: max_entries %u, no batching, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+static void measure_lookup_delete(int map_fd, __u32 max_entries, int *keys,
+				  int *values)
+{
+	int err, key, next_key, value, option;
+	__u32 batches[3] = {10, 1000};
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	__u32 count, total;
+
+	/* batched */
+	batches[2] = max_entries;
+	for (option = 0; option < 3; option++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		p_key = NULL;
+		p_next_key = &key;
+		count = batches[option];
+		start = util_gettime();
+		total = 0;
+
+		while (true) {
+			err = bpf_map_lookup_and_delete_batch(map_fd, p_key,
+				&p_next_key, keys, values, &count, 0, 0);
+			CHECK(err, "bpf_map_lookup_and_delete_batch()",
+			      "error: %s\n", strerror(errno));
+
+			total += count;
+			if (!p_next_key)
+				break;
+
+			if (!p_key)
+				p_key = p_next_key;
+		}
+
+		end = util_gettime();
+		CHECK(total != max_entries,
+		      "checking total", "total %u, max_entries %u\n",
+		      total, max_entries);
+		printf("%s: max_entries %u, batch %u, time %ld\n", __func__,
+		       max_entries, batches[option], end - start);
+	}
+
+	/* not batched */
+	map_batch_update(map_fd, max_entries, keys, values);
+	start = util_gettime();
+	p_key = NULL;
+	p_next_key = &key;
+	err = bpf_map_get_next_key(map_fd, p_key, p_next_key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+	CHECK(err, "bpf_map_lookup_elem()", "error: %s\n", strerror(errno));
+
+	p_key = p_next_key;
+	p_next_key = &next_key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_delete_elem(map_fd, p_key);
+		CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+		      strerror(errno));
+
+		err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+
+		p_key = p_next_key;
+		p_next_key = (p_next_key == &key) ? &next_key : &key;
+	}
+	err = bpf_map_delete_elem(map_fd, p_key);
+	CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+	      strerror(errno));
+	end = util_gettime();
+	printf("%s: max_entries %u, not batch, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+static void measure_delete(int map_fd, __u32 max_entries, int *keys,
+			   int *values)
+{
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	int err, key, next_key;
+	__u32 count;
+
+	/* batched */
+	map_batch_update(map_fd, max_entries, keys, values);
+	count = 0;
+	p_next_key = &key;
+	start = util_gettime();
+	err = bpf_map_delete_batch(map_fd, NULL, NULL, NULL, &count, 0, 0);
+	end = util_gettime();
+	CHECK(err, "bpf_map_delete_batch()", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries, "bpf_map_delete_batch()",
+	      "count = %u, max_entries = %u\n", count, max_entries);
+
+	printf("%s: max_entries %u, batch, time %ld\n", __func__,
+	       max_entries, end - start);
+
+	map_batch_update(map_fd, max_entries, keys, values);
+	p_key = NULL;
+	p_next_key = &key;
+	start = util_gettime();
+	err = bpf_map_get_next_key(map_fd, p_key, p_next_key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	p_key = p_next_key;
+	p_next_key = &next_key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_delete_elem(map_fd, p_key);
+		CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+		      strerror(errno));
+		p_key = p_next_key;
+		p_next_key = (p_next_key == &key) ? &next_key : &key;
+	}
+	err = bpf_map_delete_elem(map_fd, p_key);
+	CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+	      strerror(errno));
+	end = util_gettime();
+	printf("%s: max_entries %u, not batch, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+void test_map_batch_perf(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	const __u32 max_entries = 1000000;  // 1M entries for the hash table
+	int map_fd, *keys, *values;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values, "malloc()", "error:%s\n", strerror(errno));
+
+	measure_lookup(map_fd, max_entries, keys, values);
+	measure_lookup_delete(map_fd, max_entries, keys, values);
+	measure_delete(map_fd, max_entries, keys, values);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 11/13] tools/bpf: add test for bpf_map_delete_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Test bpf_map_delete_batch() functionality.
  $ ./test_maps
  ...
  test_map_delete_batch:PASS
  ...
---
 .../bpf/map_tests/map_delete_batch.c          | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_delete_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_delete_batch.c b/tools/testing/selftests/bpf/map_tests/map_delete_batch.c
new file mode 100644
index 000000000000..459495a6d9fc
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_delete_batch.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+void test_map_delete_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: delete an empty hash table, success */
+	for (i = 0; i < 2; i++) {
+		if (i == 0) {
+			count = 0;
+			p_next_skey = NULL;
+		} else {
+			count = max_entries;
+			p_next_skey = &key;
+		}
+		err = bpf_map_delete_batch(map_fd, NULL, &p_next_skey, keys,
+					   &count, 0, 0);
+		CHECK(err, "empty map", "error: %s\n", strerror(errno));
+		CHECK(p_next_skey || count, "empty map",
+		      "i = %d, p_next_skey = %p, count = %u\n", i, p_next_skey,
+		      count);
+	}
+
+	/* test 2: delete with count = 0, success */
+	for (i = 0; i < 2; i++) {
+		/* populate elements to the map */
+		map_batch_update(map_fd, max_entries, keys, values);
+
+		count = 0;
+		if (i == 0) {
+			/* all elements in the map */
+			p_skey = NULL;
+		} else {
+			/* all elements starting from p_skey */
+			p_skey = &key;
+			err = bpf_map_get_next_key(map_fd, NULL, p_skey);
+			CHECK(err, "bpf_map_get_next_key()", "error: %s\n",
+			      strerror(errno));
+		}
+		err = bpf_map_delete_batch(map_fd, p_skey, NULL, NULL,
+				    &count, 0, 0);
+		CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+		/* all elements should be gone. */
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "unexpected success\n");
+	}
+
+	/* test 3: delete in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively delete elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_delete_batch(map_fd, p_skey, &p_next_skey,
+					      keys + i * step, &count, 0, 0);
+			CHECK(err, "delete with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "delete with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "delete with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "error: %s\n",
+		      strerror(errno));
+	}
+
+	/* test 4: delete with keys in keys[]. */
+	map_batch_update(map_fd, max_entries, keys, values);
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_delete_batch(map_fd, NULL, NULL, keys, &count, 0, 0);
+	CHECK(err, "delete key in keys[]", "error: %s\n", strerror(errno));
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 04/13] bpf: refactor map_get_next_key()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Refactor function map_get_next_key() with a new helper
bpf_map_get_next_key(), which will be used later
for batched map lookup/lookup_and_delete/delete operations.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b8bba499df11..06308f0206e5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1048,6 +1048,20 @@ static int map_delete_elem(union bpf_attr *attr)
 	return err;
 }
 
+static int bpf_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return bpf_map_offload_get_next_key(map, key, next_key);
+
+	rcu_read_lock();
+	err = map->ops->map_get_next_key(map, key, next_key);
+	rcu_read_unlock();
+
+	return err;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
 
@@ -1088,15 +1102,7 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (!next_key)
 		goto free_key;
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_get_next_key(map, key, next_key);
-		goto out;
-	}
-
-	rcu_read_lock();
-	err = map->ops->map_get_next_key(map, key, next_key);
-	rcu_read_unlock();
-out:
+	err = bpf_map_get_next_key(map, key, next_key);
 	if (err)
 		goto free_next_key;
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 09/13] tools/bpf: add test for bpf_map_lookup_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Test bpf_map_lookup_batch() functionality.
  $ ./test_maps
  ...
  test_map_lookup_batch:PASS
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../bpf/map_tests/map_lookup_batch.c          | 166 ++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c b/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
new file mode 100644
index 000000000000..9c88e8adc556
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+			     int *keys, int *values)
+{
+	int i;
+
+	memset(visited, 0, max_entries * sizeof(*visited));
+	for (i = 0; i < max_entries; i++) {
+		CHECK(keys[i] + 1 != values[i], "key/value checking",
+		      "error: i %d key %d value %d\n", i, keys[i], values[i]);
+		visited[i] = 1;
+	}
+	for (i = 0; i < max_entries; i++) {
+		CHECK(visited[i] != 1, "visited checking",
+		      "error: keys array at index %d missing\n", i);
+	}
+}
+
+void test_map_lookup_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: lookup an empty hash table, success */
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "empty map", "error: %s\n", strerror(errno));
+	CHECK(p_next_skey || count, "empty map",
+	      "p_next_skey = %p, count = %u\n", p_next_skey, count);
+
+	/* populate elements to the map */
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	count = max_entries;
+	err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+
+	/* test 2: lookup with count = 0, success */
+	count = 0;
+	err = bpf_map_lookup_batch(map_fd, NULL, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+
+	/* test 3: lookup with count = max_entries, skey && !nskey, failure */
+	count = max_entries;
+	key = 1;
+	err = bpf_map_lookup_batch(map_fd, &key, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(!err, "skey && !nskey", "unexpected success\n");
+
+	/* test 4: lookup with count = max_entries, success */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+
+
+	/* test 5: lookup with count = max_entries, it should return
+	 * success with count = max_entries.
+	 */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+
+	/* test 6: lookup with an invalid start key, failure */
+	count = max_entries;
+	key = max_entries + 10;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, &key, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(!err, "invalid start_key", "unexpected success\n");
+
+	/* test 7: lookup in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively lookup elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_lookup_batch(map_fd, p_skey, &p_next_skey,
+						   keys + i * step,
+						   values + i * step,
+						   &count, 0, 0);
+			CHECK(err, "lookup with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "lookup with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "lookup with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		map_batch_verify(visited, max_entries, keys, values);
+	}
+
+	/* test 8: lookup with keys in keys[]. */
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_lookup_batch(map_fd, NULL, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "lookup key in keys[]", "error: %s\n", strerror(errno));
+	map_batch_verify(visited, max_entries, keys, values);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 07/13] tools/bpf: implement libbpf API functions for map batch operations
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Added four libbpf API functions to support map batch operations:
  . int bpf_map_delete_batch( ... )
  . int bpf_map_lookup_batch( ... )
  . int bpf_map_lookup_and_delete_batch( ... )
  . int bpf_map_update_batch( ... )

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/bpf.c      | 67 ++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/bpf.h      | 17 ++++++++++
 tools/lib/bpf/libbpf.map |  4 +++
 3 files changed, 88 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index cbb933532981..bdbd660aa2b8 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -438,6 +438,73 @@ int bpf_map_freeze(int fd)
 	return sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr));
 }
 
+static int bpf_map_batch_common(int cmd, int fd, const void *start_key,
+				void **next_start_key,
+				void *keys, void *values,
+				__u32 *count, __u64 elem_flags,
+				__u64 flags)
+{
+	union bpf_attr attr = {};
+	int ret;
+
+	attr.batch.map_fd = fd;
+	attr.batch.start_key = ptr_to_u64(start_key);
+	if (next_start_key)
+		attr.batch.next_start_key = ptr_to_u64(*next_start_key);
+	attr.batch.keys = ptr_to_u64(keys);
+	attr.batch.values = ptr_to_u64(values);
+	if (count)
+		attr.batch.count = *count;
+	attr.batch.elem_flags = elem_flags;
+	attr.batch.flags = flags;
+
+	ret = sys_bpf(cmd, &attr, sizeof(attr));
+	if (next_start_key)
+		*next_start_key = (void *)attr.batch.next_start_key;
+	if (count)
+		*count = attr.batch.count;
+
+	return ret;
+}
+
+int bpf_map_delete_batch(int fd, const void *start_key, void **next_start_key,
+			 void *keys, __u32 *count, __u64 elem_flags,
+			 __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, start_key,
+				    next_start_key, keys, (void *)NULL,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_lookup_batch(int fd, const void *start_key, void **next_start_key,
+			 void *keys, void *values, __u32 *count,
+			 __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_LOOKUP_BATCH, fd, start_key,
+				    next_start_key, keys, values,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_lookup_and_delete_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    void *values, __u32 *count,
+				    __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+				    fd, start_key,
+				    next_start_key, keys, values,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count,
+			 __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH,
+				    fd, (void *)NULL, (void *)NULL,
+				    keys, values,
+				    count, elem_flags, flags);
+}
+
 int bpf_obj_pin(int fd, const char *pathname)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 0db01334740f..a36bfcbfe04f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -120,6 +120,23 @@ LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
 LIBBPF_API int bpf_map_delete_elem(int fd, const void *key);
 LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key);
 LIBBPF_API int bpf_map_freeze(int fd);
+LIBBPF_API int bpf_map_delete_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    __u32 *count, __u64 elem_flags,
+				    __u64 flags);
+LIBBPF_API int bpf_map_lookup_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    void *values, __u32 *count,
+				    __u64 elem_flags, __u64 flags);
+LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, const void *start_key,
+					       void **next_start_key,
+					       void *keys, void *values,
+					       __u32 *count, __u64 elem_flags,
+					       __u64 flags);
+LIBBPF_API int bpf_map_update_batch(int fd, void *keys, void *values,
+				    __u32 *count, __u64 elem_flags,
+				    __u64 flags);
+
 LIBBPF_API int bpf_obj_pin(int fd, const char *pathname);
 LIBBPF_API int bpf_obj_get(const char *pathname);
 LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 664ce8e7a60e..7a00630f0ff1 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -188,4 +188,8 @@ LIBBPF_0.0.4 {
 LIBBPF_0.0.5 {
 	global:
 		bpf_btf_get_next_id;
+		bpf_map_delete_batch;
+		bpf_map_lookup_and_delete_batch;
+		bpf_map_lookup_batch;
+		bpf_map_update_batch;
 } LIBBPF_0.0.4;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 05/13] bpf: adding map batch processing support
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
map entries per syscall.
  https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t

During discussion, we found more use cases can be supported in a similar
map operation batching framework. For example, batched map lookup and delete,
which can be really helpful for bcc.
  https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
  https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138

Also, in bcc, we have API to delete all entries in a map.
  https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264

For map update, batched operations also useful as sometimes applications need
to populate initial maps with more than one entry. For example, the below
example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
  https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550

This patch addresses all the above use cases. To make uapi stable, it also
covers other potential use cases. Four bpf syscall subcommands are introduced:
	BPF_MAP_LOOKUP_BATCH
	BPF_MAP_LOOKUP_AND_DELETE_BATCH
	BPF_MAP_UPDATE_BATCH
	BPF_MAP_DELETE_BATCH

The UAPI attribute structure looks like:
	struct { /* struct used by BPF_MAP_*_BATCH commands */
		__aligned_u64   start_key;      /* input: storing start key,
						 * if NULL, starting from the beginning.
						 */
		__aligned_u64   next_start_key; /* output: storing next batch start_key,
						 * if NULL, no next key.
						 */
		__aligned_u64	keys;		/* input/output: key buffer */
		__aligned_u64	values;		/* input/output: value buffer */
		__u32		count;		/* input: # of keys/values in
						 *   or fits in keys[]/values[].
						 * output: how many successful
						 *   lookup/lookup_and_delete
						 *   /delete/update operations.
						 */
		__u32		map_fd;
		__u64		elem_flags;	/* BPF_MAP_{UPDATE,LOOKUP}_ELEM flags */
		__u64		flags;		/* flags for batch operation */
	} batch;

The 'start_key' and 'next_start_key' are used to BPF_MAP_LOOKUP_BATCH,
BPF_MAP_LOOKUP_AND_DELETE_BATCH and BPF_MAP_DELETE_BATCH
to start the operation on 'start_key' and also set the
next batch start key in 'next_start_key'.

If 'count' is greater than 0 and the return code is 0,
  . the 'count' may be updated to be smaller if there is less
    elements than 'count' for the operation. In such cases,
    'next_start_key' will be set to NULL.
  . the 'count' remains the same. 'next_start_key' could be NULL
    or could point to next start_key for batch processing.

If 'count' is greater than 0 and the return code is an error
other than -EFAULT, the kernel will try to overwrite 'count'
to contain the number of successful element-level (lookup,
lookup_and_delete, update and delete) operations. The following
attributes can be checked:
  . if 'count' value is modified, the new value will be
    the number of successful element-level operations.
  . if 'count' value is modified, the keys[]/values[] will
    contain correct results for new 'count' number of
    operations for lookup[_and_delete] and update.

The implementation in this patch mimics what user space
did, e.g., for lookup_and_delete,
    while(bpf_get_next_key(map, keyp, next_keyp) == 0) {
       bpf_map_delete_elem(map, keyp);
       bpf_map_lookup_elem(map, next_keyp, &value, flags);
       keyp, next_keyp = next_keyp, keyp;
    }
The similar loop is implemented in the kernel, and
each operation, bpf_get_next_key(), bpf_map_delete_elem()
and bpf_map_lookup_elem(), uses existing kernel functions
each of which has its own rcu_read_lock region, bpf_prog_active
protection, etc.
Therefore, it is totally possible that after bpf_get_next_key(),
the bpf_map_delete_elem() or bpf_map_lookup_elem() may fail
as the key may be deleted concurrently by kernel or
other user space processes/threads.
By default, the current implementation permits the loop
to continue, just like typical user space handling. But
a flag, BPF_F_ENFORCE_ENOENT, can be specified, so kernel
will return an error if bpf_map_delete_elem() or
bpf_map_lookup_elem() failed.

The high-level algorithm for BPF_MAP_LOOKUP_BATCH and
BPF_MAP_LOOKUP_AND_DELETE_BATCH:
	if (start_key == NULL and next_start_key == NULL) {
		lookup up to 'count' keys in keys[] and fill
		corresponding values[], and delete those
		keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
	} else if (start_key == NULL && next_start_key != NULL) {
		lookup up to 'count' keys from the beginning
		of the map and fill keys[]/values[], delete
		those keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
		Set 'next_start_key' for next batch operation.
	} else if (start_key != NULL && next_start_key != NULL) {
		lookup to 'count' keys from 'start_key', inclusive,
		and fill keys[]/values[], delete those keys if
		BPF_MAP_LOOKUP_AND_DELETE_BATCH.
		Set 'next_start_key' for next batch operation.
	}

The high-level algorithm for BPF_MAP_UPDATE_BATCH:
	if (count != 0) {
		do 'count' number of updates on keys[]/values[].
	}

The high-level algorithm for BPF_MAP_DELETE_BATCH:
	if (count == 0) {
		if (start_key == NULL) {
			delete all elements from map.
		} else {
			delete all elements from start_key to the end of map.
		}
	} else {
		if (start_key == NULL and next_start_key == NULL) {
			delete 'count' number of keys in keys[].
		} else if (start_key == NULL and next_start_key != NULL) {
			delete 'count' number of keys from the
			beginning of the map and set 'next_start_key'
			properly.
		} else if (start_key != NULL and next_start_keeey != NULL) {
			delete 'count' number of keys from 'start_key',
			and set 'next_start_key' properly.
		}
	}

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/uapi/linux/bpf.h |  27 +++
 kernel/bpf/syscall.c     | 448 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 475 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5d2fb183ee2d..576688f13e8c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 	BPF_MAP_FREEZE,
 	BPF_BTF_GET_NEXT_ID,
+	BPF_MAP_LOOKUP_BATCH,
+	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+	BPF_MAP_UPDATE_BATCH,
+	BPF_MAP_DELETE_BATCH,
 };
 
 enum bpf_map_type {
@@ -347,6 +351,9 @@ enum bpf_attach_type {
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
+/* flags for BPF_MAP_*_BATCH */
+#define BPF_F_ENFORCE_ENOENT	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -396,6 +403,26 @@ union bpf_attr {
 		__u64		flags;
 	};
 
+	struct { /* struct used by BPF_MAP_*_BATCH commands */
+		__aligned_u64	start_key;	/* input: storing start key,
+						 * if NULL, starting from the beginning.
+						 */
+		__aligned_u64	next_start_key;	/* output: storing next batch start_key,
+						 * if NULL, no next key.
+						 */
+		__aligned_u64	keys;		/* input/output: key buffer */
+		__aligned_u64	values;		/* input/output: value buffer */
+		__u32		count;		/* input: # of keys/values in
+						 *   or fits in keys[]/values[].
+						 * output: how many successful
+						 *   lookup/lookup_and_delete
+						 *   update/delete operations.
+						 */
+		__u32		map_fd;
+		__u64		elem_flags;	/* BPF_MAP_*_ELEM flags */
+		__u64		flags;		/* flags for batch operation */
+	} batch;
+
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
 		__u32		prog_type;	/* one of enum bpf_prog_type */
 		__u32		insn_cnt;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 06308f0206e5..8746b55405f9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -33,6 +33,17 @@
 
 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define BPF_MAP_BATCH_SWAP_KEYS(key1, key2, buf1, buf2)	\
+	do {						\
+		if (key1 == (buf1)) {			\
+			key1 = buf2;			\
+			key2 = buf1;			\
+		} else {				\
+			key1 = buf1;			\
+			key2 = buf2;			\
+		}					\
+	} while (0)					\
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -1183,6 +1194,431 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	return err;
 }
 
+static void __map_batch_get_attrs(const union bpf_attr *attr,
+				  void __user **skey, void __user **nskey,
+				  void __user **keys, void __user **values,
+				  u32 *max_count, u64 *elem_flags, u64 *flags)
+{
+	*skey = u64_to_user_ptr(attr->batch.start_key);
+	*nskey = u64_to_user_ptr(attr->batch.next_start_key);
+	*keys = u64_to_user_ptr(attr->batch.keys);
+	*values = u64_to_user_ptr(attr->batch.values);
+	*max_count = attr->batch.count;
+	*elem_flags = attr->batch.elem_flags;
+	*flags = attr->batch.flags;
+}
+
+static int
+__map_lookup_delete_batch_key_in_keys(struct bpf_map *map, void *key, void *value,
+				      u32 max_count, u32 key_size, u32 value_size,
+				      u64 elem_flags, void __user *keys,
+				      void __user *values,
+				      union bpf_attr __user *uattr,
+				      bool ignore_enoent)
+{
+	u32 count, missed = 0;
+	int ret = 0, err;
+
+	for (count = 0; count < max_count; count++) {
+		if (copy_from_user(key, keys + count * key_size, key_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_copy_value(map, key, value, elem_flags);
+		if (ret) {
+			if (ret != -ENOENT || !ignore_enoent)
+				break;
+
+			missed++;
+			continue;
+		}
+
+
+		if (copy_to_user(values + count * value_size, value, value_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_delete_elem(map, key);
+		if (ret) {
+			if (ret != -ENOENT || !ignore_enoent)
+				break;
+
+			missed++;
+		}
+	}
+
+	count -= missed;
+	if ((!ret && missed) || (ret && ret != -EFAULT)) {
+		err = put_user(count, &uattr->batch.count);
+		ret = err ? : ret;
+	}
+
+	return ret;
+}
+
+static int map_lookup_and_delete_batch(struct bpf_map *map,
+				       const union bpf_attr *attr,
+				       union bpf_attr __user *uattr,
+				       bool do_delete)
+{
+	u32 max_count, count = 0, key_size, roundup_key_size, value_size;
+	bool ignore_enoent, nextkey_is_null, copied;
+	void *buf = NULL, *key, *value, *next_key;
+	void __user *skey, *nskey, *keys, *values;
+	u64 elem_flags, flags, zero = 0;
+	int err, ret = 0;
+
+	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+	    map->map_type == BPF_MAP_TYPE_STACK)
+		return -ENOTSUPP;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (elem_flags & ~BPF_F_LOCK || flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	if (!max_count)
+		return 0;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count != 0 && !skey && !nskey: loop/delete max_count elements in keys[]/values[].
+	 * max_count != 0 && !skey && nskey: loop/delete max_count elements starting from map start.
+	 * max_count != 0 && skey && nskey: loop/delete max_count elements starting from skey.
+	 */
+	if (skey && !nskey)
+		return -EINVAL;
+
+	/* allocate space for two keys and one value. */
+	key_size = map->key_size;
+	roundup_key_size = round_up(map->key_size, 8);
+	value_size = bpf_map_value_size(map);
+	buf = kmalloc(roundup_key_size * 2 + value_size, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	next_key = buf + roundup_key_size;
+	value = buf + roundup_key_size * 2;
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+
+	if (!skey && !nskey) {
+		/* handle cases where keys in keys[] */
+		ret = __map_lookup_delete_batch_key_in_keys(map, key, value, max_count,
+							    key_size, value_size,
+							    elem_flags, keys, values,
+							    uattr, ignore_enoent);
+		goto out;
+	}
+
+	/* Get the first key. */
+	if (!skey) {
+		ret = bpf_map_get_next_key(map, NULL, key);
+		if (ret) {
+			nextkey_is_null = true;
+			goto after_loop;
+		}
+	} else if (copy_from_user(key, skey, key_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Copy the first key/value pair */
+	ret = bpf_map_copy_value(map, key, value, elem_flags);
+	if (ret) {
+		if (skey)
+			goto out;
+
+		nextkey_is_null = true;
+		goto after_loop;
+	}
+
+	if (copy_to_user(keys, key, key_size) ||
+	    copy_to_user(values, value, value_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* We will always try to get next_key first
+	 * and then delete the current key.
+	 */
+	ret = bpf_map_get_next_key(map, key, next_key);
+	count = 0;
+	nextkey_is_null = false;
+	copied = true;
+
+again:
+	/* delete key */
+	if (do_delete) {
+		err = bpf_map_delete_elem(map, key);
+		if (err && (err != -ENOENT || !ignore_enoent)) {
+			/* failed to delete "key".
+			 * lookup_delete will considered failed.
+			 */
+			ret = err;
+			goto after_loop;
+		}
+	}
+
+	/* deletion in lookup_and_delete is successful. */
+	/* check bpf_map_get_next_key() result. */
+	count += !!copied;
+	nextkey_is_null = ret && ret == -ENOENT;
+	if (ret || count >= max_count)
+		goto after_loop;
+
+	/* copy value corresponding to next_key */
+	ret = bpf_map_copy_value(map, next_key, value, elem_flags);
+	copied = true;
+	if (ret) {
+		copied = false;
+		if (ret != -ENOENT || !ignore_enoent)
+			goto after_loop;
+	} else {
+		if (copy_to_user(keys + count * key_size, next_key, key_size) ||
+		    copy_to_user(values + count * value_size, value, value_size)) {
+			ret = -EFAULT;
+			goto after_loop;
+		}
+	}
+
+	BPF_MAP_BATCH_SWAP_KEYS(key, next_key, buf, buf + roundup_key_size);
+	ret = bpf_map_get_next_key(map, key, next_key);
+	goto again;
+
+after_loop:
+	if (!ret) {
+		if (put_user(count, &uattr->batch.count) ||
+		    copy_to_user(nskey, next_key, key_size))
+			ret = -EFAULT;
+	} else if (nextkey_is_null) {
+		ret = 0;
+		if (put_user(count, &uattr->batch.count) ||
+                    copy_to_user(&uattr->batch.next_start_key, &zero, sizeof(u64)))
+			ret = -EFAULT;
+	} else if (ret != -EFAULT) {
+		if (put_user(count, &uattr->batch.count))
+			ret = -EFAULT;
+	}
+
+out:
+	kfree(buf);
+	return ret;
+}
+
+static int map_update_batch(struct bpf_map *map,
+			    const union bpf_attr *attr,
+			    union bpf_attr __user *uattr,
+			    struct fd *f)
+{
+	u32 max_count, count, missed, key_size, roundup_key_size, value_size;
+	void __user *skey, *nskey, *keys, *values;
+	void *buf = NULL, *key, *value;
+	u64 elem_flags, flags;
+	bool ignore_enoent;
+	int ret, err;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	if (!max_count)
+		return 0;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count != 0 && !skey && !nskey: update max_count elements in keys[]/values[].
+	 */
+	if (nskey || skey)
+		return -EINVAL;
+
+	if ((elem_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))
+		return -EINVAL;
+
+	key_size = map->key_size;
+	roundup_key_size = round_up(key_size, 8);
+	value_size = bpf_map_value_size(map);
+	buf = kmalloc(roundup_key_size + value_size, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	value = buf + roundup_key_size;
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+
+	missed = 0;
+	for (count = 0; count < max_count; count++) {
+		if (copy_from_user(key, keys + count * key_size, key_size) ||
+		    copy_from_user(value, values + count * value_size, value_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_update_elem(map, key, value, f, elem_flags);
+		if (ret && (ret != -ENOENT || !ignore_enoent))
+			break;
+
+		missed += !!ret;
+		ret = 0;
+	}
+	count -= missed;
+	if ((!ret && missed) || (ret && ret != -EFAULT)) {
+		err = put_user(count, &uattr->batch.count);
+		// Reset the error code if the above put_user failed.
+		ret = err ? : ret;
+	}
+
+	kfree(buf);
+	return ret;
+}
+
+static int map_delete_batch(struct bpf_map *map,
+			    const union bpf_attr *attr,
+			    union bpf_attr __user *uattr)
+{
+	u32 max_count, count, key_size, roundup_key_size;
+	void __user *skey, *nskey, *keys, *values;
+	bool ignore_enoent, nextkey_is_null;
+	void *buf = NULL, *key, *next_key;
+	u64 elem_flags, flags, zero = 0;
+	int ret, err;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (elem_flags || flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count == 0 && !skey && !nskey: delete all elements in the map
+	 * max_count == 0 && skey && !nskey: delete all elements starting from skey.
+	 * max_count != 0 && !skey && nskey: delete max_count elements starting from map start.
+	 * max_count != 0 && skey && nskey: delete max_count elements starting from skey.
+	 * max_count != 0 && !skey && !nskey: delete max_count elements in keys[].
+	 */
+	if ((max_count == 0 && nskey) || (max_count != 0 && skey && !nskey))
+		return -EINVAL;
+
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+	key_size = map->key_size;
+	roundup_key_size = round_up(map->key_size, 8);
+	buf = kmalloc(roundup_key_size * 2, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	next_key = buf + roundup_key_size;
+
+	nextkey_is_null = false;
+
+	if (max_count == 0 || nskey) {
+		count = 0;
+		if (skey) {
+			if (copy_from_user(key, skey, key_size)) {
+				ret = -EFAULT;
+				goto out;
+			}
+		} else {
+			ret = bpf_map_get_next_key(map, NULL, key);
+			if (ret) {
+				nextkey_is_null = true;
+				goto after_loop;
+			}
+		}
+
+		while (max_count == 0 || count < max_count) {
+			ret = bpf_map_get_next_key(map, key, next_key);
+
+			err = bpf_map_delete_elem(map, key);
+			if (err && (err != -ENOENT || !ignore_enoent)) {
+				ret = err;
+				break;
+			}
+
+			count += !err;
+
+			/* check bpf_map_get_next_key() result */
+			if (ret) {
+				nextkey_is_null = ret == -ENOENT;
+				break;
+			}
+
+			BPF_MAP_BATCH_SWAP_KEYS(key, next_key, buf, buf + roundup_key_size);
+		}
+
+after_loop:
+		if (!ret) {
+			if (copy_to_user(nskey, key, key_size))
+				ret = -EFAULT;
+		} else if (nextkey_is_null) {
+			ret = 0;
+			if (copy_to_user(&uattr->batch.next_start_key, &zero, sizeof(u64)))
+				ret = -EFAULT;
+		}
+	} else {
+		for (count = 0; count < max_count;) {
+			if (copy_from_user(key, keys + count * key_size, key_size)) {
+				ret = -EFAULT;
+				break;
+			}
+
+			ret = bpf_map_delete_elem(map, key);
+			if (ret && (ret != -ENOENT || !ignore_enoent))
+				break;
+
+			count += !ret;
+		}
+	}
+
+	if (ret != -EFAULT && put_user(count, &uattr->batch.count))
+		ret = -EFAULT;
+
+out:
+	kfree(buf);
+	return ret;
+}
+
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+			    union bpf_attr __user *uattr,
+			    int cmd)
+{
+	struct bpf_map *map;
+	int err, ufd;
+	struct fd f;
+
+	if (CHECK_ATTR(BPF_MAP_BATCH))
+		return -EINVAL;
+
+	ufd = attr->batch.map_fd;
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	if (cmd == BPF_MAP_LOOKUP_BATCH)
+		err = map_lookup_and_delete_batch(map, attr, uattr, false);
+	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
+		err = map_lookup_and_delete_batch(map, attr, uattr, true);
+	else if (cmd == BPF_MAP_UPDATE_BATCH)
+		err = map_update_batch(map, attr, uattr, &f);
+	else /* BPF_MAP_DELETE_BATCH */
+		err = map_delete_batch(map, attr, uattr);
+
+err_put:
+	fdput(f);
+	return err;
+
+}
+
 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
 
 static int map_freeze(const union bpf_attr *attr)
@@ -2939,6 +3375,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
 		err = map_lookup_and_delete_elem(&attr);
 		break;
+	case BPF_MAP_LOOKUP_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+		break;
+	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+		break;
+	case BPF_MAP_UPDATE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+		break;
+	case BPF_MAP_DELETE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 02/13] bpf: refactor map_update_elem()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Refactor function map_update_elem() by creating a
helper function bpf_map_update_elem() which will be
used later by batched map update operation.

Also reuse function bpf_map_value_size()
in map_update_elem().

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 113 ++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 211e0bc667bd..3caa0ab3d30d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -878,6 +878,61 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
 		synchronize_rcu();
 }
 
+static int bpf_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       struct fd *f, __u64 flags) {
+	int err;
+
+	/* Need to create a kthread, thus must support schedule */
+	if (bpf_map_is_dev_bound(map)) {
+		return bpf_map_offload_update_elem(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+		   map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		return map->ops->map_update_elem(map, key, value, flags);
+	}
+
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+	 * inside bpf map update or delete otherwise deadlocks are possible
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		err = bpf_percpu_hash_update(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		err = bpf_percpu_array_update(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_update(map, key, value,
+						       flags);
+	} else if (IS_FD_ARRAY(map)) {
+		rcu_read_lock();
+		err = bpf_fd_array_map_update_elem(map, f->file, key, value,
+						   flags);
+		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+		rcu_read_lock();
+		err = bpf_fd_htab_map_update_elem(map, f->file, key, value,
+						  flags);
+		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		/* rcu_read_lock() is not needed */
+		err = bpf_fd_reuseport_array_update_elem(map, key, value,
+							 flags);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_push_elem(map, value, flags);
+	} else {
+		rcu_read_lock();
+		err = map->ops->map_update_elem(map, key, value, flags);
+		rcu_read_unlock();
+	}
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+	maybe_wait_bpf_programs(map);
+
+	return err;
+}
+
 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
@@ -915,13 +970,7 @@ static int map_update_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
-		value_size = round_up(map->value_size, 8) * num_possible_cpus();
-	else
-		value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
 
 	err = -ENOMEM;
 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -932,56 +981,8 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
-	/* Need to create a kthread, thus must support schedule */
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_update_elem(map, key, value, attr->flags);
-		goto out;
-	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
-		   map->map_type == BPF_MAP_TYPE_SOCKHASH ||
-		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		err = map->ops->map_update_elem(map, key, value, attr->flags);
-		goto out;
-	}
+	err = bpf_map_update_elem(map, key, value, &f, attr->flags);
 
-	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
-	 * inside bpf map update or delete otherwise deadlocks are possible
-	 */
-	preempt_disable();
-	__this_cpu_inc(bpf_prog_active);
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
-		err = bpf_percpu_hash_update(map, key, value, attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
-		err = bpf_percpu_array_update(map, key, value, attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
-		err = bpf_percpu_cgroup_storage_update(map, key, value,
-						       attr->flags);
-	} else if (IS_FD_ARRAY(map)) {
-		rcu_read_lock();
-		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
-						   attr->flags);
-		rcu_read_unlock();
-	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-		rcu_read_lock();
-		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
-						  attr->flags);
-		rcu_read_unlock();
-	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
-		/* rcu_read_lock() is not needed */
-		err = bpf_fd_reuseport_array_update_elem(map, key, value,
-							 attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
-		   map->map_type == BPF_MAP_TYPE_STACK) {
-		err = map->ops->map_push_elem(map, value, attr->flags);
-	} else {
-		rcu_read_lock();
-		err = map->ops->map_update_elem(map, key, value, attr->flags);
-		rcu_read_unlock();
-	}
-	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-	maybe_wait_bpf_programs(map);
-out:
 free_value:
 	kfree(value);
 free_key:
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

From: Brian Vazquez <brianvv@google.com>

Move reusable code from map_lookup_elem to helper functions to avoid code
duplication in kernel/bpf/syscall.c

Suggested-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Brian Vazquez <brianvv@google.com>
---
 kernel/bpf/syscall.c | 134 +++++++++++++++++++++++--------------------
 1 file changed, 73 insertions(+), 61 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ca60eafa6922..211e0bc667bd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -126,6 +126,76 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
+static u32 bpf_map_value_size(struct bpf_map *map)
+{
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+		return round_up(map->value_size, 8) * num_possible_cpus();
+	else if (IS_FD_MAP(map))
+		return sizeof(u32);
+	else
+		return  map->value_size;
+}
+
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
+			      __u64 flags)
+{
+	void *ptr;
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return  bpf_map_offload_lookup_elem(map, key, value);
+
+	preempt_disable();
+	this_cpu_inc(bpf_prog_active);
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		err = bpf_percpu_hash_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		err = bpf_percpu_array_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+		err = bpf_stackmap_copy(map, key, value);
+	} else if (IS_FD_ARRAY(map)) {
+		err = bpf_fd_array_map_lookup_elem(map, key, value);
+	} else if (IS_FD_HASH(map)) {
+		err = bpf_fd_htab_map_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_peek_elem(map, value);
+	} else {
+		rcu_read_lock();
+		if (map->ops->map_lookup_elem_sys_only)
+			ptr = map->ops->map_lookup_elem_sys_only(map, key);
+		else
+			ptr = map->ops->map_lookup_elem(map, key);
+		if (IS_ERR(ptr)) {
+			err = PTR_ERR(ptr);
+		} else if (!ptr) {
+			err = -ENOENT;
+		} else {
+			err = 0;
+			if (flags & BPF_F_LOCK)
+				/* lock 'ptr' and copy everything but lock */
+				copy_map_value_locked(map, value, ptr, true);
+			else
+				copy_map_value(map, value, ptr);
+			/* mask lock, since value wasn't zero inited */
+			check_and_init_map_lock(map, value);
+		}
+		rcu_read_unlock();
+	}
+	this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+
+	return err;
+}
+
 void *bpf_map_area_alloc(size_t size, int numa_node)
 {
 	/* We really just want to fail instead of triggering OOM killer
@@ -739,7 +809,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	void __user *uvalue = u64_to_user_ptr(attr->value);
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
-	void *key, *value, *ptr;
+	void *key, *value;
 	u32 value_size;
 	struct fd f;
 	int err;
@@ -771,72 +841,14 @@ static int map_lookup_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
-		value_size = round_up(map->value_size, 8) * num_possible_cpus();
-	else if (IS_FD_MAP(map))
-		value_size = sizeof(u32);
-	else
-		value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
 
 	err = -ENOMEM;
 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 	if (!value)
 		goto free_key;
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_lookup_elem(map, key, value);
-		goto done;
-	}
-
-	preempt_disable();
-	this_cpu_inc(bpf_prog_active);
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
-		err = bpf_percpu_hash_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
-		err = bpf_percpu_array_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
-		err = bpf_percpu_cgroup_storage_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
-		err = bpf_stackmap_copy(map, key, value);
-	} else if (IS_FD_ARRAY(map)) {
-		err = bpf_fd_array_map_lookup_elem(map, key, value);
-	} else if (IS_FD_HASH(map)) {
-		err = bpf_fd_htab_map_lookup_elem(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
-		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
-		   map->map_type == BPF_MAP_TYPE_STACK) {
-		err = map->ops->map_peek_elem(map, value);
-	} else {
-		rcu_read_lock();
-		if (map->ops->map_lookup_elem_sys_only)
-			ptr = map->ops->map_lookup_elem_sys_only(map, key);
-		else
-			ptr = map->ops->map_lookup_elem(map, key);
-		if (IS_ERR(ptr)) {
-			err = PTR_ERR(ptr);
-		} else if (!ptr) {
-			err = -ENOENT;
-		} else {
-			err = 0;
-			if (attr->flags & BPF_F_LOCK)
-				/* lock 'ptr' and copy everything but lock */
-				copy_map_value_locked(map, value, ptr, true);
-			else
-				copy_map_value(map, value, ptr);
-			/* mask lock, since value wasn't zero inited */
-			check_and_init_map_lock(map, value);
-		}
-		rcu_read_unlock();
-	}
-	this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-
-done:
+	err = bpf_map_copy_value(map, key, value, attr->flags);
 	if (err)
 		goto free_value;
 
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH 3/3] Enable ptp_kvm for arm64
From: Jianyong Wu @ 2019-08-29  6:39 UTC (permalink / raw)
  To: netdev, pbonzini, sean.j.christopherson, maz, richardcochran,
	Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he, jianyong.wu
In-Reply-To: <20190829063952.18470-1-jianyong.wu@arm.com>

Currently in arm64 virtualization environment, there is no mechanism to
keep time sync between guest and host. Time in guest will drift compared
with host after boot up as they may both use third party time sources
to correct their time respectively. The time deviation will be in order
of milliseconds but some scenarios ask for higher time precision, like
in cloud envirenment, we want all the VMs running in the host aquire the
same level accuracy from host clock.

Use of kvm ptp clock, which choose the host clock source clock as a
reference clock to sync time clock between guest and host has been adopted
by x86 which makes the time sync order from milliseconds to nanoseconds.

This patch enable kvm ptp on arm64 and we get the similar clock drift as
found with x86 with kvm ptp.

Test result comparison between with kvm ptp and without it in arm64 are
as follows. This test derived from the result of command 'chronyc
sources'. we should take more cure of the last sample column which shows
the offset between the local clock and the source at the last measurement.

no kvm ptp in guest:
MS Name/IP address   Stratum Poll Reach LastRx Last sample
========================================================================
^* dns1.synet.edu.cn      2   6   377    13  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    21  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    29  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    37  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    45  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    53  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    61  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377     4   -130us[ +796us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    12   -130us[ +796us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    20   -130us[ +796us] +/-   21ms

in host:
MS Name/IP address   Stratum Poll Reach LastRx Last sample
========================================================================
^* 120.25.115.20          2   7   377    72   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377    92   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377   112   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377     2   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    22   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    43   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    63   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    83   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377   103   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377   123   +872ns[-6808ns] +/-   17ms

The dns1.synet.edu.cn is the network reference clock for guest and
120.25.115.20 is the network reference clock for host. we can't get the
clock error between guest and host directly, but a roughly estimated value
will be in order of hundreds of us to ms.

with kvm ptp in guest:
chrony has been disabled in host to remove the disturb by network clock.

MS Name/IP address         Stratum Poll Reach LastRx Last sample
========================================================================
* PHC0                    0   3   377     8     -7ns[   +1ns] +/-    3ns
* PHC0                    0   3   377     8     +1ns[  +16ns] +/-    3ns
* PHC0                    0   3   377     6     -4ns[   -0ns] +/-    6ns
* PHC0                    0   3   377     6     -8ns[  -12ns] +/-    5ns
* PHC0                    0   3   377     5     +2ns[   +4ns] +/-    4ns
* PHC0                    0   3   377    13     +2ns[   +4ns] +/-    4ns
* PHC0                    0   3   377    12     -4ns[   -6ns] +/-    4ns
* PHC0                    0   3   377    11     -8ns[  -11ns] +/-    6ns
* PHC0                    0   3   377    10    -14ns[  -20ns] +/-    4ns
* PHC0                    0   3   377     8     +4ns[   +5ns] +/-    4ns

The PHC0 is the ptp clock which choose the host clock as its source
clock. So we can be sure to say that the clock error between host and guest
is in order of ns.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
---
 arch/arm64/include/asm/arch_timer.h  |  3 ++
 arch/arm64/kvm/arch_ptp_kvm.c        | 76 ++++++++++++++++++++++++++++
 drivers/clocksource/arm_arch_timer.c |  6 ++-
 drivers/ptp/Kconfig                  |  2 +-
 include/linux/arm-smccc.h            | 14 +++++
 virt/kvm/arm/psci.c                  | 17 +++++++
 6 files changed, 115 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/kvm/arch_ptp_kvm.c

diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
index 6756178c27db..880576a814b6 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -229,4 +229,7 @@ static inline int arch_timer_arch_init(void)
 	return 0;
 }
 
+extern struct clocksource clocksource_counter;
+extern u64 arch_counter_read(struct clocksource *cs);
+
 #endif
diff --git a/arch/arm64/kvm/arch_ptp_kvm.c b/arch/arm64/kvm/arch_ptp_kvm.c
new file mode 100644
index 000000000000..6b2165ebce62
--- /dev/null
+++ b/arch/arm64/kvm/arch_ptp_kvm.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+#include <asm/hypervisor.h>
+#include <linux/module.h>
+#include <linux/psci.h>
+#include <linux/arm-smccc.h>
+#include <linux/timecounter.h>
+#include <linux/sched/clock.h>
+#include <asm/arch_timer.h>
+
+/*
+ * as trap call cause delay, this function will return the delay in nanosecond
+ */
+static u64 arm_smccc_1_1_invoke_delay(u32 id, struct arm_smccc_res *res)
+{
+	u64 ns, t1, t2;
+
+	t1 = sched_clock();
+	arm_smccc_1_1_invoke(id, res);
+	t2 = sched_clock();
+	t2 -= t1;
+	ns = t2;
+	return ns;
+}
+
+int kvm_arch_ptp_init(void)
+{
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+	u64 ns;
+	struct arm_smccc_res hvc_res;
+
+	if (!kvm_arm_hyp_service_available(
+			ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID)) {
+		return -EOPNOTSUPP;
+	}
+	ns = arm_smccc_1_1_invoke_delay(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
+					&hvc_res);
+	ts->tv_sec = hvc_res.a0;
+	ts->tv_nsec = hvc_res.a1;
+	timespec64_add_ns(ts, ns);
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock_fn(long *cycle, struct timespec64 *ts,
+			      struct clocksource **cs)
+{
+	u64 ns;
+	struct arm_smccc_res hvc_res;
+
+	if (!kvm_arm_hyp_service_available(
+			ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID)) {
+		return -EOPNOTSUPP;
+	}
+	ns = arm_smccc_1_1_invoke_delay(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
+					&hvc_res);
+	ts->tv_sec = hvc_res.a0;
+	ts->tv_nsec = hvc_res.a1;
+	timespec64_add_ns(ts, ns);
+	*cycle = hvc_res.a2;
+	*cs = &clocksource_counter;
+
+	return 0;
+}
+
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
+MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
+MODULE_LICENSE("GPL");
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 07e57a49d1e8..021e3f69364c 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -175,23 +175,25 @@ static notrace u64 arch_counter_get_cntvct(void)
 u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct;
 EXPORT_SYMBOL_GPL(arch_timer_read_counter);
 
-static u64 arch_counter_read(struct clocksource *cs)
+u64 arch_counter_read(struct clocksource *cs)
 {
 	return arch_timer_read_counter();
 }
+EXPORT_SYMBOL(arch_counter_read);
 
 static u64 arch_counter_read_cc(const struct cyclecounter *cc)
 {
 	return arch_timer_read_counter();
 }
 
-static struct clocksource clocksource_counter = {
+struct clocksource clocksource_counter = {
 	.name	= "arch_sys_counter",
 	.rating	= 400,
 	.read	= arch_counter_read,
 	.mask	= CLOCKSOURCE_MASK(56),
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 };
+EXPORT_SYMBOL(clocksource_counter);
 
 static struct cyclecounter cyclecounter __ro_after_init = {
 	.read	= arch_counter_read_cc,
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 9b8fee5178e8..e032fafdafa7 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -110,7 +110,7 @@ config PTP_1588_CLOCK_PCH
 config PTP_1588_CLOCK_KVM
 	tristate "KVM virtual PTP clock"
 	depends on PTP_1588_CLOCK
-	depends on KVM_GUEST && X86
+	depends on KVM_GUEST && X86 || ARM64
 	default y
 	help
 	  This driver adds support for using kvm infrastructure as a PTP
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index a6e4d3e3d10a..2a222a1a8594 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -94,6 +94,7 @@
 
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
+#define ARM_SMCCC_KVM_PTP			1
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -102,6 +103,16 @@
 			   ARM_SMCCC_SMC_32,				\
 			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
 			   ARM_SMCCC_KVM_FUNC_FEATURES)
+/*
+ * This ID used for virtual ptp kvm clock and it will pass second value
+ * and nanosecond value of host real time and system counter by vcpu
+ * register to guest.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_PTP)
 
 #ifndef __ASSEMBLY__
 
@@ -373,5 +384,8 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		method;							\
 	})
 
+#include <linux/psci.h>
+#include <linux/clocksource.h>
+
 #endif /*__ASSEMBLY__*/
 #endif /*__LINUX_ARM_SMCCC_H*/
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index 0debf49bf259..7fffdb25d32c 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -392,6 +392,8 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 	u32 func_id = smccc_get_function(vcpu);
 	u32 val[4] = {};
 	u32 option;
+	struct timespec *ts;
+	u64 cnt;
 
 	val[0] = SMCCC_RET_NOT_SUPPORTED;
 
@@ -431,6 +433,21 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 	case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
 		val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
 		break;
+	/*
+	 * This will used for virtual ptp kvm clock. three
+	 * values will be passed back.
+	 * reg0 stores seconds of host real time;
+	 * reg1 stores nanoseconds of host real time;
+	 * reg2 stotes system counter cycle value.
+	 */
+	case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+		getnstimeofday(ts);
+		cnt = arch_timer_read_counter();
+		val[0] = ts->tv_sec;
+		val[1] = ts->tv_nsec;
+		val[2] = cnt;
+		val[3] = 0;
+		break;
 	default:
 		return kvm_psci_call(vcpu);
 	}
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH 2/3] reorganize ptp_kvm modules to make it arch-independent.
From: Jianyong Wu @ 2019-08-29  6:39 UTC (permalink / raw)
  To: netdev, pbonzini, sean.j.christopherson, maz, richardcochran,
	Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he, jianyong.wu
In-Reply-To: <20190829063952.18470-1-jianyong.wu@arm.com>

Currently, ptp_kvm modules implementation is only for x86 which includs
large part of arch-specific code.  This patch move all of those code
into related arch directory.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
---
 arch/x86/kvm/arch_ptp_kvm.c          | 92 ++++++++++++++++++++++++++++
 drivers/ptp/Makefile                 |  1 +
 drivers/ptp/{ptp_kvm.c => kvm_ptp.c} | 77 ++++++-----------------
 include/asm-generic/ptp_kvm.h        | 12 ++++
 4 files changed, 123 insertions(+), 59 deletions(-)
 create mode 100644 arch/x86/kvm/arch_ptp_kvm.c
 rename drivers/ptp/{ptp_kvm.c => kvm_ptp.c} (63%)
 create mode 100644 include/asm-generic/ptp_kvm.h

diff --git a/arch/x86/kvm/arch_ptp_kvm.c b/arch/x86/kvm/arch_ptp_kvm.c
new file mode 100644
index 000000000000..56ea84a86da2
--- /dev/null
+++ b/arch/x86/kvm/arch_ptp_kvm.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <linux/module.h>
+#include <uapi/asm/kvm_para.h>
+#include <uapi/linux/kvm_para.h>
+#include <linux/ptp_clock_kernel.h>
+
+phys_addr_t clock_pair_gpa;
+struct kvm_clock_pairing clock_pair;
+struct pvclock_vsyscall_time_info *hv_clock;
+
+int kvm_arch_ptp_init(void)
+{
+	int ret;
+
+	if (!kvm_para_available())
+		return -ENODEV;
+
+	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+	hv_clock = pvclock_get_pvti_cpu0_va();
+	if (!hv_clock)
+		return -ENODEV;
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+		return -ENODEV;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+	long ret;
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+			     clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret != 0)
+		return -EOPNOTSUPP;
+
+	ts->tv_sec = clock_pair.sec;
+	ts->tv_nsec = clock_pair.nsec;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock_fn(long *cycle, struct timespec64 *tspec,
+			      struct clocksource **cs)
+{
+	unsigned long ret;
+	unsigned int version;
+	int cpu;
+	struct pvclock_vcpu_time_info *src;
+
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
+
+	do {
+		/*
+		 * We are using a TSC value read in the hosts
+		 * kvm_hc_clock_pairing handling.
+		 * So any changes to tsc_to_system_mul
+		 * and tsc_shift or any other pvclock
+		 * data invalidate that measurement.
+		 */
+		version = pvclock_read_begin(src);
+
+		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+				     clock_pair_gpa,
+				     KVM_CLOCK_PAIRING_WALLCLOCK);
+		tspec->tv_sec = clock_pair.sec;
+		tspec->tv_nsec = clock_pair.nsec;
+		*cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+	} while (pvclock_read_retry(src, version));
+
+	*cs = &kvm_clock;
+
+	return 0;
+}
+
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
+MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
+MODULE_LICENSE("GPL");
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 677d1d178a3e..5a8c6462fc0f 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -4,6 +4,7 @@
 #
 
 ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp_kvm-y				:= ../../arch/$(ARCH)/kvm/arch_ptp_kvm.o kvm_ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_IXP46X)	+= ptp_ixp46x.o
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/kvm_ptp.c
similarity index 63%
rename from drivers/ptp/ptp_kvm.c
rename to drivers/ptp/kvm_ptp.c
index fc7d0b77e118..9d07cf872be7 100644
--- a/drivers/ptp/ptp_kvm.c
+++ b/drivers/ptp/kvm_ptp.c
@@ -8,12 +8,12 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <uapi/linux/kvm_para.h>
 #include <asm/kvm_para.h>
-#include <asm/pvclock.h>
-#include <asm/kvmclock.h>
 #include <uapi/asm/kvm_para.h>
+#include <asm-generic/ptp_kvm.h>
 
 #include <linux/ptp_clock_kernel.h>
 
@@ -24,56 +24,29 @@ struct kvm_ptp_clock {
 
 DEFINE_SPINLOCK(kvm_ptp_lock);
 
-static struct pvclock_vsyscall_time_info *hv_clock;
-
-static struct kvm_clock_pairing clock_pair;
-static phys_addr_t clock_pair_gpa;
-
 static int ptp_kvm_get_time_fn(ktime_t *device_time,
 			       struct system_counterval_t *system_counter,
 			       void *ctx)
 {
-	unsigned long ret;
+	unsigned long ret, cycle;
 	struct timespec64 tspec;
-	unsigned version;
-	int cpu;
-	struct pvclock_vcpu_time_info *src;
+	struct clocksource *cs;
 
 	spin_lock(&kvm_ptp_lock);
 
 	preempt_disable_notrace();
-	cpu = smp_processor_id();
-	src = &hv_clock[cpu].pvti;
-
-	do {
-		/*
-		 * We are using a TSC value read in the hosts
-		 * kvm_hc_clock_pairing handling.
-		 * So any changes to tsc_to_system_mul
-		 * and tsc_shift or any other pvclock
-		 * data invalidate that measurement.
-		 */
-		version = pvclock_read_begin(src);
-
-		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-				     clock_pair_gpa,
-				     KVM_CLOCK_PAIRING_WALLCLOCK);
-		if (ret != 0) {
-			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
-			spin_unlock(&kvm_ptp_lock);
-			preempt_enable_notrace();
-			return -EOPNOTSUPP;
-		}
-
-		tspec.tv_sec = clock_pair.sec;
-		tspec.tv_nsec = clock_pair.nsec;
-		ret = __pvclock_read_cycles(src, clock_pair.tsc);
-	} while (pvclock_read_retry(src, version));
+	ret = kvm_arch_ptp_get_clock_fn(&cycle, &tspec, &cs);
+	if (ret != 0) {
+		pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+		spin_unlock(&kvm_ptp_lock);
+		preempt_enable_notrace();
+		return -EOPNOTSUPP;
+	}
 
 	preempt_enable_notrace();
 
-	system_counter->cycles = ret;
-	system_counter->cs = &kvm_clock;
+	system_counter->cycles = cycle;
+	system_counter->cs = cs;
 
 	*device_time = timespec64_to_ktime(tspec);
 
@@ -116,17 +89,13 @@ static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 
 	spin_lock(&kvm_ptp_lock);
 
-	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-			     clock_pair_gpa,
-			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	ret = kvm_arch_ptp_get_clock(&tspec);
 	if (ret != 0) {
 		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
 		spin_unlock(&kvm_ptp_lock);
 		return -EOPNOTSUPP;
 	}
 
-	tspec.tv_sec = clock_pair.sec;
-	tspec.tv_nsec = clock_pair.nsec;
 	spin_unlock(&kvm_ptp_lock);
 
 	memcpy(ts, &tspec, sizeof(struct timespec64));
@@ -166,21 +135,11 @@ static void __exit ptp_kvm_exit(void)
 
 static int __init ptp_kvm_init(void)
 {
-	long ret;
-
-	if (!kvm_para_available())
-		return -ENODEV;
-
-	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-	hv_clock = pvclock_get_pvti_cpu0_va();
-
-	if (!hv_clock)
-		return -ENODEV;
+	int ret;
 
-	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
-			KVM_CLOCK_PAIRING_WALLCLOCK);
-	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
-		return -ENODEV;
+	ret = kvm_arch_ptp_init();
+	if (IS_ERR(ret))
+		return ret;
 
 	kvm_ptp_clock.caps = ptp_kvm_caps;
 
diff --git a/include/asm-generic/ptp_kvm.h b/include/asm-generic/ptp_kvm.h
new file mode 100644
index 000000000000..128a9d7af161
--- /dev/null
+++ b/include/asm-generic/ptp_kvm.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+static int kvm_arch_ptp_init(void);
+static int kvm_arch_ptp_get_clock(struct timespec64 *ts);
+static int kvm_arch_ptp_get_clock_fn(long *cycle,
+		struct timespec64 *tspec, void *cs);
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH 1/3] Export psci_ops.conduit symbol as modules will use it.
From: Jianyong Wu @ 2019-08-29  6:39 UTC (permalink / raw)
  To: netdev, pbonzini, sean.j.christopherson, maz, richardcochran,
	Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he, jianyong.wu
In-Reply-To: <20190829063952.18470-1-jianyong.wu@arm.com>

If arm_smccc_1_1_invoke used in modules, psci_ops.conduit should
be export.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
---
 drivers/firmware/psci/psci.c | 6 ++++++
 include/linux/arm-smccc.h    | 2 +-
 include/linux/psci.h         | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index f82ccd39a913..35c4eaab1451 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -212,6 +212,12 @@ static unsigned long psci_migrate_info_up_cpu(void)
 			      0, 0, 0);
 }
 
+enum psci_conduit psci_get_conduit(void)
+{
+	return psci_ops.conduit;
+}
+EXPORT_SYMBOL(psci_get_conduit);
+
 static void set_conduit(enum psci_conduit conduit)
 {
 	switch (conduit) {
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 552cbd49abe8..a6e4d3e3d10a 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -357,7 +357,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
  * The return value also provides the conduit that was used.
  */
 #define arm_smccc_1_1_invoke(...) ({					\
-		int method = psci_ops.conduit;				\
+		int method = psci_get_conduit();			\
 		switch (method) {					\
 		case PSCI_CONDUIT_HVC:					\
 			arm_smccc_1_1_hvc(__VA_ARGS__);			\
diff --git a/include/linux/psci.h b/include/linux/psci.h
index a8a15613c157..e5cedc986049 100644
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -42,6 +42,7 @@ struct psci_operations {
 	enum smccc_version smccc_version;
 };
 
+extern enum psci_conduit psci_get_conduit(void);
 extern struct psci_operations psci_ops;
 
 #if defined(CONFIG_ARM_PSCI_FW)
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH 0/3] arm64: enable virtual kvm ptp for arm64
From: Jianyong Wu @ 2019-08-29  6:39 UTC (permalink / raw)
  To: netdev, pbonzini, sean.j.christopherson, maz, richardcochran,
	Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he, jianyong.wu

kvm ptp targets to provide high precision time sync between guest
and host in virtualization environment. This patch enable kvm ptp
for arm64.

This patch set base on [1][2][3]

[1]https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/
commit/?h=kvm/hvc&id=125ea89e4a21e2fc5235410f966a996a1a7148bf
[2]https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/
commit/?h=kvm/hvc&id=464f5a1741e5959c3e4d2be1966ae0093b4dce06
[3]https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/
commit/?h=kvm/hvc&id=6597490e005d0eeca8ed8c1c1d7b4318ee014681

Jianyong Wu (3):
  Export psci_ops.conduit symbol as modules will use it.
  reorganize ptp_kvm modules to make it arch-independent.
  Enable ptp_kvm for arm64

 arch/arm64/include/asm/arch_timer.h  |  3 +
 arch/arm64/kvm/arch_ptp_kvm.c        | 76 +++++++++++++++++++++++
 arch/x86/kvm/arch_ptp_kvm.c          | 92 ++++++++++++++++++++++++++++
 drivers/clocksource/arm_arch_timer.c |  6 +-
 drivers/firmware/psci/psci.c         |  6 ++
 drivers/ptp/Kconfig                  |  2 +-
 drivers/ptp/Makefile                 |  1 +
 drivers/ptp/{ptp_kvm.c => kvm_ptp.c} | 77 ++++++-----------------
 include/asm-generic/ptp_kvm.h        | 12 ++++
 include/linux/arm-smccc.h            | 16 ++++-
 include/linux/psci.h                 |  1 +
 virt/kvm/arm/psci.c                  | 17 +++++
 12 files changed, 246 insertions(+), 63 deletions(-)
 create mode 100644 arch/arm64/kvm/arch_ptp_kvm.c
 create mode 100644 arch/x86/kvm/arch_ptp_kvm.c
 rename drivers/ptp/{ptp_kvm.c => kvm_ptp.c} (63%)
 create mode 100644 include/asm-generic/ptp_kvm.h

-- 
2.17.1


^ permalink raw reply

* Re: [PATCH net] netdevsim: Restore per-network namespace accounting for fib entries
From: Jiri Pirko @ 2019-08-29  6:28 UTC (permalink / raw)
  To: David Ahern; +Cc: David Ahern, davem, netdev
In-Reply-To: <2c561928-1052-4c33-848d-ed7b81e920cf@gmail.com>

Wed, Aug 28, 2019 at 11:26:03PM CEST, dsahern@gmail.com wrote:
>On 8/28/19 4:37 AM, Jiri Pirko wrote:
>> Tue, Aug 06, 2019 at 09:15:17PM CEST, dsahern@kernel.org wrote:
>>> From: David Ahern <dsahern@gmail.com>
>>>
>>> Prior to the commit in the fixes tag, the resource controller in netdevsim
>>> tracked fib entries and rules per network namespace. Restore that behavior.
>> 
>> David, please help me understand. If the counters are per-device, not
>> per-netns, they are both the same. If we have device (devlink instance)
>> is in a netns and take only things happening in this netns into account,
>> it should count exactly the same amount of fib entries, doesn't it?
>
>if you are only changing where the counters are stored - net_generic vs
>devlink private - then yes, they should be equivalent.

Okay.

>
>> 
>> I re-thinked the devlink netns patchset and currently I'm going in
>> slightly different direction. I'm having netns as an attribute of
>> devlink reload. So all the port netdevices and everything gets
>> re-instantiated into new netns. Works fine with mlxsw. There we also
>> re-register the fib notifier.
>> 
>> I think that this can work for your usecase in netdevsim too:
>> 1) devlink instance is registering a fib notifier to track all fib
>>    entries in a namespace it belongs to. The counters are per-device -
>>    counting fib entries in a namespace the device is in.
>> 2) another devlink instance can do the same tracking in the same
>>    namespace. No problem, it's a separate counter, but the numbers are
>>    the same. One can set different limits to different devlink
>>    instances, but you can have only one. That is the bahaviour you have
>>    now.
>> 3) on devlink reload, netdevsim re-instantiates ports and re-registers
>>    fib notifier
>> 4) on devlink reload with netns change, all should be fine as the
>>    re-registered fib nofitier replays the entries. The ports are
>>    re-instatiated in new netns.
>> 
>> This way, we would get consistent behaviour between netdevsim and real
>> devices (mlxsw), correct devlink-netns implementation (you also
>> suggested to move ports to the namespace). Everyone should be happy.
>> 
>> What do you think?
>> 
>
>Right now, registering the fib notifier walks all namespaces. That is
>not a scalable solution. Are you changing that to replay only a given
>netns? Are you changing the notifiers to be per-namespace?

Eventually, that seems like good idea. Currently I want to do
if (net==nsim_dev->mynet)
	done
check at the beginning of the notifier.


>
>Also, you are still allowing devlink instances to be created within a
>namespace?

Yes, netdevsim is planned to be created directly in namespace.

^ permalink raw reply

* Re: [PATCH v2 bpf-next 3/3] perf: implement CAP_TRACING
From: Song Liu @ 2019-08-29  6:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, David S . Miller, peterz@infradead.org,
	rostedt@goodmis.org, netdev@vger.kernel.org, bpf@vger.kernel.org,
	Kernel Team, linux-api@vger.kernel.org
In-Reply-To: <20190829051253.1927291-3-ast@kernel.org>



> On Aug 28, 2019, at 10:12 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 
> Implement permissions as stated in uapi/linux/capability.h
> 
> Similar to CAP_BPF it's highly unlikely that s/CAP_SYS_ADMIN/CAP_TRACING/
> replacement will cause user breakage.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>

^ permalink raw reply

* Re: [PATCH v2 bpf-next 2/3] bpf: implement CAP_BPF
From: Song Liu @ 2019-08-29  6:04 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, davem@davemloft.net, peterz@infradead.org,
	rostedt@goodmis.org, netdev@vger.kernel.org, bpf@vger.kernel.org,
	Kernel Team, linux-api@vger.kernel.org
In-Reply-To: <20190829051253.1927291-2-ast@kernel.org>



> On Aug 28, 2019, at 10:12 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 

[...]

> diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
> index 44e2d640b088..91a7f25512ca 100644
> --- a/tools/testing/selftests/bpf/test_verifier.c
> +++ b/tools/testing/selftests/bpf/test_verifier.c
> @@ -805,10 +805,20 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
> 	}
> }
> 
> +struct libcap {
> +	struct __user_cap_header_struct hdr;
> +	struct __user_cap_data_struct data[2];
> +};
> +

I am confused by struct libcap. Why do we need it? 

> static int set_admin(bool admin)
> {
> 	cap_t caps;
> -	const cap_value_t cap_val = CAP_SYS_ADMIN;
> +	/* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs,
> +	 * and CAP_TRACING to create stackmap
> +	 */
> +	const cap_value_t cap_net_admin = CAP_NET_ADMIN;
> +	const cap_value_t cap_sys_admin = CAP_SYS_ADMIN;
> +	struct libcap *cap;
> 	int ret = -1;
> 
> 	caps = cap_get_proc();
> @@ -816,11 +826,26 @@ static int set_admin(bool admin)
> 		perror("cap_get_proc");
> 		return -1;
> 	}
> -	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
> +	cap = (struct libcap *)caps;
> +	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) {
> +		perror("cap_set_flag clear admin");
> +		goto out;
> +	}
> +	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin,
> 				admin ? CAP_SET : CAP_CLEAR)) {
> -		perror("cap_set_flag");
> +		perror("cap_set_flag set_or_clear net");
> 		goto out;
> 	}
> +	/* libcap is likely old and simply ignores CAP_BPF and CAP_TRACING,
> +	 * so update effective bits manually
> +	 */
> +	if (admin) {
> +		cap->data[1].effective |= 1 << (38 /* CAP_BPF */ - 32);
> +		cap->data[1].effective |= 1 << (39 /* CAP_TRACING */ - 32);
> +	} else {
> +		cap->data[1].effective &= ~(1 << (38 - 32));
> +		cap->data[1].effective &= ~(1 << (39 - 32));
> +	}

And why we do not need cap->data[0]?

Thanks,
Song


^ permalink raw reply

* Re: [PATCH] net: Adding parameter detection in __ethtool_get_link_ksettings.
From: Michal Kubecek @ 2019-08-29  6:01 UTC (permalink / raw)
  To: netdev
  Cc: Saeed Mahameed, liudongxu3@huawei.com, eric.dumazet@gmail.com,
	davem@davemloft.net, linux-kernel@vger.kernel.org
In-Reply-To: <0db9e18a4dd81d9a6025a2d8cda343b585d91fc4.camel@mellanox.com>

On Tue, Aug 27, 2019 at 07:01:41PM +0000, Saeed Mahameed wrote:
> On Mon, 2019-08-26 at 17:47 +0800, Dongxu Liu wrote:
> 
> > Maybe "if (!dev->ethtool_ops)" is more accurate for this bug.
> > 
> 
> Also i am not sure about this, could be a bug in the device driver your
> enslaving.
> 
> alloc_netdev_mqs will assign &default_ethtool_ops to dev->ethtool_ops ,
> if user provided setup callback didn't assign the driver specific
> ethtool_ops.

Dongxu said he encountered the null pointer dereference in a 3.10
kernel, not current mainline. But commit 2c60db037034 ("net: provide a
default dev->ethtool_ops") which introduced default_ethtool_ops came in
3.7-rc1 so 3.10 should have it already. There is indeed something wrong.

I don't think we should add either check unless we positively know that
dev->ethtool_ops can be null with current mainline kernel. And even
then, it would probably be more appropriate to fix the code which caused
it.

Michal

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/3] capability: introduce CAP_BPF and CAP_TRACING
From: Song Liu @ 2019-08-29  6:00 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, davem@davemloft.net, peterz@infradead.org,
	rostedt@goodmis.org, netdev@vger.kernel.org, bpf@vger.kernel.org,
	Kernel Team, linux-api@vger.kernel.org
In-Reply-To: <20190829051253.1927291-1-ast@kernel.org>


> On Aug 28, 2019, at 10:12 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 

[...]

> - Creation of [ku][ret]probe
> - Accessing arbitrary kernel memory via kprobe + probe_kernel_read
> - Attach tracing bpf programs to perf events
> - Access to kallsyms
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>


^ permalink raw reply

* [PATCH v4 1/1] ixgbe: sync the first fragment unconditionally
From: Firo Yang @ 2019-08-29  5:39 UTC (permalink / raw)
  To: jeffrey.t.kirsher@intel.com, davem@davemloft.net
  Cc: jakub.kicinski@netronome.com, andrewx.bowers@intel.com,
	alexander.h.duyck@linux.intel.com, nhorman@redhat.com,
	sassmann@redhat.com, netdev@vger.kernel.org, Firo Yang

In Xen environment, if Xen-swiotlb is enabled, ixgbe driver
could possibly allocate a page, DMA memory buffer, for the first
fragment which is not suitable for Xen-swiotlb to do DMA operations.
Xen-swiotlb have to internally allocate another page for doing DMA
operations. This mechanism requires syncing the data from the internal
page to the page which ixgbe sends to upper network stack. However,
since commit f3213d932173 ("ixgbe: Update driver to make use of DMA
attributes in Rx path"), the unmap operation is performed with
DMA_ATTR_SKIP_CPU_SYNC. As a result, the sync is not performed.
Since the sync isn't performed, the upper network stack could receive
a incomplete network packet. By incomplete, it means the data from
skb->head to skb->end is invalid. So we have to copy the data from
the internal xen-swiotlb page to the page which ixgbe sends to
upper network stack through the sync operation.

More details from Alexander Duyck:
Specifically since we are mapping the frame with
DMA_ATTR_SKIP_CPU_SYNC we have to unmap with that as well. As a result
a sync is not performed on an unmap and must be done manually as we
skipped it for the first frag. As such we need to always sync before
possibly performing a page unmap operation.

Fixes: f3213d932173 ("ixgbe: Update driver to make use of DMA attributes in Rx path")
Reviewed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: Firo Yang <firo.yang@suse.com>
---
Change since v3:
 * Fixed "wrapped Fixes: tag", noted by Jakub Kicinski <jakub.kicinski@netronome.com>

Changes since v2:
 * Added details on the problem caused by skipping the sync.
 * Added more explanation from Alexander Duyck.

Changes since v1:
 * Imporved the patch description.
 * Added Reviewed-by: and Fixes: as suggested by Alexander Duyck.
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 7882148abb43..4e8d5890a227 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1825,13 +1825,7 @@ static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring,
 static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
 				struct sk_buff *skb)
 {
-	/* if the page was released unmap it, else just sync our portion */
-	if (unlikely(IXGBE_CB(skb)->page_released)) {
-		dma_unmap_page_attrs(rx_ring->dev, IXGBE_CB(skb)->dma,
-				     ixgbe_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE,
-				     IXGBE_RX_DMA_ATTR);
-	} else if (ring_uses_build_skb(rx_ring)) {
+	if (ring_uses_build_skb(rx_ring)) {
 		unsigned long offset = (unsigned long)(skb->data) & ~PAGE_MASK;
 
 		dma_sync_single_range_for_cpu(rx_ring->dev,
@@ -1848,6 +1842,14 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
 					      skb_frag_size(frag),
 					      DMA_FROM_DEVICE);
 	}
+
+	/* If the page was released, just unmap it. */
+	if (unlikely(IXGBE_CB(skb)->page_released)) {
+		dma_unmap_page_attrs(rx_ring->dev, IXGBE_CB(skb)->dma,
+				     ixgbe_rx_pg_size(rx_ring),
+				     DMA_FROM_DEVICE,
+				     IXGBE_RX_DMA_ATTR);
+	}
 }
 
 /**
-- 
2.16.4


^ permalink raw reply related

* Re: [patch net-next rfc 3/7] net: rtnetlink: add commands to add and delete alternative ifnames
From: Michal Kubecek @ 2019-08-29  5:26 UTC (permalink / raw)
  To: netdev
  Cc: Roopa Prabhu, Jiri Pirko, David Miller, Jakub Kicinski,
	David Ahern, Stephen Hemminger, dcbw, Andrew Lunn, parav,
	Saeed Mahameed, mlxsw
In-Reply-To: <CAJieiUiipZY3A+04Po=WnvgkonfXZxFX2es=1Q5dq1Km869Obw@mail.gmail.com>

On Wed, Aug 28, 2019 at 09:36:41PM -0700, Roopa Prabhu wrote:
> 
> yes,  correct. I mentioned that because I was wondering if we can
> think along the same lines for this API.
> eg
> (a) RTM_NEWLINK always replaces the list attribute
> (b) RTM_SETLINK with NLM_F_APPEND always appends to the list attribute
> (c) RTM_DELLINK with NLM_F_APPEND updates the list attribute
> 
> (It could be NLM_F_UPDATE if NLM_F_APPEND sounds weird in the del
> case. I have not looked at the full dellink path if it will work
> neatly..its been a busy day )

AFAICS rtnl_dellink() calls nlmsg_parse_deprecated() so that even
current code would ignore any future attribute in RTM_DELLINK message
(any kernel before the strict validation was introduced definitely will)
and it does not seem to check NLM_F_APPEND or NLM_F_UPDATE either. So
unless I missed something, such message would result in deleting the
network device (if possible) with any kernel not implementing the
feature.

Michal

^ permalink raw reply

* Re: [PATCH] sky2: Disable MSI on yet another ASUS boards (P6Xxxx)
From: Takashi Iwai @ 2019-08-29  5:20 UTC (permalink / raw)
  To: David Miller; +Cc: mlindner, stephen, netdev, linux-kernel, swm
In-Reply-To: <20190828.160937.1788181909547435040.davem@davemloft.net>

On Thu, 29 Aug 2019 01:09:37 +0200,
David Miller wrote:
> 
> From: Takashi Iwai <tiwai@suse.de>
> Date: Wed, 28 Aug 2019 08:31:19 +0200
> 
> > A similar workaround for the suspend/resume problem is needed for yet
> > another ASUS machines, P6X models.  Like the previous fix, the BIOS
> > doesn't provide the standard DMI_SYS_* entry, so again DMI_BOARD_*
> > entries are used instead.
> > 
> > Reported-and-tested-by: SteveM <swm@swm1.com>
> > Signed-off-by: Takashi Iwai <tiwai@suse.de>
> 
> Applied, but this is getting suspicious.
> 
> It looks like MSI generally is not restored properly on resume on these
> boards, so maybe there simply needs to be a generic PCI quirk for that?

Yes, I wondered that, too.
But, e.g. HD-audio should use MSI on Intel platforms, and if the
problem were generic, it must suffer from the same issue, and I
haven't heard of such, so far.  So it's likely specific to some
limited devices, as it seems.


Thanks!

Takashi

^ permalink raw reply

* [PATCH v2 bpf-next 2/3] bpf: implement CAP_BPF
From: Alexei Starovoitov @ 2019-08-29  5:12 UTC (permalink / raw)
  To: luto; +Cc: davem, peterz, rostedt, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190829051253.1927291-1-ast@kernel.org>

Implement permissions as stated in uapi/linux/capability.h

Note that CAP_SYS_ADMIN is replaced with CAP_BPF.
All existing applications that use BPF do not drop all caps
and keep only CAP_SYS_ADMIN before doing bpf() syscall.
Hence it's highly unlikely that existing code will break.
If there will be reports of breakage then CAP_SYS_ADMIN
would be allowed as well with "it's usage is deprecated" message
similar to commit ee24aebffb75 ("cap_syslog: accept CAP_SYS_ADMIN for now")

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h                      |  1 +
 kernel/bpf/arraymap.c                       |  2 +-
 kernel/bpf/cgroup.c                         |  2 +-
 kernel/bpf/core.c                           |  9 +++-
 kernel/bpf/hashtab.c                        |  4 +-
 kernel/bpf/lpm_trie.c                       |  2 +-
 kernel/bpf/queue_stack_maps.c               |  2 +-
 kernel/bpf/reuseport_array.c                |  2 +-
 kernel/bpf/stackmap.c                       |  2 +-
 kernel/bpf/syscall.c                        | 32 ++++++++------
 kernel/bpf/verifier.c                       |  4 +-
 kernel/trace/bpf_trace.c                    |  2 +-
 net/core/bpf_sk_storage.c                   |  2 +-
 net/core/filter.c                           | 10 +++--
 tools/testing/selftests/bpf/test_verifier.c | 46 +++++++++++++++++----
 15 files changed, 83 insertions(+), 39 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 92c6e31fb008..16cea50af014 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -857,6 +857,7 @@ static inline bool bpf_dump_raw_ok(void)
 	return kallsyms_show_value() == 1;
 }
 
+bool cap_bpf_tracing(void);
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..045e30b7160d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,7 +73,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool unpriv = !capable(CAP_SYS_ADMIN);
+	bool unpriv = !capable(CAP_BPF);
 	u64 cost, array_size, mask64;
 	struct bpf_map_memory mem;
 	struct bpf_array *array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6a6a154cfa7b..97f733354421 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -795,7 +795,7 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
 	case BPF_FUNC_trace_printk:
-		if (capable(CAP_SYS_ADMIN))
+		if (cap_bpf_tracing())
 			return bpf_get_trace_printk_proto();
 		/* fall through */
 	default:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..16ed80835156 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return;
 
 	spin_lock_bh(&bpf_lock);
@@ -768,7 +768,7 @@ static int bpf_jit_charge_modmem(u32 pages)
 {
 	if (atomic_long_add_return(pages, &bpf_jit_current) >
 	    (bpf_jit_limit >> PAGE_SHIFT)) {
-		if (!capable(CAP_SYS_ADMIN)) {
+		if (!capable(CAP_BPF)) {
 			atomic_long_sub(pages, &bpf_jit_current);
 			return -EPERM;
 		}
@@ -2104,6 +2104,11 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
 
+bool cap_bpf_tracing(void)
+{
+	return capable(CAP_BPF) && capable(CAP_TRACING);
+}
+
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
 #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..f459315625ac 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -244,9 +244,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 		     offsetof(struct htab_elem, hash_node.pprev));
 
-	if (lru && !capable(CAP_SYS_ADMIN))
+	if (lru && !capable(CAP_BPF))
 		/* LRU implementation is much complicated than other
-		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
+		 * maps.  Hence, limit to CAP_BPF.
 		 */
 		return -EPERM;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 56e6c75d354d..a45fa5464d98 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	u64 cost = sizeof(*trie), cost_per_node;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return ERR_PTR(-EPERM);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..ca0ba9edca86 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
 /* Called from syscall */
 static int queue_stack_map_alloc_check(union bpf_attr *attr)
 {
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 50c083ba978c..bfad7d41a061 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	struct bpf_map_memory mem;
 	u64 array_size;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return ERR_PTR(-EPERM);
 
 	array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..beaff32fccc5 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -90,7 +90,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	u64 cost, n_buckets;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_TRACING))
 		return ERR_PTR(-EPERM);
 
 	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0f62fd67c6b..ef7b06ca30e5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1176,7 +1176,7 @@ static int map_freeze(const union bpf_attr *attr)
 		err = -EBUSY;
 		goto err_put;
 	}
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!capable(CAP_BPF)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -1634,7 +1634,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return -EPERM;
 
 	/* copy eBPF program license from user space */
@@ -1647,11 +1647,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	is_gpl = license_is_gpl_compatible(license);
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+	    attr->insn_cnt > (capable(CAP_BPF) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
 		return -E2BIG;
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return -EPERM;
 
 	bpf_prog_load_fixup_attach_type(attr);
@@ -1802,6 +1802,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 	char tp_name[128];
 	int tp_fd, err;
 
+	if (!cap_bpf_tracing())
+		return -EPERM;
+
 	if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
 			      sizeof(tp_name) - 1) < 0)
 		return -EFAULT;
@@ -2080,7 +2083,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	struct bpf_prog *prog;
 	int ret = -ENOTSUPP;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_NET_ADMIN) || !capable(CAP_BPF))
+		/* test_run callback is available for networking progs only.
+		 * Add cap_bpf_tracing() above when tracing progs become runable.
+		 */
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
@@ -2117,7 +2123,7 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	next_id++;
@@ -2143,7 +2149,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	spin_lock_bh(&prog_idr_lock);
@@ -2177,7 +2183,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	f_flags = bpf_get_file_flag(attr->open_flags);
@@ -2352,7 +2358,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.run_time_ns = stats.nsecs;
 	info.run_cnt = stats.cnt;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!capable(CAP_BPF)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
@@ -2670,7 +2676,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	return btf_new_fd(attr);
@@ -2683,7 +2689,7 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	return btf_get_fd_by_id(attr->btf_id);
@@ -2752,7 +2758,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!cap_bpf_tracing())
 		return -EPERM;
 
 	if (attr->task_fd_query.flags != 0)
@@ -2820,7 +2826,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	union bpf_attr attr = {};
 	int err;
 
-	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_BPF))
 		return -EPERM;
 
 	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 10c0ff93f52b..5810e8cc9342 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -987,7 +987,7 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
 	reg->umax_value = U64_MAX;
 
 	/* constant backtracking is enabled for root only for now */
-	reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
+	reg->precise = capable(CAP_BPF) ? false : true;
 }
 
 /* Mark a register as having a completely unknown (scalar) value. */
@@ -9233,7 +9233,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		env->insn_aux_data[i].orig_idx = i;
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
-	is_priv = capable(CAP_SYS_ADMIN);
+	is_priv = capable(CAP_BPF);
 
 	/* grab the mutex to protect few globals used by verifier */
 	if (!is_priv)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..2bf58ff5bf75 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1246,7 +1246,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 	u32 *ids, prog_cnt, ids_len;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!cap_bpf_tracing())
 		return -EPERM;
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return -EINVAL;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..0b29f6abbeba 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -616,7 +616,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	if (attr->value_size >= KMALLOC_MAX_SIZE -
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c1059cdad3d..986277abfde2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5990,7 +5990,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return NULL;
 
 	switch (func_id) {
@@ -5999,7 +5999,9 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_spin_unlock:
 		return &bpf_spin_unlock_proto;
 	case BPF_FUNC_trace_printk:
-		return bpf_get_trace_printk_proto();
+		if (cap_bpf_tracing())
+			return bpf_get_trace_printk_proto();
+		/* fall through */
 	default:
 		return NULL;
 	}
@@ -6563,7 +6565,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!capable(CAP_SYS_ADMIN))
+		if (!capable(CAP_BPF))
 			return false;
 		break;
 	}
@@ -6575,7 +6577,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!capable(CAP_SYS_ADMIN))
+			if (!capable(CAP_BPF))
 				return false;
 			break;
 		default:
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 44e2d640b088..91a7f25512ca 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -805,10 +805,20 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 	}
 }
 
+struct libcap {
+	struct __user_cap_header_struct hdr;
+	struct __user_cap_data_struct data[2];
+};
+
 static int set_admin(bool admin)
 {
 	cap_t caps;
-	const cap_value_t cap_val = CAP_SYS_ADMIN;
+	/* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs,
+	 * and CAP_TRACING to create stackmap
+	 */
+	const cap_value_t cap_net_admin = CAP_NET_ADMIN;
+	const cap_value_t cap_sys_admin = CAP_SYS_ADMIN;
+	struct libcap *cap;
 	int ret = -1;
 
 	caps = cap_get_proc();
@@ -816,11 +826,26 @@ static int set_admin(bool admin)
 		perror("cap_get_proc");
 		return -1;
 	}
-	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
+	cap = (struct libcap *)caps;
+	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) {
+		perror("cap_set_flag clear admin");
+		goto out;
+	}
+	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin,
 				admin ? CAP_SET : CAP_CLEAR)) {
-		perror("cap_set_flag");
+		perror("cap_set_flag set_or_clear net");
 		goto out;
 	}
+	/* libcap is likely old and simply ignores CAP_BPF and CAP_TRACING,
+	 * so update effective bits manually
+	 */
+	if (admin) {
+		cap->data[1].effective |= 1 << (38 /* CAP_BPF */ - 32);
+		cap->data[1].effective |= 1 << (39 /* CAP_TRACING */ - 32);
+	} else {
+		cap->data[1].effective &= ~(1 << (38 - 32));
+		cap->data[1].effective &= ~(1 << (39 - 32));
+	}
 	if (cap_set_proc(caps)) {
 		perror("cap_set_proc");
 		goto out;
@@ -1012,9 +1037,11 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 
 static bool is_admin(void)
 {
+	cap_flag_value_t net_priv = CAP_CLEAR;
+	bool tracing_priv = false;
+	bool bpf_priv = false;
+	struct libcap *cap;
 	cap_t caps;
-	cap_flag_value_t sysadmin = CAP_CLEAR;
-	const cap_value_t cap_val = CAP_SYS_ADMIN;
 
 #ifdef CAP_IS_SUPPORTED
 	if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
@@ -1027,11 +1054,14 @@ static bool is_admin(void)
 		perror("cap_get_proc");
 		return false;
 	}
-	if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin))
-		perror("cap_get_flag");
+	cap = (struct libcap *)caps;
+	bpf_priv = cap->data[1].effective & (1 << (38/* CAP_BPF */ - 32));
+	tracing_priv = cap->data[1].effective & (1 << (39/* CAP_TRACING */ - 32));
+	if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv))
+		perror("cap_get_flag NET");
 	if (cap_free(caps))
 		perror("cap_free");
-	return (sysadmin == CAP_SET);
+	return bpf_priv && tracing_priv && net_priv == CAP_SET;
 }
 
 static void get_unpriv_disabled()
-- 
2.20.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox