Netdev List
 help / color / mirror / Atom feed
* [PATCH v2 5/6] ethtool: support per-queue sub command --coalesce
From: Jeff Kirsher @ 2019-02-06  0:01 UTC (permalink / raw)
  To: linville; +Cc: Nicholas Nunley, netdev, nhorman, sassmann, Jeff Kirsher
In-Reply-To: <20190206000106.24364-1-jeffrey.t.kirsher@intel.com>

From: Nicholas Nunley <nicholas.d.nunley@intel.com>

This patch adds the ability to configure the coalesce settings from
do_scoalesce on a per-queue basis.

For each masked queue the current settings are read, modified, and written
back to the kernel.

Example:

 $ sudo ./ethtool --set-perqueue-command eth5 queue_mask 0x1 --coalesce
 rx-usecs 10 tx-usecs 5
 $ sudo ./ethtool --set-perqueue-command eth5 queue_mask 0x1
 --show-coalesce

 Queue: 0
 Adaptive RX: on  TX: on
 stats-block-usecs: 0
 sample-interval: 0
 pkt-rate-low: 0
 pkt-rate-high: 0

 rx-usecs: 10
 rx-frames: 0
 rx-usecs-irq: 0
 rx-frames-irq: 256

 tx-usecs: 5
 tx-frames: 0
 tx-usecs-irq: 0
 tx-frames-irq: 256

 rx-usecs-low: 0
 rx-frame-low: 0
 tx-usecs-low: 0
 tx-frame-low: 0

 rx-usecs-high: 0
 rx-frame-high: 0
 tx-usecs-high: 0
 tx-frame-high: 0

Based on patch by Kan Liang <kan.liang@intel.com>

Signed-off-by: Nicholas Nunley <nicholas.d.nunley@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 ethtool.8.in |  3 ++-
 ethtool.c    | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/ethtool.8.in b/ethtool.8.in
index 71bb962..c7202e8 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -1153,7 +1153,8 @@ Sets the specific queues which the sub command is applied to.
 If queue_mask is not set, the sub command will be applied to all queues.
 .TP
 .B sub_command
-Sets the sub command. The supported sub commands include --show-coalesce.
+Sets the sub command. The supported sub commands include --show-coalesce and
+--coalesce.
 .RE
 .SH BUGS
 Not supported (in part or whole) on all network drivers.
diff --git a/ethtool.c b/ethtool.c
index 9a1b83b..01bdaf1 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -5269,7 +5269,7 @@ static const struct option {
 	{ "--set-fec", 1, do_sfec, "Set FEC settings",
 	  "		[ encoding auto|off|rs|baser [...]]\n"},
 	{ "--set-perqueue-command", 1, do_perqueue, "Set per queue command. "
-	  "The supported sub commands include --show-coalesce",
+	  "The supported sub commands include --show-coalesce, --coalesce",
 	  "             [queue_mask %x] SUB_COMMAND\n"},
 	{ "-h|--help", 0, show_usage, "Show this help" },
 	{ "--version", 0, do_version, "Show version number" },
@@ -5397,6 +5397,53 @@ get_per_queue_coalesce(struct cmd_context *ctx, __u32 *queue_mask, int n_queues)
 	return per_queue_opt;
 }
 
+static void set_per_queue_coalesce(struct cmd_context *ctx,
+				   struct ethtool_per_queue_op *per_queue_opt)
+{
+	struct ethtool_coalesce ecoal;
+	DECLARE_COALESCE_OPTION_VARS();
+	struct cmdline_info cmdline_coalesce[] = COALESCE_CMDLINE_INFO(ecoal);
+	__u32 *queue_mask = per_queue_opt->queue_mask;
+	char *addr = (char *)per_queue_opt + sizeof(*per_queue_opt);
+	int gcoalesce_changed = 0;
+	int i;
+
+	parse_generic_cmdline(ctx, &gcoalesce_changed,
+			      cmdline_coalesce, ARRAY_SIZE(cmdline_coalesce));
+
+	for (i = 0; i < __KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32); i++) {
+		int queue = i * 32;
+		__u32 mask = queue_mask[i];
+
+		while (mask > 0) {
+			if (mask & 0x1) {
+				int changed = 0;
+
+				memcpy(&ecoal, addr,
+				       sizeof(struct ethtool_coalesce));
+				do_generic_set(cmdline_coalesce,
+					       ARRAY_SIZE(cmdline_coalesce),
+					       &changed);
+				if (!changed)
+					fprintf(stderr,
+						"Queue %d, no coalesce parameters changed\n",
+						queue);
+				memcpy(addr, &ecoal,
+				       sizeof(struct ethtool_coalesce));
+				addr += sizeof(struct ethtool_coalesce);
+			}
+			mask = mask >> 1;
+			queue++;
+		}
+	}
+
+	per_queue_opt->cmd = ETHTOOL_PERQUEUE;
+	per_queue_opt->sub_command = ETHTOOL_SCOALESCE;
+
+	if (send_ioctl(ctx, per_queue_opt))
+		perror("Cannot set device per queue parameters");
+}
+
 static int do_perqueue(struct cmd_context *ctx)
 {
 	struct ethtool_per_queue_op *per_queue_opt;
@@ -5447,6 +5494,17 @@ static int do_perqueue(struct cmd_context *ctx)
 		}
 		dump_per_queue_coalesce(per_queue_opt, queue_mask);
 		free(per_queue_opt);
+	} else if (strstr(args[i].opts, "--coalesce") != NULL) {
+		ctx->argc--;
+		ctx->argp++;
+		per_queue_opt = get_per_queue_coalesce(ctx, queue_mask,
+						       n_queues);
+		if (per_queue_opt == NULL) {
+			perror("Cannot get device per queue parameters");
+			return -EFAULT;
+		}
+		set_per_queue_coalesce(ctx, per_queue_opt);
+		free(per_queue_opt);
 	} else {
 		perror("The subcommand is not supported yet");
 		return -EOPNOTSUPP;
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 6/6] ethtool: fix up dump_coalesce output to match actual option names
From: Jeff Kirsher @ 2019-02-06  0:01 UTC (permalink / raw)
  To: linville; +Cc: Nicholas Nunley, netdev, nhorman, sassmann, Jeff Kirsher
In-Reply-To: <20190206000106.24364-1-jeffrey.t.kirsher@intel.com>

From: Nicholas Nunley <nicholas.d.nunley@intel.com>

When the coalesce settings are printed with --show-coalesce a few of the
option names lack the pluralization that is present in the man page and
usage info, but are otherwise identical.

This inconsistency could lead to some confusion if a user attempts to set
the coalesce settings by matching the output they see from --show-coalesce,
so fix this.

Signed-off-by: Nicholas Nunley <nicholas.d.nunley@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 ethtool.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index 01bdaf1..03e5008 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -1372,14 +1372,14 @@ static int dump_coalesce(const struct ethtool_coalesce *ecoal)
 		"tx-frames-irq: %u\n"
 		"\n"
 		"rx-usecs-low: %u\n"
-		"rx-frame-low: %u\n"
+		"rx-frames-low: %u\n"
 		"tx-usecs-low: %u\n"
-		"tx-frame-low: %u\n"
+		"tx-frames-low: %u\n"
 		"\n"
 		"rx-usecs-high: %u\n"
-		"rx-frame-high: %u\n"
+		"rx-frames-high: %u\n"
 		"tx-usecs-high: %u\n"
-		"tx-frame-high: %u\n"
+		"tx-frames-high: %u\n"
 		"\n",
 		ecoal->stats_block_coalesce_usecs,
 		ecoal->rate_sample_interval,
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 3/6] ethtool: introduce new ioctl for per-queue settings
From: Jeff Kirsher @ 2019-02-06  0:01 UTC (permalink / raw)
  To: linville; +Cc: Nicholas Nunley, netdev, nhorman, sassmann, Jeff Kirsher
In-Reply-To: <20190206000106.24364-1-jeffrey.t.kirsher@intel.com>

From: Nicholas Nunley <nicholas.d.nunley@intel.com>

Introduce a new ioctl for setting per-queue parameters.
Users can apply commands to specific queues by setting SUB_COMMAND and
queue_mask with the following ethtool command:

 ethtool --set-perqueue-command DEVNAME [queue_mask %x] SUB_COMMAND

If queue_mask is not set, the SUB_COMMAND will be applied to all queues.

SUB_COMMANDs for per-queue settings will be implemented in following
patches.

Based on patch by Kan Liang <kan.liang@intel.com>

Signed-off-by: Nicholas Nunley <nicholas.d.nunley@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 ethtool.8.in |  20 ++++++++++
 ethtool.c    | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

diff --git a/ethtool.8.in b/ethtool.8.in
index 5a26cff..0aaca2c 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -391,6 +391,14 @@ ethtool \- query or control network driver and hardware settings
 .I devname
 .B encoding
 .BR auto | off | rs | baser \ [...]
+.HP
+.B ethtool \-\-set\-perqueue\-command
+.I devname
+.RB [ queue_mask
+.IR %x ]
+.I sub_command
+.RB ...
+ .
 .
 .\" Adjust lines (i.e. full justification) and hyphenate.
 .ad
@@ -1135,6 +1143,18 @@ RS	Force RS-FEC encoding
 BaseR	Force BaseR encoding
 .TE
 .RE
+.TP
+.B \-\-set\-perqueue\-command
+Sets sub command to specific queues.
+.RS 4
+.TP
+.B queue_mask %x
+Sets the specific queues which the sub command is applied to.
+If queue_mask is not set, the sub command will be applied to all queues.
+.TP
+.B sub_command
+Sets the sub command.
+.RE
 .SH BUGS
 Not supported (in part or whole) on all network drivers.
 .SH AUTHOR
diff --git a/ethtool.c b/ethtool.c
index af266c5..4dc725c 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -5048,6 +5048,8 @@ static int do_sfec(struct cmd_context *ctx)
 	return 0;
 }
 
+static int do_perqueue(struct cmd_context *ctx);
+
 #ifndef TEST_ETHTOOL
 int send_ioctl(struct cmd_context *ctx, void *cmd)
 {
@@ -5243,6 +5245,8 @@ static const struct option {
 	{ "--show-fec", 1, do_gfec, "Show FEC settings"},
 	{ "--set-fec", 1, do_sfec, "Set FEC settings",
 	  "		[ encoding auto|off|rs|baser [...]]\n"},
+	{ "--set-perqueue-command", 1, do_perqueue, "Set per queue command",
+	  "             [queue_mask %x] SUB_COMMAND\n"},
 	{ "-h|--help", 0, show_usage, "Show this help" },
 	{ "--version", 0, do_version, "Show version number" },
 	{}
@@ -5294,6 +5298,103 @@ static int find_option(int argc, char **argp)
 	return -1;
 }
 
+static int set_queue_mask(u32 *queue_mask, char *str)
+{
+	int len = strlen(str);
+	int index = __KERNEL_DIV_ROUND_UP(len * 4, 32);
+	char tmp[9];
+	char *end = str + len;
+	int i, num;
+	__u32 mask;
+	int n_queues = 0;
+
+	if (len > MAX_NUM_QUEUE)
+		return -EINVAL;
+
+	for (i = 0; i < index; i++) {
+		num = end - str;
+
+		if (num >= 8) {
+			end -= 8;
+			num = 8;
+		} else {
+			end = str;
+		}
+		strncpy(tmp, end, num);
+		tmp[num] = '\0';
+
+		queue_mask[i] = strtoul(tmp, NULL, 16);
+
+		mask = queue_mask[i];
+		while (mask > 0) {
+			if (mask & 0x1)
+				n_queues++;
+			mask = mask >> 1;
+		}
+	}
+
+	return n_queues;
+}
+
+#define MAX(x, y) (x > y ? x : y)
+
+static int find_max_num_queues(struct cmd_context *ctx)
+{
+	struct ethtool_channels echannels;
+
+	echannels.cmd = ETHTOOL_GCHANNELS;
+	if (send_ioctl(ctx, &echannels))
+		return -1;
+
+	return MAX(MAX(echannels.rx_count, echannels.tx_count),
+		   echannels.combined_count);
+}
+
+static int do_perqueue(struct cmd_context *ctx)
+{
+	__u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)] = {0};
+	int i, n_queues = 0;
+
+	if (ctx->argc == 0)
+		exit_bad_args();
+
+	/*
+	 * The sub commands will be applied to
+	 * all queues if no queue_mask set
+	 */
+	if (strncmp(*ctx->argp, "queue_mask", 10)) {
+		n_queues = find_max_num_queues(ctx);
+		if (n_queues < 0) {
+			perror("Cannot get number of queues");
+			return -EFAULT;
+		}
+		for (i = 0; i < n_queues / 32; i++)
+			queue_mask[i] = ~0;
+		queue_mask[i] = (1 << (n_queues - i * 32)) - 1;
+		fprintf(stdout,
+			"The sub commands will be applied to all %d queues\n",
+			n_queues);
+	} else {
+		ctx->argc--;
+		ctx->argp++;
+		n_queues = set_queue_mask(queue_mask, *ctx->argp);
+		if (n_queues < 0) {
+			perror("Invalid queue mask");
+			return n_queues;
+		}
+		ctx->argc--;
+		ctx->argp++;
+	}
+
+	i = find_option(ctx->argc, ctx->argp);
+	if (i < 0)
+		exit_bad_args();
+
+	/* no sub_command support yet */
+
+	return 0;
+}
+
 int main(int argc, char **argp)
 {
 	int (*func)(struct cmd_context *);
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH net] virtio_net: Account for tx bytes and packets on sending xdp_frames
From: Saeed Mahameed @ 2019-02-06  0:06 UTC (permalink / raw)
  To: dsahern@gmail.com, brouer@redhat.com
  Cc: thoiland@redhat.com, hawk@kernel.org,
	virtualization@lists.linux-foundation.org, borkmann@iogearbox.net,
	Tariq Toukan, john.fastabend@gmail.com, mst@redhat.com,
	jakub.kicinski@netronome.com, netdev@vger.kernel.org,
	jasowang@redhat.com, davem@davemloft.net,
	makita.toshiaki@lab.ntt.co.jp
In-Reply-To: <bdcfedd6-465d-4485-e268-25c4ce6b9fcf@gmail.com>

On Mon, 2019-02-04 at 19:13 -0800, David Ahern wrote:
> On 2/4/19 3:53 AM, Jesper Dangaard Brouer wrote:
> > On Sat, 2 Feb 2019 14:27:26 -0700
> > David Ahern <dsahern@gmail.com> wrote:
> > 
> > > On 1/31/19 1:15 PM, Jesper Dangaard Brouer wrote:
> > > > > David, Jesper, care to chime in where we ended up in that
> > > > > last thread
> > > > > discussion this?  
> > > > 
> > > > IHMO packets RX and TX on a device need to be accounted, in
> > > > standard
> > > > counters, regardless of XDP.  For XDP RX the packet is counted
> > > > as RX,
> > > > regardless if XDP choose to XDP_DROP.  On XDP TX which is via
> > > > XDP_REDIRECT or XDP_TX, the driver that transmit the packet
> > > > need to
> > > > account the packet in a TX counter (this if often delayed to
> > > > DMA TX
> > > > completion handling).  We cannot break the expectation that RX
> > > > and TX
> > > > counter are visible to userspace stats tools. XDP should not
> > > > make these
> > > > packets invisible.  
> > > 
> > > Agreed. What I was pushing on that last thread was Rx, Tx and
> > > dropped
> > > are all accounted by the driver in standard stats. Basically if
> > > the
> > > driver touched it, the driver's counters should indicate that.
> > 
> > Sound like we all agree (except with the dropped counter, see
> > below).
> > 
> > Do notice that mlx5 driver doesn't do this.  It is actually rather
> > confusing to use XDP on mlx5, as when XDP "consume" which include
> > XDP_DROP, XDP_REDIRECT or XDP_TX, then the driver standard stats
> > are
> > not incremented... the packet is invisible to "ifconfig" stat based
> > tools.
> 
> mlx5 needs some work. As I recall it still has the bug/panic removing
> xdp programs - at least I don't recall seeing a patch for it.

Only when xdp_redirect to mlx5, and removing the program while redirect
is happening, this is actually due to a lack of synchronization means
between different drivers, we have some ideas to overcome this using a
standard XDP API, or just use a hack in mlx5 driver which i don't like:

https://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git/commit/?h=topic/xdp-redirect-fix&id=a3652d03cc35fd3ad62744986c8ccaca74c9f20c

I will be working on this towards the end of this week.

> 
> > 
> > > The push back was on dropped packets and whether that counter
> > > should be
> > > bumped on XDP_DROP.
> > 
> > My opinion is the XDP_DROP action should NOT increment the drivers
> > drop
> > counter.  First of all the "dropped" counter is also use for other
> > stuff, which will confuse that this counter express.  Second,
> > choosing
> > XDP_DROP is a policy choice, it still means it was RX-ed at the
> > driver
> > level.
> > 
> 
> Understood. Hopefully in March I will get some time to come back to
> this
> and propose an idea on what I would like to see - namely, the admin
> has
> a config option at load time to enable driver counters versus custom
> map
> counters. (meaning the operator of the node chooses standard stats
> over
> strict performance.) But of course that means the drivers have the
> code
> to collect those stats.

So bottom line:
1) Driver will count rx packets as rx-ed packets regardless of XDP
decision.

2) Driver should keep track of XDP decisions statistics, report them in
ethtool and in the new API suggested by David. track even (XDP_PASS) ?

Maybe instead of having all drivers track the statistics on their own,
we should move the responsibility to upper layer.

Idea: since we already have rxq_info structure per XDP ring (no false
sharing) and available per xdp_buff we can do:

+++ b/include/linux/filter.h
@@ -651,7 +651,9 @@ static __always_inline u32 bpf_prog_run_xdp(const
struct bpf_prog *prog,
         * already takes rcu_read_lock() when fetching the program, so
         * it's not necessary here anymore.
         */
-       return BPF_PROG_RUN(prog, xdp);
+       u32 ret = BPF_PROG_RUN(prog, xdp);
+       xdp->xdp_rxq_info.stats[ret]++
+       return ret;
 }

still we need a way (API) to report the rxq_info to whoever needs to
read current XDP stats 

3) Unrelated, In non XDP case, if skb allocation fails or driver fails
to pass the skb up to the stack for somereason, should the driver
increase rx packets ? IMHO the answer should be yes if we want to have
similar behavior between XDP and non XDP cases.

But this could result in netdev->stats.rx_packets + netdev-
>stats.rx_dropped to be more than the actual rx-ed packets, is this
acceptable ?






^ permalink raw reply

* Re: [RFC PATCH iproute2 2/5] act_ct: first import
From: Marcelo Ricardo Leitner @ 2019-02-06  0:09 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Guy Shattah, Aaron Conole, John Hurley, Simon Horman,
	Justin Pettit, Gregory Rose, Eelco Chaudron, Flavio Leitner,
	Florian Westphal, Jiri Pirko, Rashid Khan, Sushil Kulkarni,
	Andy Gospodarek, Roi Dayan, Yossi Kuperman, Or Gerlitz,
	Rony Efraim, davem@davemloft.net, netdev
In-Reply-To: <20190205145613.49996454@hermes.lan>

On Tue, Feb 05, 2019 at 02:56:13PM -0800, Stephen Hemminger wrote:
> On Fri, 25 Jan 2019 00:33:30 -0200
> Marcelo Ricardo Leitner <mleitner@redhat.com> wrote:
> 
> > +/*
> > + * m_ct.c	Connection Tracking target module
> > + *
> > + *		This program is free software; you can distribute it and/or
> > + *		modify it under the terms of the GNU General Public License
> > + *		as published by the Free Software Foundation; either version
> > + *		2 of the License, or (at your option) any later version.
> 
> Please just use SPDX for license info not GPL boilerplate.
> 

Will do.

  Marcelo

^ permalink raw reply

* [PATCH v2 bpf-next 2/2] btf: expose API to work with raw btf data
From: Andrii Nakryiko @ 2019-02-06  0:29 UTC (permalink / raw)
  To: songliubraving, yhs, ast, kafai, netdev, daniel, andrii.nakryiko
  Cc: Andrii Nakryiko
In-Reply-To: <20190206002949.1915237-1-andriin@fb.com>

This patch exposes two new APIs btf__get_raw_data_size() and
btf__get_raw_data() that allows to get a copy of raw BTF data out of
struct btf. This is useful for external programs that need to manipulate
raw data, e.g., pahole using btf__dedup() to deduplicate BTF type info
and then writing it back to file.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
---
 tools/lib/bpf/btf.c      | 10 ++++++++++
 tools/lib/bpf/btf.h      |  2 ++
 tools/lib/bpf/libbpf.map |  2 ++
 3 files changed, 14 insertions(+)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 1c2ba7182400..34bfb3641aac 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -437,6 +437,16 @@ int btf__fd(const struct btf *btf)
 	return btf->fd;
 }
 
+__u32 btf__get_raw_data_size(const struct btf *btf)
+{
+	return btf->data_size;
+}
+
+void btf__get_raw_data(const struct btf *btf, char *data)
+{
+	memcpy(data, btf->data, btf->data_size);
+}
+
 void btf__get_strings(const struct btf *btf, const char **strings,
 		      __u32 *str_len)
 {
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index e8410887f93a..d46f680b9416 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -66,6 +66,8 @@ LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf,
 LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__fd(const struct btf *btf);
+LIBBPF_API __u32 btf__get_raw_data_size(const struct btf *btf);
+LIBBPF_API void btf__get_raw_data(const struct btf *btf, char *data);
 LIBBPF_API void btf__get_strings(const struct btf *btf, const char **strings,
 				 __u32 *str_len);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index f5372df143f4..0ebbee13a3cd 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -136,6 +136,8 @@ LIBBPF_0.0.2 {
 		btf__dedup;
 		btf__get_map_kv_tids;
 		btf__get_nr_types;
+		btf__get_raw_data;
+		btf__get_raw_data_size;
 		btf__get_strings;
 		btf__load;
 		btf_ext__free;
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 bpf-next 0/2] tools/btf: extends libbpf APIs to work with btf w/o kernel
From: Andrii Nakryiko @ 2019-02-06  0:29 UTC (permalink / raw)
  To: songliubraving, yhs, ast, kafai, netdev, daniel, andrii.nakryiko
  Cc: Andrii Nakryiko

This patchset changes existing btf__new() API call to only load and initialize
struct btf, while exposing new btf__load() API to attempt to load and validate
BTF in kernel. It also adds ability to copy raw BTF data out of struct btf for
further processing by external applications.

This makes utilizing libbpf's APIs that don't require kernel facilities (e.g.,
btf_dedup) simpler and more natural from external application.

v1->v2:
- btf_load() returns just error, not fd
- fix ordering in libbpf.map

Andrii Nakryiko (2):
  btf: separate btf creation and loading
  btf: expose API to work with raw btf data

 tools/lib/bpf/btf.c      | 63 +++++++++++++++++++++++++---------------
 tools/lib/bpf/btf.h      |  3 ++
 tools/lib/bpf/libbpf.c   |  2 +-
 tools/lib/bpf/libbpf.map |  3 ++
 4 files changed, 46 insertions(+), 25 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH v2 bpf-next 1/2] btf: separate btf creation and loading
From: Andrii Nakryiko @ 2019-02-06  0:29 UTC (permalink / raw)
  To: songliubraving, yhs, ast, kafai, netdev, daniel, andrii.nakryiko
  Cc: Andrii Nakryiko
In-Reply-To: <20190206002949.1915237-1-andriin@fb.com>

This change splits out previous btf__new functionality of constructing
struct btf and loading it into kernel into two:
- btf__new() just creates and initializes struct btf
- btf__load() attempts to load existing struct btf into kernel

btf__free will still close BTF fd, if it was ever loaded successfully
into kernel.

This change allows users of libbpf to manipulate BTF using its API,
without the need to unnecessarily load it into kernel.

One of the intended use cases is pahole using libbpf to do DWARF to BTF
conversion and deduplication using libbpf, while handling ELF sections
overwrites and other concerns on its own.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
---
 tools/lib/bpf/btf.c      | 53 ++++++++++++++++++++++------------------
 tools/lib/bpf/btf.h      |  1 +
 tools/lib/bpf/libbpf.c   |  2 +-
 tools/lib/bpf/libbpf.map |  1 +
 4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 4949f8840bda..1c2ba7182400 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -366,8 +366,6 @@ void btf__free(struct btf *btf)
 
 struct btf *btf__new(__u8 *data, __u32 size)
 {
-	__u32 log_buf_size = 0;
-	char *log_buf = NULL;
 	struct btf *btf;
 	int err;
 
@@ -377,15 +375,6 @@ struct btf *btf__new(__u8 *data, __u32 size)
 
 	btf->fd = -1;
 
-	log_buf = malloc(BPF_LOG_BUF_SIZE);
-	if (!log_buf) {
-		err = -ENOMEM;
-		goto done;
-	}
-
-	*log_buf = 0;
-	log_buf_size = BPF_LOG_BUF_SIZE;
-
 	btf->data = malloc(size);
 	if (!btf->data) {
 		err = -ENOMEM;
@@ -395,17 +384,6 @@ struct btf *btf__new(__u8 *data, __u32 size)
 	memcpy(btf->data, data, size);
 	btf->data_size = size;
 
-	btf->fd = bpf_load_btf(btf->data, btf->data_size,
-			       log_buf, log_buf_size, false);
-
-	if (btf->fd == -1) {
-		err = -errno;
-		pr_warning("Error loading BTF: %s(%d)\n", strerror(errno), errno);
-		if (log_buf && *log_buf)
-			pr_warning("%s\n", log_buf);
-		goto done;
-	}
-
 	err = btf_parse_hdr(btf);
 	if (err)
 		goto done;
@@ -417,8 +395,6 @@ struct btf *btf__new(__u8 *data, __u32 size)
 	err = btf_parse_type_sec(btf);
 
 done:
-	free(log_buf);
-
 	if (err) {
 		btf__free(btf);
 		return ERR_PTR(err);
@@ -427,6 +403,35 @@ struct btf *btf__new(__u8 *data, __u32 size)
 	return btf;
 }
 
+int btf__load(struct btf* btf) {
+	__u32 log_buf_size = BPF_LOG_BUF_SIZE;
+	char *log_buf = NULL;
+	int err = 0;
+
+	if (btf->fd >= 0)
+		return -EEXIST;
+
+	log_buf = malloc(log_buf_size);
+	if (!log_buf)
+		return -ENOMEM;
+
+	*log_buf = 0;
+
+	btf->fd = bpf_load_btf(btf->data, btf->data_size,
+			       log_buf, log_buf_size, false);
+	if (btf->fd < 0) {
+		err = -errno;
+		pr_warning("Error loading BTF: %s(%d)\n", strerror(errno), errno);
+		if (*log_buf)
+			pr_warning("%s\n", log_buf);
+		goto done;
+	}
+
+done:
+	free(log_buf);
+	return err;
+}
+
 int btf__fd(const struct btf *btf)
 {
 	return btf->fd;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 25a9d2db035d..e8410887f93a 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -57,6 +57,7 @@ struct btf_ext_header {
 
 LIBBPF_API void btf__free(struct btf *btf);
 LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size);
+LIBBPF_API int btf__load(struct btf* btf);
 LIBBPF_API __s32 btf__find_by_name(const struct btf *btf,
 				   const char *type_name);
 LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 47969aa0faf8..ff86a43a4336 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -835,7 +835,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj, int flags)
 			obj->efile.maps_shndx = idx;
 		else if (strcmp(name, BTF_ELF_SEC) == 0) {
 			obj->btf = btf__new(data->d_buf, data->d_size);
-			if (IS_ERR(obj->btf)) {
+			if (IS_ERR(obj->btf) || btf__load(obj->btf)) {
 				pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n",
 					   BTF_ELF_SEC, PTR_ERR(obj->btf));
 				obj->btf = NULL;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 89c1149e32ee..f5372df143f4 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -137,6 +137,7 @@ LIBBPF_0.0.2 {
 		btf__get_map_kv_tids;
 		btf__get_nr_types;
 		btf__get_strings;
+		btf__load;
 		btf_ext__free;
 		btf_ext__func_info_rec_size;
 		btf_ext__line_info_rec_size;
-- 
2.17.1


^ permalink raw reply related

* [Patch net-next] mlx5: use RCU lock in mlx5_eq_cq_get()
From: Cong Wang @ 2019-02-06  0:35 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Saeed Mahameed, Tariq Toukan

mlx5_eq_cq_get() is called in IRQ handler, the spinlock inside
gets a lot of contentions when we test some heavy workload
with 60 RX queues and 80 CPU's, and it is clearly shown in the
flame graph.

In fact, radix_tree_lookup() is perfectly fine with RCU read lock,
we don't have to take a spinlock on this hot path. It is pretty much
similar to commit 291c566a2891
("net/mlx4_core: Fix racy CQ (Completion Queue) free"). Slow paths
are still serialized with the spinlock, and with synchronize_irq()
it should be safe to just move the fast path to RCU read lock.

This patch itself reduces the latency by about 50% with our workload.

Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index ee04aab65a9f..7092457705a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -114,11 +114,11 @@ static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
 	struct mlx5_cq_table *table = &eq->cq_table;
 	struct mlx5_core_cq *cq = NULL;
 
-	spin_lock(&table->lock);
+	rcu_read_lock();
 	cq = radix_tree_lookup(&table->tree, cqn);
 	if (likely(cq))
 		mlx5_cq_hold(cq);
-	spin_unlock(&table->lock);
+	rcu_read_unlock();
 
 	return cq;
 }
@@ -371,9 +371,9 @@ int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
 	struct mlx5_cq_table *table = &eq->cq_table;
 	int err;
 
-	spin_lock_irq(&table->lock);
+	spin_lock(&table->lock);
 	err = radix_tree_insert(&table->tree, cq->cqn, cq);
-	spin_unlock_irq(&table->lock);
+	spin_unlock(&table->lock);
 
 	return err;
 }
@@ -383,9 +383,9 @@ int mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
 	struct mlx5_cq_table *table = &eq->cq_table;
 	struct mlx5_core_cq *tmp;
 
-	spin_lock_irq(&table->lock);
+	spin_lock(&table->lock);
 	tmp = radix_tree_delete(&table->tree, cq->cqn);
-	spin_unlock_irq(&table->lock);
+	spin_unlock(&table->lock);
 
 	if (!tmp) {
 		mlx5_core_warn(eq->dev, "cq 0x%x not found in eq 0x%x tree\n", eq->eqn, cq->cqn);
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH]: net: hso: do not call unregister if not registered
From: David Miller @ 2019-02-06  0:36 UTC (permalink / raw)
  To: tuba; +Cc: netdev
In-Reply-To: <1549407687058.71019@ece.ufl.edu>

From: "Yavuz, Tuba" <tuba@ece.ufl.edu>
Date: Tue, 5 Feb 2019 23:01:25 +0000

> 
> 
> On an error path inside the hso_create_net_device function of the hso
> driver, hso_free_net_device gets called. This causes potentially a
> negative reference count in the net device if register_netdev has not
> been called yet as hso_free_net_device calls unregister_netdev
> regardless. I think the driver should distinguish these cases and call
> unregister_netdev only if register_netdev has been called.
> 
> Signed-off-by: Tuba Yavuz <tuba@ece.ufl.edu>
> ---
> 
> --- linux-stable/drivers/net/usb/hso.c.orig     2019-01-27 14:45:58.232683119 -0500
> +++ linux-stable/drivers/net/usb/hso.c  2019-02-05 17:54:17.056496019 -0500
> @@ -2377,7 +2377,9 @@ static void hso_free_net_device(struct h
>  
>         remove_net_device(hso_net->parent);
>  
> -       if (hso_net->net)
> +       if (hso_net->net &&
> +           hso_net->net->reg_state == NETREG_REGISTERED
> +          )

This is not formatted correctly, the final closing ')' should end the
previous line, like this:

       if (hso_net->net &&
           hso_net->net->reg_state == NETREG_REGISTERED)

^ permalink raw reply

* Re: [RFC PATCH 0/4] Initial support for allocating BPF JITs in vmalloc for x86
From: Alexei Starovoitov @ 2019-02-06  0:35 UTC (permalink / raw)
  To: Rick Edgecombe, daniel@iogearbox.net
  Cc: netdev@vger.kernel.org, ard.biesheuvel@linaro.org,
	dave.hansen@intel.com, kristen@linux.intel.com
In-Reply-To: <20190205225103.28296-1-rick.p.edgecombe@intel.com>

On 2/5/19 2:50 PM, Rick Edgecombe wrote:
> This introduces a new capability for BPF program JIT's to be located in vmalloc
> space on x86_64. This can serve as a backup area for CONFIG_BPF_JIT_ALWAYS_ON in
> case an unprivileged app uses all of the module space allowed by bpf_jit_limit.
> 
> In order to allow for calls from the increased distance of vmalloc from
> kernel/module space, relative calls are emitted as full indirect calls if the
> maximum relative call distance is exceeded. So the resulting performance of call
> BPF instructions in this case is similar to the BPF interpreter.

If I read this correctly the patches introduce retpoline overhead
to direct function call because JITed progs are more than 32-bit apart
and they're far away only because of dubious security concern ?
Nack.


^ permalink raw reply

* Re: [PATCH net] net: dsa: mv88e6xxx: Fix counting of ATU violations
From: David Miller @ 2019-02-06  0:38 UTC (permalink / raw)
  To: andrew; +Cc: netdev, f.fainelli
In-Reply-To: <20190205230258.20543-1-andrew@lunn.ch>

From: Andrew Lunn <andrew@lunn.ch>
Date: Wed,  6 Feb 2019 00:02:58 +0100

> The ATU port vector contains a bit per port of the switch. The code
> wrongly used it as a port number, and incremented a port counter. This
> resulted in the wrong interfaces counter being incremented, and
> potentially going off the end of the array of ports.
> 
> Fix this by using the source port ID for the violation, which really
> is a port number.
> 
> Reported-by: Chris Healy <Chris.Healy@zii.aero>
> Tested-by: Chris Healy <Chris.Healy@zii.aero>
> Fixes: 65f60e4582bd ("net: dsa: mv88e6xxx: Keep ATU/VTU violation statistics")
> Signed-off-by: Andrew Lunn <andrew@lunn.ch>

Applied and queued up for -stable, thanks Andrew.

^ permalink raw reply

* Re: [PATCH]: net: hso: do not call unregister if not registered
From: Yavuz, Tuba @ 2019-02-06  0:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org
In-Reply-To: <20190205.163640.175589510190486514.davem@davemloft.net>

This is surprising as I got total: 0 errors, 0 warnings, 0 checks, 10 lines checked.

Anyhow, I'll submit as suggested.

Best,


Tuba Yavuz, Ph.D.
Assistant Professor
Electrical and Computer Engineering Department
University of Florida
Gainesville, FL 32611
Webpage: http://www.tuba.ece.ufl.edu/
Email: tuba@ece.ufl.edu
Phone: (352) 846 0202

________________________________________
From: David Miller <davem@davemloft.net>
Sent: Tuesday, February 5, 2019 7:36 PM
To: Yavuz, Tuba
Cc: netdev@vger.kernel.org
Subject: Re: [PATCH]: net: hso: do not call unregister if not registered

From: "Yavuz, Tuba" <tuba@ece.ufl.edu>
Date: Tue, 5 Feb 2019 23:01:25 +0000

>
>
> On an error path inside the hso_create_net_device function of the hso
> driver, hso_free_net_device gets called. This causes potentially a
> negative reference count in the net device if register_netdev has not
> been called yet as hso_free_net_device calls unregister_netdev
> regardless. I think the driver should distinguish these cases and call
> unregister_netdev only if register_netdev has been called.
>
> Signed-off-by: Tuba Yavuz <tuba@ece.ufl.edu>
> ---
>
> --- linux-stable/drivers/net/usb/hso.c.orig     2019-01-27 14:45:58.232683119 -0500
> +++ linux-stable/drivers/net/usb/hso.c  2019-02-05 17:54:17.056496019 -0500
> @@ -2377,7 +2377,9 @@ static void hso_free_net_device(struct h
>
>         remove_net_device(hso_net->parent);
>
> -       if (hso_net->net)
> +       if (hso_net->net &&
> +           hso_net->net->reg_state == NETREG_REGISTERED
> +          )

This is not formatted correctly, the final closing ')' should end the
previous line, like this:

       if (hso_net->net &&
           hso_net->net->reg_state == NETREG_REGISTERED)

^ permalink raw reply

* Re: [PATCH bpf-next] tools/bpf: fix a selftest test_btf failure
From: Andrii Nakryiko @ 2019-02-06  0:39 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Andrii Nakryiko, netdev, Alexei Starovoitov, Daniel Borkmann,
	kernel-team
In-Reply-To: <20190205222844.2425372-1-yhs@fb.com>

On Tue, Feb 5, 2019 at 2:30 PM Yonghong Song <yhs@fb.com> wrote:
>
> Commit 9c651127445c ("selftests/btf: add initial BTF dedup tests")
> added dedup tests in test_btf.c.
> It broke the raw test:
>  BTF raw test[71] (func proto (Bad arg name_off)):
>     btf_raw_create:2905:FAIL Error getting string #65535, strs_cnt:1

Argh.. Thanks for fixing this!

>
> The test itself encodes invalid func_proto parameter name
> offset 0xffffFFFF as a negative test for the kernel.
> The above commit changed the meaning of that offset and
> resulted in a user space error.
>   #define NAME_NTH(N) (0xffff0000 | N)
>   #define IS_NAME_NTH(X) ((X & 0xffff0000) == 0xffff0000)
>   #define GET_NAME_NTH_IDX(X) (X & 0x0000ffff)
>
> Currently, the kernel permits maximum name offset 0xffff.
> Set the test name off as 0x0fffFFFF to trigger the kernel
> verification failure.
>
> Cc: Andrii Nakryiko <andriin@fb.com>
> Fixes: 9c651127445c ("selftests/btf: add initial BTF dedup tests")
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  tools/testing/selftests/bpf/test_btf.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
> index 30c3edde7e07..447acc34db94 100644
> --- a/tools/testing/selftests/bpf/test_btf.c
> +++ b/tools/testing/selftests/bpf/test_btf.c
> @@ -1978,7 +1978,7 @@ static struct btf_raw_test raw_tests[] = {
>                 /* void (*)(int a, unsigned int <bad_name_off>) */
>                 BTF_FUNC_PROTO_ENC(0, 2),                       /* [3] */
>                         BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
> -                       BTF_FUNC_PROTO_ARG_ENC(0xffffffff, 2),
> +                       BTF_FUNC_PROTO_ARG_ENC(0x0fffffff, 2),
>                 BTF_END_RAW,
>         },
>         .str_sec = "\0a",
> --
> 2.17.1
>

Acked-by: Andrii Nakryiko <andriin@fb.com>

^ permalink raw reply

* Re: [PATCH net] mISDN: fix a race in dev_expire_timer()
From: David Miller @ 2019-02-06  0:39 UTC (permalink / raw)
  To: edumazet; +Cc: netdev, eric.dumazet, isdn, syzkaller
In-Reply-To: <20190205233844.226090-1-edumazet@google.com>

From: Eric Dumazet <edumazet@google.com>
Date: Tue,  5 Feb 2019 15:38:44 -0800

> Since mISDN_close() uses dev->pending to iterate over active
> timers, there is a chance that one timer got removed from the
> ->pending list in dev_expire_timer() but that the thread
> has not called yet wake_up_interruptible()
> 
> So mISDN_close() could miss this and free dev before
> completion of at least one dev_expire_timer()
> 
> syzbot was able to catch this race :
 ...
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Karsten Keil <isdn@linux-pingi.de>
> Reported-by: syzbot <syzkaller@googlegroups.com>

Applied.

^ permalink raw reply

* [PATCH] net: hso: do not unregister if not registered
From: Yavuz, Tuba @ 2019-02-06  0:40 UTC (permalink / raw)
  To: netdev@vger.kernel.org


On an error path inside the hso_create_net_device function of the hso
driver, hso_free_net_device gets called. This causes potentially a
negative reference count in the net device if register_netdev has not
been called yet as hso_free_net_device calls unregister_netdev
regardless. I think the driver should distinguish these cases and call
unregister_netdev only if register_netdev has been called.

Signed-off-by: Tuba Yavuz <tuba@ece.ufl.edu>
---

--- linux-stable/drivers/net/usb/hso.c.orig     2019-01-27 14:45:58.232683119 -0500
+++ linux-stable/drivers/net/usb/hso.c  2019-02-05 17:54:17.056496019 -0500
@@ -2377,7 +2377,9 @@ static void hso_free_net_device(struct h
 
        remove_net_device(hso_net->parent);
 
-       if (hso_net->net)
+       if (hso_net->net &&
+           hso_net->net->reg_state == NETREG_REGISTERED)
                unregister_netdev(hso_net->net);
 
        /* start freeing */



^ permalink raw reply

* Re: [RFC bpf-next 0/7] net: flow_dissector: trigger BPF hook when called from eth_get_headlen
From: Alexei Starovoitov @ 2019-02-06  0:47 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Willem de Bruijn, Stanislav Fomichev, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann, simon.horman,
	Willem de Bruijn
In-Reply-To: <20190205204003.GB10769@mini-arch>

On Tue, Feb 05, 2019 at 12:40:03PM -0800, Stanislav Fomichev wrote:
> On 02/05, Willem de Bruijn wrote:
> > On Tue, Feb 5, 2019 at 12:57 PM Stanislav Fomichev <sdf@google.com> wrote:
> > >
> > > Currently, when eth_get_headlen calls flow dissector, it doesn't pass any
> > > skb. Because we use passed skb to lookup associated networking namespace
> > > to find whether we have a BPF program attached or not, we always use
> > > C-based flow dissector in this case.
> > >
> > > The goal of this patch series is to add new networking namespace argument
> > > to the eth_get_headlen and make BPF flow dissector programs be able to
> > > work in the skb-less case.
> > >
> > > The series goes like this:
> > > 1. introduce __init_skb and __init_skb_shinfo; those will be used to
> > >    initialize temporary skb
> > > 2. introduce skb_net which can be used to get networking namespace
> > >    associated with an skb
> > > 3. add new optional network namespace argument to __skb_flow_dissect and
> > >    plumb through the callers
> > > 4. add new __flow_bpf_dissect which constructs temporary on-stack skb
> > >    (using __init_skb) and calls BPF flow dissector program
> > 
> > The main concern I see with this series is this cost of skb zeroing
> > for every packet in the device driver receive routine, *independent*
> > from the real skb allocation and zeroing which will likely happen
> > later.
> Yes, plus ~200 bytes on the stack for the callers.
> 
> Not sure how visible this zeroing though, I can probably try to get some
> numbers from BPF_PROG_TEST_RUN (running current version vs running with
> on-stack skb).

imo extra 256 byte memset for every packet is non starter.


^ permalink raw reply

* Re: [RFC bpf-next 0/7] net: flow_dissector: trigger BPF hook when called from eth_get_headlen
From: Stanislav Fomichev @ 2019-02-06  0:59 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Willem de Bruijn, Stanislav Fomichev, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann, simon.horman,
	Willem de Bruijn
In-Reply-To: <20190206004714.pz44evow5uwgvt4x@ast-mbp.dhcp.thefacebook.com>

On 02/05, Alexei Starovoitov wrote:
> On Tue, Feb 05, 2019 at 12:40:03PM -0800, Stanislav Fomichev wrote:
> > On 02/05, Willem de Bruijn wrote:
> > > On Tue, Feb 5, 2019 at 12:57 PM Stanislav Fomichev <sdf@google.com> wrote:
> > > >
> > > > Currently, when eth_get_headlen calls flow dissector, it doesn't pass any
> > > > skb. Because we use passed skb to lookup associated networking namespace
> > > > to find whether we have a BPF program attached or not, we always use
> > > > C-based flow dissector in this case.
> > > >
> > > > The goal of this patch series is to add new networking namespace argument
> > > > to the eth_get_headlen and make BPF flow dissector programs be able to
> > > > work in the skb-less case.
> > > >
> > > > The series goes like this:
> > > > 1. introduce __init_skb and __init_skb_shinfo; those will be used to
> > > >    initialize temporary skb
> > > > 2. introduce skb_net which can be used to get networking namespace
> > > >    associated with an skb
> > > > 3. add new optional network namespace argument to __skb_flow_dissect and
> > > >    plumb through the callers
> > > > 4. add new __flow_bpf_dissect which constructs temporary on-stack skb
> > > >    (using __init_skb) and calls BPF flow dissector program
> > > 
> > > The main concern I see with this series is this cost of skb zeroing
> > > for every packet in the device driver receive routine, *independent*
> > > from the real skb allocation and zeroing which will likely happen
> > > later.
> > Yes, plus ~200 bytes on the stack for the callers.
> > 
> > Not sure how visible this zeroing though, I can probably try to get some
> > numbers from BPF_PROG_TEST_RUN (running current version vs running with
> > on-stack skb).
> 
> imo extra 256 byte memset for every packet is non starter.
We can put pre-allocated/initialized skbs without data into percpu or even
use pcpu_freelist_pop/pcpu_freelist_push to make sure we don't have to think
about having multiple percpu for irq/softirq/process contexts.
Any concerns with that approach?
Any other possible concerns with the overall series?

^ permalink raw reply

* Re: [RFC PATCH 0/4] Initial support for allocating BPF JITs in vmalloc for x86
From: Edgecombe, Rick P @ 2019-02-06  1:11 UTC (permalink / raw)
  To: daniel@iogearbox.net, ast@fb.com
  Cc: kristen@linux.intel.com, netdev@vger.kernel.org,
	ard.biesheuvel@linaro.org, Hansen, Dave
In-Reply-To: <84060613-48b1-fa62-e784-104faa7fc665@fb.com>

On Wed, 2019-02-06 at 00:35 +0000, Alexei Starovoitov wrote:
> On 2/5/19 2:50 PM, Rick Edgecombe wrote:
> > This introduces a new capability for BPF program JIT's to be located in
> > vmalloc
> > space on x86_64. This can serve as a backup area for
> > CONFIG_BPF_JIT_ALWAYS_ON in
> > case an unprivileged app uses all of the module space allowed by
> > bpf_jit_limit.
> > 
> > In order to allow for calls from the increased distance of vmalloc from
> > kernel/module space, relative calls are emitted as full indirect calls if
> > the
> > maximum relative call distance is exceeded. So the resulting performance of
> > call
> > BPF instructions in this case is similar to the BPF interpreter.
> 
> If I read this correctly the patches introduce retpoline overhead
> to direct function call because JITed progs are more than 32-bit apart
> and they're far away only because of dubious security concern ?
> Nack.
> 
There really isn't any overhead, because they are only far away if the module
space is full, or the bpf_jit_limit is exceeded for non-admin. So cases today
when insertions would succeed it emits the same code, but cases where the
insertion would fail due to lack of space, it now at least works with the
described performance.


^ permalink raw reply

* Re: [RFC PATCH 0/4] Initial support for allocating BPF JITs in vmalloc for x86
From: Alexei Starovoitov @ 2019-02-06  1:40 UTC (permalink / raw)
  To: Edgecombe, Rick P, daniel@iogearbox.net
  Cc: kristen@linux.intel.com, netdev@vger.kernel.org,
	ard.biesheuvel@linaro.org, Hansen, Dave
In-Reply-To: <e3697ab2717d9440da72775a25b83d6193a42ea2.camel@intel.com>

On 2/5/19 5:11 PM, Edgecombe, Rick P wrote:
> On Wed, 2019-02-06 at 00:35 +0000, Alexei Starovoitov wrote:
>> On 2/5/19 2:50 PM, Rick Edgecombe wrote:
>>> This introduces a new capability for BPF program JIT's to be located in
>>> vmalloc
>>> space on x86_64. This can serve as a backup area for
>>> CONFIG_BPF_JIT_ALWAYS_ON in
>>> case an unprivileged app uses all of the module space allowed by
>>> bpf_jit_limit.
>>>
>>> In order to allow for calls from the increased distance of vmalloc from
>>> kernel/module space, relative calls are emitted as full indirect calls if
>>> the
>>> maximum relative call distance is exceeded. So the resulting performance of
>>> call
>>> BPF instructions in this case is similar to the BPF interpreter.
>>
>> If I read this correctly the patches introduce retpoline overhead
>> to direct function call because JITed progs are more than 32-bit apart
>> and they're far away only because of dubious security concern ?
>> Nack.
>>
> There really isn't any overhead, because they are only far away if the module
> space is full, or the bpf_jit_limit is exceeded for non-admin. So cases today
> when insertions would succeed it emits the same code, but cases where the
> insertion would fail due to lack of space, it now at least works with the
> described performance.

I disagree with the problem statement.
x86 classic BPF jit has been around forever and no one
complained that _unprivileged_ bpf progs exhaust module space.
With bpf_jit_limit we got an extra knob to close this
remote possibility of an attack.
It's more than enough.

^ permalink raw reply

* [PATCH bpf-next] tools: bpftool: doc, add text about feature-subcommand
From: Prashant Bhole @ 2019-02-06  1:45 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: Prashant Bhole, netdev

This patch adds missing information about feature-subcommand in
bpftool.rst

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
---
 tools/bpf/bpftool/Documentation/bpftool.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 27153bb816ac..0685d5ada3ce 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
 
 	**bpftool** **version**
 
-	*OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** }
+	*OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** | **feature** }
 
 	*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -34,6 +34,8 @@ SYNOPSIS
 
 	*NET-COMMANDS* := { **show** | **list** | **help** }
 
+        *FEATURE-COMMANDS* := { **probe** | **help** }
+
 DESCRIPTION
 ===========
 	*bpftool* allows for inspection and simple modification of BPF objects
-- 
2.20.1



^ permalink raw reply related

* [PATCH bpf-next] tools: bpftool: doc, fix incorrect text
From: Prashant Bhole @ 2019-02-06  1:47 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: Prashant Bhole, netdev

Documentation about cgroup, feature, prog uses wrong header
'MAP COMMANDS' while listing commands. This patch corrects the header
in respective doc files.

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
---
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst  | 4 ++--
 tools/bpf/bpftool/Documentation/bpftool-feature.rst | 4 ++--
 tools/bpf/bpftool/Documentation/bpftool-prog.rst    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index d43fce568ef7..9bb9ace54ba8 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -17,8 +17,8 @@ SYNOPSIS
 	*COMMANDS* :=
 	{ **show** | **list** | **tree** | **attach** | **detach** | **help** }
 
-MAP COMMANDS
-=============
+CGROUP COMMANDS
+===============
 
 |	**bpftool** **cgroup { show | list }** *CGROUP*
 |	**bpftool** **cgroup tree** [*CGROUP_ROOT*]
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index 8d489a26e3c9..82de03dd8f52 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -16,8 +16,8 @@ SYNOPSIS
 
 	*COMMANDS* := { **probe** | **help** }
 
-MAP COMMANDS
-=============
+FEATURE COMMANDS
+================
 
 |	**bpftool** **feature probe** [*COMPONENT*] [**macros** [**prefix** *PREFIX*]]
 |	**bpftool** **feature help**
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 13b56102f528..7e59495cb028 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -18,7 +18,7 @@ SYNOPSIS
 	{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load**
 	| **loadall** | **help** }
 
-MAP COMMANDS
+PROG COMMANDS
 =============
 
 |	**bpftool** **prog { show | list }** [*PROG*]
-- 
2.20.1



^ permalink raw reply related

* Re: [PATCH 2/2] doc: add phylink documentation to the networking book
From: Andrew Lunn @ 2019-02-06  1:59 UTC (permalink / raw)
  To: Russell King; +Cc: linux-doc, netdev, David S. Miller, Jonathan Corbet
In-Reply-To: <E1gr376-0007ea-NV@rmk-PC.armlinux.org.uk>

On Tue, Feb 05, 2019 at 03:58:20PM +0000, Russell King wrote:
> Add some phylink documentation to the networking book detailing how
> to convert network drivers from phylib to phylink.
> 
> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* [net-next 08/14] igc: Remove unused code
From: Jeff Kirsher @ 2019-02-06  2:04 UTC (permalink / raw)
  To: davem; +Cc: Sasha Neftin, netdev, nhorman, sassmann, Aaron Brown,
	Jeff Kirsher
In-Reply-To: <20190206020424.12225-1-jeffrey.t.kirsher@intel.com>

From: Sasha Neftin <sasha.neftin@intel.com>

Remove unused igc_adv_data_desc definition from igc_base.h file.
Descriptors definition will be added per demand.

Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igc/igc_base.h | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_base.h b/drivers/net/ethernet/intel/igc/igc_base.h
index a5d3f57274a8..76d4991d7284 100644
--- a/drivers/net/ethernet/intel/igc/igc_base.h
+++ b/drivers/net/ethernet/intel/igc/igc_base.h
@@ -36,28 +36,6 @@ union igc_adv_tx_desc {
 
 #define IGC_RAR_ENTRIES		16
 
-struct igc_adv_data_desc {
-	__le64 buffer_addr;    /* Address of the descriptor's data buffer */
-	union {
-		u32 data;
-		struct {
-			u32 datalen:16; /* Data buffer length */
-			u32 rsvd:4;
-			u32 dtyp:4;  /* Descriptor type */
-			u32 dcmd:8;  /* Descriptor command */
-		} config;
-	} lower;
-	union {
-		u32 data;
-		struct {
-			u32 status:4;  /* Descriptor status */
-			u32 idx:4;
-			u32 popts:6;  /* Packet Options */
-			u32 paylen:18; /* Payload length */
-		} options;
-	} upper;
-};
-
 /* Receive Descriptor - Advanced */
 union igc_adv_rx_desc {
 	struct {
-- 
2.20.1


^ permalink raw reply related

* [net-next 04/14] igc: Fix code redundancy
From: Jeff Kirsher @ 2019-02-06  2:04 UTC (permalink / raw)
  To: davem; +Cc: Sasha Neftin, netdev, nhorman, sassmann, Aaron Brown,
	Jeff Kirsher
In-Reply-To: <20190206020424.12225-1-jeffrey.t.kirsher@intel.com>

From: Sasha Neftin <sasha.neftin@intel.com>

Remove redundant igc_check_for_link_base code and replace it with
an igc_check_for_copper_link method.
Fix duplication of IGC_ADVTXD_PAYLEN_SHIFT mask declaration.
Remove obsolete IGC_SCVPC register definition.

Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igc/igc_base.c | 20 ++------------------
 drivers/net/ethernet/intel/igc/igc_base.h |  3 ---
 drivers/net/ethernet/intel/igc/igc_regs.h |  1 -
 3 files changed, 2 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_base.c b/drivers/net/ethernet/intel/igc/igc_base.c
index df40af759542..19ff987922d2 100644
--- a/drivers/net/ethernet/intel/igc/igc_base.c
+++ b/drivers/net/ethernet/intel/igc/igc_base.c
@@ -53,22 +53,6 @@ static s32 igc_set_pcie_completion_timeout(struct igc_hw *hw)
 	return ret_val;
 }
 
-/**
- * igc_check_for_link_base - Check for link
- * @hw: pointer to the HW structure
- *
- * If sgmii is enabled, then use the pcs register to determine link, otherwise
- * use the generic interface for determining link.
- */
-static s32 igc_check_for_link_base(struct igc_hw *hw)
-{
-	s32 ret_val = 0;
-
-	ret_val = igc_check_for_copper_link(hw);
-
-	return ret_val;
-}
-
 /**
  * igc_reset_hw_base - Reset hardware
  * @hw: pointer to the HW structure
@@ -265,7 +249,7 @@ static s32 igc_init_phy_params_base(struct igc_hw *hw)
 	if (ret_val)
 		return ret_val;
 
-	igc_check_for_link_base(hw);
+	igc_check_for_copper_link(hw);
 
 	/* Verify phy id and set remaining function pointers */
 	switch (phy->id) {
@@ -512,7 +496,7 @@ void igc_rx_fifo_flush_base(struct igc_hw *hw)
 
 static struct igc_mac_operations igc_mac_ops_base = {
 	.init_hw		= igc_init_hw_base,
-	.check_for_link		= igc_check_for_link_base,
+	.check_for_link		= igc_check_for_copper_link,
 	.rar_set		= igc_rar_set,
 	.read_mac_addr		= igc_read_mac_addr_base,
 	.get_speed_and_duplex	= igc_get_link_up_info_base,
diff --git a/drivers/net/ethernet/intel/igc/igc_base.h b/drivers/net/ethernet/intel/igc/igc_base.h
index 35588fa7b8c5..a5d3f57274a8 100644
--- a/drivers/net/ethernet/intel/igc/igc_base.h
+++ b/drivers/net/ethernet/intel/igc/igc_base.h
@@ -90,9 +90,6 @@ union igc_adv_rx_desc {
 	} wb;  /* writeback */
 };
 
-/* Adv Transmit Descriptor Config Masks */
-#define IGC_ADVTXD_PAYLEN_SHIFT	14 /* Adv desc PAYLEN shift */
-
 /* Additional Transmit Descriptor Control definitions */
 #define IGC_TXDCTL_QUEUE_ENABLE	0x02000000 /* Ena specific Tx Queue */
 
diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h
index a1bd3216c906..f8c835283377 100644
--- a/drivers/net/ethernet/intel/igc/igc_regs.h
+++ b/drivers/net/ethernet/intel/igc/igc_regs.h
@@ -188,7 +188,6 @@
 #define IGC_HGOTCL	0x04130  /* Host Good Octets Transmit Count Low */
 #define IGC_HGOTCH	0x04134  /* Host Good Octets Transmit Count High */
 #define IGC_LENERRS	0x04138  /* Length Errors Count */
-#define IGC_SCVPC	0x04228  /* SerDes/SGMII Code Violation Pkt Count */
 #define IGC_HRMPC	0x0A018  /* Header Redirection Missed Packet Count */
 
 /* Management registers */
-- 
2.20.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox