Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next] ixgbe: introduce a helper to simplify code
From: YueHaibing @ 2018-05-23 12:08 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher; +Cc: netdev, intel-wired-lan, YueHaibing

ixgbe_dbg_reg_ops_read and ixgbe_dbg_netdev_ops_read copy-pasting
the same code except for ixgbe_dbg_netdev_ops_buf/ixgbe_dbg_reg_ops_buf,
so introduce a helper ixgbe_dbg_common_ops_read to remove redundant code.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c | 57 +++++++++---------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
index 55fe811..50dfb02 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
@@ -10,15 +10,9 @@ static struct dentry *ixgbe_dbg_root;
 
 static char ixgbe_dbg_reg_ops_buf[256] = "";
 
-/**
- * ixgbe_dbg_reg_ops_read - read for reg_ops datum
- * @filp: the opened file
- * @buffer: where to write the data for the user to read
- * @count: the size of the user's buffer
- * @ppos: file position offset
- **/
-static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
-				    size_t count, loff_t *ppos)
+static ssize_t ixgbe_dbg_common_ops_read(struct file *filp, char __user *buffer,
+					 size_t count, loff_t *ppos,
+					 char *dbg_buf)
 {
 	struct ixgbe_adapter *adapter = filp->private_data;
 	char *buf;
@@ -29,8 +23,7 @@ static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
 		return 0;
 
 	buf = kasprintf(GFP_KERNEL, "%s: %s\n",
-			adapter->netdev->name,
-			ixgbe_dbg_reg_ops_buf);
+			adapter->netdev->name, dbg_buf);
 	if (!buf)
 		return -ENOMEM;
 
@@ -46,6 +39,20 @@ static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
 }
 
 /**
+ * ixgbe_dbg_reg_ops_read - read for reg_ops datum
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
+				      size_t count, loff_t *ppos)
+{
+	return ixgbe_dbg_common_ops_read(filp, buffer, count, ppos,
+					 ixgbe_dbg_reg_ops_buf);
+}
+
+/**
  * ixgbe_dbg_reg_ops_write - write into reg_ops datum
  * @filp: the opened file
  * @buffer: where to find the user's data
@@ -121,33 +128,11 @@ static char ixgbe_dbg_netdev_ops_buf[256] = "";
  * @count: the size of the user's buffer
  * @ppos: file position offset
  **/
-static ssize_t ixgbe_dbg_netdev_ops_read(struct file *filp,
-					 char __user *buffer,
+static ssize_t ixgbe_dbg_netdev_ops_read(struct file *filp, char __user *buffer,
 					 size_t count, loff_t *ppos)
 {
-	struct ixgbe_adapter *adapter = filp->private_data;
-	char *buf;
-	int len;
-
-	/* don't allow partial reads */
-	if (*ppos != 0)
-		return 0;
-
-	buf = kasprintf(GFP_KERNEL, "%s: %s\n",
-			adapter->netdev->name,
-			ixgbe_dbg_netdev_ops_buf);
-	if (!buf)
-		return -ENOMEM;
-
-	if (count < strlen(buf)) {
-		kfree(buf);
-		return -ENOSPC;
-	}
-
-	len = simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-
-	kfree(buf);
-	return len;
+	return ixgbe_dbg_common_ops_read(filp, buffer, count, ppos,
+					 ixgbe_dbg_netdev_ops_buf);
 }
 
 /**
-- 
2.7.0

^ permalink raw reply related

* pull-request: mac80211-next 2018-05-23
From: Johannes Berg @ 2018-05-23 12:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-wireless

Hi Dave,

Here's a new version of the pull request for net-next, now
with the stack size fixes included, which were the reason I
withdrew my earlier one. Other things are also included all
over the map.

Please pull and let me know if there's any problem.

Thanks,
johannes



The following changes since commit 1fe8c06c4a0d3b589f076cd00c25082840f10423:

  Merge branch 'qed-firmware-TLV' (2018-05-22 23:29:55 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git tags/mac80211-next-for-davem-2018-05-23

for you to fetch changes up to bad2929733635f80f99930b252757c70372356fe:

  nl80211: Reject disconnect commands except from conn_owner (2018-05-23 11:56:26 +0200)

----------------------------------------------------------------
For this round, we have various things all over the place, notably
 * a fix for a race in aggregation, which I want to let
   bake for a bit longer before sending to stable
 * some new statistics (ACK RSSI, TXQ)
 * TXQ configuration
 * preparations for HE, particularly radiotap
 * replace confusing "country IE" by "country element" since it's
   not referring to Ireland

Note that I merged net-next to get a fix from mac80211 that got
there via net, to apply one patch that would otherwise conflict.

----------------------------------------------------------------
Alexander Wetzel (1):
      mac80211: fix TX aggregation stop race

Amar Singhal (1):
      cfg80211: Call reg_notifier for self managed hints conditionally

Andrew Zaborowski (1):
      nl80211: Reject disconnect commands except from conn_owner

Arend Van Spriel (2):
      cfg80211: use separate struct for FILS parameters
      nl80211: add FILS related parameters to ROAM event

Arend van Spriel (1):
      cfg80211: dynamically allocate per-tid stats for station info

Balaji Pothunoori (2):
      cfg80211: average ack rssi support for data frames
      mac80211: average ack rssi support for data frames

Bjoern Johansson (1):
      mac80211_hwsim: indicate support for powersave.

Colin Ian King (2):
      mac80211: ethtool: avoid 32 bit multiplication overflow
      cfg80211: fix spelling mistake: "uknown" -> "unknown"

Denis Kenzior (2):
      nl80211: Fix compilation
      nl80211: Optimize cfg80211_bss_expire invocations

Gregory Greenman (1):
      mac80211: add api to set CSA counter in mac80211

Haim Dreyfuss (1):
      nl80211: Add wmm rule attribute to NL80211_CMD_GET_WIPHY dump command

Ilan Peer (1):
      mac80211: Support adding duration for prepare_tx() callback

Johannes Berg (9):
      mac80211: rename rtap_vendor_space to rtap_space
      mac80211: clean up rate info bandwidth setting
      mac80211: ethtool: memset the whole sinfo struct to 0
      mac80211: remove pointless flags=0 assignment
      cfg80211/mac80211: revert to stack allocation for sinfo
      mac80211: allocate and fill tidstats only when needed
      cfg80211: release station info tidstats where needed
      cfg80211: add missing kernel-doc
      Merge remote-tracking branch 'net-next/master' into mac80211-next

João Paulo Rechi Vita (2):
      rfkill: Rename rfkill_any_led_trigger* functions
      rfkill: Create rfkill-none LED trigger

Toke Høiland-Jørgensen (3):
      regulatory: Rename confusing 'country IE' in log output
      cfg80211: Expose TXQ stats and parameters to userspace
      mac80211: Support the new cfg80211 TXQ stats API

Vidyullatha Kanchanapally (1):
      nl80211: Update ERP info using NL80211_CMD_UPDATE_CONNECT_PARAMS

 drivers/net/wireless/ath/ath9k/main.c             |   3 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c |   6 +-
 drivers/net/wireless/mac80211_hwsim.c             |   1 +
 include/net/cfg80211.h                            | 131 ++++++++--
 include/net/mac80211.h                            |  18 +-
 include/uapi/linux/nl80211.h                      |  99 ++++++-
 net/mac80211/cfg.c                                | 103 +++++++-
 net/mac80211/driver-ops.h                         |   8 +-
 net/mac80211/ethtool.c                            |  13 +-
 net/mac80211/ht.c                                 |  44 ++--
 net/mac80211/ieee80211_i.h                        |   3 +
 net/mac80211/main.c                               |   3 +
 net/mac80211/mlme.c                               |  17 +-
 net/mac80211/rx.c                                 |  40 ++-
 net/mac80211/sta_info.c                           |  38 ++-
 net/mac80211/sta_info.h                           |   5 +-
 net/mac80211/status.c                             |   2 +
 net/mac80211/trace.h                              |  25 +-
 net/mac80211/tx.c                                 |  45 ++++
 net/mac80211/util.c                               |   6 +-
 net/rfkill/core.c                                 |  66 +++--
 net/wireless/core.c                               |   4 +
 net/wireless/nl80211.c                            | 304 ++++++++++++++++++++--
 net/wireless/rdev-ops.h                           |  12 +
 net/wireless/reg.c                                |  39 ++-
 net/wireless/sme.c                                |  88 +++++--
 net/wireless/trace.h                              |  14 +
 net/wireless/util.c                               |  11 +
 28 files changed, 968 insertions(+), 180 deletions(-)

^ permalink raw reply

* [PATCH bpf-next v2 0/3] bpf: add boot parameters for sysctl knobs
From: Eugene Syromiatnikov @ 2018-05-23 12:18 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-doc, Kees Cook, Kai-Heng Feng,
	Daniel Borkmann, Alexei Starovoitov, Jonathan Corbet, Jiri Olsa,
	Jesper Dangaard Brouer

Some BPF sysctl knobs affect the loading of BPF programs, and during
system boot/init stages these sysctls are not yet configured.
A concrete example is systemd, that has implemented loading of BPF
programs.

Thus, to allow controlling these setting at early boot, this patch set
adds the ability to change the default setting of these sysctl knobs
as well as option to override them via a boot-time kernel parameter
(in order to avoid rebuilding kernel each time a need of changing these
defaults arises).

The sysctl knobs in question are kernel.unprivileged_bpf_disable,
net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.

Eugene Syromiatnikov (3):
  bpf: add ability to configure unprivileged BPF via boot-time parameter
  bpf: add ability to configure BPF JIT hardening via boot-time
    parameter
  bpf: add ability to configure BPF JIT kallsyms export at the boot time

 Documentation/admin-guide/kernel-parameters.txt | 28 ++++++++
 init/Kconfig                                    | 90 +++++++++++++++++++++++++
 kernel/bpf/core.c                               | 31 +++++++++
 kernel/bpf/syscall.c                            | 16 +++++
 4 files changed, 165 insertions(+)

-- 
2.1.4

^ permalink raw reply

* [PATCH bpf-next v2 1/3] bpf: add ability to configure unprivileged BPF via boot-time parameter
From: Eugene Syromiatnikov @ 2018-05-23 12:18 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-doc, Kees Cook, Kai-Heng Feng,
	Daniel Borkmann, Alexei Starovoitov, Jonathan Corbet, Jiri Olsa,
	Jesper Dangaard Brouer

This patch introduces two configuration options,
UNPRIVILEGED_BPF_BOOTPARAM and UNPRIVILEGED_BPF_BOOTPARAM_VALUE, that
allow configuring the initial value of kernel.unprivileged_bpf_disabled
sysctl knob, which is useful for the cases when disabling unprivileged
bpf() access during the early boot is desirable.

Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 +++++++
 init/Kconfig                                    | 31 +++++++++++++++++++++++++
 kernel/bpf/syscall.c                            | 16 +++++++++++++
 3 files changed, 55 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 11fc28e..aa8e831 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4355,6 +4355,14 @@
 	unknown_nmi_panic
 			[X86] Cause panic on unknown NMI.
 
+	unprivileged_bpf_disabled=
+			Format: { "0" | "1" }
+			Sets initial value of kernel.unprivileged_bpf_disabled
+			sysctl knob.
+			0 - unprivileged bpf() syscall access enabled.
+			1 - unprivileged bpf() syscall access disabled.
+			Default value is set via kernel config option.
+
 	usbcore.authorized_default=
 			[USB] Default USB device authorization:
 			(default -1 = authorized except for wireless USB,
diff --git a/init/Kconfig b/init/Kconfig
index 480a4f2..1403a3e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1404,6 +1404,37 @@ config BPF_JIT_ALWAYS_ON
 	  Enables BPF JIT and removes BPF interpreter to avoid
 	  speculative execution of BPF instructions by the interpreter
 
+config UNPRIVILEGED_BPF_BOOTPARAM
+	bool "Unprivileged bpf() boot parameter"
+	depends on BPF_SYSCALL
+	default n
+	help
+	  This option adds a kernel parameter 'unprivileged_bpf_disabled'
+	  that allows configuring default state of the
+	  kernel.unprivileged_bpf_disabled sysctl knob.
+	  If this option is selected, unprivileged access to the bpf() syscall
+	  can be disabled with unprivileged_bpf_disabled=1 on the kernel command
+	  line.  The purpose of this option is to allow disabling unprivileged
+	  bpf() syscall access during the early boot.
+
+	  If you are unsure how to answer this question, answer N.
+
+config UNPRIVILEGED_BPF_BOOTPARAM_VALUE
+	int "Unprivileged bpf() boot parameter default value"
+	depends on UNPRIVILEGED_BPF_BOOTPARAM
+	range 0 1
+	default 0
+	help
+	  This option sets the default value for the kernel parameter
+	  'unprivileged_bpf_disabled', which allows disabling unprivileged bpf()
+	  syscall access at boot.  If this option is set to 0 (zero), the
+	  unprivileged bpf() boot kernel parameter will default to 0, allowing
+	  unprivileged bpf() syscall access at bootup.  If this option is
+	  set to 1 (one), the unprivileged bpf() kernel parameter will default
+	  to 1, disabling unprivileged bpf() syscall access at bootup.
+
+	  If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
 	bool "Enable userfaultfd() system call"
 	select ANON_INODES
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde94..fdc5fd9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -29,6 +29,7 @@
 #include <linux/ctype.h>
 #include <linux/btf.h>
 #include <linux/nospec.h>
+#include <linux/init.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -45,7 +46,22 @@ static DEFINE_SPINLOCK(prog_idr_lock);
 static DEFINE_IDR(map_idr);
 static DEFINE_SPINLOCK(map_idr_lock);
 
+#ifdef CONFIG_UNPRIVILEGED_BPF_BOOTPARAM
+int sysctl_unprivileged_bpf_disabled __read_mostly =
+	CONFIG_UNPRIVILEGED_BPF_BOOTPARAM_VALUE;
+
+static int __init unprivileged_bpf_setup(char *str)
+{
+	unsigned long disabled;
+
+	if (!kstrtoul(str, 0, &disabled))
+		sysctl_unprivileged_bpf_disabled = !!disabled;
+	return 1;
+}
+__setup("unprivileged_bpf_disabled=", unprivileged_bpf_setup);
+#else /* !CONFIG_UNPRIVILEGED_BPF_BOOTPARAM */
 int sysctl_unprivileged_bpf_disabled __read_mostly;
+#endif /* CONFIG_UNPRIVILEGED_BPF_BOOTPARAM */
 
 static const struct bpf_map_ops * const bpf_map_types[] = {
 #define BPF_PROG_TYPE(_id, _ops)
-- 
2.1.4

^ permalink raw reply related

* [PATCH bpf-next v2 2/3] bpf: add ability to configure BPF JIT hardening via boot-time parameter
From: Eugene Syromiatnikov @ 2018-05-23 12:18 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-doc, Kees Cook, Kai-Heng Feng,
	Daniel Borkmann, Alexei Starovoitov, Jonathan Corbet, Jiri Olsa,
	Jesper Dangaard Brouer

This patch introduces two configuration options,
BPF_JIT_HARDEN_BOOTPARAM and BPF_JIT_HARDEN_BOOTPARAM_VALUE, that allow
configuring the initial value of net.core.bpf_jit_harden sysctl knob,
which is useful for enforcing JIT hardening during the early boot.

Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 10 +++++++++
 init/Kconfig                                    | 29 +++++++++++++++++++++++++
 kernel/bpf/core.c                               | 17 +++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index aa8e831..5adc6d0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -442,6 +442,16 @@
 	bert_disable	[ACPI]
 			Disable BERT OS support on buggy BIOSes.
 
+	bpf_jit_harden=
+			Format: { "0" | "1" | "2" }
+			Sets initial value of net.core.bpf_jit_harden
+			sysctl knob.
+			0 - JIT hardening is disabled.
+			1 - JIT hardening is enabled for unprivileged users
+			    only.
+			2 - JIT hardening is enabled for all users.
+			Default value is set via kernel config option.
+
 	bttv.card=	[HW,V4L] bttv (bt848 + bt878 based grabber cards)
 	bttv.radio=	Most important insmod options are available as
 			kernel args too.
diff --git a/init/Kconfig b/init/Kconfig
index 1403a3e..b661497 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1435,6 +1435,35 @@ config UNPRIVILEGED_BPF_BOOTPARAM_VALUE
 
 	  If you are unsure how to answer this question, answer 0.
 
+config BPF_JIT_HARDEN_BOOTPARAM
+	bool "BPF JIT harden boot parameter"
+	default n
+	help
+	  This option adds a kernel parameter 'bpf_jit_harden' that allows
+	  configuring default state of the net.core.bpf_jit_harden sysctl knob.
+	  If this option is selected, the default value of the
+	  net.core.bpf_jit_harden sysctl knob can be set on the kernel command
+	  line.  The purpose of this option is to allow enabling BPF JIT
+	  hardening for the BPF programs created during the early boot.
+
+	  If you are unsure how to answer this question, answer N.
+
+config BPF_JIT_HARDEN_BOOTPARAM_VALUE
+	int "BPF JIT harden boot parameter default value"
+	depends on BPF_JIT_HARDEN_BOOTPARAM
+	range 0 2
+	default 0
+	help
+	  This option sets the default value for the kernel parameter
+	  'bpf_jit_enabled' that configures default value of the
+	  net.core.bpf_jit_harden sysctl knob at boot.  If this option is set to
+	  0 (zero), the net.core.bpf_jit_harden will default to 0, which will
+	  lead to no hardening at bootup.  If this option is set to 1 (one),
+	  hardening will be applied only to unprivileged users only.  If this
+	  option is set to 2 (two), JIT hardening will be enabled for all users.
+
+	  If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
 	bool "Enable userfaultfd() system call"
 	select ANON_INODES
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2194c6a..9edb7a8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,7 @@
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
 #include <linux/perf_event.h>
+#include <linux/init.h>
 
 #include <asm/unaligned.h>
 
@@ -303,7 +304,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 #ifdef CONFIG_BPF_JIT
 /* All BPF JIT sysctl knobs here. */
 int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+
+#ifdef CONFIG_BPF_JIT_HARDEN_BOOTPARAM
+int bpf_jit_harden   __read_mostly = CONFIG_BPF_JIT_HARDEN_BOOTPARAM_VALUE;
+
+static int __init bpf_jit_harden_setup(char *str)
+{
+	unsigned long value;
+
+	if (!kstrtoul(str, 0, &value))
+		bpf_jit_harden = min(value, 2UL);
+	return 1;
+}
+__setup("bpf_jit_harden=", bpf_jit_harden_setup);
+#else /* !CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
 int bpf_jit_harden   __read_mostly;
+#endif /* CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
+
 int bpf_jit_kallsyms __read_mostly;
 
 static __always_inline void
-- 
2.1.4

^ permalink raw reply related

* [PATCH bpf-next v2 3/3] bpf: add ability to configure BPF JIT kallsyms export at the boot time
From: Eugene Syromiatnikov @ 2018-05-23 12:18 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-doc, Kees Cook, Kai-Heng Feng,
	Daniel Borkmann, Alexei Starovoitov, Jonathan Corbet, Jiri Olsa,
	Jesper Dangaard Brouer

This patch introduces two configuration options,
BPF_JIT_KALLSYMS_BOOTPARAM and BPF_JIT_KALLSYMS_BOOTPARAM_VALUE, that
allow configuring the initial value of net.core.bpf_jit_kallsyms sysctl
knob. This enables export of addresses of JIT'ed BPF programs that
created during the early boot.

Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 10 +++++++++
 init/Kconfig                                    | 30 +++++++++++++++++++++++++
 kernel/bpf/core.c                               | 14 ++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 5adc6d0..10e7502 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -452,6 +452,16 @@
 			2 - JIT hardening is enabled for all users.
 			Default value is set via kernel config option.
 
+	bpf_jit_kallsyms=
+			Format: { "0" | "1" }
+			Sets initial value of net.core.bpf_jit_kallsyms
+			sysctl knob.
+			0 - Addresses of JIT'ed BPF programs are not exported
+			    to kallsyms.
+			1 - Export of addresses of JIT'ed BPF programs is
+			    enabled for privileged users.
+			Default value is set via kernel config option.
+
 	bttv.card=	[HW,V4L] bttv (bt848 + bt878 based grabber cards)
 	bttv.radio=	Most important insmod options are available as
 			kernel args too.
diff --git a/init/Kconfig b/init/Kconfig
index b661497..b5405ca 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1464,6 +1464,36 @@ config BPF_JIT_HARDEN_BOOTPARAM_VALUE
 
 	  If you are unsure how to answer this question, answer 0.
 
+config BPF_JIT_KALLSYMS_BOOTPARAM
+	bool "BPF JIT kallsyms export boot parameter"
+	default n
+	help
+	  This option adds a kernel parameter 'bpf_jit_kallsyms' that allows
+	  configuring default state of the net.core.bpf_jit_kallsyms sysctl
+	  knob.  If this option is selected, the default value of the
+	  net.core.bpf_jit_kallsyms sysctl knob can be set on the kernel command
+	  line.  The purpose of this option is to allow enabling BPF JIT
+	  kallsyms export for the BPF programs created during the early boot,
+	  so they can be traced later.
+
+	  If you are unsure how to answer this question, answer N.
+
+config BPF_JIT_KALLSYMS_BOOTPARAM_VALUE
+	int "BPF JIT kallsyms export boot parameter default value"
+	depends on BPF_JIT_HARDEN_BOOTPARAM
+	range 0 1
+	default 0
+	help
+	  This option sets the default value for the kernel parameter
+	  'bpf_jit_kallsyms' that configures default value of the
+	  net.core.bpf_jit_kallsyms sysctl knob at boot.  If this option is set
+	  to 0 (zero), the net.core.bpf_jit_kallsyms will default to 0, which
+	  will lead to disabling of exporting of addresses of JIT'ed BPF
+	  programs.  If this option is set to 1 (one), addresses of privileged
+	  BPF programs are exported to kallsyms.
+
+	  If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
 	bool "Enable userfaultfd() system call"
 	select ANON_INODES
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9edb7a8..003d708 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -321,7 +321,21 @@ __setup("bpf_jit_harden=", bpf_jit_harden_setup);
 int bpf_jit_harden   __read_mostly;
 #endif /* CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
 
+#ifdef CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM
+int bpf_jit_kallsyms __read_mostly = CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM_VALUE;
+
+static int __init bpf_jit_kallsyms_setup(char *str)
+{
+	unsigned long enabled;
+
+	if (!kstrtoul(str, 0, &enabled))
+		bpf_jit_kallsyms = !!enabled;
+	return 1;
+}
+__setup("bpf_jit_kallsyms=", bpf_jit_kallsyms_setup);
+#else /* !CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM */
 int bpf_jit_kallsyms __read_mostly;
+#endif /* CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM */
 
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH v4 0/3] IR decoding using BPF
From: Daniel Borkmann @ 2018-05-23 12:21 UTC (permalink / raw)
  To: Sean Young, linux-media, linux-kernel, Alexei Starovoitov,
	Mauro Carvalho Chehab, netdev, Matthias Reichl, Devin Heitmueller,
	Y Song, Quentin Monnet
In-Reply-To: <cover.1526651592.git.sean@mess.org>

On 05/18/2018 04:07 PM, Sean Young wrote:
> The kernel IR decoders (drivers/media/rc/ir-*-decoder.c) support the most
> widely used IR protocols, but there are many protocols which are not
> supported[1]. For example, the lirc-remotes[2] repo has over 2700 remotes,
> many of which are not supported by rc-core. There is a "long tail" of
> unsupported IR protocols, for which lircd is need to decode the IR .
> 
> IR encoding is done in such a way that some simple circuit can decode it;
> therefore, bpf is ideal.
> 
> In order to support all these protocols, here we have bpf based IR decoding.
> The idea is that user-space can define a decoder in bpf, attach it to
> the rc device through the lirc chardev.
> 
> Separate work is underway to extend ir-keytable to have an extensive library
> of bpf-based decoders, and a much expanded library of rc keymaps.
> 
> Another future application would be to compile IRP[3] to a IR BPF program, and
> so support virtually every remote without having to write a decoder for each.
> It might also be possible to support non-button devices such as analog
> directional pads or air conditioning remote controls and decode the target
> temperature in bpf, and pass that to an input device.

Mauro, are you fine with this series going via bpf-next? How ugly would this
get with regards to merge conflicts wrt drivers/media/rc/?

Thanks,
Daniel

> Thanks,
> 
> Sean Young
> 
> [1] http://www.hifi-remote.com/wiki/index.php?title=DecodeIR
> [2] https://sourceforge.net/p/lirc-remotes/code/ci/master/tree/remotes/
> [3] http://www.hifi-remote.com/wiki/index.php?title=IRP_Notation
> 
> Changes since v3:
>  - Implemented review comments from Quentin Monnet and Y Song (thanks!)
>  - More helpful and better formatted bpf helper documentation
>  - Changed back to bpf_prog_array rather than open-coded implementation
>  - scancodes can be 64 bit
>  - bpf gets passed values in microseconds, not nanoseconds.
>    microseconds is more than than enough (IR receivers support carriers upto
>    70kHz, at which point a single period is already 14 microseconds). Also,
>    this makes it much more consistent with lirc mode2.
>  - Since it looks much more like lirc mode2, rename the program type to
>    BPF_PROG_TYPE_LIRC_MODE2.
>  - Rebased on bpf-next
> 
> Changes since v2:
>  - Fixed locking issues
>  - Improved self-test to cover more cases
>  - Rebased on bpf-next again
> 
> Changes since v1:
>  - Code review comments from Y Song <ys114321@gmail.com> and
>    Randy Dunlap <rdunlap@infradead.org>
>  - Re-wrote sample bpf to be selftest
>  - Renamed RAWIR_DECODER -> RAWIR_EVENT (Kconfig, context, bpf prog type)
>  - Rebase on bpf-next
>  - Introduced bpf_rawir_event context structure with simpler access checking
> 
> Sean Young (3):
>   bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not
>     found
>   media: rc: introduce BPF_PROG_LIRC_MODE2
>   bpf: add selftest for lirc_mode2 type program
> 
>  drivers/media/rc/Kconfig                      |  13 +
>  drivers/media/rc/Makefile                     |   1 +
>  drivers/media/rc/bpf-lirc.c                   | 308 ++++++++++++++++++
>  drivers/media/rc/lirc_dev.c                   |  30 ++
>  drivers/media/rc/rc-core-priv.h               |  22 ++
>  drivers/media/rc/rc-ir-raw.c                  |  12 +-
>  include/linux/bpf_rcdev.h                     |  30 ++
>  include/linux/bpf_types.h                     |   3 +
>  include/uapi/linux/bpf.h                      |  53 ++-
>  kernel/bpf/core.c                             |  11 +-
>  kernel/bpf/syscall.c                          |   7 +
>  kernel/trace/bpf_trace.c                      |   2 +
>  tools/bpf/bpftool/prog.c                      |   1 +
>  tools/include/uapi/linux/bpf.h                |  53 ++-
>  tools/include/uapi/linux/lirc.h               | 217 ++++++++++++
>  tools/lib/bpf/libbpf.c                        |   1 +
>  tools/testing/selftests/bpf/Makefile          |   8 +-
>  tools/testing/selftests/bpf/bpf_helpers.h     |   6 +
>  .../testing/selftests/bpf/test_lirc_mode2.sh  |  28 ++
>  .../selftests/bpf/test_lirc_mode2_kern.c      |  23 ++
>  .../selftests/bpf/test_lirc_mode2_user.c      | 154 +++++++++
>  21 files changed, 974 insertions(+), 9 deletions(-)
>  create mode 100644 drivers/media/rc/bpf-lirc.c
>  create mode 100644 include/linux/bpf_rcdev.h
>  create mode 100644 tools/include/uapi/linux/lirc.h
>  create mode 100755 tools/testing/selftests/bpf/test_lirc_mode2.sh
>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_kern.c
>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_user.c
> 

^ permalink raw reply

* Re: Expected result when racing listen(2) on two sockets bound to the same address
From: Kirill Tkhai @ 2018-05-23 12:44 UTC (permalink / raw)
  To: Alexander Kurtz, linux-kernel, netdev
In-Reply-To: <a3ecf04de4fded7c71284a9695b7146d284d1f22.camel@kurtz.be>

Hi,

On 23.05.2018 14:15, Alexander Kurtz wrote:
> [Please keep me CC'ed; I'm not subscribed to the list]
> 
> Hi!
> 
> The program shown below (also available at [0]) does the following:
> 
>  * Create two sockets
>  * Enable SO_REUSEADDR on both
>  * Bind both sockets to [::1]:12345
>  * Spawn two threads which both call listen(2) on one socket each
>  * Check that at least one thread succeeded
> 
> Unfortunately, when running this program on Linux 4.16, it is sometimes
> possible that neither thread succeeds in calling listen(2):
> 
>     $ uname -a
>     Linux shepard 4.16.0-1-amd64 #1 SMP Debian 4.16.5-1 (2018-04-29) x86_64 GNU/Linux
>     $ time make
>     cc -Wall -Wextra -pedantic -Werror -O3    listenrace.c  -lpthread -o listenrace
>     for i in `seq 10000`; do ./listenrace; done
>     listenrace: listenrace.c:58: main: Assertion `result1 == 0 || result2 == 0' failed.
>     Aborted
>     listenrace: listenrace.c:58: main: Assertion `result1 == 0 || result2 == 0' failed.
>     Aborted
>     listenrace: listenrace.c:58: main: Assertion `result1 == 0 || result2 == 0' failed.
>     Aborted
> 
>     real	0m8.201s
>     user	0m6.801s
>     sys	0m2.141s
>     $ 
> 
> As can be seen, on 3 runs (out of 10000) calling listen(2) failed in
> *both* threads. Is this to be expected (i.e. "don't do this then") or
> could this be some race condition in the Linux kernel?

At the first sight, it is in kernel and is expected:

inet_csk_listen_start()
{
	/* There is race window here: we announce ourselves listening,
	 * but this transition is still not validated by get_port().
	 * It is OK, because this socket enters to hash table only
	 * after validation is complete.
	 */
        inet_sk_state_store(sk, TCP_LISTEN);
        if (!sk->sk_prot->get_port(sk, inet->inet_num)) {

}

CC netdev.

Kirill

^ permalink raw reply

* Re: [PATCH 1/1] selftests/bpf: Makefile fix "missing" headers on build with -idirafter
From: Daniel Borkmann @ 2018-05-23 12:44 UTC (permalink / raw)
  To: Sirio Balmelli; +Cc: netdev
In-Reply-To: <20180521070001.tzyzdb3o2ma3hwdv@vm4>

On 05/21/2018 09:00 AM, Sirio Balmelli wrote:
> Selftests fail to build on several distros/architectures because of
> 	missing headers files.
> 
> On a Ubuntu/x86_64 some missing headers are:
> 	asm/byteorder.h, asm/socket.h, asm/sockios.h
> 
> On a Debian/arm32 build already fails at sys/cdefs.h
> 
> In both cases, these already exist in /usr/include/<arch-specific-dir>,
> but Clang does not include these when using '-target bpf' flag,
> since it is no longer compiling against the host architecture.
> 
> The solution is to:
> 
> - run Clang without '-target bpf' and extract the include chain for the
> current system
> 
> - add these to the bpf build with '-idirafter'
> 
> The choice of -idirafter is to catch this error without injecting
> unexpected include behavior: if an arch-specific tree is built
> for bpf in the future, this will be correctly found by Clang.
> 
> Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>

Ok, I've tried this on arm64:

# clang -v -E - </dev/null 2>&1 | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }'
-idirafter /usr/local/include
-idirafter /usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include
-idirafter /usr/include/aarch64-linux-gnu
-idirafter /usr/include

# clang -target bpf -v -E - </dev/null 2>&1 | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }'
-idirafter /usr/local/include
-idirafter /usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include
-idirafter /usr/include

# clang -target aarch64 -v -E - </dev/null 2>&1 | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }'
-idirafter /usr/local/include
-idirafter /usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include
-idirafter /usr/include

# clang -target aarch64-unknown-linux-gnu -v -E - </dev/null 2>&1 | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }'
-idirafter /usr/local/include
-idirafter /usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include
-idirafter /usr/include/aarch64-linux-gnu
-idirafter /usr/include

# llc --version
LLVM (http://llvm.org/):
  LLVM version 3.8.0

  Optimized build.
  Built Jul  9 2016 (11:22:59).
  Default target: aarch64-unknown-linux-gnu
  Host CPU: (unknown)
[...]

So the default target adds additionally /usr/include/aarch64-linux-gnu which is
what you're after. Seems okay if it does the trick. Worst case we can always
revert and find a different solution. I've applied it to bpf-next instead of
bpf in order to give this some time for test exposure, thanks Sirio!

^ permalink raw reply

* Re: KASAN: use-after-free Read in remove_wait_queue (2)
From: Guillaume Nault @ 2018-05-23 13:26 UTC (permalink / raw)
  To: Eric Biggers
  Cc: linux-ppp, Paul Mackerras, netdev, linux-kernel, syzkaller-bugs,
	syzbot, viro
In-Reply-To: <20180523032958.GE658@sol.localdomain>

On Tue, May 22, 2018 at 08:29:58PM -0700, Eric Biggers wrote:
> On Fri, May 18, 2018 at 06:02:23PM +0200, Guillaume Nault wrote:
> > On Sun, May 13, 2018 at 11:11:55PM -0700, Eric Biggers wrote:
> > > [+ppp list and maintainer]
> > > 
> > > This is a bug in ppp_generic.c; it still happens on Linus' tree and it's easily
> > > reproducible, see program below.  The bug is that the PPPIOCDETACH ioctl doesn't
> > > consider that the file can still be attached to epoll instances even when
> > > ->f_count == 1.
> > 
> > Right. What would it take to remove the file for the epoll instances?
> > Sorry for the naive question, but I'm not familiar with VFS and didn't
> > find a helper function we could call.
> > 
> 
> There is eventpoll_release_file(), but it's not exported to modules.  It might
> work to call it, but it seems like a hack.
> 
> > > Also, the reproducer doesn't test this but I think ppp_poll(),
> > > ppp_read(), and ppp_write() can all race with PPPIOCDETACH, causing
> > > use-after-frees as well.
> > 
> > I also believe so. ppp_release() resets ->private_data, and even though
> > functions like ppp_read() test ->private_data before executing, there's
> > no synchronisation mechanism to ensure that the update is visible
> > before the unit or channel is destroyed. Is that the kind of race you
> > had in mind?
> 
> Yes, though after looking into it more I *think* these additional races aren't
> actually possible, due to the 'f_count < 2' check.  These races could only
> happen with a shared fd table, but in that case fdget() would increment f_count
> for the duration of each operation, resulting in 'f_count >= 2' if both ioctl()
> and something else is running on the same file concurrently.
> 
> Note that this also means PPPIOCDETACH doesn't work at all if called from a
> multithreaded application...
> 
> > 
> > > Any chance that PPPIOCDETACH can simply be removed,
> > > given that it's apparently been "deprecated" for 16 years?
> > > Does anyone use it?
> > 
> > The only users I'm aware of are pppd versions older than ppp-2.4.2
> > (released in November 2003). But even at that time, there were issues
> > with PPPIOCDETACH as pppd didn't seem to react properly when this call
> > failed. An Internet search on the "PPPIOCDETACH file->f_count=" kernel
> > log string, or on the "Couldn't release PPP unit: Invalid argument"
> > error message of pppd, returns several related bug reports.
> > 
> > Originally, PPPIOCDETACH never failed, but testing ->f_count was
> > later introduced to fix crashes when the file descriptor had been
> > duplicated. It seems that this was motivated by polling issues too.
> > 
> > Long story short, it looks like PPPIOCDETACH never has worked well
> > and we have at least two more bugs to fix. Given how it has proven
> > fragile, I wouldn't be surprised if there were even more lurking
> > around. I'd say that it's probably safer to drop it than to add more
> > workarounds and playing wack-a-mole with those bugs.
> 
> IMO, if we can get away with removing it without any users noticing, that would
> be much better than trying to fix it with a VFS-level hack, and probably missing
> some cases.  I'll send a patch to get things started...
> 
Yes, I fully agree. That looks much safer, and given the track record
of this ioctl I very much doubt anyone would depend on it.

^ permalink raw reply

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
From: Willem de Bruijn @ 2018-05-23 13:37 UTC (permalink / raw)
  To: Jon Rosen (jrosen)
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL], open list
In-Reply-To: <9cbf19b5765746b88af6dadc71b99d78@XCH-RTP-016.cisco.com>

On Wed, May 23, 2018 at 7:54 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
>> > For the ring, there is no requirement to allocate exactly the amount
>> > specified by the user request. Safer than relying on shared memory
>> > and simpler than the extra allocation in this patch would be to allocate
>> > extra shadow memory at the end of the ring (and not mmap that).
>> >
>> > That still leaves an extra cold cacheline vs using tp_padding.
>>
>> Given my lack of experience and knowledge in writing kernel code
>> it was easier for me to allocate the shadow ring as a separate
>> structure.  Of course it's not about me and my skills so if it's
>> more appropriate to allocate at the tail of the existing ring
>> then certainly I can look at doing that.
>
> The memory for the ring is not one contiguous block, it's an array of
> blocks of pages (or 'order' sized blocks of pages). I don't think
> increasing the size of each of the blocks to provided storage would be
> such a good idea as it will risk spilling over into the next order and
> wasting lots of memory. I suspect it's also more complex than a single
> shadow ring to do both the allocation and the access.
>
> It could be tacked onto the end of the pg_vec[] used to store the
> pointers to the blocks. The challenge with that is that a pg_vec[] is
> created for each of RX and TX rings so either it would have to
> allocate unnecessary storage for TX or the caller will have to say if
> extra space should be allocated or not.  E.g.:
>
> static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order, int scratch, void **scratch_p)
>
> I'm not sure avoiding the extra allocation and moving it to the
> pg_vec[] for the RX ring is going to get the simplification you were
> hoping for.  Is there another way of storing the shadow ring which
> I should consider?

I did indeed mean attaching extra pages to pg_vec[]. It should be
simpler than a separate structure, but I may be wrong.

Either way, I still would prefer to avoid the shadow buffer completely.
It incurs complexity and cycle cost on all users because of only the
rare (non-existent?) consumer that overwrites the padding bytes.

Perhaps we can use padding yet avoid deadlock by writing a
timed value. The simplest would be jiffies >> N. Then only a
process that writes this exact value would be subject to drops and
then still only for a limited period.

Instead of depending on wall clock time, like jiffies, another option
would be to keep a percpu array of values. Each cpu has a zero
entry if it is not writing, nonzero if it is. If a writer encounters a
number in padding that is > num_cpus, then the state is garbage
from userspace. If <= num_cpus, it is adhered to only until that cpu
clears its entry, which is guaranteed to happen eventually.

Just a quick thought. This might not fly at all upon closer scrutiny.

^ permalink raw reply

* Re: [PATCH net-next] tipc: eliminate complaint of KMSAN uninit-value in tipc_conn_rcv_sub
From: Ying Xue @ 2018-05-23 13:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, syzkaller-bugs, tipc-discussion, dvyukov
In-Reply-To: <20180519.230021.538446373514892322.davem@davemloft.net>

On 05/20/2018 11:00 AM, David Miller wrote:
> From: Ying Xue <ying.xue@windriver.com>
> Date: Fri, 18 May 2018 19:50:55 +0800
> 
>> As variable s of struct tipc_subscr type is not initialized
>> in tipc_conn_rcv_from_sock() before it is used in tipc_conn_rcv_sub(),
>> KMSAN reported the following uninit-value type complaint:
> 
> I agree with others that the short read is the bug.

In this error case, Dmitry is right. A short read happened in
tipc_recvmsg() especially when the size of skb received from a socket of
user space was smaller than the msg_iter.count of struct msghdr (ie,
tipc_subscr object size) passed by tipc_conn_rcv_from_sock() in kernel
space.

But when tipc_recvmsg() copied the data of skb to msg_iter.kvec of
struct msghdr with skb data length rather than msg_iter.count, it means
the part of space (ie, msg_iter.count - skb data length) of
msg_iter.kvec was not initialized. For the detailed info, please refer
to its relevant code:

tipc_recvmsg()
{
...
        if (likely(!err)) {
                copy = min_t(int, dlen, buflen);
                if (unlikely(copy != dlen))
                        m->msg_flags |= MSG_TRUNC;
                rc = skb_copy_datagram_msg(skb, hlen, m, copy);
...
}

If we receive a skb message with recvmsg() in user space, it seems no
problem even if the length of msg_iter.iov is bigger than skb data size.
But under the same situation, if we receive a message through
sock_recvmsg() in kernel space, it might be a problem.

I have checked the receive functions of other stacks like TCP and UDP,
as a result, when msg_iter.count is bigger than skb->len, they never use
memset() to initialize the remaining area of msg_iter.kvec (ie,
msg_iter.count - skb->len) no matter whether we receive a message
through sock_recvmsg() in user space or kernel space.

> 
> You need to decide what should happen if not a full tipc_subscr object
> is obtained from the sock_recvmsg() call.
> 
> Proceeding to pass it on to tipc_conn_rcv_sub() cannot possibly be
> correct.

If we do not conduct a full read for tipc_subscr object through
sock_recvmsg() call, some fields of tipc_subscr object might be
incorrect. But I can confirm that the incorrect fields of tipc_subscr
object any fatal error except that we might wrongly add one subscription.

> 
> You're not getting what you are expecting from the peer, the memset()
> you are adding doesn't change that.

Before tipc_subscr object address is passed to msg_iter.kvec pointer, we
have initialized the whole tipc_subscr object with memset(). Just in
this case, this method should kill the uninit-value complaint.

If we initialize tipc_subscr object in tipc_recvmsg(), we have to
conduct initialization behavior for the msg_iter.kvec/msg_iter.vio area
(msg_iter.count - skb->len) no matter whether a message is received from
user space or kernel space. Particularly when the caller of recvmsg() is
in user space, the initialization seems no necessary.

> 
> And once you get this badly sized read, what does that do to
> the stream of subsequent recvmsg calls here?
> 

Even if we get a bad sized read for tipc_subscr object in
tipc_conn_rcv_sub(), it doesn't cause any fatal impact on system or TIPC
stack.

------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot

^ permalink raw reply

* Re: [PATCH bpf-next v3 10/10] tools: bpftool: add delimiters to multi-function JITed dumps
From: Daniel Borkmann @ 2018-05-23 13:50 UTC (permalink / raw)
  To: Sandipan Das, Jakub Kicinski
  Cc: ast, netdev, linuxppc-dev, mpe, naveen.n.rao, Quentin Monnet
In-Reply-To: <7142939b-515e-50ac-bc0b-50444bf9cc97@linux.vnet.ibm.com>

On 05/23/2018 12:37 PM, Sandipan Das wrote:
[...]
> Other than that, for powerpc64, there is a problem with the way the
> binutils disassembler code (in "opcodes/ppc-dis.c") passes arguments
> to the callback fprintf_json().
> 
> In fprintf_json(), we always expect the va_list elements to resolve
> to strings (char *). But for powerpc64, the register or immediate
> operands are always passed as integers. So, when the code attempts
> to resolve these operands using va_arg(ap, char *), bpftool crashes.
> For now, I am using a workaround based on vsnprintf() but this does
> not get the semantics correct for memory operands. You can probably
> see that for the store instructions in the JSON dump above this.
> 
> Daniel,
> 
> Would it be okay if I send out a fix for this in a different series?

I'm fine either way with regards to the fix. Feels like a portability bug
in the binutils disassembler?

We could probably have a feature test like in test-disassembler-four-args
and select a workaround in bpftool based on that outcome.

Thanks Sandipan!

  [1] tools/build/feature/test-disassembler-four-args.c

^ permalink raw reply

* Re: [net-next 1/6] net/dcb: Add dcbnl buffer attribute
From: John Fastabend @ 2018-05-23 13:52 UTC (permalink / raw)
  To: Jiri Pirko, Jakub Kicinski
  Cc: Saeed Mahameed, David S. Miller, netdev, Huy Nguyen, Or Gerlitz
In-Reply-To: <20180523094331.GC3046@nanopsycho>

On 05/23/2018 02:43 AM, Jiri Pirko wrote:
> Tue, May 22, 2018 at 07:20:26AM CEST, jakub.kicinski@netronome.com wrote:
>> On Mon, 21 May 2018 14:04:57 -0700, Saeed Mahameed wrote:
>>> From: Huy Nguyen <huyn@mellanox.com>
>>>
>>> In this patch, we add dcbnl buffer attribute to allow user
>>> change the NIC's buffer configuration such as priority
>>> to buffer mapping and buffer size of individual buffer.
>>>
>>> This attribute combined with pfc attribute allows advance user to
>>> fine tune the qos setting for specific priority queue. For example,
>>> user can give dedicated buffer for one or more prirorities or user
>>> can give large buffer to certain priorities.
>>>
>>> We present an use case scenario where dcbnl buffer attribute configured
>>> by advance user helps reduce the latency of messages of different sizes.
>>>
>>> Scenarios description:
>>> On ConnectX-5, we run latency sensitive traffic with
>>> small/medium message sizes ranging from 64B to 256KB and bandwidth sensitive
>>> traffic with large messages sizes 512KB and 1MB. We group small, medium,
>>> and large message sizes to their own pfc enables priorities as follow.
>>>   Priorities 1 & 2 (64B, 256B and 1KB)
>>>   Priorities 3 & 4 (4KB, 8KB, 16KB, 64KB, 128KB and 256KB)
>>>   Priorities 5 & 6 (512KB and 1MB)
>>>
>>> By default, ConnectX-5 maps all pfc enabled priorities to a single
>>> lossless fixed buffer size of 50% of total available buffer space. The
>>> other 50% is assigned to lossy buffer. Using dcbnl buffer attribute,
>>> we create three equal size lossless buffers. Each buffer has 25% of total
>>> available buffer space. Thus, the lossy buffer size reduces to 25%. Priority
>>> to lossless  buffer mappings are set as follow.
>>>   Priorities 1 & 2 on lossless buffer #1
>>>   Priorities 3 & 4 on lossless buffer #2
>>>   Priorities 5 & 6 on lossless buffer #3
>>>
>>> We observe improvements in latency for small and medium message sizes
>>> as follows. Please note that the large message sizes bandwidth performance is
>>> reduced but the total bandwidth remains the same.
>>>   256B message size (42 % latency reduction)
>>>   4K message size (21% latency reduction)
>>>   64K message size (16% latency reduction)
>>>
>>> Signed-off-by: Huy Nguyen <huyn@mellanox.com>
>>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>>
>> On a cursory look this bares a lot of resemblance to devlink shared
>> buffer configuration ABI.  Did you look into using that?  
>>
>> Just to be clear devlink shared buffer ABIs don't require representors
>> and "switchdev mode".
> 
> If the CX5 buffer they are trying to utilize here is per port and not a
> shared one, it would seem ok for me to not have it in "devlink sb".
> 

+1 I think its probably reasonable to let devlink manage the global
(device layer) buffers and then have dcbnl partition the buffer up
further per netdev. Notice there is already a partitioning of the
buffers happening when DCB is enabled and/or parameters are changed.
So giving explicit control over this seems OK to me.

It would be nice though if the API gave us some hint on max/min/stride
of allowed values. Could the get API return these along with current
value? Presumably the allowed max size could change with devlink buffer
changes in how the global buffer is divided up as well.

The argument against allowing this API is it doesn't have anything to
do with the 802.1Q standard, but that is fine IMO.

.John

^ permalink raw reply

* Re: [PATCH] ppp: remove the PPPIOCDETACH ioctl
From: Guillaume Nault @ 2018-05-23 13:57 UTC (permalink / raw)
  To: Eric Biggers
  Cc: linux-ppp, Paul Mackerras, netdev, linux-fsdevel, linux-kernel,
	syzkaller-bugs, Eric Biggers
In-Reply-To: <20180523035952.25768-1-ebiggers3@gmail.com>

On Tue, May 22, 2018 at 08:59:52PM -0700, Eric Biggers wrote:
> From: Eric Biggers <ebiggers@google.com>
> 
> The PPPIOCDETACH ioctl effectively tries to "close" the given ppp file
> before f_count has reached 0, which is fundamentally a bad idea.  It
> does check 'f_count < 2', which excludes concurrent operations on the
> file since they would only be possible with a shared fd table, in which
> case each fdget() would take a file reference.  However, it fails to
> account for the fact that even with 'f_count == 1' the file can still be
> linked into epoll instances.  As reported by syzbot, this can trivially
> be used to cause a use-after-free.
> 
> Yet, the only known user of PPPIOCDETACH is pppd versions older than
> ppp-2.4.2, which was released almost 15 years ago (November 2003).
> Also, PPPIOCDETACH apparently stopped working reliably at around the
> same time, when the f_count check was added to the kernel, e.g. see
> https://lkml.org/lkml/2002/12/31/83.  Also, the current 'f_count < 2'
> check makes PPPIOCDETACH only work in single-threaded applications; it
> always fails if called from a multithreaded application.
> 
> All pppd versions released in the last 15 years just close() the file
> descriptor instead.
> 
> Therefore, instead of hacking around this bug by exporting epoll
> internals to modules, and probably missing other related bugs, just
> remove the PPPIOCDETACH ioctl and see if anyone actually notices.
> 
> Reported-by: syzbot+16363c99d4134717c05b@syzkaller.appspotmail.com
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Cc: stable@vger.kernel.org
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
>  Documentation/networking/ppp_generic.txt |  6 -----
>  drivers/net/ppp/ppp_generic.c            | 29 ------------------------
>  fs/compat_ioctl.c                        |  1 -
>  include/uapi/linux/ppp-ioctl.h           |  1 -
>  4 files changed, 37 deletions(-)
> 
> diff --git a/Documentation/networking/ppp_generic.txt b/Documentation/networking/ppp_generic.txt
> index 091d20273dcb..61daf4b39600 100644
> --- a/Documentation/networking/ppp_generic.txt
> +++ b/Documentation/networking/ppp_generic.txt
> @@ -300,12 +300,6 @@ unattached instance are:
>  The ioctl calls available on an instance of /dev/ppp attached to a
>  channel are:
>  
> -* PPPIOCDETACH detaches the instance from the channel.  This ioctl is
> -  deprecated since the same effect can be achieved by closing the
> -  instance.  In order to prevent possible races this ioctl will fail
> -  with an EINVAL error if more than one file descriptor refers to this
> -  instance (i.e. as a result of dup(), dup2() or fork()).
> -
>  * PPPIOCCONNECT connects this channel to a PPP interface.  The
>    argument should point to an int containing the interface unit
>    number.  It will return an EINVAL error if the channel is already
> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
> index dc7c7ec43202..dce8812fe802 100644
> --- a/drivers/net/ppp/ppp_generic.c
> +++ b/drivers/net/ppp/ppp_generic.c
> @@ -603,35 +603,6 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  		goto out;
>  	}
>  
> -	if (cmd == PPPIOCDETACH) {
> -		/*
> -		 * We have to be careful here... if the file descriptor
> -		 * has been dup'd, we could have another process in the
> -		 * middle of a poll using the same file *, so we had
> -		 * better not free the interface data structures -
> -		 * instead we fail the ioctl.  Even in this case, we
> -		 * shut down the interface if we are the owner of it.
> -		 * Actually, we should get rid of PPPIOCDETACH, userland
> -		 * (i.e. pppd) could achieve the same effect by closing
> -		 * this fd and reopening /dev/ppp.
> -		 */
> -		err = -EINVAL;
> -		if (pf->kind == INTERFACE) {
> -			ppp = PF_TO_PPP(pf);
> -			rtnl_lock();
> -			if (file == ppp->owner)
> -				unregister_netdevice(ppp->dev);
> -			rtnl_unlock();
> -		}
> -		if (atomic_long_read(&file->f_count) < 2) {
> -			ppp_release(NULL, file);
> -			err = 0;
> -		} else
> -			pr_warn("PPPIOCDETACH file->f_count=%ld\n",
> -				atomic_long_read(&file->f_count));
> -		goto out;
> -	}
> -

I'd rather add
+	if (cmd == PPPIOCDETACH) {
+		err = -EINVAL;
+		goto out;
+	}

Making PPPIOCDETACH unknown to ppp_generic means that the ioctl would
be handled by the underlying channel when pf->kind == CHANNEL (see the
chan->ops->ioctl() call further down). That shouldn't be a problem per
se, but even though PPPIOCDETACH is unsupported, I feel that it should
remain a ppp_generic thing. I don't really want its value to be reused
for other purposes in the future or have different behaviour depending
on the underlying channel.

Also PPPIOCDETACH can already fail with -EINVAL. Therefore, if ever
there really were programs out there using this call, they'd already
have to handle this case. Unconditionally returning -EINVAL would
further minimise possibilities for breakage.

^ permalink raw reply

* Re: [PATCH bpf-next v3 10/10] tools: bpftool: add delimiters to multi-function JITed dumps
From: Sandipan Das @ 2018-05-23 13:59 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Jakub Kicinski, ast, netdev, linuxppc-dev, mpe, naveen.n.rao,
	Quentin Monnet
In-Reply-To: <024d9f55-2382-8080-c0c2-e3acfbfb9590@iogearbox.net>


On 05/23/2018 07:20 PM, Daniel Borkmann wrote:
> On 05/23/2018 12:37 PM, Sandipan Das wrote:
> [...]
>> Other than that, for powerpc64, there is a problem with the way the
>> binutils disassembler code (in "opcodes/ppc-dis.c") passes arguments
>> to the callback fprintf_json().
>>
>> In fprintf_json(), we always expect the va_list elements to resolve
>> to strings (char *). But for powerpc64, the register or immediate
>> operands are always passed as integers. So, when the code attempts
>> to resolve these operands using va_arg(ap, char *), bpftool crashes.
>> For now, I am using a workaround based on vsnprintf() but this does
>> not get the semantics correct for memory operands. You can probably
>> see that for the store instructions in the JSON dump above this.
>>
>> Daniel,
>>
>> Would it be okay if I send out a fix for this in a different series?
> 
> I'm fine either way with regards to the fix. Feels like a portability bug
> in the binutils disassembler?
> 
> We could probably have a feature test like in test-disassembler-four-args
> and select a workaround in bpftool based on that outcome.
> 
> Thanks Sandipan!
> 
>   [1] tools/build/feature/test-disassembler-four-args.c
> 

Cool. Thanks for the tip!

- Sandipan

^ permalink raw reply

* Re: [bpf-next V4 PATCH 3/8] xdp: add tracepoint for devmap like cpumap have
From: John Fastabend @ 2018-05-23 14:24 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, Daniel Borkmann,
	Alexei Starovoitov
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665048683.21055.2555532949856555388.stgit@firesoul>

On 05/18/2018 06:34 AM, Jesper Dangaard Brouer wrote:
> Notice how this allow us get XDP statistic without affecting the XDP
> performance, as tracepoint is no-longer activated on a per packet basis.
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---


[...]

>  #include <trace/define_trace.h>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index cab72c100bb5..6f84100723b0 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -50,6 +50,7 @@
>  #include <linux/bpf.h>
>  #include <net/xdp.h>
>  #include <linux/filter.h>
> +#include <trace/events/xdp.h>
>  
>  #define DEV_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
> @@ -57,6 +58,7 @@
>  #define DEV_MAP_BULK_SIZE 16
>  struct xdp_bulk_queue {
>  	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
> +	struct net_device *dev_rx;
>  	unsigned int count;
>  };
>  
> @@ -219,8 +221,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
>  static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>  			 struct xdp_bulk_queue *bq)
>  {
> -	unsigned int processed = 0, drops = 0;
>  	struct net_device *dev = obj->dev;
> +	int sent = 0, drops = 0;
>  	int i;
>  
>  	if (unlikely(!bq->count))
> @@ -241,10 +243,13 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>  			drops++;
>  			xdp_return_frame(xdpf);
>  		}
> -		processed++;
> +		sent++;

Do 'dropped' frames also get counted as 'sent' frames? This seems a bit
counter-intuitive to me. Should it be 'drops+sent = total frames'
instead?

>  	}
>  	bq->count = 0;
>  
> +	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
> +			      sent, drops, bq->dev_rx, dev);
> +	bq->dev_rx = NULL;
>  	return 0;
>  }
>  
> @@ -301,18 +306,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
>  /* Runs under RCU-read-side, plus in softirq under NAPI protection.
>   * Thus, safe percpu variable access.
>   */
> -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
> +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
> +		      struct net_device *dev_rx)
> +
>  {
>  	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
>  
>  	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
>  		bq_xmit_all(obj, bq);
>  
> +	/* Ingress dev_rx will be the same for all xdp_frame's in
> +	 * bulk_queue, because bq stored per-CPU and must be flushed
> +	 * from net_device drivers NAPI func end.
> +	 */
> +	if (!bq->dev_rx)
> +		bq->dev_rx = dev_rx;
> +
>  	bq->q[bq->count++] = xdpf;
>  	return 0;
>  }
>  
> -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
> +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> +		    struct net_device *dev_rx)
>  {
>  	struct net_device *dev = dst->dev;
>  	struct xdp_frame *xdpf;
> @@ -325,7 +340,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
>  	if (unlikely(!xdpf))
>  		return -EOVERFLOW;
>  
> -	err = bq_enqueue(dst, xdpf);
> +	err = bq_enqueue(dst, xdpf, dev_rx);
>  	if (err)
>  		return err;
>  
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1447ec94ef74..4a93423cc5ea 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3063,7 +3063,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>  	case BPF_MAP_TYPE_DEVMAP: {
>  		struct bpf_dtab_netdev *dst = fwd;
>  
> -		err = dev_map_enqueue(dst, xdp);
> +		err = dev_map_enqueue(dst, xdp, dev_rx);
>  		if (err)
>  			return err;
>  		__dev_map_insert_ctx(map, index);
> 

^ permalink raw reply

* [PATCH net-next 1/2] cxgb4: change the port capability bits definition
From: Ganesh Goudar @ 2018-05-23 14:32 UTC (permalink / raw)
  To: netdev, davem
  Cc: nirranjan, indranil, venkatesh, linux-scsi, varun, Ganesh Goudar,
	Casey Leedom

MDI Port Capabilities bit definitions were inconsistent with
regard to the MDI enum values. 2 bits used to define MDI in
the port capabilities are not really separable, it's a 2-bit
field with 4 different values. Change the port capability bit
definitions to be "AUTO" and "STRAIGHT" in order to get them
to line up with the enum's.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c     | 4 ++--
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h  | 8 ++++----
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 2 +-
 drivers/scsi/csiostor/csio_hw.c                | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index df5e7c7..537ed07 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -3941,8 +3941,8 @@ static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
 	CAP16_TO_CAP32(FC_RX);
 	CAP16_TO_CAP32(FC_TX);
 	CAP16_TO_CAP32(ANEG);
-	CAP16_TO_CAP32(MDIX);
 	CAP16_TO_CAP32(MDIAUTO);
+	CAP16_TO_CAP32(MDISTRAIGHT);
 	CAP16_TO_CAP32(FEC_RS);
 	CAP16_TO_CAP32(FEC_BASER_RS);
 	CAP16_TO_CAP32(802_3_PAUSE);
@@ -3982,8 +3982,8 @@ static fw_port_cap16_t fwcaps32_to_caps16(fw_port_cap32_t caps32)
 	CAP32_TO_CAP16(802_3_PAUSE);
 	CAP32_TO_CAP16(802_3_ASM_DIR);
 	CAP32_TO_CAP16(ANEG);
-	CAP32_TO_CAP16(MDIX);
 	CAP32_TO_CAP16(MDIAUTO);
+	CAP32_TO_CAP16(MDISTRAIGHT);
 	CAP32_TO_CAP16(FEC_RS);
 	CAP32_TO_CAP16(FEC_BASER_RS);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index e6b2e95..2d91480 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -2471,8 +2471,8 @@ enum fw_port_cap {
 	FW_PORT_CAP_FC_RX		= 0x0040,
 	FW_PORT_CAP_FC_TX		= 0x0080,
 	FW_PORT_CAP_ANEG		= 0x0100,
-	FW_PORT_CAP_MDIX		= 0x0200,
-	FW_PORT_CAP_MDIAUTO		= 0x0400,
+	FW_PORT_CAP_MDIAUTO		= 0x0200,
+	FW_PORT_CAP_MDISTRAIGHT		= 0x0400,
 	FW_PORT_CAP_FEC_RS		= 0x0800,
 	FW_PORT_CAP_FEC_BASER_RS	= 0x1000,
 	FW_PORT_CAP_FEC_RESERVED	= 0x2000,
@@ -2515,8 +2515,8 @@ enum fw_port_mdi {
 #define	FW_PORT_CAP32_802_3_PAUSE	0x00040000UL
 #define	FW_PORT_CAP32_802_3_ASM_DIR	0x00080000UL
 #define	FW_PORT_CAP32_ANEG		0x00100000UL
-#define	FW_PORT_CAP32_MDIX		0x00200000UL
-#define	FW_PORT_CAP32_MDIAUTO		0x00400000UL
+#define	FW_PORT_CAP32_MDIAUTO		0x00200000UL
+#define	FW_PORT_CAP32_MDISTRAIGHT	0x00400000UL
 #define	FW_PORT_CAP32_FEC_RS		0x00800000UL
 #define	FW_PORT_CAP32_FEC_BASER_RS	0x01000000UL
 #define	FW_PORT_CAP32_FEC_RESERVED1	0x02000000UL
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
index 798695b..3017f78 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
@@ -341,8 +341,8 @@ static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
 	CAP16_TO_CAP32(FC_RX);
 	CAP16_TO_CAP32(FC_TX);
 	CAP16_TO_CAP32(ANEG);
-	CAP16_TO_CAP32(MDIX);
 	CAP16_TO_CAP32(MDIAUTO);
+	CAP16_TO_CAP32(MDISTRAIGHT);
 	CAP16_TO_CAP32(FEC_RS);
 	CAP16_TO_CAP32(FEC_BASER_RS);
 	CAP16_TO_CAP32(802_3_PAUSE);
diff --git a/drivers/scsi/csiostor/csio_hw.c b/drivers/scsi/csiostor/csio_hw.c
index 96bbb82..a10cf25 100644
--- a/drivers/scsi/csiostor/csio_hw.c
+++ b/drivers/scsi/csiostor/csio_hw.c
@@ -1500,8 +1500,8 @@ fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
 	CAP16_TO_CAP32(FC_RX);
 	CAP16_TO_CAP32(FC_TX);
 	CAP16_TO_CAP32(ANEG);
-	CAP16_TO_CAP32(MDIX);
 	CAP16_TO_CAP32(MDIAUTO);
+	CAP16_TO_CAP32(MDISTRAIGHT);
 	CAP16_TO_CAP32(FEC_RS);
 	CAP16_TO_CAP32(FEC_BASER_RS);
 	CAP16_TO_CAP32(802_3_PAUSE);
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 2/2] cxgb4: do L1 config when module is inserted
From: Ganesh Goudar @ 2018-05-23 14:33 UTC (permalink / raw)
  To: netdev, davem
  Cc: nirranjan, indranil, venkatesh, linux-scsi, varun, Ganesh Goudar,
	Casey Leedom

trigger an L1 configure operation when a transceiver module
is inserted in order to cause current "sticky" options like
Requested Forward Error Correction to be reapplied.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      | 26 ++++++++++++++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 11 +++++--
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c      | 40 +++++++++++++++++++++----
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 211086b..0f305d9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -491,6 +491,9 @@ struct link_config {
 
 	unsigned char  link_ok;          /* link up? */
 	unsigned char  link_down_rc;     /* link down reason */
+
+	bool new_module;		 /* ->OS Transceiver Module inserted */
+	bool redo_l1cfg;		 /* ->CC redo current "sticky" L1 CFG */
 };
 
 #define FW_LEN16(fw_struct) FW_CMD_LEN16_V(sizeof(fw_struct) / 16)
@@ -1324,7 +1327,7 @@ static inline unsigned int qtimer_val(const struct adapter *adap,
 extern char cxgb4_driver_name[];
 extern const char cxgb4_driver_version[];
 
-void t4_os_portmod_changed(const struct adapter *adap, int port_id);
+void t4_os_portmod_changed(struct adapter *adap, int port_id);
 void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat);
 
 void t4_free_sge_resources(struct adapter *adap);
@@ -1505,8 +1508,25 @@ void t4_intr_disable(struct adapter *adapter);
 int t4_slow_intr_handler(struct adapter *adapter);
 
 int t4_wait_dev_ready(void __iomem *regs);
-int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port,
-		  struct link_config *lc);
+
+int t4_link_l1cfg_core(struct adapter *adap, unsigned int mbox,
+		       unsigned int port, struct link_config *lc,
+		       bool sleep_ok, int timeout);
+
+static inline int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
+				unsigned int port, struct link_config *lc)
+{
+	return t4_link_l1cfg_core(adapter, mbox, port, lc,
+				  true, FW_CMD_MAX_TIMEOUT);
+}
+
+static inline int t4_link_l1cfg_ns(struct adapter *adapter, unsigned int mbox,
+				   unsigned int port, struct link_config *lc)
+{
+	return t4_link_l1cfg_core(adapter, mbox, port, lc,
+				  false, FW_CMD_MAX_TIMEOUT);
+}
+
 int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port);
 
 u32 t4_read_pcie_cfg4(struct adapter *adap, int reg);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 130d1ee..513e1d3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -301,14 +301,14 @@ void t4_os_link_changed(struct adapter *adapter, int port_id, int link_stat)
 	}
 }
 
-void t4_os_portmod_changed(const struct adapter *adap, int port_id)
+void t4_os_portmod_changed(struct adapter *adap, int port_id)
 {
 	static const char *mod_str[] = {
 		NULL, "LR", "SR", "ER", "passive DA", "active DA", "LRM"
 	};
 
-	const struct net_device *dev = adap->port[port_id];
-	const struct port_info *pi = netdev_priv(dev);
+	struct net_device *dev = adap->port[port_id];
+	struct port_info *pi = netdev_priv(dev);
 
 	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
 		netdev_info(dev, "port module unplugged\n");
@@ -325,6 +325,11 @@ void t4_os_portmod_changed(const struct adapter *adap, int port_id)
 	else
 		netdev_info(dev, "%s: unknown module type %d inserted\n",
 			    dev->name, pi->mod_type);
+
+	/* If the interface is running, then we'll need any "sticky" Link
+	 * Parameters redone with a new Transceiver Module.
+	 */
+	pi->link_cfg.redo_l1cfg = netif_running(dev);
 }
 
 int dbfifo_int_thresh = 10; /* 10 == 640 entry threshold */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 537ed07..704f696 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -4058,14 +4058,16 @@ static inline fw_port_cap32_t cc_to_fwcap_fec(enum cc_fec cc_fec)
  *	- If auto-negotiation is off set the MAC to the proper speed/duplex/FC,
  *	  otherwise do it later based on the outcome of auto-negotiation.
  */
-int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
-		  unsigned int port, struct link_config *lc)
+int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox,
+		       unsigned int port, struct link_config *lc,
+		       bool sleep_ok, int timeout)
 {
 	unsigned int fw_caps = adapter->params.fw_caps_support;
-	struct fw_port_cmd cmd;
-	unsigned int fw_mdi = FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO);
 	fw_port_cap32_t fw_fc, cc_fec, fw_fec, rcap;
+	struct fw_port_cmd cmd;
+	unsigned int fw_mdi;
 
+	fw_mdi = (FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO) & lc->pcaps);
 	/* Convert driver coding of Pause Frame Flow Control settings into the
 	 * Firmware's API.
 	 */
@@ -4087,7 +4089,7 @@ int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
 	/* Figure out what our Requested Port Capabilities are going to be.
 	 */
 	if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
-		rcap = (lc->pcaps & ADVERT_MASK) | fw_fc | fw_fec;
+		rcap = lc->acaps | fw_fc | fw_fec;
 		lc->fc = lc->requested_fc & ~PAUSE_AUTONEG;
 		lc->fec = cc_fec;
 	} else if (lc->autoneg == AUTONEG_DISABLE) {
@@ -4113,7 +4115,8 @@ int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
 		cmd.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap));
 	else
 		cmd.u.l1cfg32.rcap32 = cpu_to_be32(rcap);
-	return t4_wr_mbox(adapter, mbox, &cmd, sizeof(cmd), NULL);
+	return t4_wr_mbox_meat_timeout(adapter, mbox, &cmd, sizeof(cmd), NULL,
+				       sleep_ok, timeout);
 }
 
 /**
@@ -8335,6 +8338,9 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
 	fc = fwcap_to_cc_pause(linkattr);
 	speed = fwcap_to_speed(linkattr);
 
+	lc->new_module = false;
+	lc->redo_l1cfg = false;
+
 	if (mod_type != pi->mod_type) {
 		/* With the newer SFP28 and QSFP28 Transceiver Module Types,
 		 * various fundamental Port Capabilities which used to be
@@ -8369,6 +8375,8 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
 		pi->port_type = port_type;
 
 		pi->mod_type = mod_type;
+
+		lc->new_module = t4_is_inserted_mod_type(mod_type);
 		t4_os_portmod_changed(adapter, pi->port_id);
 	}
 
@@ -8401,6 +8409,26 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
 
 		t4_os_link_changed(adapter, pi->port_id, link_ok);
 	}
+
+	if (lc->new_module && lc->redo_l1cfg) {
+		struct link_config old_lc;
+		int ret;
+
+		/* Save the current L1 Configuration and restore it if an
+		 * error occurs.  We probably should fix the l1_cfg*()
+		 * routines not to change the link_config when an error
+		 * occurs ...
+		 */
+		old_lc = *lc;
+		ret = t4_link_l1cfg_ns(adapter, adapter->mbox, pi->lport, lc);
+		if (ret) {
+			*lc = old_lc;
+			dev_warn(adapter->pdev_dev,
+				 "Attempt to update new Transceiver Module settings failed\n");
+		}
+	}
+	lc->new_module = false;
+	lc->redo_l1cfg = false;
 }
 
 /**
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 0/4] patches 2018-05-23
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun

Dave,

here are more smc-patches for net-next:

Patch 1 fixes an ioctl problem detected by syzbot.

Patch 2 improves smc_lgr_list locking in case of abnormal link
group termination. If you want to receive a version for the net-tree,
please let me know. It would look somewhat different, since the port
terminate code has been moved to smc_core.c on net-next.

Patch 3 enables SMC to deal with urgent data.

Patch 4 is a minor improvement to avoid out-of-sync linkgroups
between 2 peers.

Thanks, Ursula

Hans Wippel (1):
  net/smc: lock smc_lgr_list in port_terminate()

Stefan Raspl (1):
  net/smc: urgent data support

Ursula Braun (2):
  net/smc: return 0 for ioctl calls in states INIT and CLOSED
  net/smc: longer delay when freeing client link groups

 net/smc/af_smc.c   |  42 ++++++++++++++++---
 net/smc/smc.h      |  15 +++++++
 net/smc/smc_cdc.c  |  44 ++++++++++++++++++--
 net/smc/smc_cdc.h  |  13 ++++++
 net/smc/smc_core.c |  19 +++++++--
 net/smc/smc_rx.c   | 120 +++++++++++++++++++++++++++++++++++++++++++++++------
 net/smc/smc_tx.c   |  55 ++++++++++++++++--------
 net/smc/smc_tx.h   |   2 +-
 8 files changed, 267 insertions(+), 43 deletions(-)

-- 
2.16.3

^ permalink raw reply

* [PATCH net-next 1/4] net/smc: return 0 for ioctl calls in states INIT and CLOSED
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180523143812.25824-1-ubraun@linux.ibm.com>

A connected SMC-socket contains addresses of descriptors for the
send buffer and the rmb (receive buffer). Fields of these descriptors
are used to determine the answer for certain ioctl requests.
Add extra handling for unconnected SMC socket states without valid
buffer descriptor addresses.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Reported-by: syzbot+e6714328fda813fc670f@syzkaller.appspotmail.com
---
 net/smc/af_smc.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 48530dab5c94..f2d925921d81 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1490,20 +1490,32 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 	case SIOCINQ: /* same as FIONREAD */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = atomic_read(&smc->conn.bytes_to_rcv);
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = atomic_read(&smc->conn.bytes_to_rcv);
 		break;
 	case SIOCOUTQ:
 		/* output queue size (not send + not acked) */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = smc->conn.sndbuf_desc->len -
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = smc->conn.sndbuf_desc->len -
 					atomic_read(&smc->conn.sndbuf_space);
 		break;
 	case SIOCOUTQNSD:
 		/* output queue size (not send only) */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = smc_tx_prepared_sends(&smc->conn);
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = smc_tx_prepared_sends(&smc->conn);
 		break;
 	default:
 		return -ENOIOCTLCMD;
-- 
2.16.3

^ permalink raw reply related

* [PATCH net-next 2/4] net/smc: lock smc_lgr_list in port_terminate()
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180523143812.25824-1-ubraun@linux.ibm.com>

From: Hans Wippel <hwippel@linux.ibm.com>

Currently, smc_port_terminate() is not holding the lock of the lgr list
while it is traversing the list. This patch adds locking to this
function and changes smc_lgr_terminate() accordingly.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
---
 net/smc/smc_core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 1e5c0e90a706..21c244f53b0a 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -346,7 +346,7 @@ void smc_lgr_forget(struct smc_link_group *lgr)
 }
 
 /* terminate linkgroup abnormally */
-void smc_lgr_terminate(struct smc_link_group *lgr)
+static void __smc_lgr_terminate(struct smc_link_group *lgr)
 {
 	struct smc_connection *conn;
 	struct smc_sock *smc;
@@ -355,7 +355,8 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	if (lgr->terminating)
 		return;	/* lgr already terminating */
 	lgr->terminating = 1;
-	smc_lgr_forget(lgr);
+	if (!list_empty(&lgr->list)) /* forget lgr */
+		list_del_init(&lgr->list);
 	smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
 
 	write_lock_bh(&lgr->conns_lock);
@@ -377,16 +378,25 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	smc_lgr_schedule_free_work(lgr);
 }
 
+void smc_lgr_terminate(struct smc_link_group *lgr)
+{
+	spin_lock_bh(&smc_lgr_list.lock);
+	__smc_lgr_terminate(lgr);
+	spin_unlock_bh(&smc_lgr_list.lock);
+}
+
 /* Called when IB port is terminated */
 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
 {
 	struct smc_link_group *lgr, *l;
 
+	spin_lock_bh(&smc_lgr_list.lock);
 	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
 		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
 		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
-			smc_lgr_terminate(lgr);
+			__smc_lgr_terminate(lgr);
 	}
+	spin_unlock_bh(&smc_lgr_list.lock);
 }
 
 /* Determine vlan of internal TCP socket.
-- 
2.16.3

^ permalink raw reply related

* [PATCH net-next 3/4] net/smc: urgent data support
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180523143812.25824-1-ubraun@linux.ibm.com>

From: Stefan Raspl <raspl@linux.ibm.com>

Add support for out of band data send and receive.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
---
 net/smc/af_smc.c   |  24 ++++++++++-
 net/smc/smc.h      |  15 +++++++
 net/smc/smc_cdc.c  |  44 ++++++++++++++++++--
 net/smc/smc_cdc.h  |  13 ++++++
 net/smc/smc_core.c |   1 +
 net/smc/smc_rx.c   | 120 +++++++++++++++++++++++++++++++++++++++++++++++------
 net/smc/smc_tx.c   |  55 ++++++++++++++++--------
 net/smc/smc_tx.h   |   2 +-
 8 files changed, 238 insertions(+), 36 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index f2d925921d81..2c369d4bb1c1 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -8,8 +8,6 @@
  *
  *  Initial restrictions:
  *    - support for alternate links postponed
- *    - partial support for non-blocking sockets only
- *    - support for urgent data postponed
  *
  *  Copyright IBM Corp. 2016, 2018
  *
@@ -1338,6 +1336,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
 				mask |= EPOLLIN;
 		}
+		if (smc->conn.urg_state == SMC_URG_VALID)
+			mask |= EPOLLPRI;
 
 	}
 	release_sock(sk);
@@ -1477,10 +1477,13 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
 static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		     unsigned long arg)
 {
+	union smc_host_cursor cons, urg;
+	struct smc_connection *conn;
 	struct smc_sock *smc;
 	int answ;
 
 	smc = smc_sk(sock->sk);
+	conn = &smc->conn;
 	if (smc->use_fallback) {
 		if (!smc->clcsock)
 			return -EBADF;
@@ -1517,6 +1520,23 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		else
 			answ = smc_tx_prepared_sends(&smc->conn);
 		break;
+	case SIOCATMARK:
+		if (smc->sk.sk_state == SMC_LISTEN)
+			return -EINVAL;
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED) {
+			answ = 0;
+		} else {
+			smc_curs_write(&cons,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+				       conn);
+			smc_curs_write(&urg,
+				       smc_curs_read(&conn->urg_curs, conn),
+				       conn);
+			answ = smc_curs_diff(conn->rmb_desc->len,
+					     &cons, &urg) == 1;
+		}
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/net/smc/smc.h b/net/smc/smc.h
index a1467e411645..51ae1f10d81a 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -114,6 +114,12 @@ struct smc_host_cdc_msg {		/* Connection Data Control message */
 	u8				reserved[18];
 } __aligned(8);
 
+enum smc_urg_state {
+	SMC_URG_VALID,			/* data present */
+	SMC_URG_NOTYET,			/* data pending */
+	SMC_URG_READ			/* data was already read */
+};
+
 struct smc_connection {
 	struct rb_node		alert_node;
 	struct smc_link_group	*lgr;		/* link group of connection */
@@ -160,6 +166,15 @@ struct smc_connection {
 	union smc_host_cursor	rx_curs_confirmed; /* confirmed to peer
 						    * source of snd_una ?
 						    */
+	union smc_host_cursor	urg_curs;	/* points at urgent byte */
+	enum smc_urg_state	urg_state;
+	bool			urg_tx_pend;	/* urgent data staged */
+	bool			urg_rx_skip_pend;
+						/* indicate urgent oob data
+						 * read, but previous regular
+						 * data still pending
+						 */
+	char			urg_rx_byte;	/* urgent byte */
 	atomic_t		bytes_to_rcv;	/* arrived data,
 						 * not yet received
 						 */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 8d2c079c87b0..a7e8d63fc8ae 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -164,6 +164,28 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2)
 	return (s16)(seq1 - seq2) < 0;
 }
 
+static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
+					    int *diff_prod)
+{
+	struct smc_connection *conn = &smc->conn;
+	char *base;
+
+	/* new data included urgent business */
+	smc_curs_write(&conn->urg_curs,
+		       smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+		       conn);
+	conn->urg_state = SMC_URG_VALID;
+	if (!sock_flag(&smc->sk, SOCK_URGINLINE))
+		/* we'll skip the urgent byte, so don't account for it */
+		(*diff_prod)--;
+	base = (char *)conn->rmb_desc->cpu_addr;
+	if (conn->urg_curs.count)
+		conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
+	else
+		conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1);
+	sk_send_sigurg(&smc->sk);
+}
+
 static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 				    struct smc_cdc_msg *cdc)
 {
@@ -194,15 +216,25 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 	diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
 				  &conn->local_rx_ctrl.prod);
 	if (diff_prod) {
+		if (conn->local_rx_ctrl.prod_flags.urg_data_present)
+			smc_cdc_handle_urg_data_arrival(smc, &diff_prod);
 		/* bytes_to_rcv is decreased in smc_recvmsg */
 		smp_mb__before_atomic();
 		atomic_add(diff_prod, &conn->bytes_to_rcv);
 		/* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
 		smp_mb__after_atomic();
 		smc->sk.sk_data_ready(&smc->sk);
-	} else if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
-		   (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) {
-		smc->sk.sk_data_ready(&smc->sk);
+	} else {
+		if (conn->local_rx_ctrl.prod_flags.write_blocked ||
+		    conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+		    conn->local_rx_ctrl.prod_flags.urg_data_pending) {
+			if (conn->local_rx_ctrl.prod_flags.urg_data_pending)
+				conn->urg_state = SMC_URG_NOTYET;
+			/* force immediate tx of current consumer cursor, but
+			 * under send_lock to guarantee arrival in seqno-order
+			 */
+			smc_tx_sndbuf_nonempty(conn);
+		}
 	}
 
 	/* piggy backed tx info */
@@ -212,6 +244,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		/* trigger socket release if connection closed */
 		smc_close_wake_tx_prepared(smc);
 	}
+	if (diff_cons && conn->urg_tx_pend &&
+	    atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
+		/* urg data confirmed by peer, indicate we're ready for more */
+		conn->urg_tx_pend = false;
+		smc->sk.sk_write_space(&smc->sk);
+	}
 
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
 		smc->sk.sk_err = ECONNRESET;
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index d2012fd22100..f60082fee5b8 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -146,6 +146,19 @@ static inline int smc_curs_diff(unsigned int size,
 	return max_t(int, 0, (new->count - old->count));
 }
 
+/* calculate cursor difference between old and new - returns negative
+ * value in case old > new
+ */
+static inline int smc_curs_comp(unsigned int size,
+				union smc_host_cursor *old,
+				union smc_host_cursor *new)
+{
+	if (old->wrap > new->wrap ||
+	    (old->wrap == new->wrap && old->count > new->count))
+		return -smc_curs_diff(size, new, old);
+	return smc_curs_diff(size, old, new);
+}
+
 static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
 					  union smc_host_cursor *local,
 					  struct smc_connection *conn)
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 21c244f53b0a..2bf138e7d3ec 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -544,6 +544,7 @@ int smc_conn_create(struct smc_sock *smc,
 	}
 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
+	conn->urg_state = SMC_URG_READ;
 #ifndef KERNEL_HAS_ATOMIC64
 	spin_lock_init(&conn->acurs_lock);
 #endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 290a434471d1..3d77b383cccd 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -47,16 +47,59 @@ static void smc_rx_wake_up(struct sock *sk)
  *   @conn   connection to update
  *   @cons   consumer cursor
  *   @len    number of Bytes consumed
+ *   Returns:
+ *   1 if we should end our receive, 0 otherwise
  */
-static void smc_rx_update_consumer(struct smc_connection *conn,
-				   union smc_host_cursor cons, size_t len)
+static int smc_rx_update_consumer(struct smc_sock *smc,
+				  union smc_host_cursor cons, size_t len)
 {
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	bool force = false;
+	int diff, rc = 0;
+
 	smc_curs_add(conn->rmb_desc->len, &cons, len);
+
+	/* did we process urgent data? */
+	if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) {
+		diff = smc_curs_comp(conn->rmb_desc->len, &cons,
+				     &conn->urg_curs);
+		if (sock_flag(sk, SOCK_URGINLINE)) {
+			if (diff == 0) {
+				force = true;
+				rc = 1;
+				conn->urg_state = SMC_URG_READ;
+			}
+		} else {
+			if (diff == 1) {
+				/* skip urgent byte */
+				force = true;
+				smc_curs_add(conn->rmb_desc->len, &cons, 1);
+				conn->urg_rx_skip_pend = false;
+			} else if (diff < -1)
+				/* we read past urgent byte */
+				conn->urg_state = SMC_URG_READ;
+		}
+	}
+
 	smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
 		       conn);
+
 	/* send consumer cursor update if required */
 	/* similar to advertising new TCP rcv_wnd if required */
-	smc_tx_consumer_update(conn);
+	smc_tx_consumer_update(conn, force);
+
+	return rc;
+}
+
+static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
+{
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+
+	smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+		       conn);
+	smc_rx_update_consumer(smc, cons, len);
 }
 
 struct smc_spd_priv {
@@ -70,7 +113,6 @@ static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
 	struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
 	struct smc_sock *smc = priv->smc;
 	struct smc_connection *conn;
-	union smc_host_cursor cons;
 	struct sock *sk = &smc->sk;
 
 	if (sk->sk_state == SMC_CLOSED ||
@@ -79,9 +121,7 @@ static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
 		goto out;
 	conn = &smc->conn;
 	lock_sock(sk);
-	smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
-		       conn);
-	smc_rx_update_consumer(conn, cons, priv->len);
+	smc_rx_update_cons(smc, priv->len);
 	release_sock(sk);
 	if (atomic_sub_and_test(priv->len, &conn->splice_pending))
 		smc_rx_wake_up(sk);
@@ -184,6 +224,52 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo,
 	return rc;
 }
 
+static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
+			   int flags)
+{
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+	struct sock *sk = &smc->sk;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_URGINLINE) ||
+	    !(conn->urg_state == SMC_URG_VALID) ||
+	    conn->urg_state == SMC_URG_READ)
+		return -EINVAL;
+
+	if (conn->urg_state == SMC_URG_VALID) {
+		if (!(flags & MSG_PEEK))
+			smc->conn.urg_state = SMC_URG_READ;
+		msg->msg_flags |= MSG_OOB;
+		if (len > 0) {
+			if (!(flags & MSG_TRUNC))
+				rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
+			len = 1;
+			smc_curs_write(&cons,
+				       smc_curs_read(&conn->local_tx_ctrl.cons,
+						     conn),
+				       conn);
+			if (smc_curs_diff(conn->rmb_desc->len, &cons,
+					  &conn->urg_curs) > 1)
+				conn->urg_rx_skip_pend = true;
+			/* Urgent Byte was already accounted for, but trigger
+			 * skipping the urgent byte in non-inline case
+			 */
+			if (!(flags & MSG_PEEK))
+				smc_rx_update_consumer(smc, cons, 0);
+		} else {
+			msg->msg_flags |= MSG_TRUNC;
+		}
+
+		return rc ? -EFAULT : len;
+	}
+
+	if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN)
+		return 0;
+
+	return -EAGAIN;
+}
+
 /* smc_rx_recvmsg - receive data from RMBE
  * @msg:	copy data to receive buffer
  * @pipe:	copy data to pipe if set - indicates splice() call
@@ -209,12 +295,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return -EINVAL; /* future work for sk.sk_family == AF_SMC */
-	if (flags & MSG_OOB)
-		return -EINVAL; /* future work */
 
 	sk = &smc->sk;
 	if (sk->sk_state == SMC_LISTEN)
 		return -ENOTCONN;
+	if (flags & MSG_OOB)
+		return smc_rx_recv_urg(smc, msg, len, flags);
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
@@ -227,6 +313,9 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 
 		if (atomic_read(&conn->bytes_to_rcv))
 			goto copy;
+		else if (conn->urg_state == SMC_URG_VALID)
+			/* we received a single urgent Byte - skip */
+			smc_rx_update_cons(smc, 0);
 
 		if (sk->sk_shutdown & RCV_SHUTDOWN ||
 		    smc_cdc_rxed_any_close_or_senddone(conn) ||
@@ -281,14 +370,18 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 			continue;
 		}
 
-		/* not more than what user space asked for */
-		copylen = min_t(size_t, read_remaining, readable);
 		smc_curs_write(&cons,
 			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
 			       conn);
 		/* subsequent splice() calls pick up where previous left */
 		if (splbytes)
 			smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
+		if (conn->urg_state == SMC_URG_VALID &&
+		    sock_flag(&smc->sk, SOCK_URGINLINE) &&
+		    readable > 1)
+			readable--;	/* always stop at urgent Byte */
+		/* not more than what user space asked for */
+		copylen = min_t(size_t, read_remaining, readable);
 		/* determine chunks where to read from rcvbuf */
 		/* either unwrapped case, or 1st chunk of wrapped case */
 		chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
@@ -333,8 +426,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 			atomic_sub(copylen, &conn->bytes_to_rcv);
 			/* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
 			smp_mb__after_atomic();
-			if (msg)
-				smc_rx_update_consumer(conn, cons, copylen);
+			if (msg && smc_rx_update_consumer(smc, cons, copylen))
+				goto out;
 		}
 	} while (read_remaining);
 out:
@@ -346,4 +439,5 @@ void smc_rx_init(struct smc_sock *smc)
 {
 	smc->sk.sk_data_ready = smc_rx_wake_up;
 	atomic_set(&smc->conn.splice_pending, 0);
+	smc->conn.urg_state = SMC_URG_READ;
 }
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 1f4a38b857f0..cee666400752 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -32,7 +32,7 @@
 /***************************** sndbuf producer *******************************/
 
 /* callback implementation for sk.sk_write_space()
- * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * to wakeup sndbuf producers that blocked with smc_tx_wait().
  * called under sk_socket lock.
  */
 static void smc_tx_write_space(struct sock *sk)
@@ -56,7 +56,7 @@ static void smc_tx_write_space(struct sock *sk)
 	}
 }
 
-/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+/* Wakeup sndbuf producers that blocked with smc_tx_wait().
  * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
  */
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
@@ -66,8 +66,10 @@ void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
 		smc->sk.sk_write_space(&smc->sk);
 }
 
-/* blocks sndbuf producer until at least one byte of free space available */
-static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+/* blocks sndbuf producer until at least one byte of free space available
+ * or urgent Byte was consumed
+ */
+static int smc_tx_wait(struct smc_sock *smc, int flags)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct smc_connection *conn = &smc->conn;
@@ -103,14 +105,15 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
 			break;
 		}
 		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-		if (atomic_read(&conn->sndbuf_space))
-			break; /* at least 1 byte of free space available */
+		if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
+			break; /* at least 1 byte of free & no urgent data */
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		sk_wait_event(sk, &timeo,
 			      sk->sk_err ||
 			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
 			      smc_cdc_rxed_any_close(conn) ||
-			      atomic_read(&conn->sndbuf_space),
+			      (atomic_read(&conn->sndbuf_space) &&
+			       !conn->urg_tx_pend),
 			      &wait);
 	}
 	remove_wait_queue(sk_sleep(sk), &wait);
@@ -157,8 +160,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		if (smc_cdc_rxed_any_close(conn))
 			return send_done ?: -ECONNRESET;
 
-		if (!atomic_read(&conn->sndbuf_space)) {
-			rc = smc_tx_wait_memory(smc, msg->msg_flags);
+		if (msg->msg_flags & MSG_OOB)
+			conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
+
+		if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
+			rc = smc_tx_wait(smc, msg->msg_flags);
 			if (rc) {
 				if (send_done)
 					return send_done;
@@ -168,7 +174,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		}
 
 		/* initialize variables for 1st iteration of subsequent loop */
-		/* could be just 1 byte, even after smc_tx_wait_memory above */
+		/* could be just 1 byte, even after smc_tx_wait above */
 		writespace = atomic_read(&conn->sndbuf_space);
 		/* not more than what user space asked for */
 		copylen = min_t(size_t, send_remaining, writespace);
@@ -218,6 +224,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		/* since we just produced more new data into sndbuf,
 		 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
 		 */
+		if ((msg->msg_flags & MSG_OOB) && !send_remaining)
+			conn->urg_tx_pend = true;
 		if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
 		    (atomic_read(&conn->sndbuf_space) >
 						(conn->sndbuf_desc->len >> 1)))
@@ -299,6 +307,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 	union smc_host_cursor sent, prep, prod, cons;
 	struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
 	struct smc_link_group *lgr = conn->lgr;
+	struct smc_cdc_producer_flags *pflags;
 	int to_send, rmbespace;
 	struct smc_link *link;
 	dma_addr_t dma_addr;
@@ -326,7 +335,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 		       conn);
 
 	/* if usable snd_wnd closes ask peer to advertise once it opens again */
-	conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+	pflags = &conn->local_tx_ctrl.prod_flags;
+	pflags->write_blocked = (to_send >= rmbespace);
 	/* cf. usable snd_wnd */
 	len = min(to_send, rmbespace);
 
@@ -391,6 +401,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 		src_len_sum = src_len;
 	}
 
+	if (conn->urg_tx_pend && len == to_send)
+		pflags->urg_data_present = 1;
 	smc_tx_advance_cursors(conn, &prod, &sent, len);
 	/* update connection's cursors with advanced local cursors */
 	smc_curs_write(&conn->local_tx_ctrl.prod,
@@ -410,6 +422,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
  */
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 {
+	struct smc_cdc_producer_flags *pflags;
 	struct smc_cdc_tx_pend *pend;
 	struct smc_wr_buf *wr_buf;
 	int rc;
@@ -433,14 +446,21 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 		goto out_unlock;
 	}
 
-	rc = smc_tx_rdma_writes(conn);
-	if (rc) {
-		smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
-				   (struct smc_wr_tx_pend_priv *)pend);
-		goto out_unlock;
+	if (!conn->local_tx_ctrl.prod_flags.urg_data_present) {
+		rc = smc_tx_rdma_writes(conn);
+		if (rc) {
+			smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+					   (struct smc_wr_tx_pend_priv *)pend);
+			goto out_unlock;
+		}
 	}
 
 	rc = smc_cdc_msg_send(conn, wr_buf, pend);
+	pflags = &conn->local_tx_ctrl.prod_flags;
+	if (!rc && pflags->urg_data_present) {
+		pflags->urg_data_pending = 0;
+		pflags->urg_data_present = 0;
+	}
 
 out_unlock:
 	spin_unlock_bh(&conn->send_lock);
@@ -473,7 +493,7 @@ void smc_tx_work(struct work_struct *work)
 	release_sock(&smc->sk);
 }
 
-void smc_tx_consumer_update(struct smc_connection *conn)
+void smc_tx_consumer_update(struct smc_connection *conn, bool force)
 {
 	union smc_host_cursor cfed, cons;
 	int to_confirm;
@@ -487,6 +507,7 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 	to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
 
 	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+	    force ||
 	    ((to_confirm > conn->rmbe_update_limit) &&
 	     ((to_confirm > (conn->rmb_desc->len / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 44d077942976..9d2238909fa0 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -32,6 +32,6 @@ void smc_tx_init(struct smc_sock *smc);
 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
-void smc_tx_consumer_update(struct smc_connection *conn);
+void smc_tx_consumer_update(struct smc_connection *conn, bool force);
 
 #endif /* SMC_TX_H */
-- 
2.16.3

^ permalink raw reply related

* [PATCH net-next 4/4] net/smc: longer delay when freeing client link groups
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180523143812.25824-1-ubraun@linux.ibm.com>

Client link group creation always follows the server linkgroup creation.
If peer creates a new server link group, client has to create a new
client link group. If peer reuses a server link group for a new
connection, client has to reuse its client link group as well. To
avoid out-of-sync conditions for link groups a longer delay for
for client link group removal is defined to make sure this link group
still exists, once the peer decides to reuse a server link group.

Currently the client link group delay time is just 10 jiffies larger
than the server link group delay time. This patch increases the delay
difference to 10 seconds to have a better protection against
out-of-sync link groups.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
---
 net/smc/smc_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2bf138e7d3ec..add82b0266f3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -28,7 +28,7 @@
 
 #define SMC_LGR_NUM_INCR		256
 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
-#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10)
+#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
 
 static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
-- 
2.16.3

^ permalink raw reply related

* Re: [bpf-next V4 PATCH 6/8] xdp: change ndo_xdp_xmit API to support bulking
From: John Fastabend @ 2018-05-23 14:42 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, Daniel Borkmann,
	Alexei Starovoitov
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665050207.21055.17828855357297094854.stgit@firesoul>

On 05/18/2018 06:35 AM, Jesper Dangaard Brouer wrote:
> This patch change the API for ndo_xdp_xmit to support bulking
> xdp_frames.
> 
> When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
> Most of the slowdown is caused by DMA API indirect function calls, but
> also the net_device->ndo_xdp_xmit() call.
> 
> Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
> single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
> performance improved:
>  for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
>  for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps
> 
> With frames avail as a bulk inside the driver ndo_xdp_xmit call,
> further optimizations are possible, like bulk DMA-mapping for TX.
> 
> Testing without CONFIG_RETPOLINE show the same performance for
> physical NIC drivers.
> 
> The virtual NIC driver tun sees a huge performance boost, as it can
> avoid doing per frame producer locking, but instead amortize the
> locking cost over the bulk.
> 
> V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>
> V4: Isolated ndo, driver changes and callers.
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

Couple suggestions for some optimizations/improvements but otherwise
looks good to me.

Thanks,
John

> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   26 +++++++---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    2 -
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   21 ++++++--
>  drivers/net/tun.c                             |   37 +++++++++-----
>  drivers/net/virtio_net.c                      |   66 +++++++++++++++++++------
>  include/linux/netdevice.h                     |   14 +++--
>  kernel/bpf/devmap.c                           |   31 ++++++++----
>  net/core/filter.c                             |    8 ++-
>  8 files changed, 141 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 5efa68de935b..9b698c5acd05 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
>   * @dev: netdev
>   * @xdp: XDP buffer
>   *
> - * Returns Zero if sent, else an error code
> + * Returns number of frames successfully sent. Frames that fail are
> + * free'ed via XDP return API.
> + *
> + * For error cases, a negative errno code is returned and no-frames
> + * are transmitted (caller must handle freeing frames).
>   **/
> -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
> +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
>  {
>  	struct i40e_netdev_priv *np = netdev_priv(dev);
>  	unsigned int queue_index = smp_processor_id();
>  	struct i40e_vsi *vsi = np->vsi;
> -	int err;
> +	int drops = 0;
> +	int i;
>  
>  	if (test_bit(__I40E_VSI_DOWN, vsi->state))
>  		return -ENETDOWN;
> @@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
>  	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
>  		return -ENXIO;
>  
> -	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
> -	if (err != I40E_XDP_TX)
> -		return -ENOSPC;
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdpf = frames[i];
> +		int err;
>  
> -	return 0;
> +		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
> +		if (err != I40E_XDP_TX) {
> +			xdp_return_frame_rx_napi(xdpf);
> +			drops++;
> +		}
> +	}
> +

Perhaps not needed in this series, but I wonder how useful it is to hit
the ring with the remaining frames after the first error. The error will
typically be due to the TX queue being full. In this case it might make
more sense to just drop the entire train of frames vs beating the up a
full queue.

> +	return n - drops;
>  }
>  
>  /**
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index fdd2c55f03a6..eb8804b3d7b6 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
>  void i40e_detect_recover_hung(struct i40e_vsi *vsi);
>  int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
>  bool __i40e_chk_linearize(struct sk_buff *skb);
> -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
> +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
>  void i40e_xdp_flush(struct net_device *dev);
>  
>  /**
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 6652b201df5b..9645619f7729 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  	}
>  }
>  
> -static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
> +static int ixgbe_xdp_xmit(struct net_device *dev, int n,
> +			  struct xdp_frame **frames)
>  {
>  	struct ixgbe_adapter *adapter = netdev_priv(dev);
>  	struct ixgbe_ring *ring;
> -	int err;
> +	int drops = 0;
> +	int i;
>  
>  	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
>  		return -ENETDOWN;
> @@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
>  	if (unlikely(!ring))
>  		return -ENXIO;
>  
> -	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
> -	if (err != IXGBE_XDP_TX)
> -		return -ENOSPC;
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdpf = frames[i];
> +		int err;
>  
> -	return 0;
> +		err = ixgbe_xmit_xdp_ring(adapter, xdpf);
> +		if (err != IXGBE_XDP_TX) {
> +			xdp_return_frame_rx_napi(xdpf);
> +			drops++;
> +		}
> +	}

Same as above, how about just aborting if we get an error?

> +
> +	return n - drops;
>  }
>  
>  static void ixgbe_xdp_flush(struct net_device *dev)
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 44d4f3d25350..d3dcfcb1c4b3 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -70,6 +70,7 @@
>  #include <net/netns/generic.h>
>  #include <net/rtnetlink.h>
>  #include <net/sock.h>
> +#include <net/xdp.h>
>  #include <linux/seq_file.h>
>  #include <linux/uio.h>
>  #include <linux/skb_array.h>
> @@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
>  	.ndo_get_stats64	= tun_net_get_stats64,
>  };
>  
> -static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
> +static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
>  {
>  	struct tun_struct *tun = netdev_priv(dev);
>  	struct tun_file *tfile;
>  	u32 numqueues;
> -	int ret = 0;
> +	int drops = 0;
> +	int cnt = n;
> +	int i;
>  
>  	rcu_read_lock();
>  
>  	numqueues = READ_ONCE(tun->numqueues);
>  	if (!numqueues) {
> -		ret = -ENOSPC;
> -		goto out;
> +		rcu_read_unlock();
> +		return -ENXIO; /* Caller will free/return all frames */
>  	}
>  
>  	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
>  					    numqueues]);
> -	/* Encode the XDP flag into lowest bit for consumer to differ
> -	 * XDP buffer from sk_buff.
> -	 */
> -	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
> -		this_cpu_inc(tun->pcpu_stats->tx_dropped);
> -		ret = -ENOSPC;
> +
> +	spin_lock(&tfile->tx_ring.producer_lock);
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdp = frames[i];
> +		/* Encode the XDP flag into lowest bit for consumer to differ
> +		 * XDP buffer from sk_buff.
> +		 */
> +		void *frame = tun_xdp_to_ptr(xdp);
> +
> +		if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
> +			this_cpu_inc(tun->pcpu_stats->tx_dropped);
> +			xdp_return_frame_rx_napi(xdp);
> +			drops++;
> +		}

We have a ptr_ring_consume_batched() API it seems we should also have a
ptr_ring_produce_batched() now as well. Could be a follow up patch but
I think its probably worth considering.


>  	}
> +	spin_unlock(&tfile->tx_ring.producer_lock);
>  
> -out:
>  	rcu_read_unlock();
> -	return ret;
> +	return cnt - drops;
>  }
>  
>  static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
> @@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
>  	if (unlikely(!frame))
>  		return -EOVERFLOW;
>  
> -	return tun_xdp_xmit(dev, frame);
> +	return tun_xdp_xmit(dev, 1, &frame);
>  }
>  
>  static void tun_xdp_flush(struct net_device *dev)
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index f34794a76c4d..39a0783d1cde 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
>  	virtqueue_kick(sq->vq);
>  }
>  
> -static int __virtnet_xdp_xmit(struct virtnet_info *vi,
> -			       struct xdp_frame *xdpf)
> +static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> +				   struct send_queue *sq,
> +				   struct xdp_frame *xdpf)
>  {
>  	struct virtio_net_hdr_mrg_rxbuf *hdr;
> -	struct xdp_frame *xdpf_sent;
> -	struct send_queue *sq;
> -	unsigned int len;
> -	unsigned int qp;
>  	int err;
>  
> -	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
> -	sq = &vi->sq[qp];
> -
> -	/* Free up any pending old buffers before queueing new ones. */
> -	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
> -		xdp_return_frame(xdpf_sent);
> -
>  	/* virtqueue want to use data area in-front of packet */
>  	if (unlikely(xdpf->metasize > 0))
>  		return -EOPNOTSUPP;
> @@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
>  	return 0;
>  }
>  
> -static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
> +static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
> +				   struct xdp_frame *xdpf)
> +{
> +	struct xdp_frame *xdpf_sent;
> +	struct send_queue *sq;
> +	unsigned int len;
> +	unsigned int qp;
> +
> +	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
> +	sq = &vi->sq[qp];
> +
> +	/* Free up any pending old buffers before queueing new ones. */
> +	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
> +		xdp_return_frame(xdpf_sent);
> +
> +	return __virtnet_xdp_xmit_one(vi, sq, xdpf);
> +}
> +
> +static int virtnet_xdp_xmit(struct net_device *dev,
> +			    int n, struct xdp_frame **frames)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	struct receive_queue *rq = vi->rq;
> +	struct xdp_frame *xdpf_sent;
>  	struct bpf_prog *xdp_prog;
> +	struct send_queue *sq;
> +	unsigned int len;
> +	unsigned int qp;
> +	int drops = 0;
> +	int err;
> +	int i;
> +
> +	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
> +	sq = &vi->sq[qp];
>  
>  	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
>  	 * indicate XDP resources have been successfully allocated.
> @@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
>  	if (!xdp_prog)
>  		return -ENXIO;
>  
> -	return __virtnet_xdp_xmit(vi, xdpf);
> +	/* Free up any pending old buffers before queueing new ones. */
> +	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
> +		xdp_return_frame(xdpf_sent);
> +
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdpf = frames[i];
> +
> +		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
> +		if (err) {
> +			xdp_return_frame_rx_napi(xdpf);
> +			drops++;
> +		}
> +	}
> +	return n - drops;
>  }
>  
>  static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
> @@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
>  			xdpf = convert_to_xdp_frame(&xdp);
>  			if (unlikely(!xdpf))
>  				goto err_xdp;
> -			err = __virtnet_xdp_xmit(vi, xdpf);
> +			err = __virtnet_xdp_tx_xmit(vi, xdpf);
>  			if (unlikely(err)) {
>  				trace_xdp_exception(vi->dev, xdp_prog, act);
>  				goto err_xdp;
> @@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  			xdpf = convert_to_xdp_frame(&xdp);
>  			if (unlikely(!xdpf))
>  				goto err_xdp;
> -			err = __virtnet_xdp_xmit(vi, xdpf);
> +			err = __virtnet_xdp_tx_xmit(vi, xdpf);
>  			if (unlikely(err)) {
>  				trace_xdp_exception(vi->dev, xdp_prog, act);
>  				if (unlikely(xdp_page != page))
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 03ed492c4e14..debdb6286170 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1185,9 +1185,13 @@ struct dev_ifalias {
>   *	This function is used to set or query state related to XDP on the
>   *	netdevice and manage BPF offload. See definition of
>   *	enum bpf_netdev_command for details.
> - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
> - *	This function is used to submit a XDP packet for transmit on a
> - *	netdevice.
> + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
> + *	This function is used to submit @n XDP packets for transmit on a
> + *	netdevice. Returns number of frames successfully transmitted, frames
> + *	that got dropped are freed/returned via xdp_return_frame().
> + *	Returns negative number, means general error invoking ndo, meaning
> + *	no frames were xmit'ed and core-caller will free all frames.
> + *	TODO: Consider add flag to allow sending flush operation.
>   * void (*ndo_xdp_flush)(struct net_device *dev);
>   *	This function is used to inform the driver to flush a particular
>   *	xdp tx queue. Must be called on same CPU as xdp_xmit.
> @@ -1375,8 +1379,8 @@ struct net_device_ops {
>  						       int needed_headroom);
>  	int			(*ndo_bpf)(struct net_device *dev,
>  					   struct netdev_bpf *bpf);
> -	int			(*ndo_xdp_xmit)(struct net_device *dev,
> -						struct xdp_frame *xdp);
> +	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
> +						struct xdp_frame **xdp);
>  	void			(*ndo_xdp_flush)(struct net_device *dev);
>  };
>  
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 6f84100723b0..1317629662ae 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -222,7 +222,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>  			 struct xdp_bulk_queue *bq)
>  {
>  	struct net_device *dev = obj->dev;
> -	int sent = 0, drops = 0;
> +	int sent = 0, drops = 0, err = 0;
>  	int i;
>  
>  	if (unlikely(!bq->count))
> @@ -234,23 +234,32 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>  		prefetch(xdpf);
>  	}
>  
> -	for (i = 0; i < bq->count; i++) {
> -		struct xdp_frame *xdpf = bq->q[i];
> -		int err;
> -
> -		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> -		if (err) {
> -			drops++;
> -			xdp_return_frame(xdpf);
> -		}
> -		sent++;
> +	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
> +	if (sent < 0) {
> +		err = sent;
> +		sent = 0;
> +		goto error;
>  	}
> +	drops = bq->count - sent;
> +out:
>  	bq->count = 0;
>  
>  	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
>  			      sent, drops, bq->dev_rx, dev);
>  	bq->dev_rx = NULL;
>  	return 0;
> +error:
> +	/* If ndo_xdp_xmit fails with an errno, no frames have been
> +	 * xmit'ed and it's our responsibility to them free all.
> +	 */
> +	for (i = 0; i < bq->count; i++) {
> +		struct xdp_frame *xdpf = bq->q[i];
> +
> +		/* RX path under NAPI protection, can return frames faster */
> +		xdp_return_frame_rx_napi(xdpf);
> +		drops++;
> +	}
> +	goto out;
>  }
>  
>  /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 4a93423cc5ea..19504b7f4959 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3035,7 +3035,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
>  			u32 index)
>  {
>  	struct xdp_frame *xdpf;
> -	int err;
> +	int sent;
>  
>  	if (!dev->netdev_ops->ndo_xdp_xmit) {
>  		return -EOPNOTSUPP;
> @@ -3045,9 +3045,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
>  	if (unlikely(!xdpf))
>  		return -EOVERFLOW;
>  
> -	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> -	if (err)
> -		return err;
> +	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
> +	if (sent <= 0)
> +		return sent;
>  	dev->netdev_ops->ndo_xdp_flush(dev);
>  	return 0;
>  }
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox