Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 11/19] mlxsw: reg: Add Tunneling NVE Underlay Multicast Table Register
From: Ido Schimmel @ 2018-10-11  7:48 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

This register builds the linked list of underlay destination IPs used
for BUM traffic on the overlay.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 91 +++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index de97764a71b2..a8cc7a58c390 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -8476,6 +8476,96 @@ static inline void mlxsw_reg_tngcr_pack(char *payload,
 	mlxsw_reg_tngcr_nve_group_size_flood_set(payload, 1);
 }
 
+/* TNUMT - Tunneling NVE Underlay Multicast Table Register
+ * -------------------------------------------------------
+ * The TNUMT register is for building the underlay MC table. It is used
+ * for MC, flooding and BC traffic into the NVE tunnel.
+ */
+#define MLXSW_REG_TNUMT_ID 0xA003
+#define MLXSW_REG_TNUMT_LEN 0x20
+
+MLXSW_REG_DEFINE(tnumt, MLXSW_REG_TNUMT_ID, MLXSW_REG_TNUMT_LEN);
+
+enum mlxsw_reg_tnumt_record_type {
+	MLXSW_REG_TNUMT_RECORD_TYPE_IPV4,
+	MLXSW_REG_TNUMT_RECORD_TYPE_IPV6,
+	MLXSW_REG_TNUMT_RECORD_TYPE_LABEL,
+};
+
+/* reg_tnumt_record_type
+ * Record type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tnumt, record_type, 0x00, 28, 4);
+
+enum mlxsw_reg_tnumt_tunnel_port {
+	MLXSW_REG_TNUMT_TUNNEL_PORT_NVE,
+	MLXSW_REG_TNUMT_TUNNEL_PORT_VPLS,
+	MLXSW_REG_TNUMT_TUNNEL_FLEX_TUNNEL0,
+	MLXSW_REG_TNUMT_TUNNEL_FLEX_TUNNEL1,
+};
+
+/* reg_tnumt_tunnel_port
+ * Tunnel port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tnumt, tunnel_port, 0x00, 24, 4);
+
+/* reg_tnumt_underlay_mc_ptr
+ * Index to the underlay multicast table.
+ * For Spectrum the index is to the KVD linear.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, tnumt, underlay_mc_ptr, 0x00, 0, 24);
+
+/* reg_tnumt_vnext
+ * The next_underlay_mc_ptr is valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tnumt, vnext, 0x04, 31, 1);
+
+/* reg_tnumt_next_underlay_mc_ptr
+ * The next index to the underlay multicast table.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tnumt, next_underlay_mc_ptr, 0x04, 0, 24);
+
+/* reg_tnumt_record_size
+ * Number of IP addresses in the record.
+ * Range is 1..cap_max_nve_mc_entries_ipv{4,6}
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tnumt, record_size, 0x08, 0, 3);
+
+/* reg_tnumt_udip
+ * The underlay IPv4 addresses. udip[i] is reserved if i >= size
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, tnumt, udip, 0x0C, 0, 32, 0x04, 0x00, false);
+
+/* reg_tnumt_udip_ptr
+ * The pointer to the underlay IPv6 addresses. udip_ptr[i] is reserved if
+ * i >= size. The IPv6 addresses are configured by RIPS.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, tnumt, udip_ptr, 0x0C, 0, 24, 0x04, 0x00, false);
+
+static inline void mlxsw_reg_tnumt_pack(char *payload,
+					enum mlxsw_reg_tnumt_record_type type,
+					enum mlxsw_reg_tnumt_tunnel_port tport,
+					u32 underlay_mc_ptr, bool vnext,
+					u32 next_underlay_mc_ptr,
+					u8 record_size)
+{
+	MLXSW_REG_ZERO(tnumt, payload);
+	mlxsw_reg_tnumt_record_type_set(payload, type);
+	mlxsw_reg_tnumt_tunnel_port_set(payload, tport);
+	mlxsw_reg_tnumt_underlay_mc_ptr_set(payload, underlay_mc_ptr);
+	mlxsw_reg_tnumt_vnext_set(payload, vnext);
+	mlxsw_reg_tnumt_next_underlay_mc_ptr_set(payload, next_underlay_mc_ptr);
+	mlxsw_reg_tnumt_record_size_set(payload, record_size);
+}
+
 /* TNPC - Tunnel Port Configuration Register
  * -----------------------------------------
  * The TNPC register is used for tunnel port configuration.
@@ -9071,6 +9161,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
 	MLXSW_REG(mcda),
 	MLXSW_REG(mgpc),
 	MLXSW_REG(tngcr),
+	MLXSW_REG(tnumt),
 	MLXSW_REG(tnpc),
 	MLXSW_REG(tigcr),
 	MLXSW_REG(sbpr),
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 12/19] mlxsw: reg: Add Tunneling NVE Encapsulation ECN Mapping Register
From: Ido Schimmel @ 2018-10-11  7:48 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

This register performs mapping from overlay ECN to underlay ECN during
NVE encapsulation.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 31 +++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index a8cc7a58c390..aabba7360050 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -8566,6 +8566,36 @@ static inline void mlxsw_reg_tnumt_pack(char *payload,
 	mlxsw_reg_tnumt_record_size_set(payload, record_size);
 }
 
+/* TNEEM - Tunneling NVE Encapsulation ECN Mapping Register
+ * --------------------------------------------------------
+ * The TNEEM register maps ECN of the IP header at the ingress to the
+ * encapsulation to the ECN of the underlay network.
+ */
+#define MLXSW_REG_TNEEM_ID 0xA012
+#define MLXSW_REG_TNEEM_LEN 0x0C
+
+MLXSW_REG_DEFINE(tneem, MLXSW_REG_TNEEM_ID, MLXSW_REG_TNEEM_LEN);
+
+/* reg_tneem_overlay_ecn
+ * ECN of the IP header in the overlay network.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, tneem, overlay_ecn, 0x04, 24, 2);
+
+/* reg_tneem_underlay_ecn
+ * ECN of the IP header in the underlay network.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tneem, underlay_ecn, 0x04, 16, 2);
+
+static inline void mlxsw_reg_tneem_pack(char *payload, u8 overlay_ecn,
+					u8 underlay_ecn)
+{
+	MLXSW_REG_ZERO(tneem, payload);
+	mlxsw_reg_tneem_overlay_ecn_set(payload, overlay_ecn);
+	mlxsw_reg_tneem_underlay_ecn_set(payload, underlay_ecn);
+}
+
 /* TNPC - Tunnel Port Configuration Register
  * -----------------------------------------
  * The TNPC register is used for tunnel port configuration.
@@ -9162,6 +9192,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
 	MLXSW_REG(mgpc),
 	MLXSW_REG(tngcr),
 	MLXSW_REG(tnumt),
+	MLXSW_REG(tneem),
 	MLXSW_REG(tnpc),
 	MLXSW_REG(tigcr),
 	MLXSW_REG(sbpr),
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 10/19] mlxsw: reg: Add Tunnel Port Configuration Register
From: Ido Schimmel @ 2018-10-11  7:48 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

This register enables / disables learning on different types of tunnel
ports (e.g., NVE, VPLS).

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 46 +++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 3b58f1013677..de97764a71b2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -8476,6 +8476,51 @@ static inline void mlxsw_reg_tngcr_pack(char *payload,
 	mlxsw_reg_tngcr_nve_group_size_flood_set(payload, 1);
 }
 
+/* TNPC - Tunnel Port Configuration Register
+ * -----------------------------------------
+ * The TNPC register is used for tunnel port configuration.
+ * Reserved when Spectrum.
+ */
+#define MLXSW_REG_TNPC_ID 0xA020
+#define MLXSW_REG_TNPC_LEN 0x18
+
+MLXSW_REG_DEFINE(tnpc, MLXSW_REG_TNPC_ID, MLXSW_REG_TNPC_LEN);
+
+enum mlxsw_reg_tnpc_tunnel_port {
+	MLXSW_REG_TNPC_TUNNEL_PORT_NVE,
+	MLXSW_REG_TNPC_TUNNEL_PORT_VPLS,
+	MLXSW_REG_TNPC_TUNNEL_FLEX_TUNNEL0,
+	MLXSW_REG_TNPC_TUNNEL_FLEX_TUNNEL1,
+};
+
+/* reg_tnpc_tunnel_port
+ * Tunnel port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, tnpc, tunnel_port, 0x00, 0, 4);
+
+/* reg_tnpc_learn_enable_v6
+ * During IPv6 underlay decapsulation, whether to learn from tunnel port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tnpc, learn_enable_v6, 0x04, 1, 1);
+
+/* reg_tnpc_learn_enable_v4
+ * During IPv4 underlay decapsulation, whether to learn from tunnel port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tnpc, learn_enable_v4, 0x04, 0, 1);
+
+static inline void mlxsw_reg_tnpc_pack(char *payload,
+				       enum mlxsw_reg_tnpc_tunnel_port tport,
+				       bool learn_enable)
+{
+	MLXSW_REG_ZERO(tnpc, payload);
+	mlxsw_reg_tnpc_tunnel_port_set(payload, tport);
+	mlxsw_reg_tnpc_learn_enable_v4_set(payload, learn_enable);
+	mlxsw_reg_tnpc_learn_enable_v6_set(payload, learn_enable);
+}
+
 /* TIGCR - Tunneling IPinIP General Configuration Register
  * -------------------------------------------------------
  * The TIGCR register is used for setting up the IPinIP Tunnel configuration.
@@ -9026,6 +9071,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
 	MLXSW_REG(mcda),
 	MLXSW_REG(mgpc),
 	MLXSW_REG(tngcr),
+	MLXSW_REG(tnpc),
 	MLXSW_REG(tigcr),
 	MLXSW_REG(sbpr),
 	MLXSW_REG(sbcm),
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 09/19] mlxsw: reg: Add Tunneling NVE General Configuration Register
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

This register configures global NVE configuration such as source IP of
the NVE tunnel and UDP source port calculation.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 185 ++++++++++++++++++++++
 1 file changed, 185 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 4988d24a628c..3b58f1013677 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -8292,6 +8292,190 @@ static inline void mlxsw_reg_mgpc_pack(char *payload, u32 counter_index,
 	mlxsw_reg_mgpc_opcode_set(payload, opcode);
 }
 
+/* TNGCR - Tunneling NVE General Configuration Register
+ * ----------------------------------------------------
+ * The TNGCR register is used for setting up the NVE Tunneling configuration.
+ */
+#define MLXSW_REG_TNGCR_ID 0xA001
+#define MLXSW_REG_TNGCR_LEN 0x44
+
+MLXSW_REG_DEFINE(tngcr, MLXSW_REG_TNGCR_ID, MLXSW_REG_TNGCR_LEN);
+
+enum mlxsw_reg_tngcr_type {
+	MLXSW_REG_TNGCR_TYPE_VXLAN,
+	MLXSW_REG_TNGCR_TYPE_VXLAN_GPE,
+	MLXSW_REG_TNGCR_TYPE_GENEVE,
+	MLXSW_REG_TNGCR_TYPE_NVGRE,
+};
+
+/* reg_tngcr_type
+ * Tunnel type for encapsulation and decapsulation. The types are mutually
+ * exclusive.
+ * Note: For Spectrum the NVE parsing must be enabled in MPRS.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, type, 0x00, 0, 4);
+
+/* reg_tngcr_nve_valid
+ * The VTEP is valid. Allows adding FDB entries for tunnel encapsulation.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_valid, 0x04, 31, 1);
+
+/* reg_tngcr_nve_ttl_uc
+ * The TTL for NVE tunnel encapsulation underlay unicast packets.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_ttl_uc, 0x04, 0, 8);
+
+/* reg_tngcr_nve_ttl_mc
+ * The TTL for NVE tunnel encapsulation underlay multicast packets.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_ttl_mc, 0x08, 0, 8);
+
+enum {
+	/* Do not copy flow label. Calculate flow label using nve_flh. */
+	MLXSW_REG_TNGCR_FL_NO_COPY,
+	/* Copy flow label from inner packet if packet is IPv6 and
+	 * encapsulation is by IPv6. Otherwise, calculate flow label using
+	 * nve_flh.
+	 */
+	MLXSW_REG_TNGCR_FL_COPY,
+};
+
+/* reg_tngcr_nve_flc
+ * For NVE tunnel encapsulation: Flow label copy from inner packet.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_flc, 0x0C, 25, 1);
+
+enum {
+	/* Flow label is static. In Spectrum this means '0'. Spectrum-2
+	 * uses {nve_fl_prefix, nve_fl_suffix}.
+	 */
+	MLXSW_REG_TNGCR_FL_NO_HASH,
+	/* 8 LSBs of the flow label are calculated from ECMP hash of the
+	 * inner packet. 12 MSBs are configured by nve_fl_prefix.
+	 */
+	MLXSW_REG_TNGCR_FL_HASH,
+};
+
+/* reg_tngcr_nve_flh
+ * NVE flow label hash.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_flh, 0x0C, 24, 1);
+
+/* reg_tngcr_nve_fl_prefix
+ * NVE flow label prefix. Constant 12 MSBs of the flow label.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_fl_prefix, 0x0C, 8, 12);
+
+/* reg_tngcr_nve_fl_suffix
+ * NVE flow label suffix. Constant 8 LSBs of the flow label.
+ * Reserved when nve_flh=1 and for Spectrum.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_fl_suffix, 0x0C, 0, 8);
+
+enum {
+	/* Source UDP port is fixed (default '0') */
+	MLXSW_REG_TNGCR_UDP_SPORT_NO_HASH,
+	/* Source UDP port is calculated based on hash */
+	MLXSW_REG_TNGCR_UDP_SPORT_HASH,
+};
+
+/* reg_tngcr_nve_udp_sport_type
+ * NVE UDP source port type.
+ * Spectrum uses LAG hash (SLCRv2). Spectrum-2 uses ECMP hash (RECRv2).
+ * When the source UDP port is calculated based on hash, then the 8 LSBs
+ * are calculated from hash the 8 MSBs are configured by
+ * nve_udp_sport_prefix.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_udp_sport_type, 0x10, 24, 1);
+
+/* reg_tngcr_nve_udp_sport_prefix
+ * NVE UDP source port prefix. Constant 8 MSBs of the UDP source port.
+ * Reserved when NVE type is NVGRE.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_udp_sport_prefix, 0x10, 8, 8);
+
+/* reg_tngcr_nve_group_size_mc
+ * The amount of sequential linked lists of MC entries. The first linked
+ * list is configured by SFD.underlay_mc_ptr.
+ * Valid values: 1, 2, 4, 8, 16, 32, 64
+ * The linked list are configured by TNUMT.
+ * The hash is set by LAG hash.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_group_size_mc, 0x18, 0, 8);
+
+/* reg_tngcr_nve_group_size_flood
+ * The amount of sequential linked lists of flooding entries. The first
+ * linked list is configured by SFMR.nve_tunnel_flood_ptr
+ * Valid values: 1, 2, 4, 8, 16, 32, 64
+ * The linked list are configured by TNUMT.
+ * The hash is set by LAG hash.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, nve_group_size_flood, 0x1C, 0, 8);
+
+/* reg_tngcr_learn_enable
+ * During decapsulation, whether to learn from NVE port.
+ * Reserved when Spectrum-2. See TNPC.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, learn_enable, 0x20, 31, 1);
+
+/* reg_tngcr_underlay_virtual_router
+ * Underlay virtual router.
+ * Reserved when Spectrum-2.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, underlay_virtual_router, 0x20, 0, 16);
+
+/* reg_tngcr_underlay_rif
+ * Underlay ingress router interface. RIF type should be loopback generic.
+ * Reserved when Spectrum.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, underlay_rif, 0x24, 0, 16);
+
+/* reg_tngcr_usipv4
+ * Underlay source IPv4 address of the NVE.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tngcr, usipv4, 0x28, 0, 32);
+
+/* reg_tngcr_usipv6
+ * Underlay source IPv6 address of the NVE. For Spectrum, must not be
+ * modified under traffic of NVE tunneling encapsulation.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, tngcr, usipv6, 0x30, 16);
+
+static inline void mlxsw_reg_tngcr_pack(char *payload,
+					enum mlxsw_reg_tngcr_type type,
+					bool valid, u8 ttl)
+{
+	MLXSW_REG_ZERO(tngcr, payload);
+	mlxsw_reg_tngcr_type_set(payload, type);
+	mlxsw_reg_tngcr_nve_valid_set(payload, valid);
+	mlxsw_reg_tngcr_nve_ttl_uc_set(payload, ttl);
+	mlxsw_reg_tngcr_nve_ttl_mc_set(payload, ttl);
+	mlxsw_reg_tngcr_nve_flc_set(payload, MLXSW_REG_TNGCR_FL_NO_COPY);
+	mlxsw_reg_tngcr_nve_flh_set(payload, 0);
+	mlxsw_reg_tngcr_nve_udp_sport_type_set(payload,
+					       MLXSW_REG_TNGCR_UDP_SPORT_HASH);
+	mlxsw_reg_tngcr_nve_udp_sport_prefix_set(payload, 0);
+	mlxsw_reg_tngcr_nve_group_size_mc_set(payload, 1);
+	mlxsw_reg_tngcr_nve_group_size_flood_set(payload, 1);
+}
+
 /* TIGCR - Tunneling IPinIP General Configuration Register
  * -------------------------------------------------------
  * The TIGCR register is used for setting up the IPinIP Tunnel configuration.
@@ -8841,6 +9025,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
 	MLXSW_REG(mcc),
 	MLXSW_REG(mcda),
 	MLXSW_REG(mgpc),
+	MLXSW_REG(tngcr),
 	MLXSW_REG(tigcr),
 	MLXSW_REG(sbpr),
 	MLXSW_REG(sbcm),
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 08/19] mlxsw: spectrum: Seed LAG hash function
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

Currently, the seed of the LAG hash function is always set to 0, which
means it is identical across all switches. Instead, use a random number.

This is especially important now that VxLAN is supported, as the LAG
hash function is used to calculate the UDP source port of the
encapsulated packet.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h      | 9 ++++++++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 5 ++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index a0441f3d4284..4988d24a628c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -1321,12 +1321,19 @@ MLXSW_ITEM32(reg, slcr, type, 0x00, 0, 4);
  */
 MLXSW_ITEM32(reg, slcr, lag_hash, 0x04, 0, 20);
 
-static inline void mlxsw_reg_slcr_pack(char *payload, u16 lag_hash)
+/* reg_slcr_seed
+ * LAG seed value. The seed is the same for all ports.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcr, seed, 0x08, 0, 32);
+
+static inline void mlxsw_reg_slcr_pack(char *payload, u16 lag_hash, u32 seed)
 {
 	MLXSW_REG_ZERO(slcr, payload);
 	mlxsw_reg_slcr_pp_set(payload, MLXSW_REG_SLCR_PP_GLOBAL);
 	mlxsw_reg_slcr_type_set(payload, MLXSW_REG_SLCR_TYPE_CRC);
 	mlxsw_reg_slcr_lag_hash_set(payload, lag_hash);
+	mlxsw_reg_slcr_seed_set(payload, seed);
 }
 
 /* SLCOR - Switch LAG Collector Register
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 2b14fd0dcc42..d42980df57b7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -21,6 +21,7 @@
 #include <linux/dcbnl.h>
 #include <linux/inetdevice.h>
 #include <linux/netlink.h>
+#include <linux/random.h>
 #include <net/switchdev.h>
 #include <net/pkt_cls.h>
 #include <net/tc_act/tc_mirred.h>
@@ -3666,8 +3667,10 @@ static void mlxsw_sp_traps_fini(struct mlxsw_sp *mlxsw_sp)
 static int mlxsw_sp_lag_init(struct mlxsw_sp *mlxsw_sp)
 {
 	char slcr_pl[MLXSW_REG_SLCR_LEN];
+	u32 seed;
 	int err;
 
+	get_random_bytes(&seed, sizeof(seed));
 	mlxsw_reg_slcr_pack(slcr_pl, MLXSW_REG_SLCR_LAG_HASH_SMAC |
 				     MLXSW_REG_SLCR_LAG_HASH_DMAC |
 				     MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE |
@@ -3676,7 +3679,7 @@ static int mlxsw_sp_lag_init(struct mlxsw_sp *mlxsw_sp)
 				     MLXSW_REG_SLCR_LAG_HASH_DIP |
 				     MLXSW_REG_SLCR_LAG_HASH_SPORT |
 				     MLXSW_REG_SLCR_LAG_HASH_DPORT |
-				     MLXSW_REG_SLCR_LAG_HASH_IPPROTO);
+				     MLXSW_REG_SLCR_LAG_HASH_IPPROTO, seed);
 	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcr), slcr_pl);
 	if (err)
 		return err;
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 07/19] mlxsw: reg: Extend FDB flush types for NVE
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

The device has the ability to flush all the FDB records that perform NVE
encapsulation or only a subset of these with a specific filtering
identifier (FID).

Expose these types so that they could be used by subsequent patches
where we need to flush the FDB records when an NVE device is unlinked
from a bridge (FID).

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index df81e0a1eb64..a0441f3d4284 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -1069,6 +1069,8 @@ enum mlxsw_reg_sfdf_flush_type {
 	MLXSW_REG_SFDF_FLUSH_PER_PORT_AND_FID,
 	MLXSW_REG_SFDF_FLUSH_PER_LAG,
 	MLXSW_REG_SFDF_FLUSH_PER_LAG_AND_FID,
+	MLXSW_REG_SFDF_FLUSH_PER_NVE,
+	MLXSW_REG_SFDF_FLUSH_PER_NVE_AND_FID,
 };
 
 /* reg_sfdf_flush_type
@@ -1079,6 +1081,10 @@ enum mlxsw_reg_sfdf_flush_type {
  * 3 - All FID dynamic entries pointing to port are flushed.
  * 4 - All dynamic entries pointing to LAG are flushed.
  * 5 - All FID dynamic entries pointing to LAG are flushed.
+ * 6 - All entries of type "Unicast Tunnel" or "Multicast Tunnel" are
+ *     flushed.
+ * 7 - All entries of type "Unicast Tunnel" or "Multicast Tunnel" are
+ *     flushed, per FID.
  * Access: RW
  */
 MLXSW_ITEM32(reg, sfdf, flush_type, 0x04, 28, 4);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 05/19] mlxsw: spectrum: Move L3 protocol and address definitions to global header file
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

The L3 protocol and address definitions are going to be used by the NVE
code, so move them to the global header file from the one private to the
router.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h        | 11 +++++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 206ecc69b51c..534a8be6a6db 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -383,6 +383,17 @@ static inline void mlxsw_sp_port_dcb_fini(struct mlxsw_sp_port *mlxsw_sp_port)
 #endif
 
 /* spectrum_router.c */
+enum mlxsw_sp_l3proto {
+	MLXSW_SP_L3_PROTO_IPV4,
+	MLXSW_SP_L3_PROTO_IPV6,
+#define MLXSW_SP_L3_PROTO_MAX	(MLXSW_SP_L3_PROTO_IPV6 + 1)
+};
+
+union mlxsw_sp_l3addr {
+	__be32 addr4;
+	struct in6_addr addr6;
+};
+
 int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp);
 void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp);
 int mlxsw_sp_netdevice_router_port_event(struct net_device *dev);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
index 354cb0834f3a..3dbafdeaab2b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
@@ -7,17 +7,6 @@
 #include "spectrum.h"
 #include "reg.h"
 
-enum mlxsw_sp_l3proto {
-	MLXSW_SP_L3_PROTO_IPV4,
-	MLXSW_SP_L3_PROTO_IPV6,
-#define MLXSW_SP_L3_PROTO_MAX	(MLXSW_SP_L3_PROTO_IPV6 + 1)
-};
-
-union mlxsw_sp_l3addr {
-	__be32 addr4;
-	struct in6_addr addr6;
-};
-
 struct mlxsw_sp_rif_ipip_lb;
 struct mlxsw_sp_rif_ipip_lb_config {
 	enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt;
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 06/19] mlxsw: spectrum: Add a new type of KVD linear record
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

When the device needs to flood an overlay packet to remote VTEPs it
retrieves a pointer to the head of a linked-list of records that store
the IP addresses of these VTEPs.

These records are stored in the KVD linear memory and configured via the
Tunneling NVE Underlay Multicast Table (TNUMT) register.

Add a new KVD linear entry type for these records, so that we will be
able to allocate and free them.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h       | 2 ++
 drivers/net/ethernet/mellanox/mlxsw/spectrum2_kvdl.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 534a8be6a6db..1f68ac2a20f4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -438,6 +438,7 @@ enum mlxsw_sp_kvdl_entry_type {
 	MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
 	MLXSW_SP_KVDL_ENTRY_TYPE_PBS,
 	MLXSW_SP_KVDL_ENTRY_TYPE_MCRIGR,
+	MLXSW_SP_KVDL_ENTRY_TYPE_TNUMT,
 };
 
 static inline unsigned int
@@ -448,6 +449,7 @@ mlxsw_sp_kvdl_entry_size(enum mlxsw_sp_kvdl_entry_type type)
 	case MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET: /* fall through */
 	case MLXSW_SP_KVDL_ENTRY_TYPE_PBS: /* fall through */
 	case MLXSW_SP_KVDL_ENTRY_TYPE_MCRIGR: /* fall through */
+	case MLXSW_SP_KVDL_ENTRY_TYPE_TNUMT: /* fall through */
 	default:
 		return 1;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_kvdl.c
index 68c8b148bef2..8d14770766b4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_kvdl.c
@@ -35,6 +35,7 @@ static const struct mlxsw_sp2_kvdl_part_info mlxsw_sp2_kvdl_parts_info[] = {
 				 MAX_KVD_ACTION_SETS),
 	MLXSW_SP2_KVDL_PART_INFO(PBS, 0x24, KVD_SIZE, KVD_SIZE),
 	MLXSW_SP2_KVDL_PART_INFO(MCRIGR, 0x26, KVD_SIZE, KVD_SIZE),
+	MLXSW_SP2_KVDL_PART_INFO(TNUMT, 0x29, KVD_SIZE, KVD_SIZE),
 };
 
 #define MLXSW_SP2_KVDL_PARTS_INFO_LEN ARRAY_SIZE(mlxsw_sp2_kvdl_parts_info)
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 04/19] mlxsw: spectrum_switchdev: Do not assume notifier information type
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

VxLAN notifications are going to use a different notifier information
type, so cast to the correct type based on the received event.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 9edaec95ddd2..fa16ad2c6a50 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -2343,7 +2343,8 @@ static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
 {
 	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
 	struct mlxsw_sp_switchdev_event_work *switchdev_work;
-	struct switchdev_notifier_fdb_info *fdb_info = ptr;
+	struct switchdev_notifier_fdb_info *fdb_info;
+	struct switchdev_notifier_info *info = ptr;
 	struct net_device *br_dev;
 
 	/* Tunnel devices are not our uppers, so check their master instead */
@@ -2367,6 +2368,9 @@ static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
 	case SWITCHDEV_FDB_DEL_TO_DEVICE: /* fall through */
 	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
 	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
+		fdb_info = container_of(info,
+					struct switchdev_notifier_fdb_info,
+					info);
 		INIT_WORK(&switchdev_work->work,
 			  mlxsw_sp_switchdev_bridge_fdb_event_work);
 		memcpy(&switchdev_work->fdb_info, ptr,
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 01/19] mlxsw: spectrum: Remove misuses of private header file
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

The spectrum_router.h header file is private to the router block and
should only be included by direct consumers of it, such as dpipe and the
multicast routing code.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h           | 4 ++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h    | 4 ----
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 3cdb7aca90b7..206ecc69b51c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -416,6 +416,10 @@ mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
 void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif);
 void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp,
 				 struct net_device *dev);
+struct mlxsw_sp_rif *mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp,
+					      const struct net_device *dev);
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp);
+struct mlxsw_sp_fid *mlxsw_sp_rif_fid(const struct mlxsw_sp_rif *rif);
 
 /* spectrum_kvdl.c */
 enum mlxsw_sp_kvdl_entry_type {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
index 1a60391daafa..354cb0834f3a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
@@ -35,8 +35,6 @@ struct mlxsw_sp_neigh_entry;
 struct mlxsw_sp_nexthop;
 struct mlxsw_sp_ipip_entry;
 
-struct mlxsw_sp_rif *mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp,
-					      const struct net_device *dev);
 struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp,
 					   u16 rif_index);
 u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif);
@@ -44,9 +42,7 @@ u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif);
 u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif);
 u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev);
 int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif);
-u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp);
 const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif);
-struct mlxsw_sp_fid *mlxsw_sp_rif_fid(const struct mlxsw_sp_rif *rif);
 int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp,
 				   struct mlxsw_sp_rif *rif,
 				   enum mlxsw_sp_rif_counter_dir dir,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index db715da7bab7..1a9370a46b0e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -17,7 +17,6 @@
 #include <net/switchdev.h>
 
 #include "spectrum_span.h"
-#include "spectrum_router.h"
 #include "spectrum_switchdev.h"
 #include "spectrum.h"
 #include "core.h"
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 03/19] mlxsw: spectrum_switchdev: Check notification relevance based on upper device
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

VxLAN FDB updates are sent with the VxLAN device which is not our upper
and will therefore be ignored by current code.

Solve this by checking whether the upper device (bridge) is our upper.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index c9c605fdce10..9edaec95ddd2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -2344,8 +2344,15 @@ static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
 	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
 	struct mlxsw_sp_switchdev_event_work *switchdev_work;
 	struct switchdev_notifier_fdb_info *fdb_info = ptr;
+	struct net_device *br_dev;
 
-	if (!mlxsw_sp_port_dev_lower_find_rcu(dev))
+	/* Tunnel devices are not our uppers, so check their master instead */
+	br_dev = netdev_master_upper_dev_get_rcu(dev);
+	if (!br_dev)
+		return NOTIFY_DONE;
+	if (!netif_is_bridge_master(br_dev))
+		return NOTIFY_DONE;
+	if (!mlxsw_sp_port_dev_lower_find_rcu(br_dev))
 		return NOTIFY_DONE;
 
 	switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 02/19] mlxsw: spectrum_switchdev: Prepare for VxLAN FDB notifications
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel
In-Reply-To: <20181011074701.17983-1-idosch@mellanox.com>

VxLAN FDB notifications need to be handled differently than bridge FDB
notifications, so initialize the work item based on the received
notification and rename the invoked function accordingly.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 1a9370a46b0e..c9c605fdce10 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -2288,7 +2288,7 @@ struct mlxsw_sp_switchdev_event_work {
 	unsigned long event;
 };
 
-static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
+static void mlxsw_sp_switchdev_bridge_fdb_event_work(struct work_struct *work)
 {
 	struct mlxsw_sp_switchdev_event_work *switchdev_work =
 		container_of(work, struct mlxsw_sp_switchdev_event_work, work);
@@ -2352,7 +2352,6 @@ static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
 	if (!switchdev_work)
 		return NOTIFY_BAD;
 
-	INIT_WORK(&switchdev_work->work, mlxsw_sp_switchdev_event_work);
 	switchdev_work->dev = dev;
 	switchdev_work->event = event;
 
@@ -2361,6 +2360,8 @@ static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
 	case SWITCHDEV_FDB_DEL_TO_DEVICE: /* fall through */
 	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
 	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
+		INIT_WORK(&switchdev_work->work,
+			  mlxsw_sp_switchdev_bridge_fdb_event_work);
 		memcpy(&switchdev_work->fdb_info, ptr,
 		       sizeof(switchdev_work->fdb_info));
 		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 00/19] mlxsw: Preparations for VxLAN support
From: Ido Schimmel @ 2018-10-11  7:47 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Petr Machata, mlxsw,
	Ido Schimmel

This patchset prepares mlxsw for VxLAN support. It contains small and
mostly non-functional changes.

The first eight patches perform small changes in the code to make it
more receptive towards the actual VxLAN changes in the next patchset.

Patches 9-17 add the registers used to configure the device for VxLAN
offload.

Last two patches add the required resources and trap IDs.

The next patchset is available here [1].

1. https://github.com/idosch/linux/tree/vxlan

Ido Schimmel (19):
  mlxsw: spectrum: Remove misuses of private header file
  mlxsw: spectrum_switchdev: Prepare for VxLAN FDB notifications
  mlxsw: spectrum_switchdev: Check notification relevance based on upper
    device
  mlxsw: spectrum_switchdev: Do not assume notifier information type
  mlxsw: spectrum: Move L3 protocol and address definitions to global
    header file
  mlxsw: spectrum: Add a new type of KVD linear record
  mlxsw: reg: Extend FDB flush types for NVE
  mlxsw: spectrum: Seed LAG hash function
  mlxsw: reg: Add Tunneling NVE General Configuration Register
  mlxsw: reg: Add Tunnel Port Configuration Register
  mlxsw: reg: Add Tunneling NVE Underlay Multicast Table Register
  mlxsw: reg: Add Tunneling NVE Encapsulation ECN Mapping Register
  mlxsw: reg: Add Tunneling NVE Decapsulation ECN Mapping Register
  mlxsw: reg: Add Tunneling NVE QoS Configuration Register
  mlxsw: reg: Add Tunneling NVE QoS Default Register
  mlxsw: reg: Add definition of unicast tunnel record for SFD register
  mlxsw: reg: Add Monitoring Parsing State Register
  mlxsw: resources: Add NVE resources
  mlxsw: spectrum: Add NVE packet traps

 drivers/net/ethernet/mellanox/mlxsw/reg.h     | 581 +++++++++++++++++-
 .../net/ethernet/mellanox/mlxsw/resources.h   |   4 +
 .../net/ethernet/mellanox/mlxsw/spectrum.c    |   8 +-
 .../net/ethernet/mellanox/mlxsw/spectrum.h    |  17 +
 .../ethernet/mellanox/mlxsw/spectrum2_kvdl.c  |   1 +
 .../ethernet/mellanox/mlxsw/spectrum_router.h |  15 -
 .../mellanox/mlxsw/spectrum_switchdev.c       |  21 +-
 drivers/net/ethernet/mellanox/mlxsw/trap.h    |   2 +
 8 files changed, 627 insertions(+), 22 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH net-next] octeontx2-af: Remove set but not used variable 'dev'
From: YueHaibing @ 2018-10-11  7:37 UTC (permalink / raw)
  To: Sunil Goutham, Linu Cherian, Geetha sowjanya, Jerin Jacob, davem
  Cc: YueHaibing, netdev, kernel-janitors

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/net/ethernet/marvell/octeontx2/af/cgx.c: In function 'cgx_fwi_event_handler':
drivers/net/ethernet/marvell/octeontx2/af/cgx.c:257:17: warning:
 variable 'dev' set but not used [-Wunused-but-set-variable]

It never be used since introduction in
commit 1463f382f58d ("octeontx2-af: Add support for CGX link management")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index f290b1d..2cf8e40 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -254,7 +254,6 @@ static inline bool cgx_event_is_linkevent(u64 event)
 static irqreturn_t cgx_fwi_event_handler(int irq, void *data)
 {
 	struct lmac *lmac = data;
-	struct device *dev;
 	struct cgx *cgx;
 	u64 event;
 
@@ -265,8 +264,6 @@ static irqreturn_t cgx_fwi_event_handler(int irq, void *data)
 	if (!FIELD_GET(EVTREG_ACK, event))
 		return IRQ_NONE;
 
-	dev = &cgx->pdev->dev;
-
 	switch (FIELD_GET(EVTREG_EVT_TYPE, event)) {
 	case CGX_EVT_CMD_RESP:
 		/* Copy the response. Since only one command is active at a

^ permalink raw reply related

* Re: Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Tiwei Bie @ 2018-10-11 14:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, virtualization, linux-kernel, netdev, virtio-dev,
	wexu, jfreimann, maxime.coquelin, zhihong.wang
In-Reply-To: <20181011101656-mutt-send-email-mst@kernel.org>

On Thu, Oct 11, 2018 at 10:17:15AM -0400, Michael S. Tsirkin wrote:
> On Thu, Oct 11, 2018 at 10:13:31PM +0800, Tiwei Bie wrote:
> > On Thu, Oct 11, 2018 at 09:48:48AM -0400, Michael S. Tsirkin wrote:
> > > On Thu, Oct 11, 2018 at 08:12:21PM +0800, Tiwei Bie wrote:
> > > > > > But if it's not too late, I second for a OUT_OF_ORDER feature.
> > > > > > Starting from in order can have much simpler code in driver.
> > > > > > 
> > > > > > Thanks
> > > > > 
> > > > > It's tricky to change the flag polarity because of compatibility
> > > > > with legacy interfaces. Why is this such a big deal?
> > > > > 
> > > > > Let's teach drivers about IN_ORDER, then if devices
> > > > > are in order it will get enabled by default.
> > > > 
> > > > Yeah, make sense.
> > > > 
> > > > Besides, I have done some further profiling and debugging
> > > > both in kernel driver and DPDK vhost. Previously I was mislead
> > > > by a bug in vhost code. I will send a patch to fix that bug.
> > > > With that bug fixed, the performance of packed ring in the
> > > > test between kernel driver and DPDK vhost is better now.
> > > 
> > > OK, if we get a performance gain on the virtio side, we can finally
> > > upstream it. If you see that please re-post ASAP so we can
> > > put it in the next kernel release.
> > 
> > Got it, I will re-post ASAP.
> > 
> > Thanks!
> 
> 
> Pls remember to include data on performance gain in the cover letter.

Sure. I'll try to include some performance analyses.

^ permalink raw reply

* 9p/RDMA for syzkaller (Was: BUG: corrupted list in p9_read_work)
From: Dominique Martinet @ 2018-10-11 14:28 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Leon Romanovsky, syzbot, David Miller, Eric Van Hensbergen, LKML,
	Latchesar Ionkov, netdev, Ron Minnich, syzkaller-bugs,
	v9fs-developer
In-Reply-To: <CACT4Y+bM4DxysCzhWvhURgBVpnLR4uu+oxhLpdZ3x92cKwtAzA@mail.gmail.com>

Dmitry Vyukov wrote on Thu, Oct 11, 2018:
> > Now we are talking!
> > We generally assume that all modules are simply compiled into kernel.
> > At least that's we have on syzbot. If somebody can't compile them in,
> > we can suggest to add modprobe into init.
> > So this boils down to just writing to /sys/module/rdma_rxe/parameters/add.
> 
> This fails for me:
> 
> root@syzkaller:~# echo -n syz1 > /sys/module/rdma_rxe/parameters/add
> [20992.905406] rdma_rxe: interface syz1 not found
> bash: echo: write error: Invalid argument

Works here, I just did:

[root@f2 ~]# modprobe rdma_rxe
[root@f2 ~]# echo -n ens3 > /sys/module/rdma_rxe/parameters/add 

dmesg says:
[   35.595534] rdma_rxe: set rxe0 active
[   35.595541] rdma_rxe: added rxe0 to ens3


Actually for a dummy interface if I try the echo directly the echo
works, and a verb device is created, and I just confirmed I can use
it... so not sure why rxe_cfg said EINVAL earlier...

[root@f2 ~]# ip link add dummy0 type dummy
[root@f2 ~]# ip link set dummy0 up
[root@f2 ~]# ip addr add 10.1.1.1/24 dev dummy0
[root@f2 ~]# modprobe rdma_rxe
[root@f2 ~]# echo -n dummy0 > /sys/module/rdma_rxe/parameters/add 


(then using my test client:
[root@f2 src]# ./rcat -s
INFO:  trans_rdma.c (879), msk_cma_event_handler: CONNECT_REQUEST
INFO:  trans_rdma.c (862), msk_cma_event_handler: ESTABLISHED
INFO:  trans_rdma.c (917), msk_cma_event_handler: DISCONNECT EVENT...

[root@f2 src]# ./rcat -c 10.1.1.1
INFO:  trans_rdma.c (862), msk_cma_event_handler: ESTABLISHED
^C
)


I assume your syz1 interface is a tap device as you were saying earlier?
Got anything in dmesg?

-- 
Dominique

^ permalink raw reply

* 9p/RDMA for syzkaller (Was: BUG: corrupted list in p9_read_work)
From: Dominique Martinet @ 2018-10-11 14:19 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Leon Romanovsky, syzbot, David Miller, Eric Van Hensbergen, LKML,
	Latchesar Ionkov, netdev, Ron Minnich, syzkaller-bugs,
	v9fs-developer
In-Reply-To: <CACT4Y+YvJ-KT4UxtOXpou1xpT3wBASDkt135zr2BwrRi0gv4Tw@mail.gmail.com>

Dmitry Vyukov wrote on Thu, Oct 11, 2018:
> But again we don't need to support all of the available hardware.

I agree with that, I just have no idea what the "librxe-rdmav16.so" lib
could be doing and described something I am slightly more familiar with
(e.g. libmlx5)
I talked about a common subset of the verb abi because I didn't want to
look into what it's doing, but if it's not enough there's always that
possibility.


> For example, we are testing net stack from external side using tun.
> tun is a very simple, virtual abstraction of a network card. It allows
> us to test all of generic net stack starting from L2 without messing
> with any real drivers and their differences entirely. I had impression
> that we are talking about something similar here too. Or not?

That sounds about right, rxe is a software implementation that should
work on most network interfaces ; at least from what I tried it worked
on a VM's virtio net down to my laptop's wifi interface so it's a good
start... I'm not saying all because I just tried a dummy interface and
that returned EINVAL.
The only point I disagree is the 'very simple', even getting that to
work will be a far cry from a socket() call... :)


> Also I am a bit missing context about rdma<->9p interface. Do we need
> to setup all these ring buffers to satisfy the parts that 9p needs? Is
> it that 9p actually reads data directly from these ring buffers? Or
> there is some higher-level rdma interface that 9p uses?

It needs an "RDMA_PS_TCP" connection established, that requires
everything I described unfortunately...
Once that's established we need to register some memory to the driver
and post some recv buffers (even if we won't read it, the client would
get errors if we aren't ready to receive anything - at least it does
with real hardware), and also use some registered memory to send data.

Thinking back though I think that my server implementation isn't very
far from the raw level in what I'm doing, I recall libibverbs fallback
implementation (e.g. if the driver lib doesn't implement it otherwise)
of the functions I looked at like ibv_post_send to mostly be just
serializing the arguments, slapping the command from an enum in front of
it and sending it to the kernel, so it might be enough to just
reimplement that shim in or figure a way to generate the binary commands
once and then use these values; now I'm comparing two runs of strace of
my test server I definitely see a pattern.

I'll give it a try but don't expect something fast, and it's probably
not going to be very pretty either...

To give a concrete example, here are all the read/write/fcntl calls
looking just at /dev/infiniband in a hello world program that just
establishes connection (server side), receive and send two messages and
quits:


This part apparently sets up the listening connection of the server:

1430  1539262699.126025 openat(AT_FDCWD, "/dev/infiniband/rdma_cm", O_RDWR|O_CLOEXEC) = 3
1430  1539262699.126155 write(3, "\0\0\0\0\30\0\4\0@m'\1\0\0\0\0\344\327\375\271\374\177\0\0?\1\2\0\0\0\0\0", 32) = 32
1430  1539262699.126192 write(3, "\24\0\0\0\210\0\0\0\0\0\0\0000\0\0\0\33\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\6\0\0\377\377\377\377\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 144) = 144
1430  1539262699.126223 write(3, "\23\0\0\0\20\0\20\1 \326\375\271\374\177\0\0\0\0\0\0\0\0\0\0", 24) = 24
1430  1539262699.126250 write(3, "\23\0\0\0\20\0\20\1 \326\375\271\374\177\0\0\0\0\0\0\2\0\0\0", 24) = 24
1430  1539262699.126274 write(3, "\1\0\0\0\20\0\4\0\324\327\375\271\374\177\0\0\0\0\0\0\0\0\0\0", 24) = 24
1430  1539262699.126303 close(3)        = 0
1430  1539262699.126360 openat(AT_FDCWD, "/dev/infiniband/rdma_cm", O_RDWR|O_CLOEXEC) = 3
1430  1539262699.126429 write(3, "\0\0\0\0\30\0\4\0\240\217'\1\0\0\0\0t\330\375\271\374\177\0\0\6\1\2\0\0\0\0\0", 32) = 32
1430  1539262699.126472 write(3, "\24\0\0\0\210\0\0\0\0\0\0\0\34\0\0\0\n\0\4\323\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 144) = 144
1430  1539262699.126501 write(3, "\23\0\0\0\20\0\20\1p\326\375\271\374\177\0\0\0\0\0\0\0\0\0\0", 24) = 24
1430  1539262699.126534 write(3, "\23\0\0\0\20\0\20\1p\326\375\271\374\177\0\0\0\0\0\0\2\0\0\0", 24) = 24
1430  1539262699.127119 write(3, "\7\0\0\0\10\0\0\0\0\0\0\0@\0\0\0", 16) = 16
1430  1539262699.127149 write(3, "\23\0\0\0\20\0\20\1`\327\375\271\374\177\0\0\0\0\0\0\0\0\0\0", 24) = 24
1430  1539262699.127319 fcntl(3, F_GETFL) = 0x8002 (flags O_RDWR|O_LARGEFILE)
1430  1539262699.127348 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK|O_LARGEFILE <unfinished ...>

Then the client connects (had some epoll on read on fd 3, but no read?!)

1446  1539262706.268685 write(3, "\f\0\0\0\10\0H\1\200\307\211\302G\177\0\0", 16) = 16
1446  1539262706.268718 write(3, "\23\0\0\0\20\0\20\1\240\304\211\302G\177\0\0\2\0\0\0\0\0\0\0", 24) = 24
1446  1539262706.269440 openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 5
1446  1539262706.269474 write(5, "\0\0\0\0\4\0\2\0H\302\211\302G\177\0\0", 16) = 16
1446  1539262706.269503 write(5, "\1\0\0\0\4\0,\0\220\301\211\302G\177\0\0", 16) = 16
1446  1539262706.269545 write(5, "\2\0\0\0\6\0\n\0\20\302\211\302G\177\0\0\1\0\0\0\0\0\0\0", 24) = 24
1446  1539262706.269571 write(5, "\3\0\0\0\4\0\1\0\314\303\211\302G\177\0\0", 16) = 16
1446  1539262706.269596 write(3, "\23\0\0\0\20\0\20\1\240\304\211\302G\177\0\0\2\0\0\0\2\0\0\0", 24) = 24
1446  1539262706.269618 write(3, "\23\0\0\0\20\0\270\1\200\303\211\302G\177\0\0\2\0\0\0\1\0\0\0", 24) = 24
1430  1539262706.269801 write(5, "\3\0\0\0\4\0\1\0\354\330\375\271\374\177\0\0", 16) = 16
1430  1539262706.269944 write(5, "\21\0\0\0\4\0\1\0T\330\375\271\374\177\0\0", 16) = 16
1430  1539262706.270000 write(5, "\22\0\0\0\n\0\6\0 \330\375\271\374\177\0\0`\232'\1\0\0\0\0006\0\0\0\0\0\0\0\7\0\0\0\0\0\0\0", 40) = 40
1430  1539262706.270203 write(5, "\27\0\0\0\4\0\0\0\2\0\0\0\0\0\0\0", 16) = 16
1430  1539262706.270262 write(5, "\30\0\0\0\20\0\20\0000\327\375\271\374\177\0\0\20\233'\1\0\0\0\0\1\0\0\0\2\0\0\0\2\0\0\0\0\0\0\0002\0\0\0\4\0\0\0\1\0\0\0\1\0\0\0\0\0\0\0\1\2\0\0", 64) = 64
1430  1539262706.270482 write(3, "\v\0\0\0\20\0\220\0p\326\375\271\374\177\0\0\2\0\0\0\1\0\0\0", 24) = 24
1430  1539262706.270546 write(5, "\32\0\0\0\36\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\16\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0", 120) = 120
1430  1539262706.270677 write(5, "\t\0\0\0\f\0\3\0\224\330\375\271\374\177\0\0\20p)\302G\177\0\0\0\0@\0\0\0\0\0\20p)\302G\177\0\0\1\0\0\0\1\0\0\0", 48) = 48
1430  1539262706.271973 write(5, "\t\0\0\0\f\0\3\0D\330\375\271\374\177\0\0\210\362&\1\0\0\0\0\1\0\0\0\0\0\0\0\210\362&\1\0\0\0\0\1\0\0\0\1\0\0\0", 48) = 48
1430  1539262706.272060 write(3, "\v\0\0\0\20\0\220\0000\325\375\271\374\177\0\0\2\0\0\0\1\0\0\0", 24) = 24
1430  1539262706.272110 write(5, "\32\0\0\0\36\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\16\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0", 120) = 120
1430  1539262706.272159 write(3, "\v\0\0\0\20\0\220\0000\325\375\271\374\177\0\0\2\0\0\0\2\0\0\0", 24) = 24
1430  1539262706.272205 write(5, "\32\0\0\0\36\0\0\0\0\0\0\0\0\0\0\0\0\0\377\377\n*\21f\0\0\0\0\0\0\0\0\1@\0\0\0\7\1\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\201\221\22\0\0\0\0\0\340\t\351\0\0\0\0\0\23\0\0\0\0\0\0\0\0\0\0\0\2\0\3\0\0\0\1\0\0\0\0\0\0\0\0\0", 120) = 120
1430  1539262706.272439 write(3, "\v\0\0\0\20\0\220\0000\325\375\271\374\177\0\0\2\0\0\0\3\0\0\0", 24) = 24
1430  1539262706.272496 write(5, "\32\0\0\0\36\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\1.\1\0\0\0\0\0\0\0\0\0\364((\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\0\1\0\0\0\23\7\7\0\0\0\0", 120) = 120
1430  1539262706.272565 write(3, "\10\0\0\0 \1\0\0\220\f\0\274G\177\0\0\24\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\1\0\0\n\1\2\0\0\0\0\0\0\0", 296) = 296
1446  1539262706.272962 write(3, "\f\0\0\0\10\0H\1\200\307\211\302G\177\0\0", 16) = 16
1430  1539262706.274144 write(5, "\t\0\0\0\f\0\3\0D\330\375\271\374\177\0\0`\0\351\301G\177\0\0\0\0 \0\0\0\0\0`\0\351\301G\177\0\0\1\0\0\0\1\0\0\0", 48) = 48


Some data is exchanged (we don't see the data as it's in buffers whose
address was given earlier):

1464  1539262714.529679 write(5, "\27\0\0\0\4\0\0\0\2\0\0\0\0\0\0\0", 16) = 16
1464  1539262714.530059 write(5, "\34\0\0\0\10\0\1\0lT)\302G\177\0\0\3\0\0\0\0\0\0\0\0\0\0\0\200\0\0\0", 32) = 32
1464  1539262714.530634 write(5, "\27\0\0\0\4\0\0\0\2\0\0\0\0\0\0\0", 16) = 16
1430  1539262719.331307 write(5, "\34\0\0\0\10\0\1\0\374\327\375\271\374\177\0\0\3\0\0\0\0\0\0\0\0\0\0\0\200\0\0\0", 32) = 32
1464  1539262719.332113 write(5, "\27\0\0\0\4\0\0\0\2\0\0\0\0\0\0\0", 16) = 16

And disconnect:

1430  1539262721.192844 write(5, "\r\0\0\0\3\0\0\0\6\0\0\0", 12) = 12
1430  1539262721.193186 write(5, "\r\0\0\0\3\0\0\0\5\0\0\0", 12) = 12
1430  1539262721.193324 write(5, "\32\0\0\0\36\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 120) = 120
1430  1539262721.193567 write(3, "\n\0\0\0\4\0\0\0\2\0\0\0", 12) = 12
1446  1539262721.256556 write(3, "\f\0\0\0\10\0H\1\200\307\211\302G\177\0\0", 16) = 16
1430  1539262721.257618 write(3, "\1\0\0\0\20\0\4\0\204\327\375\271\374\177\0\0\2\0\0\0\0\0\0\0", 24) = 24
1430  1539262721.257769 write(5, "\4\0\0\0\3\0\0\0\0\0\0\0", 12) = 12
1430  1539262721.258369 write(5, "\27\0\0\0\4\0\0\0\2\0\0\0\0\0\0\0", 16) = 16
1430  1539262721.258667 write(5, "\33\0\0\0\6\0\1\0T\327\375\271\374\177\0\0\3\0\0\0\0\0\0\0", 24) = 24
1430  1539262721.259223 write(5, "\24\0\0\0\6\0\2\08\327\375\271\374\177\0\0\2\0\0\0\0\0\0\0", 24) = 24
1430  1539262721.260476 write(3, "\1\0\0\0\20\0\4\0D\330\375\271\374\177\0\0\0\0\0\0\0\0\0\0", 24) = 24
1430  1539262721.260726 close(3)        = 0
1430  1539262721.261082 write(5, "\4\0\0\0\3\0\0\0\1\0\0\0", 12) = -1 EBUSY (Device or resource busy)
1430  1539262721.358728 write(5, "\r\0\0\0\3\0\0\0\4\0\0\0", 12) = 12


I don't see any read on these fd despite epoll being set to wait for
read events on these so I'm not quite sure where ibverbs knows if the
commands worked or not, but hopefully that illustrats that it's slightly
more complex than just socket/bind/listen/accept/write/close! :) 

-- 
Dominique

^ permalink raw reply

* Re: Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Michael S. Tsirkin @ 2018-10-11 14:17 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: Jason Wang, virtualization, linux-kernel, netdev, virtio-dev,
	wexu, jfreimann, maxime.coquelin, zhihong.wang
In-Reply-To: <20181011141331.GA11650@debian>

On Thu, Oct 11, 2018 at 10:13:31PM +0800, Tiwei Bie wrote:
> On Thu, Oct 11, 2018 at 09:48:48AM -0400, Michael S. Tsirkin wrote:
> > On Thu, Oct 11, 2018 at 08:12:21PM +0800, Tiwei Bie wrote:
> > > > > But if it's not too late, I second for a OUT_OF_ORDER feature.
> > > > > Starting from in order can have much simpler code in driver.
> > > > > 
> > > > > Thanks
> > > > 
> > > > It's tricky to change the flag polarity because of compatibility
> > > > with legacy interfaces. Why is this such a big deal?
> > > > 
> > > > Let's teach drivers about IN_ORDER, then if devices
> > > > are in order it will get enabled by default.
> > > 
> > > Yeah, make sense.
> > > 
> > > Besides, I have done some further profiling and debugging
> > > both in kernel driver and DPDK vhost. Previously I was mislead
> > > by a bug in vhost code. I will send a patch to fix that bug.
> > > With that bug fixed, the performance of packed ring in the
> > > test between kernel driver and DPDK vhost is better now.
> > 
> > OK, if we get a performance gain on the virtio side, we can finally
> > upstream it. If you see that please re-post ASAP so we can
> > put it in the next kernel release.
> 
> Got it, I will re-post ASAP.
> 
> Thanks!


Pls remember to include data on performance gain in the cover letter.


> 
> > 
> > -- 
> > MST

^ permalink raw reply

* Re: [PATCH stable 4.4 V2 0/6] fix SegmentSmack in stable branch (CVE-2018-5390)
From: Greg KH @ 2018-10-11 14:07 UTC (permalink / raw)
  To: maowenan; +Cc: netdev, dwmw2, eric.dumazet, davem, stable, linux-kernel
In-Reply-To: <20180926202121.GB23881@kroah.com>

On Wed, Sep 26, 2018 at 10:21:21PM +0200, Greg KH wrote:
> On Tue, Sep 25, 2018 at 10:10:15PM +0800, maowenan wrote:
> > Hi Greg:
> > 
> > can you review this patch set?
> 
> It is still in the queue, don't worry.  It will take some more time to
> properly review and test it.
> 
> Ideally you could get someone else to test this and provide a
> "tested-by:" tag for it?

All now queued up, let's see what breaks :)

thanks,

greg k-h

^ permalink raw reply

* [PATCH] r8169: set RX_MULTI_EN bit in RxConfig for 8168F-family chips
From: Maciej S. Szmigiero @ 2018-10-11 14:02 UTC (permalink / raw)
  To: David S. Miller, Chris Clayton
  Cc: Heiner Kallweit, Azat Khuzhin, Realtek linux nic maintainers,
	netdev@vger.kernel.org, linux-kernel

It has been reported that since
commit 05212ba8132b42 ("r8169: set RxConfig after tx/rx is enabled for RTL8169sb/8110sb devices")
at least RTL_GIGA_MAC_VER_38 NICs work erratically after a resume from
suspend.
The problem has been traced to a missing RX_MULTI_EN bit in the RxConfig
register.
We already set this bit for RTL_GIGA_MAC_VER_35 NICs of the same 8168F
chip family so let's do it also for its other siblings: RTL_GIGA_MAC_VER_36
and RTL_GIGA_MAC_VER_38.

Curiously, the NIC seems to work fine after a system boot without having
this bit set as long as the system isn't suspended and resumed.

Fixes: 05212ba8132b42 ("r8169: set RxConfig after tx/rx is enabled for RTL8169sb/8110sb devices")
Reported-by: Chris Clayton <chris2553@googlemail.com>
Signed-off-by: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
---
 drivers/net/ethernet/realtek/r8169.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 7d3f671e1bb3..b68e32186d67 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4269,8 +4269,8 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp)
 		RTL_W32(tp, RxConfig, RX_FIFO_THRESH | RX_DMA_BURST);
 		break;
 	case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_24:
-	case RTL_GIGA_MAC_VER_34:
-	case RTL_GIGA_MAC_VER_35:
+	case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_36:
+	case RTL_GIGA_MAC_VER_38:
 		RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST);
 		break;
 	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
-- 
2.17.0

^ permalink raw reply related

* Re: [virtio-dev] Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Michael S. Tsirkin @ 2018-10-11 13:48 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: Jason Wang, virtualization, linux-kernel, netdev, virtio-dev,
	wexu, jfreimann
In-Reply-To: <20181011121221.GA27106@debian>

On Thu, Oct 11, 2018 at 08:12:21PM +0800, Tiwei Bie wrote:
> > > But if it's not too late, I second for a OUT_OF_ORDER feature.
> > > Starting from in order can have much simpler code in driver.
> > > 
> > > Thanks
> > 
> > It's tricky to change the flag polarity because of compatibility
> > with legacy interfaces. Why is this such a big deal?
> > 
> > Let's teach drivers about IN_ORDER, then if devices
> > are in order it will get enabled by default.
> 
> Yeah, make sense.
> 
> Besides, I have done some further profiling and debugging
> both in kernel driver and DPDK vhost. Previously I was mislead
> by a bug in vhost code. I will send a patch to fix that bug.
> With that bug fixed, the performance of packed ring in the
> test between kernel driver and DPDK vhost is better now.

OK, if we get a performance gain on the virtio side, we can finally
upstream it. If you see that please re-post ASAP so we can
put it in the next kernel release.

-- 
MST

^ permalink raw reply

* [PATCH net-next v6] net/ncsi: Extend NC-SI Netlink interface to allow user space to send NC-SI command
From: Justin.Lee1 @ 2018-10-11  6:21 UTC (permalink / raw)
  To: sam, joel; +Cc: linux-aspeed, netdev, openbmc, amithash, christian, vijaykhemka

The new command (NCSI_CMD_SEND_CMD) is added to allow user space application
to send NC-SI command to the network card.
Also, add a new attribute (NCSI_ATTR_DATA) for transferring request and response.

The work flow is as below. 

Request:
User space application
	-> Netlink interface (msg)
	-> new Netlink handler - ncsi_send_cmd_nl()
	-> ncsi_xmit_cmd()

Response:
Response received - ncsi_rcv_rsp()
	-> internal response handler - ncsi_rsp_handler_xxx()
	-> ncsi_rsp_handler_netlink()
	-> ncsi_send_netlink_rsp ()
	-> Netlink interface (msg)
	-> user space application

Command timeout - ncsi_request_timeout()
	-> ncsi_send_netlink_timeout ()
	-> Netlink interface (msg with zero data length)
	-> user space application

Error:
Error detected
	-> ncsi_send_netlink_err ()
	-> Netlink interface (err msg)
	-> user space application


Signed-off-by: Justin Lee <justin.lee1@dell.com> 


---
V6: Add checking before accessing NCSI_ATTR_DATA attribute to avoid null-dereference. 
V5: Update comments and debug message.
V4: Update comments and remove some debug message.
V3: Based on http://patchwork.ozlabs.org/patch/979688/ to remove the duplicated code.
V2: Remove non-related debug message and clean up the code.

 include/uapi/linux/ncsi.h |   6 ++
 net/ncsi/internal.h       |   7 ++
 net/ncsi/ncsi-cmd.c       |   8 ++
 net/ncsi/ncsi-manage.c    |  16 ++++
 net/ncsi/ncsi-netlink.c   | 205 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ncsi/ncsi-netlink.h   |  12 +++
 net/ncsi/ncsi-rsp.c       |  67 +++++++++++++--
 7 files changed, 316 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
index 4c292ec..0a26a55 100644
--- a/include/uapi/linux/ncsi.h
+++ b/include/uapi/linux/ncsi.h
@@ -23,6 +23,9 @@
  *	optionally the preferred NCSI_ATTR_CHANNEL_ID.
  * @NCSI_CMD_CLEAR_INTERFACE: clear any preferred package/channel combination.
  *	Requires NCSI_ATTR_IFINDEX.
+ * @NCSI_CMD_SEND_CMD: send NC-SI command to network card.
+ *	Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID
+ *	and NCSI_ATTR_CHANNEL_ID.
  * @NCSI_CMD_MAX: highest command number
  */
 enum ncsi_nl_commands {
@@ -30,6 +33,7 @@ enum ncsi_nl_commands {
 	NCSI_CMD_PKG_INFO,
 	NCSI_CMD_SET_INTERFACE,
 	NCSI_CMD_CLEAR_INTERFACE,
+	NCSI_CMD_SEND_CMD,
 
 	__NCSI_CMD_AFTER_LAST,
 	NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1
@@ -43,6 +47,7 @@ enum ncsi_nl_commands {
  * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes
  * @NCSI_ATTR_PACKAGE_ID: package ID
  * @NCSI_ATTR_CHANNEL_ID: channel ID
+ * @NCSI_ATTR_DATA: command payload
  * @NCSI_ATTR_MAX: highest attribute number
  */
 enum ncsi_nl_attrs {
@@ -51,6 +56,7 @@ enum ncsi_nl_attrs {
 	NCSI_ATTR_PACKAGE_LIST,
 	NCSI_ATTR_PACKAGE_ID,
 	NCSI_ATTR_CHANNEL_ID,
+	NCSI_ATTR_DATA,
 
 	__NCSI_ATTR_AFTER_LAST,
 	NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 3d0a33b..13c9b5e 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -175,6 +175,8 @@ struct ncsi_package;
 #define NCSI_RESERVED_CHANNEL	0x1f
 #define NCSI_CHANNEL_INDEX(c)	((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
 #define NCSI_TO_CHANNEL(p, c)	(((p) << NCSI_PACKAGE_SHIFT) | (c))
+#define NCSI_MAX_PACKAGE	8
+#define NCSI_MAX_CHANNEL	32
 
 struct ncsi_channel {
 	unsigned char               id;
@@ -220,11 +222,15 @@ struct ncsi_request {
 	bool                 used;    /* Request that has been assigned  */
 	unsigned int         flags;   /* NCSI request property           */
 #define NCSI_REQ_FLAG_EVENT_DRIVEN	1
+#define NCSI_REQ_FLAG_NETLINK_DRIVEN	2
 	struct ncsi_dev_priv *ndp;    /* Associated NCSI device          */
 	struct sk_buff       *cmd;    /* Associated NCSI command packet  */
 	struct sk_buff       *rsp;    /* Associated NCSI response packet */
 	struct timer_list    timer;   /* Timer on waiting for response   */
 	bool                 enabled; /* Time has been enabled or not    */
+	u32                  snd_seq;     /* netlink sending sequence number */
+	u32                  snd_portid;  /* netlink portid of sender        */
+	struct nlmsghdr      nlhdr;       /* netlink message header          */
 };
 
 enum {
@@ -310,6 +316,7 @@ struct ncsi_cmd_arg {
 		unsigned int   dwords[4];
 	};
 	unsigned char        *data;       /* NCSI OEM data                 */
+	struct genl_info     *info;       /* Netlink information           */
 };
 
 extern struct list_head ncsi_dev_list;
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
index 82b7d92..356af47 100644
--- a/net/ncsi/ncsi-cmd.c
+++ b/net/ncsi/ncsi-cmd.c
@@ -17,6 +17,7 @@
 #include <net/ncsi.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/genetlink.h>
 
 #include "internal.h"
 #include "ncsi-pkt.h"
@@ -346,6 +347,13 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca)
 	if (!nr)
 		return -ENOMEM;
 
+	/* track netlink information */
+	if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+		nr->snd_seq = nca->info->snd_seq;
+		nr->snd_portid = nca->info->snd_portid;
+		nr->nlhdr = *nca->info->nlhdr;
+	}
+
 	/* Prepare the packet */
 	nca->id = nr->id;
 	ret = nch->handler(nr->cmd, nca);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 0912847..76a4bcb 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -19,6 +19,7 @@
 #include <net/addrconf.h>
 #include <net/ipv6.h>
 #include <net/if_inet6.h>
+#include <net/genetlink.h>
 
 #include "internal.h"
 #include "ncsi-pkt.h"
@@ -406,6 +407,9 @@ static void ncsi_request_timeout(struct timer_list *t)
 {
 	struct ncsi_request *nr = from_timer(nr, t, timer);
 	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	struct ncsi_cmd_pkt *cmd;
 	unsigned long flags;
 
 	/* If the request already had associated response,
@@ -419,6 +423,18 @@ static void ncsi_request_timeout(struct timer_list *t)
 	}
 	spin_unlock_irqrestore(&ndp->lock, flags);
 
+	if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+		if (nr->cmd) {
+			/* Find the package */
+			cmd = (struct ncsi_cmd_pkt *)
+			      skb_network_header(nr->cmd);
+			ncsi_find_package_and_channel(ndp,
+						      cmd->cmd.common.channel,
+						      &np, &nc);
+			ncsi_send_netlink_timeout(nr, np, nc);
+		}
+	}
+
 	/* Release the request */
 	ncsi_free_request(nr);
 }
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 45f33d6..80ba212 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -20,6 +20,7 @@
 #include <uapi/linux/ncsi.h>
 
 #include "internal.h"
+#include "ncsi-pkt.h"
 #include "ncsi-netlink.h"
 
 static struct genl_family ncsi_genl_family;
@@ -29,6 +30,7 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
 	[NCSI_ATTR_PACKAGE_LIST] =	{ .type = NLA_NESTED },
 	[NCSI_ATTR_PACKAGE_ID] =	{ .type = NLA_U32 },
 	[NCSI_ATTR_CHANNEL_ID] =	{ .type = NLA_U32 },
+	[NCSI_ATTR_DATA] =		{ .type = NLA_BINARY, .len = 2048 },
 };
 
 static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex)
@@ -366,6 +368,203 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
 	return 0;
 }
 
+static int ncsi_send_cmd_nl(struct sk_buff *msg, struct genl_info *info)
+{
+	struct ncsi_dev_priv *ndp;
+
+	struct ncsi_cmd_arg nca;
+	struct ncsi_pkt_hdr *hdr;
+
+	u32 package_id, channel_id;
+	unsigned char *data;
+	int len, ret;
+
+	if (!info || !info->attrs) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!info->attrs[NCSI_ATTR_IFINDEX]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!info->attrs[NCSI_ATTR_DATA]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+	if (!ndp) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
+	channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
+
+	if (package_id >= NCSI_MAX_PACKAGE || channel_id >= NCSI_MAX_CHANNEL) {
+		ret = -ERANGE;
+		goto out_netlink;
+	}
+
+	len = nla_len(info->attrs[NCSI_ATTR_DATA]);
+	if (len < sizeof(struct ncsi_pkt_hdr)) {
+		netdev_info(ndp->ndev.dev, "NCSI: no command to send %u\n",
+			    package_id);
+		ret = -EINVAL;
+		goto out_netlink;
+	} else {
+		data = (unsigned char *)nla_data(info->attrs[NCSI_ATTR_DATA]);
+	}
+
+	hdr = (struct ncsi_pkt_hdr *)data;
+
+	nca.ndp = ndp;
+	nca.package = (unsigned char)package_id;
+	nca.channel = (unsigned char)channel_id;
+	nca.type = hdr->type;
+	nca.req_flags = NCSI_REQ_FLAG_NETLINK_DRIVEN;
+	nca.info = info;
+	nca.payload = ntohs(hdr->length);
+	nca.data = data + sizeof(*hdr);
+
+	ret = ncsi_xmit_cmd(&nca);
+out_netlink:
+	if (ret != 0) {
+		netdev_err(ndp->ndev.dev,
+			   "NCSI: Error %d sending command\n",
+			   ret);
+		ncsi_send_netlink_err(ndp->ndev.dev,
+				      info->snd_seq,
+				      info->snd_portid,
+				      info->nlhdr,
+				      ret);
+	}
+out:
+	return ret;
+}
+
+int ncsi_send_netlink_rsp(struct ncsi_request *nr,
+			  struct ncsi_package *np,
+			  struct ncsi_channel *nc)
+{
+	struct sk_buff *skb;
+	struct net *net;
+	void *hdr;
+	int rc;
+
+	net = dev_net(nr->rsp->dev);
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq,
+			  &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD);
+	if (!hdr) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->rsp->dev->ifindex);
+	if (np)
+		nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id);
+	if (nc)
+		nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id);
+	else
+		nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL);
+
+	rc = nla_put(skb, NCSI_ATTR_DATA, nr->rsp->len, (void *)nr->rsp->data);
+	if (rc)
+		goto err;
+
+	genlmsg_end(skb, hdr);
+	return genlmsg_unicast(net, skb, nr->snd_portid);
+
+err:
+	kfree_skb(skb);
+	return rc;
+}
+
+int ncsi_send_netlink_timeout(struct ncsi_request *nr,
+			      struct ncsi_package *np,
+			      struct ncsi_channel *nc)
+{
+	struct sk_buff *skb;
+	struct net *net;
+	void *hdr;
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq,
+			  &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD);
+	if (!hdr) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	net = dev_net(nr->cmd->dev);
+
+	nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->cmd->dev->ifindex);
+
+	if (np)
+		nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id);
+	else
+		nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID,
+			    NCSI_PACKAGE_INDEX((((struct ncsi_pkt_hdr *)
+						 nr->cmd->data)->channel)));
+
+	if (nc)
+		nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id);
+	else
+		nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL);
+
+	genlmsg_end(skb, hdr);
+	return genlmsg_unicast(net, skb, nr->snd_portid);
+}
+
+int ncsi_send_netlink_err(struct net_device *dev,
+			  u32 snd_seq,
+			  u32 snd_portid,
+			  struct nlmsghdr *nlhdr,
+			  int err)
+{
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct nlmsgerr *nle;
+	struct net *net;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	net = dev_net(dev);
+
+	nlh = nlmsg_put(skb, snd_portid, snd_seq,
+			NLMSG_ERROR, sizeof(*nle), 0);
+	nle = (struct nlmsgerr *)nlmsg_data(nlh);
+	nle->error = err;
+	memcpy(&nle->msg, nlhdr, sizeof(*nlh));
+
+	nlmsg_end(skb, nlh);
+
+	return nlmsg_unicast(net->genl_sock, skb, snd_portid);
+}
+
 static const struct genl_ops ncsi_ops[] = {
 	{
 		.cmd = NCSI_CMD_PKG_INFO,
@@ -386,6 +585,12 @@ static const struct genl_ops ncsi_ops[] = {
 		.doit = ncsi_clear_interface_nl,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NCSI_CMD_SEND_CMD,
+		.policy = ncsi_genl_policy,
+		.doit = ncsi_send_cmd_nl,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 static struct genl_family ncsi_genl_family __ro_after_init = {
diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h
index 91a5c25..c4a4688 100644
--- a/net/ncsi/ncsi-netlink.h
+++ b/net/ncsi/ncsi-netlink.h
@@ -14,6 +14,18 @@
 
 #include "internal.h"
 
+int ncsi_send_netlink_rsp(struct ncsi_request *nr,
+			  struct ncsi_package *np,
+			  struct ncsi_channel *nc);
+int ncsi_send_netlink_timeout(struct ncsi_request *nr,
+			      struct ncsi_package *np,
+			      struct ncsi_channel *nc);
+int ncsi_send_netlink_err(struct net_device *dev,
+			  u32 snd_seq,
+			  u32 snd_portid,
+			  struct nlmsghdr *nlhdr,
+			  int err);
+
 int ncsi_init_netlink(struct net_device *dev);
 int ncsi_unregister_netlink(struct net_device *dev);
 
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index d66b347..dd931d2 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -16,9 +16,11 @@
 #include <net/ncsi.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/genetlink.h>
 
 #include "internal.h"
 #include "ncsi-pkt.h"
+#include "ncsi-netlink.h"
 
 static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
 				 unsigned short payload)
@@ -32,15 +34,25 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
 	 * before calling this function.
 	 */
 	h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp);
-	if (h->common.revision != NCSI_PKT_REVISION)
+
+	if (h->common.revision != NCSI_PKT_REVISION) {
+		netdev_dbg(nr->ndp->ndev.dev,
+			   "NCSI: unsupported header revision\n");
 		return -EINVAL;
-	if (ntohs(h->common.length) != payload)
+	}
+	if (ntohs(h->common.length) != payload) {
+		netdev_dbg(nr->ndp->ndev.dev,
+			   "NCSI: payload length mismatched\n");
 		return -EINVAL;
+	}
 
 	/* Check on code and reason */
 	if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED ||
-	    ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR)
-		return -EINVAL;
+	    ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) {
+		netdev_dbg(nr->ndp->ndev.dev,
+			   "NCSI: non zero response/reason code\n");
+		return -EPERM;
+	}
 
 	/* Validate checksum, which might be zeroes if the
 	 * sender doesn't support checksum according to NCSI
@@ -52,8 +64,11 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
 
 	checksum = ncsi_calculate_checksum((unsigned char *)h,
 					   sizeof(*h) + payload - 4);
-	if (*pchecksum != htonl(checksum))
+
+	if (*pchecksum != htonl(checksum)) {
+		netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n");
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -941,6 +956,26 @@ static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr)
 	return 0;
 }
 
+static int ncsi_rsp_handler_netlink(struct ncsi_request *nr)
+{
+	struct ncsi_rsp_pkt *rsp;
+	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	int ret;
+
+	/* Find the package */
+	rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+	ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+				      &np, &nc);
+	if (!np)
+		return -ENODEV;
+
+	ret = ncsi_send_netlink_rsp(nr, np, nc);
+
+	return ret;
+}
+
 static struct ncsi_rsp_handler {
 	unsigned char	type;
 	int             payload;
@@ -1043,6 +1078,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
 		netdev_warn(ndp->ndev.dev,
 			    "NCSI: 'bad' packet ignored for type 0x%x\n",
 			    hdr->type);
+
+		if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+			if (ret == -EPERM)
+				goto out_netlink;
+			else
+				ncsi_send_netlink_err(ndp->ndev.dev,
+						      nr->snd_seq,
+						      nr->snd_portid,
+						      &nr->nlhdr,
+						      ret);
+		}
 		goto out;
 	}
 
@@ -1052,6 +1098,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
 		netdev_err(ndp->ndev.dev,
 			   "NCSI: Handler for packet type 0x%x returned %d\n",
 			   hdr->type, ret);
+
+out_netlink:
+	if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+		ret = ncsi_rsp_handler_netlink(nr);
+		if (ret) {
+			netdev_err(ndp->ndev.dev,
+				   "NCSI: Netlink handler for packet type 0x%x returned %d\n",
+				   hdr->type, ret);
+		}
+	}
+
 out:
 	ncsi_free_request(nr);
 	return ret;
-- 
2.9.3



^ permalink raw reply related

* Re: [PATCH net-next 0/7] add support for VSC8584 and VSC8574 Microsemi quad-port PHYs
From: Allan W. Nielsen @ 2018-10-11  6:18 UTC (permalink / raw)
  To: Linus Walleij; +Cc: Alexandre Belloni, netdev
In-Reply-To: <CACRpkdZYpDJGzexRr7y4LO+hyJ92aEu_eRquK50iBWaaox5H5Q@mail.gmail.com>

Hi Linus,

I'm Allan, working for Microchip, who acquired Microsemi, who acquired Vitesse.

Alexandre pointed me to your comments on the VSC7384 switch done by Vitesse ~15
years ago. BTW: the chip is not being sold any more, and is unlikely to turn up
in new products.

I managed to find the complete datasheet of this device, and convincing people
that it can be "opened-up" meaning that it is available without any NDA or
similar. It is a 1.2mb/200 page pdf, it is not avialable at any web page (that I
know of), but if you are interested in having it then I can send it in a mail to
you.

The 09/20/2018 14:38, Linus Walleij wrote:
> Just as a drive-by comment this seems vaguely related to the Vitesse
> DSA switch I merged in drivers/net/dsa/vitesse-vsc73xx.c
> The VSC* product name handily gives away the origin in Vitesse's
> product line.
> 
> The VSC73xx also have the 8051 CPU and internal RAM, but are
> accessed (typically) over SPI, and AFAICT this thing is talking over
> MDIO.
> 
> The Vitesse 73xx however also supports a WAN port and VLANs
> which makes it significantly different, falling into switch class I
> guess.
That is right, here are the list of feature from page 1 in the datasheet.

Features:
- 12 Gigabit Ethernet ports with nonblocking wire- speed performance
- IEEE802.1Q-in-Q nested VLAN support
- Tri-speed (10/100/1000 Mbps) RGMII interfaces
- Full duplex flow control (IEEE802.3x) and half duplex back pressure
- Support for both wire-speed automatic learning, and CPU-based learning
- Flexible link aggregation compliant with IEEE802.3ad
- 208 kB on-chip frame buffer
- Spanning Tree Protocol support (IEEE802.1D)
- Jumbo frame support
- Multiple Spanning Tree support (IEEE802.1s)
- Programmable classifier for QoS (Layer 4/Multimedia) into four classes of
  service
- Port-based Access Control (IEEE802.1X)
- IGMP, GARP, GMRP, and GVRP support
- 8192 MAC addresses and 4,096 VLAN support (IEEE802.1Q)
- Cost effective 4-pin serial CPU interface
- Per-port shaping, policing, and Broadcast and Multicast Storm Control
- Selection between on-chip 8051 CPU, or off-chip 8-bit or 16-bit CPU for SNMP
  and Web-based management

/Allan

^ permalink raw reply

* Re: BUG: corrupted list in p9_read_work
From: Dmitry Vyukov @ 2018-10-11 13:40 UTC (permalink / raw)
  To: Dominique Martinet
  Cc: Leon Romanovsky, syzbot, David Miller, Eric Van Hensbergen, LKML,
	Latchesar Ionkov, netdev, Ron Minnich, syzkaller-bugs,
	v9fs-developer
In-Reply-To: <CACT4Y+YvJ-KT4UxtOXpou1xpT3wBASDkt135zr2BwrRi0gv4Tw@mail.gmail.com>

On Thu, Oct 11, 2018 at 3:27 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
> On Thu, Oct 11, 2018 at 3:10 PM, Dominique Martinet
> <asmadeus@codewreck.org> wrote:
>> Dmitry Vyukov wrote on Thu, Oct 11, 2018:
>>> > That's still the tricky part, I'm afraid... Making a separate server
>>> > would have been easy because I could have reused some of my junk for the
>>> > actual connection handling (some rdma helper library I wrote ages
>>> > ago[1]), but if you're going to just embed C code you'll probably want
>>> > something lower level? I've never seen syzkaller use any library call
>>> > but I'm not even sure I would know how to create a qp without
>>> > libibverbs, would standard stuff be OK ?
>>>
>>> Raw syscalls preferably.
>>> What does 'rxe_cfg start ens3' do on syscall level? Some netlink?
>>
>> modprobe rdma_rxe (and a bunch of other rdma modules before that) then
>> writes the interface name in /sys/module/rdma_rxe/parameters/add
>> apparently; then checks it worked.
>> this part could be done in C directly without too much trouble, but as
>> long as the proper kernel configuration/modules are available
>
> Now we are talking!
> We generally assume that all modules are simply compiled into kernel.
> At least that's we have on syzbot. If somebody can't compile them in,
> we can suggest to add modprobe into init.
> So this boils down to just writing to /sys/module/rdma_rxe/parameters/add.

This fails for me:

root@syzkaller:~# echo -n syz1 > /sys/module/rdma_rxe/parameters/add
[20992.905406] rdma_rxe: interface syz1 not found
bash: echo: write error: Invalid argument



>>> Any libraries and utilities are hell pain in linux world. Will it work
>>> in Android userspace? gVisor? Who will explain all syzkaller users
>>> where they get this for their who-knows-what distro, which is 10 years
>>> old because of corp policies, and debug how their version of the
>>> library has a slightly incompatible version?
>>> For example, after figuring out that rxe_cfg actually comes from
>>> rdma-core (which is a separate delight on linux), my debian
>>> destribution failed to install it because of some conflicts around
>>> /etc/modprobe.d/mlx4.conf, and my ubuntu distro does not know about
>>> such package. And we've just started :)
>>
>> The rdma ecosystem is a pain, I'll easily agree with that...
>>
>>> Syscalls tend to be simpler and more reliable. If it gives ENOSUPP,
>>> ok, that's it. If it works, great, we can use it.
>>
>> I'll have to look into it a bit more; libibverbs abstracts a lot of
>> stuff into per-nic userspace drivers (the files I cited in a previous
>> mail) and basically with the mellanox cards I'm familiar with the whole
>> user session looks like this:
>>  * common libibverbs/rdmacm code opens /dev/infiniband/rdma_cm and
>> /dev/infiniband/uverbs0 (plus a bunch of files to figure out abi
>> version, what user driver to load etc)
>>  * it and the userspace driver issue "commands" over these two files' fd
>> to setup the connection ; some commands are standard but some are
>> specific to the interface and defined in the driver.
>
> But we will use some kind of virtual/stub driver, right? We don't have
> real hardware. So all these commands should be fixed and known for the
> virtual/stub driver.
>
>> There are many facets to a connection in RDMA: a protection domain used
>> to register memory with the nic, a queue pair that is the actual tx/rx
>> connection, optionally a completion channel that will be another fd to
>> listen on for events that tell you something happened and finally some
>> memory regions to directly communicate with the nic from userspace
>> depending on the specific driver.
>>  * then there's the actual usage, more commands through the uverbs0 char
>> device to register the memory you'll use, and once that's done it's
>> entierly up to the driver - for example the mellanox lib can do
>> everything in userspace playing with the memory regions it registered,
>> but I'd wager the rxe driver does more calls through the uverbs0 fd...
>>
>> Honestly I'm not keen on reimplementing all of this; the interface
>> itself pretty much depends on your version of the kernel (there is a
>> common ABI defined, but as far as specific nics are concerned if your
>> kernel module doesn't match the user library version you can get some
>> nasty surprises), and it's far from the black or white of a good ol'
>> ENOSUPP error.
>>
>>
>> I'll look if I can figure out if there is a common subset of verbs
>> commands that are standard and sufficient to setup a listening
>> connection and exchange data that should be supported for all devices
>> and would let us reimplement just that, but while I hear your point
>> about android and ten years in the future I think it's more likely than
>> ten years in the future the verb abi will have changed but libibverbs
>> will just have the new version implemented and hide the change :P
>
> But again we don't need to support all of the available hardware.
> For example, we are testing net stack from external side using tun.
> tun is a very simple, virtual abstraction of a network card. It allows
> us to test all of generic net stack starting from L2 without messing
> with any real drivers and their differences entirely. I had impression
> that we are talking about something similar here too. Or not?
>
> Also I am a bit missing context about rdma<->9p interface. Do we need
> to setup all these ring buffers to satisfy the parts that 9p needs? Is
> it that 9p actually reads data directly from these ring buffers? Or
> there is some higher-level rdma interface that 9p uses?

^ permalink raw reply

* Re: BUG: corrupted list in p9_read_work
From: Dmitry Vyukov @ 2018-10-11 13:27 UTC (permalink / raw)
  To: Dominique Martinet
  Cc: Leon Romanovsky, syzbot, David Miller, Eric Van Hensbergen, LKML,
	Latchesar Ionkov, netdev, Ron Minnich, syzkaller-bugs,
	v9fs-developer
In-Reply-To: <20181011131045.GA32030@nautica>

On Thu, Oct 11, 2018 at 3:10 PM, Dominique Martinet
<asmadeus@codewreck.org> wrote:
> Dmitry Vyukov wrote on Thu, Oct 11, 2018:
>> > That's still the tricky part, I'm afraid... Making a separate server
>> > would have been easy because I could have reused some of my junk for the
>> > actual connection handling (some rdma helper library I wrote ages
>> > ago[1]), but if you're going to just embed C code you'll probably want
>> > something lower level? I've never seen syzkaller use any library call
>> > but I'm not even sure I would know how to create a qp without
>> > libibverbs, would standard stuff be OK ?
>>
>> Raw syscalls preferably.
>> What does 'rxe_cfg start ens3' do on syscall level? Some netlink?
>
> modprobe rdma_rxe (and a bunch of other rdma modules before that) then
> writes the interface name in /sys/module/rdma_rxe/parameters/add
> apparently; then checks it worked.
> this part could be done in C directly without too much trouble, but as
> long as the proper kernel configuration/modules are available

Now we are talking!
We generally assume that all modules are simply compiled into kernel.
At least that's we have on syzbot. If somebody can't compile them in,
we can suggest to add modprobe into init.
So this boils down to just writing to /sys/module/rdma_rxe/parameters/add.



>> Any libraries and utilities are hell pain in linux world. Will it work
>> in Android userspace? gVisor? Who will explain all syzkaller users
>> where they get this for their who-knows-what distro, which is 10 years
>> old because of corp policies, and debug how their version of the
>> library has a slightly incompatible version?
>> For example, after figuring out that rxe_cfg actually comes from
>> rdma-core (which is a separate delight on linux), my debian
>> destribution failed to install it because of some conflicts around
>> /etc/modprobe.d/mlx4.conf, and my ubuntu distro does not know about
>> such package. And we've just started :)
>
> The rdma ecosystem is a pain, I'll easily agree with that...
>
>> Syscalls tend to be simpler and more reliable. If it gives ENOSUPP,
>> ok, that's it. If it works, great, we can use it.
>
> I'll have to look into it a bit more; libibverbs abstracts a lot of
> stuff into per-nic userspace drivers (the files I cited in a previous
> mail) and basically with the mellanox cards I'm familiar with the whole
> user session looks like this:
>  * common libibverbs/rdmacm code opens /dev/infiniband/rdma_cm and
> /dev/infiniband/uverbs0 (plus a bunch of files to figure out abi
> version, what user driver to load etc)
>  * it and the userspace driver issue "commands" over these two files' fd
> to setup the connection ; some commands are standard but some are
> specific to the interface and defined in the driver.

But we will use some kind of virtual/stub driver, right? We don't have
real hardware. So all these commands should be fixed and known for the
virtual/stub driver.

> There are many facets to a connection in RDMA: a protection domain used
> to register memory with the nic, a queue pair that is the actual tx/rx
> connection, optionally a completion channel that will be another fd to
> listen on for events that tell you something happened and finally some
> memory regions to directly communicate with the nic from userspace
> depending on the specific driver.
>  * then there's the actual usage, more commands through the uverbs0 char
> device to register the memory you'll use, and once that's done it's
> entierly up to the driver - for example the mellanox lib can do
> everything in userspace playing with the memory regions it registered,
> but I'd wager the rxe driver does more calls through the uverbs0 fd...
>
> Honestly I'm not keen on reimplementing all of this; the interface
> itself pretty much depends on your version of the kernel (there is a
> common ABI defined, but as far as specific nics are concerned if your
> kernel module doesn't match the user library version you can get some
> nasty surprises), and it's far from the black or white of a good ol'
> ENOSUPP error.
>
>
> I'll look if I can figure out if there is a common subset of verbs
> commands that are standard and sufficient to setup a listening
> connection and exchange data that should be supported for all devices
> and would let us reimplement just that, but while I hear your point
> about android and ten years in the future I think it's more likely than
> ten years in the future the verb abi will have changed but libibverbs
> will just have the new version implemented and hide the change :P

But again we don't need to support all of the available hardware.
For example, we are testing net stack from external side using tun.
tun is a very simple, virtual abstraction of a network card. It allows
us to test all of generic net stack starting from L2 without messing
with any real drivers and their differences entirely. I had impression
that we are talking about something similar here too. Or not?

Also I am a bit missing context about rdma<->9p interface. Do we need
to setup all these ring buffers to satisfy the parts that 9p needs? Is
it that 9p actually reads data directly from these ring buffers? Or
there is some higher-level rdma interface that 9p uses?

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox