Netdev List
 help / color / mirror / Atom feed
* [next-queue PATCH v7 08/10] igb: Add MAC address support for ethtool nftuple filters
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia
In-Reply-To: <20180410174959.18757-1-vinicius.gomes@intel.com>

This adds the capability of configuring the queue steering of arriving
packets based on their source and destination MAC addresses.

Source address steering (i.e. driving traffic to a specific queue),
for the i210, does not work, but filtering does (i.e. accepting
traffic based on the source address). So, trying to add a filter
specifying only a source address will be an error.

In practical terms this adds support for the following use cases,
characterized by these examples:

$ ethtool -N eth0 flow-type ether dst aa:aa:aa:aa:aa:aa action 0
(this will direct packets with destination address "aa:aa:aa:aa:aa:aa"
to the RX queue 0)

$ ethtool -N eth0 flow-type ether src 44:44:44:44:44:44 \
  	     	  	    	  proto 0x22f0 action 3
(this will direct packets with source address "44:44:44:44:44:44" and
ethertype 0x22f0 to the RX queue 3)

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 43 ++++++++++++++++++--
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 31b2960a7869..6697c273ab59 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2495,6 +2495,23 @@ static int igb_get_ethtool_nfc_entry(struct igb_adapter *adapter,
 			fsp->h_ext.vlan_tci = rule->filter.vlan_tci;
 			fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
 		}
+		if (rule->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR) {
+			ether_addr_copy(fsp->h_u.ether_spec.h_dest,
+					rule->filter.dst_addr);
+			/* As we only support matching by the full
+			 * mask, return the mask to userspace
+			 */
+			eth_broadcast_addr(fsp->m_u.ether_spec.h_dest);
+		}
+		if (rule->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR) {
+			ether_addr_copy(fsp->h_u.ether_spec.h_source,
+					rule->filter.src_addr);
+			/* As we only support matching by the full
+			 * mask, return the mask to userspace
+			 */
+			eth_broadcast_addr(fsp->m_u.ether_spec.h_source);
+		}
+
 		return 0;
 	}
 	return -EINVAL;
@@ -2768,8 +2785,16 @@ static int igb_rxnfc_write_vlan_prio_filter(struct igb_adapter *adapter,
 
 int igb_add_filter(struct igb_adapter *adapter, struct igb_nfc_filter *input)
 {
+	struct e1000_hw *hw = &adapter->hw;
 	int err = -EINVAL;
 
+	if (hw->mac.type == e1000_i210 &&
+	    !(input->filter.match_flags & ~IGB_FILTER_FLAG_SRC_MAC_ADDR)) {
+		dev_err(&adapter->pdev->dev,
+			"i210 doesn't support flow classification rules specifying only source addresses.\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (input->filter.match_flags & IGB_FILTER_FLAG_ETHER_TYPE) {
 		err = igb_rxnfc_write_etype_filter(adapter, input);
 		if (err)
@@ -2933,10 +2958,6 @@ static int igb_add_ethtool_nfc_entry(struct igb_adapter *adapter,
 	if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW)
 		return -EINVAL;
 
-	if (fsp->m_u.ether_spec.h_proto != ETHER_TYPE_FULL_MASK &&
-	    fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK))
-		return -EINVAL;
-
 	input = kzalloc(sizeof(*input), GFP_KERNEL);
 	if (!input)
 		return -ENOMEM;
@@ -2946,6 +2967,20 @@ static int igb_add_ethtool_nfc_entry(struct igb_adapter *adapter,
 		input->filter.match_flags = IGB_FILTER_FLAG_ETHER_TYPE;
 	}
 
+	/* Only support matching addresses by the full mask */
+	if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) {
+		input->filter.match_flags |= IGB_FILTER_FLAG_SRC_MAC_ADDR;
+		ether_addr_copy(input->filter.src_addr,
+				fsp->h_u.ether_spec.h_source);
+	}
+
+	/* Only support matching addresses by the full mask */
+	if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) {
+		input->filter.match_flags |= IGB_FILTER_FLAG_DST_MAC_ADDR;
+		ether_addr_copy(input->filter.dst_addr,
+				fsp->h_u.ether_spec.h_dest);
+	}
+
 	if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
 		if (fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) {
 			err = -EINVAL;
-- 
2.17.0

^ permalink raw reply related

* [next-queue PATCH v7 07/10] igb: Enable nfc filters to specify MAC addresses
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia
In-Reply-To: <20180410174959.18757-1-vinicius.gomes@intel.com>

This allows igb_add_filter()/igb_erase_filter() to work on filters
that include MAC addresses (both source and destination).

For now, this only exposes the functionality, the next commit glues
ethtool into this. Later in this series, these APIs are used to allow
offloading of cls_flower filters.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |  4 +++
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 28 ++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index f48ba090fd6a..b9b965921e9f 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -442,6 +442,8 @@ struct hwmon_buff {
 enum igb_filter_match_flags {
 	IGB_FILTER_FLAG_ETHER_TYPE = 0x1,
 	IGB_FILTER_FLAG_VLAN_TCI   = 0x2,
+	IGB_FILTER_FLAG_SRC_MAC_ADDR   = 0x4,
+	IGB_FILTER_FLAG_DST_MAC_ADDR   = 0x8,
 };
 
 #define IGB_MAX_RXNFC_FILTERS 16
@@ -456,6 +458,8 @@ struct igb_nfc_input {
 	u8 match_flags;
 	__be16 etype;
 	__be16 vlan_tci;
+	u8 src_addr[ETH_ALEN];
+	u8 dst_addr[ETH_ALEN];
 };
 
 struct igb_nfc_filter {
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 5975d432836f..31b2960a7869 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2776,6 +2776,25 @@ int igb_add_filter(struct igb_adapter *adapter, struct igb_nfc_filter *input)
 			return err;
 	}
 
+	if (input->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR) {
+		err = igb_add_mac_steering_filter(adapter,
+						  input->filter.dst_addr,
+						  input->action, 0);
+		err = min_t(int, err, 0);
+		if (err)
+			return err;
+	}
+
+	if (input->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR) {
+		err = igb_add_mac_steering_filter(adapter,
+						  input->filter.src_addr,
+						  input->action,
+						  IGB_MAC_STATE_SRC_ADDR);
+		err = min_t(int, err, 0);
+		if (err)
+			return err;
+	}
+
 	if (input->filter.match_flags & IGB_FILTER_FLAG_VLAN_TCI)
 		err = igb_rxnfc_write_vlan_prio_filter(adapter, input);
 
@@ -2824,6 +2843,15 @@ int igb_erase_filter(struct igb_adapter *adapter, struct igb_nfc_filter *input)
 		igb_clear_vlan_prio_filter(adapter,
 					   ntohs(input->filter.vlan_tci));
 
+	if (input->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR)
+		igb_del_mac_steering_filter(adapter, input->filter.src_addr,
+					    input->action,
+					    IGB_MAC_STATE_SRC_ADDR);
+
+	if (input->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR)
+		igb_del_mac_steering_filter(adapter, input->filter.dst_addr,
+					    input->action, 0);
+
 	return 0;
 }
 
-- 
2.17.0

^ permalink raw reply related

* [next-queue PATCH v7 05/10] igb: Add support for enabling queue steering in filters
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia
In-Reply-To: <20180410174959.18757-1-vinicius.gomes@intel.com>

On some igb models (82575 and i210) the MAC address filters can
control to which queue the packet will be assigned.

This extends the 'state' with one more state to signify that queue
selection should be enabled for that filter.

As 82575 parts are no longer easily obtained (and this was developed
against i210), only support for the i210 model is enabled.

These functions are exported and will be used in the next patch.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 .../net/ethernet/intel/igb/e1000_defines.h    |  1 +
 drivers/net/ethernet/intel/igb/igb.h          |  6 +++++
 drivers/net/ethernet/intel/igb/igb_main.c     | 26 +++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 5417edbe3125..d3d1d868e7ba 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -492,6 +492,7 @@
  */
 #define E1000_RAH_AV  0x80000000        /* Receive descriptor valid */
 #define E1000_RAH_ASEL_SRC_ADDR 0x00010000
+#define E1000_RAH_QSEL_ENABLE 0x10000000
 #define E1000_RAL_MAC_ADDR_LEN 4
 #define E1000_RAH_MAC_ADDR_LEN 2
 #define E1000_RAH_POOL_MASK 0x03FC0000
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index f3ecda46f287..f48ba090fd6a 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -475,6 +475,7 @@ struct igb_mac_addr {
 #define IGB_MAC_STATE_DEFAULT	0x1
 #define IGB_MAC_STATE_IN_USE	0x2
 #define IGB_MAC_STATE_SRC_ADDR  0x4
+#define IGB_MAC_STATE_QUEUE_STEERING 0x8
 
 /* board specific private data structure */
 struct igb_adapter {
@@ -740,4 +741,9 @@ int igb_add_filter(struct igb_adapter *adapter,
 int igb_erase_filter(struct igb_adapter *adapter,
 		     struct igb_nfc_filter *input);
 
+int igb_add_mac_steering_filter(struct igb_adapter *adapter,
+				const u8 *addr, u8 queue, u8 flags);
+int igb_del_mac_steering_filter(struct igb_adapter *adapter,
+				const u8 *addr, u8 queue, u8 flags);
+
 #endif /* _IGB_H_ */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 2033ec3c9b27..e3da35cab786 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6935,6 +6935,28 @@ static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 	return igb_del_mac_filter_flags(adapter, addr, queue, 0);
 }
 
+int igb_add_mac_steering_filter(struct igb_adapter *adapter,
+				const u8 *addr, u8 queue, u8 flags)
+{
+	struct e1000_hw *hw = &adapter->hw;
+
+	/* In theory, this should be supported on 82575 as well, but
+	 * that part wasn't easily accessible during development.
+	 */
+	if (hw->mac.type != e1000_i210)
+		return -EOPNOTSUPP;
+
+	return igb_add_mac_filter_flags(adapter, addr, queue,
+					IGB_MAC_STATE_QUEUE_STEERING | flags);
+}
+
+int igb_del_mac_steering_filter(struct igb_adapter *adapter,
+				const u8 *addr, u8 queue, u8 flags)
+{
+	return igb_del_mac_filter_flags(adapter, addr, queue,
+					IGB_MAC_STATE_QUEUE_STEERING | flags);
+}
+
 static int igb_uc_sync(struct net_device *netdev, const unsigned char *addr)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
@@ -8784,6 +8806,10 @@ static void igb_rar_set_index(struct igb_adapter *adapter, u32 index)
 		switch (hw->mac.type) {
 		case e1000_82575:
 		case e1000_i210:
+			if (adapter->mac_table[index].state &
+			    IGB_MAC_STATE_QUEUE_STEERING)
+				rar_high |= E1000_RAH_QSEL_ENABLE;
+
 			rar_high |= E1000_RAH_POOL_1 *
 				      adapter->mac_table[index].queue;
 			break;
-- 
2.17.0

^ permalink raw reply related

* [next-queue PATCH v7 06/10] igb: Allow filters to be added for the local MAC address
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia
In-Reply-To: <20180410174959.18757-1-vinicius.gomes@intel.com>

Users expect that when adding a steering filter for the local MAC
address, that all the traffic directed to that address will go to some
queue.

Currently, it's not possible to configure entries in the "in use"
state, which is the normal state of the local MAC address entry (it is
the default), this patch allows to override the steering configuration
of "in use" entries, if the filter to be added match the address and
address type (source or destination) of an existing entry.

There is a bit of a special handling for entries referring to the
local MAC address, when they are removed, only the steering
configuration is reset.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 40 ++++++++++++++++++++---
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index e3da35cab786..1b6fad88107a 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6844,6 +6844,27 @@ static void igb_set_default_mac_filter(struct igb_adapter *adapter)
 	igb_rar_set_index(adapter, 0);
 }
 
+/* If the filter to be added and an already existing filter express
+ * the same address and address type, it should be possible to only
+ * override the other configurations, for example the queue to steer
+ * traffic.
+ */
+static bool igb_mac_entry_can_be_used(const struct igb_mac_addr *entry,
+				      const u8 *addr, const u8 flags)
+{
+	if (!(entry->state & IGB_MAC_STATE_IN_USE))
+		return true;
+
+	if ((entry->state & IGB_MAC_STATE_SRC_ADDR) !=
+	    (flags & IGB_MAC_STATE_SRC_ADDR))
+		return false;
+
+	if (!ether_addr_equal(addr, entry->addr))
+		return false;
+
+	return true;
+}
+
 /* Add a MAC filter for 'addr' directing matching traffic to 'queue',
  * 'flags' is used to indicate what kind of match is made, match is by
  * default for the destination address, if matching by source address
@@ -6866,7 +6887,8 @@ static int igb_add_mac_filter_flags(struct igb_adapter *adapter,
 	 * addresses.
 	 */
 	for (i = 0; i < rar_entries; i++) {
-		if (adapter->mac_table[i].state & IGB_MAC_STATE_IN_USE)
+		if (!igb_mac_entry_can_be_used(&adapter->mac_table[i],
+					       addr, flags))
 			continue;
 
 		ether_addr_copy(adapter->mac_table[i].addr, addr);
@@ -6918,9 +6940,19 @@ static int igb_del_mac_filter_flags(struct igb_adapter *adapter,
 		if (!ether_addr_equal(adapter->mac_table[i].addr, addr))
 			continue;
 
-		adapter->mac_table[i].state = 0;
-		memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
-		adapter->mac_table[i].queue = 0;
+		/* When a filter for the default address is "deleted",
+		 * we return it to its initial configuration
+		 */
+		if (adapter->mac_table[i].state & IGB_MAC_STATE_DEFAULT) {
+			adapter->mac_table[i].state =
+				IGB_MAC_STATE_DEFAULT | IGB_MAC_STATE_IN_USE;
+			adapter->mac_table[i].queue =
+				adapter->vfs_allocated_count;
+		} else {
+			adapter->mac_table[i].state = 0;
+			adapter->mac_table[i].queue = 0;
+			memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
+		}
 
 		igb_rar_set_index(adapter, i);
 		return 0;
-- 
2.17.0

^ permalink raw reply related

* [next-queue PATCH v7 04/10] igb: Add support for MAC address filters specifying source addresses
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia
In-Reply-To: <20180410174959.18757-1-vinicius.gomes@intel.com>

Makes it possible to direct packets to queues based on their source
address. Documents the expected usage of the 'flags' parameter.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 .../net/ethernet/intel/igb/e1000_defines.h    |  1 +
 drivers/net/ethernet/intel/igb/igb.h          |  1 +
 drivers/net/ethernet/intel/igb/igb_main.c     | 40 ++++++++++++++++---
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 98534f765e0e..5417edbe3125 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -491,6 +491,7 @@
  * manageability enabled, allowing us room for 15 multicast addresses.
  */
 #define E1000_RAH_AV  0x80000000        /* Receive descriptor valid */
+#define E1000_RAH_ASEL_SRC_ADDR 0x00010000
 #define E1000_RAL_MAC_ADDR_LEN 4
 #define E1000_RAH_MAC_ADDR_LEN 2
 #define E1000_RAH_POOL_MASK 0x03FC0000
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 8dbc399b345e..f3ecda46f287 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -474,6 +474,7 @@ struct igb_mac_addr {
 
 #define IGB_MAC_STATE_DEFAULT	0x1
 #define IGB_MAC_STATE_IN_USE	0x2
+#define IGB_MAC_STATE_SRC_ADDR  0x4
 
 /* board specific private data structure */
 struct igb_adapter {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 976898d39d6e..2033ec3c9b27 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6844,8 +6844,14 @@ static void igb_set_default_mac_filter(struct igb_adapter *adapter)
 	igb_rar_set_index(adapter, 0);
 }
 
-static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
-			      const u8 queue)
+/* Add a MAC filter for 'addr' directing matching traffic to 'queue',
+ * 'flags' is used to indicate what kind of match is made, match is by
+ * default for the destination address, if matching by source address
+ * is desired the flag IGB_MAC_STATE_SRC_ADDR can be used.
+ */
+static int igb_add_mac_filter_flags(struct igb_adapter *adapter,
+				    const u8 *addr, const u8 queue,
+				    const u8 flags)
 {
 	struct e1000_hw *hw = &adapter->hw;
 	int rar_entries = hw->mac.rar_entry_count -
@@ -6865,7 +6871,7 @@ static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 
 		ether_addr_copy(adapter->mac_table[i].addr, addr);
 		adapter->mac_table[i].queue = queue;
-		adapter->mac_table[i].state |= IGB_MAC_STATE_IN_USE;
+		adapter->mac_table[i].state |= IGB_MAC_STATE_IN_USE | flags;
 
 		igb_rar_set_index(adapter, i);
 		return i;
@@ -6874,8 +6880,21 @@ static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 	return -ENOSPC;
 }
 
-static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
+static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 			      const u8 queue)
+{
+	return igb_add_mac_filter_flags(adapter, addr, queue, 0);
+}
+
+/* Remove a MAC filter for 'addr' directing matching traffic to
+ * 'queue', 'flags' is used to indicate what kind of match need to be
+ * removed, match is by default for the destination address, if
+ * matching by source address is to be removed the flag
+ * IGB_MAC_STATE_SRC_ADDR can be used.
+ */
+static int igb_del_mac_filter_flags(struct igb_adapter *adapter,
+				    const u8 *addr, const u8 queue,
+				    const u8 flags)
 {
 	struct e1000_hw *hw = &adapter->hw;
 	int rar_entries = hw->mac.rar_entry_count -
@@ -6892,12 +6911,14 @@ static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 	for (i = 0; i < rar_entries; i++) {
 		if (!(adapter->mac_table[i].state & IGB_MAC_STATE_IN_USE))
 			continue;
+		if ((adapter->mac_table[i].state & flags) != flags)
+			continue;
 		if (adapter->mac_table[i].queue != queue)
 			continue;
 		if (!ether_addr_equal(adapter->mac_table[i].addr, addr))
 			continue;
 
-		adapter->mac_table[i].state &= ~IGB_MAC_STATE_IN_USE;
+		adapter->mac_table[i].state = 0;
 		memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
 		adapter->mac_table[i].queue = 0;
 
@@ -6908,6 +6929,12 @@ static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 	return -ENOENT;
 }
 
+static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
+			      const u8 queue)
+{
+	return igb_del_mac_filter_flags(adapter, addr, queue, 0);
+}
+
 static int igb_uc_sync(struct net_device *netdev, const unsigned char *addr)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
@@ -8751,6 +8778,9 @@ static void igb_rar_set_index(struct igb_adapter *adapter, u32 index)
 		if (is_valid_ether_addr(addr))
 			rar_high |= E1000_RAH_AV;
 
+		if (adapter->mac_table[index].state & IGB_MAC_STATE_SRC_ADDR)
+			rar_high |= E1000_RAH_ASEL_SRC_ADDR;
+
 		switch (hw->mac.type) {
 		case e1000_82575:
 		case e1000_i210:
-- 
2.17.0

^ permalink raw reply related

* [next-queue PATCH v7 03/10] igb: Enable the hardware traffic class feature bit for igb models
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia
In-Reply-To: <20180410174959.18757-1-vinicius.gomes@intel.com>

This will allow functionality depending on the hardware being traffic
class aware to work. In particular the tc-flower offloading checks
verifies that this bit is set.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 0a79fef3c4fb..976898d39d6e 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2807,6 +2807,9 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (hw->mac.type >= e1000_82576)
 		netdev->features |= NETIF_F_SCTP_CRC;
 
+	if (hw->mac.type >= e1000_i350)
+		netdev->features |= NETIF_F_HW_TC;
+
 #define IGB_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				  NETIF_F_GSO_GRE_CSUM | \
 				  NETIF_F_GSO_IPXIP4 | \
-- 
2.17.0

^ permalink raw reply related

* [next-queue PATCH v7 01/10] igb: Fix not adding filter elements to the list
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia
In-Reply-To: <20180410174959.18757-1-vinicius.gomes@intel.com>

Because the order of the parameters passes to 'hlist_add_behind()' was
inverted, the 'parent' node was added "behind" the 'input', as input
is not in the list, this causes the 'input' node to be lost.

Fixes: 0e71def25281 ("igb: add support of RX network flow classification")
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index e77ba0d5866d..5975d432836f 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2865,7 +2865,7 @@ static int igb_update_ethtool_nfc_entry(struct igb_adapter *adapter,
 
 	/* add filter to the list */
 	if (parent)
-		hlist_add_behind(&parent->nfc_node, &input->nfc_node);
+		hlist_add_behind(&input->nfc_node, &parent->nfc_node);
 	else
 		hlist_add_head(&input->nfc_node, &adapter->nfc_filter_list);
 
-- 
2.17.0

^ permalink raw reply related

* [next-queue PATCH v7 02/10] igb: Fix queue selection on MAC filters on i210
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia
In-Reply-To: <20180410174959.18757-1-vinicius.gomes@intel.com>

On the RAH registers there are semantic differences on the meaning of
the "queue" parameter for traffic steering depending on the controller
model: there is the 82575 meaning, which "queue" means a RX Hardware
Queue, and the i350 meaning, where it is a reception pool.

The previous behaviour was having no effect for i210 based controllers
because the QSEL bit of the RAH register wasn't being set.

This patch separates the condition in discrete cases, so the different
handling is clearer.

Fixes: 83c21335c876 ("igb: improve MAC filter handling")
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index c1c0bc30a16d..0a79fef3c4fb 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8748,12 +8748,17 @@ static void igb_rar_set_index(struct igb_adapter *adapter, u32 index)
 		if (is_valid_ether_addr(addr))
 			rar_high |= E1000_RAH_AV;
 
-		if (hw->mac.type == e1000_82575)
+		switch (hw->mac.type) {
+		case e1000_82575:
+		case e1000_i210:
 			rar_high |= E1000_RAH_POOL_1 *
-				    adapter->mac_table[index].queue;
-		else
+				      adapter->mac_table[index].queue;
+			break;
+		default:
 			rar_high |= E1000_RAH_POOL_1 <<
-				    adapter->mac_table[index].queue;
+				adapter->mac_table[index].queue;
+			break;
+		}
 	}
 
 	wr32(E1000_RAL(index), rar_low);
-- 
2.17.0

^ permalink raw reply related

* [next-queue PATCH v7 00/10] igb: offloading of receive filters
From: Vinicius Costa Gomes @ 2018-04-10 17:49 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Vinicius Costa Gomes, jeffrey.t.kirsher, netdev,
	jesus.sanchez-palencia

Hi,

Known issue:
 - It seems that the the QSEL bits in the RAH registers do not have
 any effect for source address (i.e. steering doesn't work for source
 address filters), everything is pointing to a hardware (or
 documentation) issue;

Changes from v6:
 - Because the i210 doesn't support steering traffic per source
   address (only filtering works), an error is emitted when only an
   source address is specified in a classification rule;

Changes from v5:
 - Filters can now be added for local MAC addresses, when removed,
   they return to their initial configuration (thanks for the testing
   Aaron);

Changes from v4:
 - Added a new bit to the MAC address filters internal
 representation to mean that some filters are steering filters (i.e.
 they direct traffic to queues);
 - And, this is only supported in i210;

Changes from v3:
 - Addressed review comments from Aaron F. Brown and
   Jakub Kicinski;

Changes from v2:
 - Addressed review comments from Jakub Kicinski, mostly about coding
   style adjustments and more consistent error reporting;

Changes from v1:
 - Addressed review comments from Alexander Duyck and Florian
   Fainelli;
 - Adding and removing cls_flower filters are now proposed in the same
   patch;
 - cls_flower filters are kept in a separated list from "ethtool"
   filters (so that section of the original cover letter is no longer
   valid);
 - The patch adding support for ethtool filters is now independent from
   the rest of the series;

Original cover letter:

This series enables some ethtool and tc-flower filters to be offloaded
to igb-based network controllers. This is useful when the system
configurator want to steer kinds of traffic to a specific hardware
queue.

The first two commits are bug fixes.

The basis of this series is to export the internal API used to
configure address filters, so they can be used by ethtool, and
extending the functionality so an source address can be handled.

Then, we enable the tc-flower offloading implementation to re-use the
same infrastructure as ethtool, and storing them in the per-adapter
"nfc" (Network Filter Config?) list. But for consistency, for
destructive access they are separated, i.e. an filter added by
tc-flower can only be removed by tc-flower, but ethtool can read them
all.

Only support for VLAN Prio, Source and Destination MAC Address, and
Ethertype is enabled for now.

Open question:
  - igb is initialized with the number of traffic classes as 1, if we
  want to use multiple traffic classes we need to increase this value,
  the only way I could find is to use mqprio (for example). Should igb
  be initialized with, say, the number of queues as its "num_tc"?

--

Vinicius Costa Gomes (10):
  igb: Fix not adding filter elements to the list
  igb: Fix queue selection on MAC filters on i210
  igb: Enable the hardware traffic class feature bit for igb models
  igb: Add support for MAC address filters specifying source addresses
  igb: Add support for enabling queue steering in filters
  igb: Allow filters to be added for the local MAC address
  igb: Enable nfc filters to specify MAC addresses
  igb: Add MAC address support for ethtool nftuple filters
  igb: Add the skeletons for tc-flower offloading
  igb: Add support for adding offloaded clsflower filters

 .../net/ethernet/intel/igb/e1000_defines.h    |   2 +
 drivers/net/ethernet/intel/igb/igb.h          |  13 +
 drivers/net/ethernet/intel/igb/igb_ethtool.c  |  73 +++-
 drivers/net/ethernet/intel/igb/igb_main.c     | 370 +++++++++++++++++-
 4 files changed, 441 insertions(+), 17 deletions(-)

--
2.17.0

^ permalink raw reply

* Re: [PATCH] ibmvnic: Disable irqs before exiting reset from closed state
From: David Miller @ 2018-04-10 17:39 UTC (permalink / raw)
  To: jallen; +Cc: netdev


John, if you want this to be submitted to stable you'll need to provide
backports for the various -stable trees and this patch doesn't even
come close to applying cleanly to v4.16, for example.

Thank you.

^ permalink raw reply

* Re: [PATCH] net: wireless: b43legacy: Replace GFP_ATOMIC with GFP_KERNEL in dma_tx_fragment
From: Michael Büsch @ 2018-04-10 16:57 UTC (permalink / raw)
  To: Jia-Ju Bai
  Cc: Larry.Finger, kvalo, netdev, linux-wireless, b43-dev,
	linux-kernel
In-Reply-To: <1523368459-32128-1-git-send-email-baijiaju1990@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2015 bytes --]

On Tue, 10 Apr 2018 21:54:19 +0800
Jia-Ju Bai <baijiaju1990@gmail.com> wrote:

> dma_tx_fragment() is never called in atomic context.
> 
> dma_tx_fragment() is only called by b43legacy_dma_tx(), which is 
> only called by b43legacy_tx_work().
> b43legacy_tx_work() is only set a parameter of INIT_WORK() in 
> b43legacy_wireless_init().
> 
> Despite never getting called from atomic context,
> dma_tx_fragment() calls alloc_skb() with GFP_ATOMIC,
> which does not sleep for allocation.
> GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
> which can sleep and improve the possibility of sucessful allocation.
> 
> This is found by a static analysis tool named DCNS written by myself.
> And I also manually check it.
> 
> Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
> ---
>  drivers/net/wireless/broadcom/b43legacy/dma.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/wireless/broadcom/b43legacy/dma.c b/drivers/net/wireless/broadcom/b43legacy/dma.c
> index cfa617d..2f0c64c 100644
> --- a/drivers/net/wireless/broadcom/b43legacy/dma.c
> +++ b/drivers/net/wireless/broadcom/b43legacy/dma.c
> @@ -1064,7 +1064,7 @@ static int dma_tx_fragment(struct b43legacy_dmaring *ring,
>  	meta->dmaaddr = map_descbuffer(ring, skb->data, skb->len, 1);
>  	/* create a bounce buffer in zone_dma on mapping failure. */
>  	if (b43legacy_dma_mapping_error(ring, meta->dmaaddr, skb->len, 1)) {
> -		bounce_skb = alloc_skb(skb->len, GFP_ATOMIC | GFP_DMA);
> +		bounce_skb = alloc_skb(skb->len, GFP_KERNEL | GFP_DMA);
>  		if (!bounce_skb) {
>  			ring->current_slot = old_top_slot;
>  			ring->used_slots = old_used_slots;


Ack.
I think the GFP_ATOMIC came from the days where we did DMA operations
under spinlock instead of mutex.

The same thing can be done in b43.

Also 
setup_rx_descbuffer(ring, desc, meta, GFP_ATOMIC)
could be GFP_KERNEL in dma_rx().
This function is called from IRQ thread context.

-- 
Michael

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH] net: bridge: add missing NULL checks
From: Laszlo Toth @ 2018-04-10 17:22 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: Laszlo Toth, Stephen Hemminger, David S. Miller, bridge, netdev
In-Reply-To: <3db78abd-bc13-528f-9da3-cb9a0ce0f617@cumulusnetworks.com>

On Mon, Apr 09, 2018 at 01:25:41AM +0300, Nikolay Aleksandrov wrote:
> On 08/04/18 20:49, Laszlo Toth wrote:
> >br_port_get_rtnl() can return NULL
> >
> >Signed-off-by: Laszlo Toth <laszlth@gmail.com>
> >---
> >  net/bridge/br_netlink.c | 12 ++++++++++--
> >  1 file changed, 10 insertions(+), 2 deletions(-)
> >
> 
> Nacked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> More below.
> 
> >diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
> >index 015f465c..cbec11f 100644
> >--- a/net/bridge/br_netlink.c
> >+++ b/net/bridge/br_netlink.c
> >@@ -939,14 +939,17 @@ static int br_port_slave_changelink(struct net_device *brdev,
> >  				    struct nlattr *data[],
> >  				    struct netlink_ext_ack *extack)
> >  {
> >+	struct net_bridge_port *port = br_port_get_rtnl(dev);
> >  	struct net_bridge *br = netdev_priv(brdev);
> >  	int ret;
> >  	if (!data)
> >  		return 0;
> >+	if (!port)
> >+		return -EINVAL;
> 
> If we're here, it means the master device of dev is a bridge => dev is a bridge port,
> since we're running with RTNL that cannot change, so this check is unnecessary.
> 
> Have you actually hit a bug with this code ?
> 
> >  	spin_lock_bh(&br->lock);
> >-	ret = br_setport(br_port_get_rtnl(dev), data);
> >+	ret = br_setport(port, data);
> >  	spin_unlock_bh(&br->lock);
> >  	return ret;
> >@@ -956,7 +959,12 @@ static int br_port_fill_slave_info(struct sk_buff *skb,
> >  				   const struct net_device *brdev,
> >  				   const struct net_device *dev)
> >  {
> >-	return br_port_fill_attrs(skb, br_port_get_rtnl(dev));
> >+	struct net_bridge_port *port = br_port_get_rtnl(dev);
> >+
> >+	if (!port)
> >+		return -EINVAL;
> >+
> >+	return br_port_fill_attrs(skb, port);
> 
> Same rationale here, fill_slave_info is called via a master device's ops
> under RTNL, which means dev is a bridge port and that also cannot change.
> 
> If you have hit a bug with this code, can we see the trace ?
> The problem might be elsewhere.

There was a NULL dereference in br_port_fill_attrs(), but on a much
older release w/ a probably buggy and custom driver,
so there is no real problem to trace.
Anyway I thought I'd make a quick patch from it, but you're right,
it's pointless to validate twice. 

Please just ignore the patch.

Laszlo

> 
> Thanks,
>  Nik
> 
> >  }
> >  static size_t br_port_get_slave_size(const struct net_device *brdev,
> >
> 

^ permalink raw reply

* Re: [RFC net-next 2/2] net: net-sysfs: Reduce netstat_show read_lock critical section
From: David Miller @ 2018-04-10 17:17 UTC (permalink / raw)
  To: saeedm; +Cc: netdev
In-Reply-To: <20180410170812.18905-2-saeedm@mellanox.com>

From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 10 Apr 2018 10:08:12 -0700

> Instead of holding the device chain read_lock also while calling
> dev_get_stats just hold it only to check dev_isalive, if the dev is alive,
> hold that dev via dev_hold then release the read_lock.
> 
> When done handling the device, dev_put it.
> 
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

My feedback here is that same as for patch #1.

Two atomics for a shorter RCU lock hold time is not that great of
a tradeoff.

^ permalink raw reply

* Re: [RFC net-next 1/2] net: net-porcfs: Reduce rcu lock critical section
From: David Miller @ 2018-04-10 17:16 UTC (permalink / raw)
  To: saeedm; +Cc: netdev
In-Reply-To: <20180410170812.18905-1-saeedm@mellanox.com>

From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 10 Apr 2018 10:08:11 -0700

> The current net proc fs sequence file implementation to show current
> namespace netdevs list statistics and mc lists holds the rcu lock
> throughout the whole process, from dev seq start up to dev seq stop.
> 
> This is really greedy and demanding from device drivers since
> ndo_get_stats64 called from dev_seq_show while the rcu lock is held.
> 
> The rcu lock is needed to guarantee that device chain is not modified
> while the dev sequence file is walking through it and handling the
> netdev in the same time.
> 
> To minimize this critical section and drastically reduce the time rcu lock
> is being held, all we need is to grab the rcu lock only for the brief
> moment where we are looking for the next netdev to handle, if found,
> dev_hold it to guarantee it kept alive while accessed later in seq show
> callback and release the rcu lock immediately.
> 
> The current netdev being handled will be released "dev_put" when the seq next
> callback is called or dev seq stop is called.
> 
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

The tradeoff here is that now you are doing two unnecessary atomic
operations per stats dump.

That is what the RCU lock allows us to avoid.

^ permalink raw reply

* [RFC net-next 2/2] net: net-sysfs: Reduce netstat_show read_lock critical section
From: Saeed Mahameed @ 2018-04-10 17:08 UTC (permalink / raw)
  To: netdev; +Cc: Saeed Mahameed
In-Reply-To: <20180410170812.18905-1-saeedm@mellanox.com>

Instead of holding the device chain read_lock also while calling
dev_get_stats just hold it only to check dev_isalive, if the dev is alive,
hold that dev via dev_hold then release the read_lock.

When done handling the device, dev_put it.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 net/core/net-sysfs.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c476f0794132..ee6f9fed43df 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -563,13 +563,20 @@ static ssize_t netstat_show(const struct device *d,
 		offset % sizeof(u64) != 0);
 
 	read_lock(&dev_base_lock);
-	if (dev_isalive(dev)) {
+	if (dev_isalive(dev))
+		dev_hold(dev);
+	else
+		dev = NULL;
+	read_unlock(&dev_base_lock);
+
+	if (dev) {
 		struct rtnl_link_stats64 temp;
 		const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
 
+		dev_put(dev);
 		ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
 	}
-	read_unlock(&dev_base_lock);
+
 	return ret;
 }
 
-- 
2.14.3

^ permalink raw reply related

* [RFC net-next 1/2] net: net-porcfs: Reduce rcu lock critical section
From: Saeed Mahameed @ 2018-04-10 17:08 UTC (permalink / raw)
  To: netdev; +Cc: Saeed Mahameed

The current net proc fs sequence file implementation to show current
namespace netdevs list statistics and mc lists holds the rcu lock
throughout the whole process, from dev seq start up to dev seq stop.

This is really greedy and demanding from device drivers since
ndo_get_stats64 called from dev_seq_show while the rcu lock is held.

The rcu lock is needed to guarantee that device chain is not modified
while the dev sequence file is walking through it and handling the
netdev in the same time.

To minimize this critical section and drastically reduce the time rcu lock
is being held, all we need is to grab the rcu lock only for the brief
moment where we are looking for the next netdev to handle, if found,
dev_hold it to guarantee it kept alive while accessed later in seq show
callback and release the rcu lock immediately.

The current netdev being handled will be released "dev_put" when the seq next
callback is called or dev seq stop is called.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 net/core/net-procfs.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 9737302907b1..9d5ce6a203d2 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -31,19 +31,24 @@ static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff
 
 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
 {
-	struct net_device *dev;
+	struct net_device *dev = NULL;
 	unsigned int bucket;
 
+	rcu_read_lock();
 	do {
 		dev = dev_from_same_bucket(seq, pos);
-		if (dev)
-			return dev;
+		if (dev) {
+			dev_hold(dev);
+			goto unlock;
+		}
 
 		bucket = get_bucket(*pos) + 1;
 		*pos = set_bucket_offset(bucket, 1);
 	} while (bucket < NETDEV_HASHENTRIES);
 
-	return NULL;
+unlock:
+	rcu_read_unlock();
+	return dev;
 }
 
 /*
@@ -51,9 +56,7 @@ static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *p
  *	in detail.
  */
 static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(RCU)
 {
-	rcu_read_lock();
 	if (!*pos)
 		return SEQ_START_TOKEN;
 
@@ -66,13 +69,15 @@ static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
 static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	++*pos;
+	if (v && v != SEQ_START_TOKEN)
+		dev_put(v);
 	return dev_from_bucket(seq, pos);
 }
 
 static void dev_seq_stop(struct seq_file *seq, void *v)
-	__releases(RCU)
 {
-	rcu_read_unlock();
+	if (v && v != SEQ_START_TOKEN)
+		dev_put(v);
 }
 
 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
-- 
2.14.3

^ permalink raw reply related

* Re: [RFC bpf-next v2 7/8] bpf: add documentation for eBPF helpers (51-57)
From: Yonghong Song @ 2018-04-10 16:58 UTC (permalink / raw)
  To: Quentin Monnet, daniel, ast
  Cc: netdev, oss-drivers, linux-doc, linux-man, Lawrence Brakmo,
	Josef Bacik, Andrey Ignatov
In-Reply-To: <20180410144157.4831-8-quentin.monnet@netronome.com>



On 4/10/18 7:41 AM, Quentin Monnet wrote:
> Add documentation for eBPF helper functions to bpf.h user header file.
> This documentation can be parsed with the Python script provided in
> another commit of the patch series, in order to provide a RST document
> that can later be converted into a man page.
> 
> The objective is to make the documentation easily understandable and
> accessible to all eBPF developers, including beginners.
> 
> This patch contains descriptions for the following helper functions:
> 
> Helpers from Lawrence:
> - bpf_setsockopt()
> - bpf_getsockopt()
> - bpf_sock_ops_cb_flags_set()
> 
> Helpers from Yonghong:
> - bpf_perf_event_read_value()
> - bpf_perf_prog_read_value()
> 
> Helper from Josef:
> - bpf_override_return()
> 
> Helper from Andrey:
> - bpf_bind()
> 
> Cc: Lawrence Brakmo <brakmo@fb.com>
> Cc: Yonghong Song <yhs@fb.com>
> Cc: Josef Bacik <jbacik@fb.com>
> Cc: Andrey Ignatov <rdna@fb.com>
> Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
> ---
>   include/uapi/linux/bpf.h | 184 +++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 184 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 15d9ccafebbe..7343af4196c8 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1208,6 +1208,28 @@ union bpf_attr {
>    * 	Return
>    * 		0
>    *
> + * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
> + * 	Description
> + * 		Emulate a call to **setsockopt()** on the socket associated to
> + * 		*bpf_socket*, which must be a full socket. The *level* at
> + * 		which the option resides and the name *optname* of the option
> + * 		must be specified, see **setsockopt(2)** for more information.
> + * 		The option value of length *optlen* is pointed by *optval*.
> + *
> + * 		This helper actually implements a subset of **setsockopt()**.
> + * 		It supports the following *level*\ s:
> + *
> + * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
> + * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
> + * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
> + * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
> + * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
> + * 		  **TCP_BPF_SNDCWND_CLAMP**.
> + * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
> + * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
> + *
>    * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
>    * 	Description
>    * 		Grow or shrink the room for data in the packet associated to
> @@ -1255,6 +1277,168 @@ union bpf_attr {
>    * 		performed again.
>    * 	Return
>    * 		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
> + * 	Description
> + * 		Read the value of a perf event counter, and store it into *buf*
> + * 		of size *buf_size*. This helper relies on a *map* of type
> + * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf
> + * 		event counter is selected at the creation of the *map*. The

The nature of the perf event counter is selected when *map* is updated 
with perf_event fd's.

> + * 		*map* is an array whose size is the number of available CPU
> + * 		cores, and each cell contains a value relative to one core. The

It is confusing to mix core/cpu here. Maybe just use perf_event 
convention, always using cpu?

> + * 		value to retrieve is indicated by *flags*, that contains the
> + * 		index of the core to look up, masked with
> + * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
> + * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
> + * 		current CPU core should be retrieved.
> + *
> + * 		This helper behaves in a way close to
> + * 		**bpf_perf_event_read**\ () helper, save that instead of
> + * 		just returning the value observed, it fills the *buf*
> + * 		structure. This allows for additional data to be retrieved: in
> + * 		particular, the enabled and running times (in *buf*\
> + * 		**->enabled** and *buf*\ **->running**, respectively) are
> + * 		copied.
> + *
> + * 		These values are interesting, because hardware PMU (Performance
> + * 		Monitoring Unit) counters are limited resources. When there are
> + * 		more PMU based perf events opened than available counters,
> + * 		kernel will multiplex these events so each event gets certain
> + * 		percentage (but not all) of the PMU time. In case that
> + * 		multiplexing happens, the number of samples or counter value
> + * 		will not reflect the case compared to when no multiplexing
> + * 		occurs. This makes comparison between different runs difficult.
> + * 		Typically, the counter value should be normalized before
> + * 		comparing to other experiments. The usual normalization is done
> + * 		as follows.
> + *
> + * 		::
> + *
> + * 			normalized_counter = counter * t_enabled / t_running
> + *
> + * 		Where t_enabled is the time enabled for event and t_running is
> + * 		the time running for event since last normalization. The
> + * 		enabled and running times are accumulated since the perf event
> + * 		open. To achieve scaling factor between two invocations of an
> + * 		eBPF program, users can can use CPU id as the key (which is
> + * 		typical for perf array usage model) to remember the previous
> + * 		value and do the calculation inside the eBPF program.
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
> + * 	Description
> + * 		For en eBPF program attached to a perf event, retrieve the
> + * 		value of the event counter associated to *ctx* and store it in
> + * 		the structure pointed by *buf* and of size *buf_size*. Enabled
> + * 		and running times are also stored in the structure (see
> + * 		description of helper **bpf_perf_event_read_value**\ () for
> + * 		more details).
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
> + * 	Description
> + * 		Emulate a call to **getsockopt()** on the socket associated to
> + * 		*bpf_socket*, which must be a full socket. The *level* at
> + * 		which the option resides and the name *optname* of the option
> + * 		must be specified, see **getsockopt(2)** for more information.
> + * 		The retrieved value is stored in the structure pointed by
> + * 		*opval* and of length *optlen*.
> + *
> + * 		This helper actually implements a subset of **getsockopt()**.
> + * 		It supports the following *level*\ s:
> + *
> + * 		* **IPPROTO_TCP**, which supports *optname*
> + * 		  **TCP_CONGESTION**.
> + * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
> + * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_override_return(struct pt_reg *regs, u64 rc)
> + * 	Description
> + * 		Used for error injection, this helper uses kprobes to override
> + * 		the return value of the probed function, and to set it to *rc*.
> + * 		The first argument is the context *regs* on which the kprobe
> + * 		works.
> + *
> + * 		This helper works by setting setting the PC (program counter)
> + * 		to an override function which is run in place of the original
> + * 		probed function. This means the probed function is not run at
> + * 		all. The replacement function just returns with the required
> + * 		value.
> + *
> + * 		This helper has security implications, and thus is subject to
> + * 		restrictions. It is only available if the kernel was compiled
> + * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
> + * 		option, and in this case it only works on functions tagged with
> + * 		**ALLOW_ERROR_INJECTION** in the kernel code.
> + *
> + * 		Also, the helper is only available for the architectures having
> + * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
> + * 		x86 architecture is the only one to support this feature.
> + * 	Return
> + * 		0
> + *
> + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
> + * 	Description
> + * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
> + * 		for the full TCP socket associated to *bpf_sock_ops* to
> + * 		*argval*.
> + *
> + * 		The primary use of this field is to determine if there should
> + * 		be calls to eBPF programs of type
> + * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
> + * 		code. A program of the same type can change its value, per
> + * 		connection and as necessary, when the connection is
> + * 		established. This field is directly accessible for reading, but
> + * 		this helper must be used for updates in order to return an
> + * 		error if an eBPF program tries to set a callback that is not
> + * 		supported in the current kernel.
> + *
> + * 		The supported callback values that *argval* can combine are:
> + *
> + * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
> + * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
> + * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
> + *
> + * 		Here are some examples of where one could call such eBPF
> + * 		program:
> + *
> + * 		* When RTO fires.
> + * 		* When a packet is retransmitted.
> + * 		* When the connection terminates.
> + * 		* When a packet is sent.
> + * 		* When a packet is received.
> + * 	Return
> + * 		Code **-EINVAL** if the socket is not a full TCP socket;
> + * 		otherwise, a positive number containing the bits that could not
> + * 		be set is returned (which comes down to 0 if all bits were set
> + * 		as required).
> + *
> + * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
> + * 	Description
> + * 		Bind the socket associated to *ctx* to the address pointed by
> + * 		*addr*, of length *addr_len*. This allows for making outgoing
> + * 		connection from the desired IP address, which can be useful for
> + * 		example when all processes inside a cgroup should use one
> + * 		single IP address on a host that has multiple IP configured.
> + *
> + * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
> + * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
> + * 		**AF_INET6**). Looking for a free port to bind to can be
> + * 		expensive, therefore binding to port is not permitted by the
> + * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
> + * 		must be set to zero.
> + *
> + * 		As for the remote end, both parts of it can be overridden,
> + * 		remote IP and remote port. This can be useful if an application
> + * 		inside a cgroup wants to connect to another application inside
> + * 		the same cgroup or to itself, but knows nothing about the IP
> + * 		address assigned to the cgroup.
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>    */
>   #define __BPF_FUNC_MAPPER(FN)		\
>   	FN(unspec),			\
> 

^ permalink raw reply

* [PATCH bpf v4] bpf/tracing: fix a deadlock in perf_event_detach_bpf_prog
From: Yonghong Song @ 2018-04-10 16:37 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

syzbot reported a possible deadlock in perf_event_detach_bpf_prog.
The error details:
  ======================================================
  WARNING: possible circular locking dependency detected
  4.16.0-rc7+ #3 Not tainted
  ------------------------------------------------------
  syz-executor7/24531 is trying to acquire lock:
   (bpf_event_mutex){+.+.}, at: [<000000008a849b07>] perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854

  but task is already holding lock:
   (&mm->mmap_sem){++++}, at: [<0000000038768f87>] vm_mmap_pgoff+0x198/0x280 mm/util.c:353

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #1 (&mm->mmap_sem){++++}:
       __might_fault+0x13a/0x1d0 mm/memory.c:4571
       _copy_to_user+0x2c/0xc0 lib/usercopy.c:25
       copy_to_user include/linux/uaccess.h:155 [inline]
       bpf_prog_array_copy_info+0xf2/0x1c0 kernel/bpf/core.c:1694
       perf_event_query_prog_array+0x1c7/0x2c0 kernel/trace/bpf_trace.c:891
       _perf_ioctl kernel/events/core.c:4750 [inline]
       perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4770
       vfs_ioctl fs/ioctl.c:46 [inline]
       do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686
       SYSC_ioctl fs/ioctl.c:701 [inline]
       SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
       do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
       entry_SYSCALL_64_after_hwframe+0x42/0xb7

  -> #0 (bpf_event_mutex){+.+.}:
       lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
       __mutex_lock_common kernel/locking/mutex.c:756 [inline]
       __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
       mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
       perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
       perf_event_free_bpf_prog kernel/events/core.c:8147 [inline]
       _free_event+0xbdb/0x10f0 kernel/events/core.c:4116
       put_event+0x24/0x30 kernel/events/core.c:4204
       perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172
       remove_vma+0xb4/0x1b0 mm/mmap.c:172
       remove_vma_list mm/mmap.c:2490 [inline]
       do_munmap+0x82a/0xdf0 mm/mmap.c:2731
       mmap_region+0x59e/0x15a0 mm/mmap.c:1646
       do_mmap+0x6c0/0xe00 mm/mmap.c:1483
       do_mmap_pgoff include/linux/mm.h:2223 [inline]
       vm_mmap_pgoff+0x1de/0x280 mm/util.c:355
       SYSC_mmap_pgoff mm/mmap.c:1533 [inline]
       SyS_mmap_pgoff+0x462/0x5f0 mm/mmap.c:1491
       SYSC_mmap arch/x86/kernel/sys_x86_64.c:100 [inline]
       SyS_mmap+0x16/0x20 arch/x86/kernel/sys_x86_64.c:91
       do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
       entry_SYSCALL_64_after_hwframe+0x42/0xb7

  other info that might help us debug this:

   Possible unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(&mm->mmap_sem);
                                 lock(bpf_event_mutex);
                                 lock(&mm->mmap_sem);
    lock(bpf_event_mutex);

   *** DEADLOCK ***
  ======================================================

The bug is introduced by Commit f371b304f12e ("bpf/tracing: allow
user space to query prog array on the same tp") where copy_to_user,
which requires mm->mmap_sem, is called inside bpf_event_mutex lock.
At the same time, during perf_event file descriptor close,
mm->mmap_sem is held first and then subsequent
perf_event_detach_bpf_prog needs bpf_event_mutex lock.
Such a senario caused a deadlock.

As suggested by Daniel, moving copy_to_user out of the
bpf_event_mutex lock should fix the problem.

Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp")
Reported-by: syzbot+dc5ca0e4c9bfafaf2bae@syzkaller.appspotmail.com
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h      |  4 ++--
 kernel/bpf/core.c        | 45 +++++++++++++++++++++++++++++----------------
 kernel/trace/bpf_trace.c | 25 +++++++++++++++++++++----
 3 files changed, 52 insertions(+), 22 deletions(-)

Changelog:
  v3 -> v4:
    . Add comments to clarify that ZERO_SIZE_PTR is handled properly,
      suggested by Daniel.
  v2 -> v3:
    . Remove the redundant label in function perf_event_query_prog_array.
  v1 -> v2:
    . Use the common core function for prog_id copying for two
      different prog_array copy routines, suggested by Alexei.

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95a7abd..486e65e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -339,8 +339,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
 				struct bpf_prog *old_prog);
 int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
-			     __u32 __user *prog_ids, u32 request_cnt,
-			     __u32 __user *prog_cnt);
+			     u32 *prog_ids, u32 request_cnt,
+			     u32 *prog_cnt);
 int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 			struct bpf_prog *exclude_prog,
 			struct bpf_prog *include_prog,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d315b39..ba03ec3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1572,13 +1572,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
 	return cnt;
 }
 
+static bool bpf_prog_array_copy_core(struct bpf_prog **prog,
+				     u32 *prog_ids,
+				     u32 request_cnt)
+{
+	int i = 0;
+
+	for (; *prog; prog++) {
+		if (*prog == &dummy_bpf_prog.prog)
+			continue;
+		prog_ids[i] = (*prog)->aux->id;
+		if (++i == request_cnt) {
+			prog++;
+			break;
+		}
+	}
+
+	return !!(*prog);
+}
+
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 				__u32 __user *prog_ids, u32 cnt)
 {
 	struct bpf_prog **prog;
 	unsigned long err = 0;
-	u32 i = 0, *ids;
 	bool nospc;
+	u32 *ids;
 
 	/* users of this function are doing:
 	 * cnt = bpf_prog_array_length();
@@ -1595,16 +1614,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 		return -ENOMEM;
 	rcu_read_lock();
 	prog = rcu_dereference(progs)->progs;
-	for (; *prog; prog++) {
-		if (*prog == &dummy_bpf_prog.prog)
-			continue;
-		ids[i] = (*prog)->aux->id;
-		if (++i == cnt) {
-			prog++;
-			break;
-		}
-	}
-	nospc = !!(*prog);
+	nospc = bpf_prog_array_copy_core(prog, ids, cnt);
 	rcu_read_unlock();
 	err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
 	kfree(ids);
@@ -1683,22 +1693,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 }
 
 int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
-			     __u32 __user *prog_ids, u32 request_cnt,
-			     __u32 __user *prog_cnt)
+			     u32 *prog_ids, u32 request_cnt,
+			     u32 *prog_cnt)
 {
+	struct bpf_prog **prog;
 	u32 cnt = 0;
 
 	if (array)
 		cnt = bpf_prog_array_length(array);
 
-	if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
-		return -EFAULT;
+	*prog_cnt = cnt;
 
 	/* return early if user requested only program count or nothing to copy */
 	if (!request_cnt || !cnt)
 		return 0;
 
-	return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
+	/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
+	prog = rcu_dereference_check(array, 1)->progs;
+	return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC
+								     : 0;
 }
 
 static void bpf_prog_free_deferred(struct work_struct *work)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d88e96d..56ba0f2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -977,6 +977,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 {
 	struct perf_event_query_bpf __user *uquery = info;
 	struct perf_event_query_bpf query = {};
+	u32 *ids, prog_cnt, ids_len;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -985,16 +986,32 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 		return -EINVAL;
 	if (copy_from_user(&query, uquery, sizeof(query)))
 		return -EFAULT;
-	if (query.ids_len > BPF_TRACE_MAX_PROGS)
+
+	ids_len = query.ids_len;
+	if (ids_len > BPF_TRACE_MAX_PROGS)
 		return -E2BIG;
+	ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
+	if (!ids)
+		return -ENOMEM;
+	/*
+	 * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
+	 * is required when user only wants to check for uquery->prog_cnt.
+	 * There is no need to check for it since the case is handled
+	 * gracefully in bpf_prog_array_copy_info.
+	 */
 
 	mutex_lock(&bpf_event_mutex);
 	ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
-				       uquery->ids,
-				       query.ids_len,
-				       &uquery->prog_cnt);
+				       ids,
+				       ids_len,
+				       &prog_cnt);
 	mutex_unlock(&bpf_event_mutex);
 
+	if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
+	    copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
+		ret = -EFAULT;
+
+	kfree(ids);
 	return ret;
 }
 
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH] make net_gso_ok return false when gso_type is zero(invalid)
From: Marcelo Ricardo Leitner @ 2018-04-10 16:32 UTC (permalink / raw)
  To: Wenhua Shi; +Cc: David Miller, netdev, linux-kernel
In-Reply-To: <CAN6D2npQd9AowLCz5CXGNPPib+10nABh1dOvLWV15r1z6FvF8w@mail.gmail.com>

On Sun, Apr 08, 2018 at 08:41:21PM +0200, Wenhua Shi wrote:
> 2018-04-08 18:51 GMT+02:00 David Miller <davem@davemloft.net>:
> >
> > From: Wenhua Shi <march511@gmail.com>
> > Date: Fri,  6 Apr 2018 03:43:39 +0200
> >
> > > Signed-off-by: Wenhua Shi <march511@gmail.com>
> >
> > This precondition should be made impossible instead of having to do
> > an extra check everywhere that this helper is invoked, many of which
> > are in fast paths.
>
> I believe the precondition you said is quite true. In my situation, I
> have to disable GSO for some packet and I notice that it leads to a
> worse performance (slower than 1Mbps, was almost 800Mbps).
>
> Here's the hook I use on debian 9.4, kernel version 4.9:

There is quite a distance between 4.9 and net/net-next. Did you test
on a more recent kernel too?

Note that TCP stack now works with GSO being always on.
0a6b2a1dc2a2 ("tcp: switch to GSO being always on")

^ permalink raw reply

* Re: [PATCH bpf v3] bpf/tracing: fix a deadlock in perf_event_detach_bpf_prog
From: Yonghong Song @ 2018-04-10 16:04 UTC (permalink / raw)
  To: Daniel Borkmann, ast, netdev; +Cc: kernel-team
In-Reply-To: <1469ff7e-4fe4-b25f-25f1-273fdd3d68c2@iogearbox.net>



On 4/10/18 1:54 AM, Daniel Borkmann wrote:
> On 04/10/2018 09:21 AM, Yonghong Song wrote:
>> syzbot reported a possible deadlock in perf_event_detach_bpf_prog.
>> The error details:
>>    ======================================================
>>    WARNING: possible circular locking dependency detected
>>    4.16.0-rc7+ #3 Not tainted
>>    ------------------------------------------------------
>>    syz-executor7/24531 is trying to acquire lock:
>>     (bpf_event_mutex){+.+.}, at: [<000000008a849b07>] perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
>>
>>    but task is already holding lock:
>>     (&mm->mmap_sem){++++}, at: [<0000000038768f87>] vm_mmap_pgoff+0x198/0x280 mm/util.c:353
>>
>>    which lock already depends on the new lock.
>>
>>    the existing dependency chain (in reverse order) is:
>>
>>    -> #1 (&mm->mmap_sem){++++}:
>>         __might_fault+0x13a/0x1d0 mm/memory.c:4571
>>         _copy_to_user+0x2c/0xc0 lib/usercopy.c:25
>>         copy_to_user include/linux/uaccess.h:155 [inline]
>>         bpf_prog_array_copy_info+0xf2/0x1c0 kernel/bpf/core.c:1694
>>         perf_event_query_prog_array+0x1c7/0x2c0 kernel/trace/bpf_trace.c:891
>>         _perf_ioctl kernel/events/core.c:4750 [inline]
>>         perf_ioctl+0x3e1/0x1480 kernel/events/core.c:4770
>>         vfs_ioctl fs/ioctl.c:46 [inline]
>>         do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:686
>>         SYSC_ioctl fs/ioctl.c:701 [inline]
>>         SyS_ioctl+0x8f/0xc0 fs/ioctl.c:692
>>         do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
>>         entry_SYSCALL_64_after_hwframe+0x42/0xb7
>>
>>    -> #0 (bpf_event_mutex){+.+.}:
>>         lock_acquire+0x1d5/0x580 kernel/locking/lockdep.c:3920
>>         __mutex_lock_common kernel/locking/mutex.c:756 [inline]
>>         __mutex_lock+0x16f/0x1a80 kernel/locking/mutex.c:893
>>         mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908
>>         perf_event_detach_bpf_prog+0x92/0x3d0 kernel/trace/bpf_trace.c:854
>>         perf_event_free_bpf_prog kernel/events/core.c:8147 [inline]
>>         _free_event+0xbdb/0x10f0 kernel/events/core.c:4116
>>         put_event+0x24/0x30 kernel/events/core.c:4204
>>         perf_mmap_close+0x60d/0x1010 kernel/events/core.c:5172
>>         remove_vma+0xb4/0x1b0 mm/mmap.c:172
>>         remove_vma_list mm/mmap.c:2490 [inline]
>>         do_munmap+0x82a/0xdf0 mm/mmap.c:2731
>>         mmap_region+0x59e/0x15a0 mm/mmap.c:1646
>>         do_mmap+0x6c0/0xe00 mm/mmap.c:1483
>>         do_mmap_pgoff include/linux/mm.h:2223 [inline]
>>         vm_mmap_pgoff+0x1de/0x280 mm/util.c:355
>>         SYSC_mmap_pgoff mm/mmap.c:1533 [inline]
>>         SyS_mmap_pgoff+0x462/0x5f0 mm/mmap.c:1491
>>         SYSC_mmap arch/x86/kernel/sys_x86_64.c:100 [inline]
>>         SyS_mmap+0x16/0x20 arch/x86/kernel/sys_x86_64.c:91
>>         do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
>>         entry_SYSCALL_64_after_hwframe+0x42/0xb7
>>
>>    other info that might help us debug this:
>>
>>     Possible unsafe locking scenario:
>>
>>           CPU0                    CPU1
>>           ----                    ----
>>      lock(&mm->mmap_sem);
>>                                   lock(bpf_event_mutex);
>>                                   lock(&mm->mmap_sem);
>>      lock(bpf_event_mutex);
>>
>>     *** DEADLOCK ***
>>    ======================================================
>>
>> The bug is introduced by Commit f371b304f12e ("bpf/tracing: allow
>> user space to query prog array on the same tp") where copy_to_user,
>> which requires mm->mmap_sem, is called inside bpf_event_mutex lock.
>> At the same time, during perf_event file descriptor close,
>> mm->mmap_sem is held first and then subsequent
>> perf_event_detach_bpf_prog needs bpf_event_mutex lock.
>> Such a senario caused a deadlock.
>>
>> As suggested by Danial, moving copy_to_user out of the
> 
> Nit: typo :)
Oh, my bad. will correct.

> 
>> bpf_event_mutex lock should fix the problem.
>>
>> Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp")
>> Reported-by: syzbot+dc5ca0e4c9bfafaf2bae@syzkaller.appspotmail.com
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h      |  4 ++--
>>   kernel/bpf/core.c        | 45 +++++++++++++++++++++++++++++----------------
>>   kernel/trace/bpf_trace.c | 19 +++++++++++++++----
>>   3 files changed, 46 insertions(+), 22 deletions(-)
>>
>> Changelog:
>>    v2 -> v3:
>>      . Remove the redundant label in function perf_event_query_prog_array.
>>    v1 -> v2:
>>      . Use the common core function for prog_id copying for two
>>        different prog_array copy routines, suggested by Alexei.
>>
> [...]
>>   static void bpf_prog_free_deferred(struct work_struct *work)
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index d88e96d..f505d43 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -977,6 +977,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
>>   {
>>   	struct perf_event_query_bpf __user *uquery = info;
>>   	struct perf_event_query_bpf query = {};
>> +	u32 *ids, prog_cnt, ids_len;
>>   	int ret;
>>   
>>   	if (!capable(CAP_SYS_ADMIN))
>> @@ -985,16 +986,26 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
>>   		return -EINVAL;
>>   	if (copy_from_user(&query, uquery, sizeof(query)))
>>   		return -EFAULT;
>> -	if (query.ids_len > BPF_TRACE_MAX_PROGS)
>> +
>> +	ids_len = query.ids_len;
>> +	if (ids_len > BPF_TRACE_MAX_PROGS)
>>   		return -E2BIG;
>> +	ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
>> +	if (!ids)
>> +		return -ENOMEM;
> 
> Fix looks good to me, but could you still add a comment stating that we don't
> need to check for ZERO_SIZE_PTR above since we handle this gracefully in the
> bpf_prog_array_copy_info() plus it's also required for the case where the user
> only wants to check for the uquery->prog_cnt, but nothing else.

Will add the comment and send another version.

Thanks.

> 
>>   	mutex_lock(&bpf_event_mutex);
>>   	ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
>> -				       uquery->ids,
>> -				       query.ids_len,
>> -				       &uquery->prog_cnt);
>> +				       ids,
>> +				       ids_len,
>> +				       &prog_cnt);
>>   	mutex_unlock(&bpf_event_mutex);
>>   
>> +	if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
>> +	    copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
>> +		ret = -EFAULT;
>> +
>> +	kfree(ids);
>>   	return ret;
>>   }
> 
> Thanks,
> Daniel
> 

^ permalink raw reply

* Re: [RFC] connector: add group_exit_code and signal_flags fields to exit_proc_event
From: Stefan Strogin @ 2018-04-10 16:01 UTC (permalink / raw)
  To: Evgeniy Polyakov, matthltc@us.ibm.com
  Cc: netdev@vger.kernel.org, xe-linux-external@cisco.com,
	Victor Kamensky, Taras Kondratiuk, Ruslan Bilovol
In-Reply-To: <8933031523231387@web29j.yandex.ru>

Hi Evgeniy,

On 09/04/18 02:49, Evgeniy Polyakov wrote:
> Hi everyone
> 
> Sorry for that late reply
> 
> 01.03.2018, 21:58, "Stefan Strogin" <sstrogin@cisco.com>:
>> So I was thinking to add these two fields to union event_data:
>> task->signal->group_exit_code
>> task->signal->flags
>> This won't increase size of struct proc_event (because of comm_proc_event)
>> and shouldn't break backward compatibility for the user-space. But it will
>> add some useful information about what caused the process death.
>> What do you think, is it an acceptable approach?
> 
> As I saw in other discussion, doesn't it break userspace API, or you are sure that no sizes has been increased?
> You are using the same structure as used for plain signals and add group status there, how will userspace react,
> if it was compiled with older headers? What if it uses zero-field alignment, i.e. allocating exactly the size of structure with byte precision?
> 

Please ignore this RFC, I was wrong about the fields I need for the problem.
I have sent this patch: https://lkml.org/lkml/2018/3/29/531,
would be grateful for a review.

As for breaking UAPI and structure sizes, look:

> struct proc_event {
...
> 	union { /* must be last field of proc_event struct */
...
> 		struct comm_proc_event {
> 			__kernel_pid_t process_pid;
> 			__kernel_pid_t process_tgid;
> 			char           comm[16];
> 		} comm;
__kernel_pid_t is int that is always 4 bytes in Linux, then
sizeof(event_data.comm) == 24 on all platforms.
> 
> 		struct coredump_proc_event {
> 			__kernel_pid_t process_pid;
> 			__kernel_pid_t process_tgid;
> +			__kernel_pid_t parent_pid;
> +			__kernel_pid_t parent_tgid;
> 		} coredump;
sizeof(event_data.coredump) == 16
> 
> 		struct exit_proc_event {
> 			__kernel_pid_t process_pid;
> 			__kernel_pid_t process_tgid;
> 			__u32 exit_code, exit_signal;
> +			__kernel_pid_t parent_pid;
> +			__kernel_pid_t parent_tgid;
> 		} exit;
sizeof(event_data.exit) == 24
> 
> 	} event_data;
> };

Therefore, sizeof(event_data) is always = 24 - with old headers and new ones.
sizeof(struct proc_event) is the same as well.
Hence user-space software with old and new headers will allocate the same size.

If the user-space program somehow allocates space only for an internal structure,
e.g. for event_data.exit, I still don't see any problems if it allocates and
handles only first 16 bytes of the structure using old headers.

^ permalink raw reply

* Re: [PATCH net] sctp: sctp_sockaddr_af must check minimal addr length for AF_INET6
From: Marcelo Ricardo Leitner @ 2018-04-10 16:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Eric Dumazet, Vlad Yasevich,
	Neil Horman
In-Reply-To: <20180408145208.228542-1-edumazet@google.com>

On Sun, Apr 08, 2018 at 07:52:08AM -0700, Eric Dumazet wrote:
> Check must happen before call to ipv6_addr_v4mapped()

Please don't forget to also Cc linux-sctp@vger.kernel.org on sctp
patches.

Thanks!

^ permalink raw reply

* Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-04-10 15:59 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <20180410154304.GG2028@nanopsycho>

On 4/10/2018 8:43 AM, Jiri Pirko wrote:
> Tue, Apr 10, 2018 at 05:27:48PM CEST, sridhar.samudrala@intel.com wrote:
>> On 4/10/2018 8:22 AM, Jiri Pirko wrote:
>>> Tue, Apr 10, 2018 at 05:13:40PM CEST, sridhar.samudrala@intel.com wrote:
>>>> On 4/10/2018 3:55 AM, Jiri Pirko wrote:
>>>>> Mon, Apr 09, 2018 at 08:47:06PM CEST, sridhar.samudrala@intel.com wrote:
>>>>>> On 4/9/2018 1:07 AM, Jiri Pirko wrote:
>>>>>>> Sat, Apr 07, 2018 at 12:59:14AM CEST, sridhar.samudrala@intel.com wrote:
>>>>>>>> On 4/6/2018 5:48 AM, Jiri Pirko wrote:
>>>>>>>>> Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudrala@intel.com wrote:
>>>>>>> [...]
>>>>>>>
>>>>>>>>>> +static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>>>>>>>>>> +				     struct net_device *child_netdev)
>>>>>>>>>> +{
>>>>>>>>>> +	struct virtnet_bypass_info *vbi;
>>>>>>>>>> +	bool backup;
>>>>>>>>>> +
>>>>>>>>>> +	vbi = netdev_priv(bypass_netdev);
>>>>>>>>>> +	backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>>>>>>>>>> +	if (backup ? rtnl_dereference(vbi->backup_netdev) :
>>>>>>>>>> +			rtnl_dereference(vbi->active_netdev)) {
>>>>>>>>>> +		netdev_info(bypass_netdev,
>>>>>>>>>> +			    "%s attempting to join bypass dev when %s already present\n",
>>>>>>>>>> +			    child_netdev->name, backup ? "backup" : "active");
>>>>>>>>> Bypass module should check if there is already some other netdev
>>>>>>>>> enslaved and refuse right there.
>>>>>>>> This will work for virtio-net with 3 netdev model, but this check has to be done by netvsc
>>>>>>>> as its bypass_netdev is same as the backup_netdev.
>>>>>>>> Will add a flag while registering with the bypass module to indicate if the driver is doing
>>>>>>>> a 2 netdev or 3 netdev model and based on that flag this check can be done in bypass module
>>>>>>>> for 3 netdev scenario.
>>>>>>> Just let me undestand it clearly. What I expect the difference would be
>>>>>>> between 2netdev and3 netdev model is this:
>>>>>>> 2netdev:
>>>>>>>       bypass_master
>>>>>>>          /
>>>>>>>         /
>>>>>>> VF_slave
>>>>>>>
>>>>>>> 3netdev:
>>>>>>>       bypass_master
>>>>>>>          /     \
>>>>>>>         /       \
>>>>>>> VF_slave   backup_slave
>>>>>>>
>>>>>>> Is that correct? If not, how does it look like?
>>>>>>>
>>>>>>>
>>>>>> Looks correct.
>>>>>> VF_slave and backup_slave are the original netdevs and are present in both the models.
>>>>>> In the 3 netdev model, bypass_master netdev is created and VF_slave and backup_slave are
>>>>>> marked as the 2 slaves of this new netdev.
>>>>> You say it looks correct and in another sentence you provide completely
>>>>> different description. Could you please look again?
>>>>>
>>>> To be exact, 2 netdev model with netvsc looks like this.
>>>>
>>>>      netvsc_netdev
>>>>        /
>>>>       /
>>>> VF_slave
>>>>
>>>> With virtio_net, 3 netdev model
>>>>
>>>>    bypass_netdev
>>>>        /     \
>>>>       /       \
>>>> VF_slave   virtio_net netdev
>>> Could you also mark the original netdev which is there now? is it
>>> bypass_netdev or virtio_net_netdev ?
>> bypass_netdev
>>      /     \
>>     /       \
>> VF_slave   virtio_net netdev (original)
> That does not make sense.
> 1) You diverge from the behaviour of the netvsc, where the original
>     netdev is a master of the VF
> 2) If the original netdev is a slave, you cannot have any IP address
>     configured on it (well you could, but the rx_handler would eat every
>     incoming packet). So you will break the user bacause he would have to
>     move the configuration to the new master device.
> This only makes sense that the original netdev becomes the master for both
> netvsc and virtio_net.
Forgot to mention that bypass_netdev takes over the name of the original 
netdev and
virtio_net netdev will get the backup name.
So the userspace network configuration doesn't need to change.

^ permalink raw reply

* Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Siwei Liu @ 2018-04-10 15:55 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Samudrala, Sridhar, Michael S. Tsirkin, Stephen Hemminger,
	David Miller, Netdev, virtualization, virtio-dev,
	Brandeburg, Jesse, Alexander Duyck, Jakub Kicinski, Jason Wang
In-Reply-To: <20180410154304.GG2028@nanopsycho>

On Tue, Apr 10, 2018 at 8:43 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Tue, Apr 10, 2018 at 05:27:48PM CEST, sridhar.samudrala@intel.com wrote:
>>On 4/10/2018 8:22 AM, Jiri Pirko wrote:
>>> Tue, Apr 10, 2018 at 05:13:40PM CEST, sridhar.samudrala@intel.com wrote:
>>> > On 4/10/2018 3:55 AM, Jiri Pirko wrote:
>>> > > Mon, Apr 09, 2018 at 08:47:06PM CEST, sridhar.samudrala@intel.com wrote:
>>> > > > On 4/9/2018 1:07 AM, Jiri Pirko wrote:
>>> > > > > Sat, Apr 07, 2018 at 12:59:14AM CEST, sridhar.samudrala@intel.com wrote:
>>> > > > > > On 4/6/2018 5:48 AM, Jiri Pirko wrote:
>>> > > > > > > Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudrala@intel.com wrote:
>>> > > > > [...]
>>> > > > >
>>> > > > > > > > +static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>>> > > > > > > > +                                   struct net_device *child_netdev)
>>> > > > > > > > +{
>>> > > > > > > > +      struct virtnet_bypass_info *vbi;
>>> > > > > > > > +      bool backup;
>>> > > > > > > > +
>>> > > > > > > > +      vbi = netdev_priv(bypass_netdev);
>>> > > > > > > > +      backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>>> > > > > > > > +      if (backup ? rtnl_dereference(vbi->backup_netdev) :
>>> > > > > > > > +                      rtnl_dereference(vbi->active_netdev)) {
>>> > > > > > > > +              netdev_info(bypass_netdev,
>>> > > > > > > > +                          "%s attempting to join bypass dev when %s already present\n",
>>> > > > > > > > +                          child_netdev->name, backup ? "backup" : "active");
>>> > > > > > > Bypass module should check if there is already some other netdev
>>> > > > > > > enslaved and refuse right there.
>>> > > > > > This will work for virtio-net with 3 netdev model, but this check has to be done by netvsc
>>> > > > > > as its bypass_netdev is same as the backup_netdev.
>>> > > > > > Will add a flag while registering with the bypass module to indicate if the driver is doing
>>> > > > > > a 2 netdev or 3 netdev model and based on that flag this check can be done in bypass module
>>> > > > > > for 3 netdev scenario.
>>> > > > > Just let me undestand it clearly. What I expect the difference would be
>>> > > > > between 2netdev and3 netdev model is this:
>>> > > > > 2netdev:
>>> > > > >      bypass_master
>>> > > > >         /
>>> > > > >        /
>>> > > > > VF_slave
>>> > > > >
>>> > > > > 3netdev:
>>> > > > >      bypass_master
>>> > > > >         /     \
>>> > > > >        /       \
>>> > > > > VF_slave   backup_slave
>>> > > > >
>>> > > > > Is that correct? If not, how does it look like?
>>> > > > >
>>> > > > >
>>> > > > Looks correct.
>>> > > > VF_slave and backup_slave are the original netdevs and are present in both the models.
>>> > > > In the 3 netdev model, bypass_master netdev is created and VF_slave and backup_slave are
>>> > > > marked as the 2 slaves of this new netdev.
>>> > > You say it looks correct and in another sentence you provide completely
>>> > > different description. Could you please look again?
>>> > >
>>> > To be exact, 2 netdev model with netvsc looks like this.
>>> >
>>> >     netvsc_netdev
>>> >       /
>>> >      /
>>> > VF_slave
>>> >
>>> > With virtio_net, 3 netdev model
>>> >
>>> >   bypass_netdev
>>> >       /     \
>>> >      /       \
>>> > VF_slave   virtio_net netdev
>>> Could you also mark the original netdev which is there now? is it
>>> bypass_netdev or virtio_net_netdev ?
>>
>> bypass_netdev
>>     /     \
>>    /       \
>>VF_slave   virtio_net netdev (original)
>
> That does not make sense.
> 1) You diverge from the behaviour of the netvsc, where the original
>    netdev is a master of the VF
> 2) If the original netdev is a slave, you cannot have any IP address
>    configured on it (well you could, but the rx_handler would eat every
>    incoming packet). So you will break the user bacause he would have to
>    move the configuration to the new master device.

That's exactly the point why I need to hide the lower netdev slaves
and trying the align the naming of the bypass with where IP was
configured on the original netdev. The current 3-netdev model is not
"transparent" by any means.

-Siwei

> This only makes sense that the original netdev becomes the master for both
> netvsc and virtio_net.

^ permalink raw reply

* Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Jiri Pirko @ 2018-04-10 15:43 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <d08ed1e6-678a-7b34-cd2e-52788ceec919@intel.com>

Tue, Apr 10, 2018 at 05:27:48PM CEST, sridhar.samudrala@intel.com wrote:
>On 4/10/2018 8:22 AM, Jiri Pirko wrote:
>> Tue, Apr 10, 2018 at 05:13:40PM CEST, sridhar.samudrala@intel.com wrote:
>> > On 4/10/2018 3:55 AM, Jiri Pirko wrote:
>> > > Mon, Apr 09, 2018 at 08:47:06PM CEST, sridhar.samudrala@intel.com wrote:
>> > > > On 4/9/2018 1:07 AM, Jiri Pirko wrote:
>> > > > > Sat, Apr 07, 2018 at 12:59:14AM CEST, sridhar.samudrala@intel.com wrote:
>> > > > > > On 4/6/2018 5:48 AM, Jiri Pirko wrote:
>> > > > > > > Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudrala@intel.com wrote:
>> > > > > [...]
>> > > > > 
>> > > > > > > > +static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>> > > > > > > > +				     struct net_device *child_netdev)
>> > > > > > > > +{
>> > > > > > > > +	struct virtnet_bypass_info *vbi;
>> > > > > > > > +	bool backup;
>> > > > > > > > +
>> > > > > > > > +	vbi = netdev_priv(bypass_netdev);
>> > > > > > > > +	backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>> > > > > > > > +	if (backup ? rtnl_dereference(vbi->backup_netdev) :
>> > > > > > > > +			rtnl_dereference(vbi->active_netdev)) {
>> > > > > > > > +		netdev_info(bypass_netdev,
>> > > > > > > > +			    "%s attempting to join bypass dev when %s already present\n",
>> > > > > > > > +			    child_netdev->name, backup ? "backup" : "active");
>> > > > > > > Bypass module should check if there is already some other netdev
>> > > > > > > enslaved and refuse right there.
>> > > > > > This will work for virtio-net with 3 netdev model, but this check has to be done by netvsc
>> > > > > > as its bypass_netdev is same as the backup_netdev.
>> > > > > > Will add a flag while registering with the bypass module to indicate if the driver is doing
>> > > > > > a 2 netdev or 3 netdev model and based on that flag this check can be done in bypass module
>> > > > > > for 3 netdev scenario.
>> > > > > Just let me undestand it clearly. What I expect the difference would be
>> > > > > between 2netdev and3 netdev model is this:
>> > > > > 2netdev:
>> > > > >      bypass_master
>> > > > >         /
>> > > > >        /
>> > > > > VF_slave
>> > > > > 
>> > > > > 3netdev:
>> > > > >      bypass_master
>> > > > >         /     \
>> > > > >        /       \
>> > > > > VF_slave   backup_slave
>> > > > > 
>> > > > > Is that correct? If not, how does it look like?
>> > > > > 
>> > > > > 
>> > > > Looks correct.
>> > > > VF_slave and backup_slave are the original netdevs and are present in both the models.
>> > > > In the 3 netdev model, bypass_master netdev is created and VF_slave and backup_slave are
>> > > > marked as the 2 slaves of this new netdev.
>> > > You say it looks correct and in another sentence you provide completely
>> > > different description. Could you please look again?
>> > > 
>> > To be exact, 2 netdev model with netvsc looks like this.
>> > 
>> >     netvsc_netdev
>> >       /
>> >      /
>> > VF_slave
>> > 
>> > With virtio_net, 3 netdev model
>> > 
>> >   bypass_netdev
>> >       /     \
>> >      /       \
>> > VF_slave   virtio_net netdev
>> Could you also mark the original netdev which is there now? is it
>> bypass_netdev or virtio_net_netdev ?
>
> bypass_netdev
>     /     \
>    /       \
>VF_slave   virtio_net netdev (original)

That does not make sense.
1) You diverge from the behaviour of the netvsc, where the original
   netdev is a master of the VF
2) If the original netdev is a slave, you cannot have any IP address
   configured on it (well you could, but the rx_handler would eat every
   incoming packet). So you will break the user bacause he would have to
   move the configuration to the new master device.
This only makes sense that the original netdev becomes the master for both
netvsc and virtio_net.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox