Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 03/12] net: hns3: Add STRP_TAGP field support for hardware revision 0x21
From: Salil Mehta @ 2018-05-25 18:42 UTC (permalink / raw)
  To: davem
  Cc: salil.mehta, yisen.zhuang, lipeng321, mehta.salil, netdev,
	linux-kernel, linuxarm
In-Reply-To: <20180525184307.36288-1-salil.mehta@huawei.com>

From: Peng Li <lipeng321@huawei.com>

Hardware Revision(0x21) Buffer Descriptor adds a field STRP_TAGP
for vlan stripped processed indication. STRP_TAGP field has 2 bits,
bit 0 is stripped indication of the vlan tag in outer vlan tag
field, bit 1 is stripped indication of the vlan tag in inner vlan
tag field. For each bit, 0 indicates the tag is not stripped and
1 indicates the tag is stripped.

This patch adds STRP_TAGP support for revision(0x21), and does not
change the revision(0x20) action.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 42 ++++++++++++++++++++++---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h |  3 ++
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index ae8d749..1bcb676 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2066,6 +2066,39 @@ static void hns3_rx_skb(struct hns3_enet_ring *ring, struct sk_buff *skb)
 	napi_gro_receive(&ring->tqp_vector->napi, skb);
 }
 
+static u16 hns3_parse_vlan_tag(struct hns3_enet_ring *ring,
+			       struct hns3_desc *desc, u32 l234info)
+{
+	struct pci_dev *pdev = ring->tqp->handle->pdev;
+	u16 vlan_tag;
+
+	if (pdev->revision == 0x20) {
+		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
+		if (!(vlan_tag & VLAN_VID_MASK))
+			vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+
+		return vlan_tag;
+	}
+
+#define HNS3_STRP_OUTER_VLAN	0x1
+#define HNS3_STRP_INNER_VLAN	0x2
+
+	switch (hnae_get_field(l234info, HNS3_RXD_STRP_TAGP_M,
+			       HNS3_RXD_STRP_TAGP_S)) {
+	case HNS3_STRP_OUTER_VLAN:
+		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
+		break;
+	case HNS3_STRP_INNER_VLAN:
+		vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+		break;
+	default:
+		vlan_tag = 0;
+		break;
+	}
+
+	return vlan_tag;
+}
+
 static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 			     struct sk_buff **out_skb, int *out_bnum)
 {
@@ -2155,6 +2188,9 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 	}
 
 	*out_bnum = bnum;
+
+	l234info = le32_to_cpu(desc->rx.l234_info);
+
 	/* Based on hw strategy, the tag offloaded will be stored at
 	 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
 	 * in one layer tag case.
@@ -2162,17 +2198,13 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
 		u16 vlan_tag;
 
-		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
-		if (!(vlan_tag & VLAN_VID_MASK))
-			vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+		vlan_tag = hns3_parse_vlan_tag(ring, desc, l234info);
 		if (vlan_tag & VLAN_VID_MASK)
 			__vlan_hwaccel_put_tag(skb,
 					       htons(ETH_P_8021Q),
 					       vlan_tag);
 	}
 
-	l234info = le32_to_cpu(desc->rx.l234_info);
-
 	if (unlikely(!hnae_get_bit(bd_base_info, HNS3_RXD_VLD_B))) {
 		netdev_err(netdev, "no valid bd,%016llx,%016llx\n",
 			   ((u64 *)desc)[0], ((u64 *)desc)[1]);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 5b40f5a..38e91ca 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -104,6 +104,9 @@ enum hns3_nic_state {
 #define HNS3_RXD_L4ID_S				8
 #define HNS3_RXD_L4ID_M				(0xf << HNS3_RXD_L4ID_S)
 #define HNS3_RXD_FRAG_B				12
+#define HNS3_RXD_STRP_TAGP_S			13
+#define HNS3_RXD_STRP_TAGP_M			(0x3 << HNS3_RXD_STRP_TAGP_S)
+
 #define HNS3_RXD_L2E_B				16
 #define HNS3_RXD_L3E_B				17
 #define HNS3_RXD_L4E_B				18
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 02/12] net: hns3: Add support for tx_accept_tag2 and tx_accept_untag2 config
From: Salil Mehta @ 2018-05-25 18:42 UTC (permalink / raw)
  To: davem
  Cc: salil.mehta, yisen.zhuang, lipeng321, mehta.salil, netdev,
	linux-kernel, linuxarm
In-Reply-To: <20180525184307.36288-1-salil.mehta@huawei.com>

From: Peng Li <lipeng321@huawei.com>

HNS3 Hardware can support up to two VLAN tags in transmit leg, the PPP
module can handle the packets based on the tag1 and tag2 config. This
patch adds support for tag2 config for vlan handling

Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |  7 ++++--
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 26 +++++++++++++++++-----
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  6 +++--
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index ee3cbac..3fa08f7 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -704,11 +704,14 @@ struct hclge_vlan_filter_vf_cfg_cmd {
 	u8  vf_bitmap[16];
 };
 
-#define HCLGE_ACCEPT_TAG_B		0
-#define HCLGE_ACCEPT_UNTAG_B		1
+#define HCLGE_ACCEPT_TAG1_B		0
+#define HCLGE_ACCEPT_UNTAG1_B		1
 #define HCLGE_PORT_INS_TAG1_EN_B	2
 #define HCLGE_PORT_INS_TAG2_EN_B	3
 #define HCLGE_CFG_NIC_ROCE_SEL_B	4
+#define HCLGE_ACCEPT_TAG2_B		5
+#define HCLGE_ACCEPT_UNTAG2_B		6
+
 struct hclge_vport_vtag_tx_cfg_cmd {
 	u8 vport_vlan_cfg;
 	u8 vf_offset;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 2f0bbb6..c0b8d5a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4687,10 +4687,14 @@ static int hclge_set_vlan_tx_offload_cfg(struct hclge_vport *vport)
 	req = (struct hclge_vport_vtag_tx_cfg_cmd *)desc.data;
 	req->def_vlan_tag1 = cpu_to_le16(vcfg->default_tag1);
 	req->def_vlan_tag2 = cpu_to_le16(vcfg->default_tag2);
-	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG_B,
-		     vcfg->accept_tag ? 1 : 0);
-	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG_B,
-		     vcfg->accept_untag ? 1 : 0);
+	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG1_B,
+			vcfg->accept_tag1 ? 1 : 0);
+	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG1_B,
+			vcfg->accept_untag1 ? 1 : 0);
+	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG2_B,
+			vcfg->accept_tag2 ? 1 : 0);
+	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG2_B,
+			vcfg->accept_untag2 ? 1 : 0);
 	hnae_set_bit(req->vport_vlan_cfg, HCLGE_PORT_INS_TAG1_EN_B,
 		     vcfg->insert_tag1_en ? 1 : 0);
 	hnae_set_bit(req->vport_vlan_cfg, HCLGE_PORT_INS_TAG2_EN_B,
@@ -4814,8 +4818,18 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
 
 	for (i = 0; i < hdev->num_alloc_vport; i++) {
 		vport = &hdev->vport[i];
-		vport->txvlan_cfg.accept_tag = true;
-		vport->txvlan_cfg.accept_untag = true;
+		vport->txvlan_cfg.accept_tag1 = true;
+		vport->txvlan_cfg.accept_untag1 = true;
+
+		/* accept_tag2 and accept_untag2 are not supported on
+		 * pdev revision(0x20), new revision support them. The
+		 * value of this two fields will not return error when driver
+		 * send command to fireware in revision(0x20).
+		 * This two fields can not configured by user.
+		 */
+		vport->txvlan_cfg.accept_tag2 = true;
+		vport->txvlan_cfg.accept_untag2 = true;
+
 		vport->txvlan_cfg.insert_tag1_en = false;
 		vport->txvlan_cfg.insert_tag2_en = false;
 		vport->txvlan_cfg.default_tag1 = 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 93177d9..677f1e4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -570,8 +570,10 @@ struct hclge_dev {
 
 /* VPort level vlan tag configuration for TX direction */
 struct hclge_tx_vtag_cfg {
-	bool accept_tag;	/* Whether accept tagged packet from host */
-	bool accept_untag;	/* Whether accept untagged packet from host */
+	bool accept_tag1;	/* Whether accept tag1 packet from host */
+	bool accept_untag1;	/* Whether accept untag1 packet from host */
+	bool accept_tag2;
+	bool accept_untag2;
 	bool insert_tag1_en;	/* Whether insert inner vlan tag */
 	bool insert_tag2_en;	/* Whether insert outer vlan tag */
 	u16  default_tag1;	/* The default inner vlan tag to insert */
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 01/12] net: hns3: Updates RX packet info fetch in case of multi BD
From: Salil Mehta @ 2018-05-25 18:42 UTC (permalink / raw)
  To: davem
  Cc: salil.mehta, yisen.zhuang, lipeng321, mehta.salil, netdev,
	linux-kernel, linuxarm
In-Reply-To: <20180525184307.36288-1-salil.mehta@huawei.com>

From: Peng Li <lipeng321@huawei.com>

In the latest revision of the hardware, if a packet is spanning
across multiple BDs then only VLD bit and current data size info
is valid in each BD, and rest of the information is only valid
in the last BD of the packet. In such case we should make sure
we are fetching RX packet size from the first descriptor and
information like VLAN should be fetched from last BD.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Reviewed-by: Yisen Zhuang <yisen.zhuang@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 36 ++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index cac5195..ae8d749 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2085,9 +2085,8 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 
 	prefetch(desc);
 
-	length = le16_to_cpu(desc->rx.pkt_len);
+	length = le16_to_cpu(desc->rx.size);
 	bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
-	l234info = le32_to_cpu(desc->rx.l234_info);
 
 	/* Check valid BD */
 	if (!hnae_get_bit(bd_base_info, HNS3_RXD_VLD_B))
@@ -2121,22 +2120,6 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 
 	prefetchw(skb->data);
 
-	/* Based on hw strategy, the tag offloaded will be stored at
-	 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
-	 * in one layer tag case.
-	 */
-	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
-		u16 vlan_tag;
-
-		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
-		if (!(vlan_tag & VLAN_VID_MASK))
-			vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
-		if (vlan_tag & VLAN_VID_MASK)
-			__vlan_hwaccel_put_tag(skb,
-					       htons(ETH_P_8021Q),
-					       vlan_tag);
-	}
-
 	bnum = 1;
 	if (length <= HNS3_RX_HEAD_SIZE) {
 		memcpy(__skb_put(skb, length), va, ALIGN(length, sizeof(long)));
@@ -2172,6 +2155,23 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 	}
 
 	*out_bnum = bnum;
+	/* Based on hw strategy, the tag offloaded will be stored at
+	 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
+	 * in one layer tag case.
+	 */
+	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
+		u16 vlan_tag;
+
+		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
+		if (!(vlan_tag & VLAN_VID_MASK))
+			vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+		if (vlan_tag & VLAN_VID_MASK)
+			__vlan_hwaccel_put_tag(skb,
+					       htons(ETH_P_8021Q),
+					       vlan_tag);
+	}
+
+	l234info = le32_to_cpu(desc->rx.l234_info);
 
 	if (unlikely(!hnae_get_bit(bd_base_info, HNS3_RXD_VLD_B))) {
 		netdev_err(netdev, "no valid bd,%016llx,%016llx\n",
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 00/12] Misc. bug fixes & some minor additions to HNS3 driver
From: Salil Mehta @ 2018-05-25 18:42 UTC (permalink / raw)
  To: davem
  Cc: salil.mehta, yisen.zhuang, lipeng321, mehta.salil, netdev,
	linux-kernel, linuxarm

This patch-set provides some bug fixes figured out during testing
and review. It also provides some additions due to running of the
existing code on the new revision of the HNS3 hardware.

Fuyun Liang (3):
  net: hns3: Fixes the init of the VALID BD info in the descriptor
  net: hns3: Removes unnecessary check when clearing TX/RX rings
  net: hns3: Clear TX/RX rings when stopping port & un-initializing
    client

Jian Shen (2):
  net: hns3: Remove unused led control code
  net: hns3: Adds support for led locate command for copper port

Lijun Ou (1):
  net: hns3: Fixes initalization of RoCE handle and makes it conditional

Peng Li (5):
  net: hns3: Updates RX packet info fetch in case of multi BD
  net: hns3: Add support for tx_accept_tag2 and tx_accept_untag2 config
  net: hns3: Add STRP_TAGP field support for hardware revision 0x21
  net: hns3: Add support to enable TX/RX promisc mode for H/W rev(0x21)
  net: hns3: Fixes the state to indicate client-type initialization

Xi Wang (1):
  net: hns3: Fix for PF mailbox receving unknown message

 drivers/net/ethernet/hisilicon/hns3/hnae3.c        |  49 +++++-
 drivers/net/ethernet/hisilicon/hns3/hnae3.h        |   4 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c    | 189 +++++++++++++++++----
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h    |   4 +
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c |   4 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |  22 +--
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 179 ++++---------------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   8 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c |  23 ++-
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c  |   8 +-
 10 files changed, 283 insertions(+), 207 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH 2/8] batman-adv: Disable CONFIG_BATMAN_ADV_DEBUGFS by default
From: David Miller @ 2018-05-25 18:39 UTC (permalink / raw)
  To: sergei.shtylyov; +Cc: sven, sw, netdev, b.a.t.m.a.n, joe
In-Reply-To: <ffef0c24-37af-f58e-47b7-ed5ed41af70f@cogentembedded.com>

From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Fri, 25 May 2018 18:56:09 +0300

> On 05/25/2018 02:15 PM, Sven Eckelmann wrote:
> 
>>>> [...]
>>>>>> --- a/net/batman-adv/Kconfig
>>>>>> +++ b/net/batman-adv/Kconfig
>>>>>> @@ -94,13 +94,13 @@ config BATMAN_ADV_DEBUGFS
>>>>>> bool "batman-adv debugfs entries"
>>>>>> depends on BATMAN_ADV
>>>>>> depends on DEBUG_FS
>>>>>> -       default y
>>>>>> +       default n
>>>>>
>>>>> N is the default default. :-) You don't need this line.
>>>>
>>>> Hm, looks like this would have to be changed in a lot of places (~782
>>>> according to `git grep 'default n$'|wc -l` in my slightly outdated linux-
>>>> next). Do you want to fix it everywhere?
>>>
>>>     No, but we can at least not add the new ones...
>> 
>> But the patch was added to net-next yesterday.
> 
>    DaveM is still too fast for me. :-)

No worries, just let's get a patch to remove the line.

^ permalink raw reply

* Re: [PATCH net-next v5 1/2] openvswitch: Add conntrack limit netlink definition
From: Pravin Shelar @ 2018-05-25 18:39 UTC (permalink / raw)
  To: Yi-Hung Wei; +Cc: Linux Kernel Network Developers
In-Reply-To: <1527209803-48274-2-git-send-email-yihung.wei@gmail.com>

On Thu, May 24, 2018 at 5:56 PM, Yi-Hung Wei <yihung.wei@gmail.com> wrote:
> Define netlink messages and attributes to support user kernel
> communication that uses the conntrack limit feature.
>
> Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>

Acked-by: Pravin B Shelar <pshelar@ovn.org>

Thanks.

^ permalink raw reply

* Re: [PATCH net-next v5 2/2] openvswitch: Support conntrack zone limit
From: Pravin Shelar @ 2018-05-25 18:38 UTC (permalink / raw)
  To: Yi-Hung Wei; +Cc: Linux Kernel Network Developers
In-Reply-To: <1527209803-48274-3-git-send-email-yihung.wei@gmail.com>

On Thu, May 24, 2018 at 5:56 PM, Yi-Hung Wei <yihung.wei@gmail.com> wrote:
> Currently, nf_conntrack_max is used to limit the maximum number of
> conntrack entries in the conntrack table for every network namespace.
> For the VMs and containers that reside in the same namespace,
> they share the same conntrack table, and the total # of conntrack entries
> for all the VMs and containers are limited by nf_conntrack_max.  In this
> case, if one of the VM/container abuses the usage the conntrack entries,
> it blocks the others from committing valid conntrack entries into the
> conntrack table.  Even if we can possibly put the VM in different network
> namespace, the current nf_conntrack_max configuration is kind of rigid
> that we cannot limit different VM/container to have different # conntrack
> entries.
>
> To address the aforementioned issue, this patch proposes to have a
> fine-grained mechanism that could further limit the # of conntrack entries
> per-zone.  For example, we can designate different zone to different VM,
> and set conntrack limit to each zone.  By providing this isolation, a
> mis-behaved VM only consumes the conntrack entries in its own zone, and
> it will not influence other well-behaved VMs.  Moreover, the users can
> set various conntrack limit to different zone based on their preference.
>
> The proposed implementation utilizes Netfilter's nf_conncount backend
> to count the number of connections in a particular zone.  If the number of
> connection is above a configured limitation, ovs will return ENOMEM to the
> userspace.  If userspace does not configure the zone limit, the limit
> defaults to zero that is no limitation, which is backward compatible to
> the behavior without this patch.
>
> The following high leve APIs are provided to the userspace:
>   - OVS_CT_LIMIT_CMD_SET:
>     * set default connection limit for all zones
>     * set the connection limit for a particular zone
>   - OVS_CT_LIMIT_CMD_DEL:
>     * remove the connection limit for a particular zone
>   - OVS_CT_LIMIT_CMD_GET:
>     * get the default connection limit for all zones
>     * get the connection limit for a particular zone
>
> Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>

Thanks for working on the feature.

Acked-by: Pravin B Shelar <pshelar@ovn.org>

^ permalink raw reply

* Re: [PATCH v2] ath6kl: mark expected switch fall-throughs
From: Steve deRosier @ 2018-05-25 18:27 UTC (permalink / raw)
  To: gustavo
  Cc: Kalle Valo, davem, sergei.shtylyov, linux-wireless,
	Network Development, LKML
In-Reply-To: <20180525182311.GA3000@embeddedor.com>

On Fri, May 25, 2018 at 11:23 AM Gustavo A. R. Silva
<gustavo@embeddedor.com>
wrote:

> In preparation to enabling -Wimplicit-fallthrough, mark switch cases
> where we are expecting to fall through.

> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> ---
> Changes in v2:
>    - Place code comments on a line of their own.

>    drivers/net/wireless/ath/ath6kl/cfg80211.c | 3 +++
>    1 file changed, 3 insertions(+)

> diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c
b/drivers/net/wireless/ath/ath6kl/cfg80211.c
> index 2ba8cf3..a16ee5d 100644
> --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
> +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
> @@ -3899,16 +3899,19 @@ int ath6kl_cfg80211_init(struct ath6kl *ar)
>           switch (ar->hw.cap) {
>           case WMI_11AN_CAP:
>                   ht = true;
> +               /* fall through */
>           case WMI_11A_CAP:
>                   band_5gig = true;
>                   break;
>           case WMI_11GN_CAP:
>                   ht = true;
> +               /* fall through */
>           case WMI_11G_CAP:
>                   band_2gig = true;
>                   break;
>           case WMI_11AGN_CAP:
>                   ht = true;
> +               /* fall through */
>           case WMI_11AG_CAP:
>                   band_2gig = true;
>                   band_5gig = true;
> --
> 2.7.4


Gustavo,

Thanks for the adjustment.  It now looks good to me.

Reviewed-by: Steve deRosier <derosier@cal-sierra.com>

^ permalink raw reply

* [PATCH v2] ath6kl: mark expected switch fall-throughs
From: Gustavo A. R. Silva @ 2018-05-25 18:23 UTC (permalink / raw)
  To: Kalle Valo, David S. Miller, Sergei Shtylyov
  Cc: linux-wireless, netdev, linux-kernel, Gustavo A. R. Silva

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
Changes in v2:
 - Place code comments on a line of their own.

 drivers/net/wireless/ath/ath6kl/cfg80211.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 2ba8cf3..a16ee5d 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -3899,16 +3899,19 @@ int ath6kl_cfg80211_init(struct ath6kl *ar)
 	switch (ar->hw.cap) {
 	case WMI_11AN_CAP:
 		ht = true;
+		/* fall through */
 	case WMI_11A_CAP:
 		band_5gig = true;
 		break;
 	case WMI_11GN_CAP:
 		ht = true;
+		/* fall through */
 	case WMI_11G_CAP:
 		band_2gig = true;
 		break;
 	case WMI_11AGN_CAP:
 		ht = true;
+		/* fall through */
 	case WMI_11AG_CAP:
 		band_2gig = true;
 		band_5gig = true;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH 2/8] batman-adv: Disable CONFIG_BATMAN_ADV_DEBUGFS by default
From: Joe Perches @ 2018-05-25 18:20 UTC (permalink / raw)
  To: Sergei Shtylyov, Sven Eckelmann, Andrew Morton
  Cc: Simon Wunderlich, davem, netdev, b.a.t.m.a.n
In-Reply-To: <6fd70d33-11f7-33dd-3b11-e5031fe46466@cogentembedded.com>

On Fri, 2018-05-25 at 14:13 +0300, Sergei Shtylyov wrote:
> On 5/25/2018 1:50 PM, Sven Eckelmann wrote:
> 
> > [...]
> > > > --- a/net/batman-adv/Kconfig
> > > > +++ b/net/batman-adv/Kconfig
> > > > @@ -94,13 +94,13 @@ config BATMAN_ADV_DEBUGFS
> > > > bool "batman-adv debugfs entries"
> > > > depends on BATMAN_ADV
> > > > depends on DEBUG_FS
> > > > -       default y
> > > > +       default n
> > > 
> > >      N is the default default. :-) You don't need this line.
> > 
> > Hm, looks like this would have to be changed in a lot of places (~782
> > according to `git grep 'default n$'|wc -l` in my slightly outdated linux-
> > next). Do you want to fix it everywhere?
> 
>     No, but we can at least not add the new ones...
> 
> > Might be good to get this integrated
> > in checkpatch.pl when this will become a new policy.
> 
>     Adding Joe Perches. Joe, can you add a check for "default n"?

OK.

This also improves the Kconfig boolean->bool test above
as it's broken for multiple section Kconfig files with
help texts.
---
 scripts/checkpatch.pl | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index baddac9379f0..1f980be4950b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2888,9 +2888,22 @@ sub process {
 
 # discourage the use of boolean for type definition attributes of Kconfig options
 		if ($realfile =~ /Kconfig/ &&
-		    $line =~ /^\+\s*\bboolean\b/) {
-			WARN("CONFIG_TYPE_BOOLEAN",
-			     "Use of boolean is deprecated, please use bool instead.\n" . $herecurr);
+		    $rawline =~ /^\+\s*\bboolean\b/) {
+			if (WARN("CONFIG_TYPE_BOOLEAN",
+				 "Use of boolean is deprecated, please use bool instead.\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$fixlinenr] =~ s/\bboolean\b/bool/;
+			}
+		}
+
+# discourage the use of 'default n' in Kconfig files as that's the default
+		if ($realfile =~ /Kconfig/ &&
+		    $rawline =~ /^\+\s*default\s+n\s*$/) {
+			if (WARN("CONFIG_DEFAULT_N",
+				 "Unnecessary Use of 'default n'\n" . $herecurr) &&
+			    $fix) {
+				fix_delete_line($fixlinenr, $rawline);
+			}
 		}
 
 		if (($realfile =~ /Makefile.*/ || $realfile =~ /Kbuild.*/) &&

^ permalink raw reply related

* [PATCH] rtnetlink: Add more well known protocol values
From: Donald Sharp @ 2018-05-25 18:20 UTC (permalink / raw)
  To: netdev, dsahern

FRRouting installs routes into the kernel associated with
the originating protocol.  Add these values to the well
known values in rtnetlink.h.

Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
---
 include/uapi/linux/rtnetlink.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cabb210c93af..81b33826f818 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -254,6 +254,11 @@ enum {
 #define RTPROT_DHCP	16      /* DHCP client */
 #define RTPROT_MROUTED	17      /* Multicast daemon */
 #define RTPROT_BABEL	42      /* Babel daemon */
+#define RTPROT_BGP      186     /* BGP Routes */
+#define RTPROT_ISIS     187     /* ISIS Routes */
+#define RTPROT_OSPF     188     /* OSPF Routes */
+#define RTPROT_RIP      189     /* RIP Routes */
+#define RTPROT_EIGRP    192     /* EIGRP Routes */
 
 /* rtm_scope
 
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH] ath6kl: mark expected switch fall-throughs
From: Gustavo A. R. Silva @ 2018-05-25 18:14 UTC (permalink / raw)
  To: Kalle Valo
  Cc: Sergei Shtylyov, David S. Miller, linux-wireless, netdev,
	linux-kernel
In-Reply-To: <87k1rr8v3l.fsf@kamboji.qca.qualcomm.com>



On 05/25/2018 01:10 PM, Kalle Valo wrote:
>>> Yeah, I was wondering the same. Was there a particular reason for this?
>>>
>>
>> Sometimes people use this style for a one-line code block.
>>
>> I can change it to the traditional style. No problem.
> 
> I would prefer that. So if you can send v2 that would be great.
> 

Yep. No problem. I'll send it shortly.

Thanks
--
Gustavo

^ permalink raw reply

* [PATCH net] net: sched: check netif_xmit_frozen_or_stopped() in sch_direct_xmit()
From: Song Liu @ 2018-05-25 18:11 UTC (permalink / raw)
  To: netdev; +Cc: Song Liu, kernel-team, John Fastabend, David S . Miller

Summary:

At the end of sch_direct_xmit(), we are in the else path of
!dev_xmit_complete(ret), which means ret == NETDEV_TX_OK. The following
condition will always fail and netif_xmit_frozen_or_stopped() is not
checked at all.

    if (ret && netif_xmit_frozen_or_stopped(txq))
         return false;

In this patch, this condition is fixed as:

    if (netif_xmit_frozen_or_stopped(txq))
         return false;

and further simplifies the code as:

    return !netif_xmit_frozen_or_stopped(txq);

Fixes: 29b86cdac00a ("net: sched: remove remaining uses for qdisc_qlen in xmit path")
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 net/sched/sch_generic.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39c144b..8261d48 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -346,10 +346,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 		return false;
 	}
 
-	if (ret && netif_xmit_frozen_or_stopped(txq))
-		return false;
-
-	return true;
+	return !netif_xmit_frozen_or_stopped(txq);
 }
 
 /*
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH] ath6kl: mark expected switch fall-throughs
From: Kalle Valo @ 2018-05-25 18:10 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: Sergei Shtylyov, David S. Miller, linux-wireless, netdev,
	linux-kernel
In-Reply-To: <e777af5c-8d56-aef7-a4b6-f93f12378049@embeddedor.com>

"Gustavo A. R. Silva" <gustavo@embeddedor.com> writes:

> On 05/25/2018 08:30 AM, Kalle Valo wrote:
>> Sergei Shtylyov <sergei.shtylyov@cogentembedded.com> writes:
>>
>>> On 5/25/2018 2:13 AM, Gustavo A. R. Silva wrote:
>>>
>>>> In preparation to enabling -Wimplicit-fallthrough, mark switch cases
>>>> where we are expecting to fall through.
>>>>
>>>> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
>>>> ---
>>>>    drivers/net/wireless/ath/ath6kl/cfg80211.c | 6 +++---
>>>>    1 file changed, 3 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
>>>> index 2ba8cf3..29e32cd 100644
>>>> --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
>>>> +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
>>>> @@ -3898,17 +3898,17 @@ int ath6kl_cfg80211_init(struct ath6kl *ar)
>>>>    	wiphy->max_scan_ie_len = 1000; /* FIX: what is correct limit? */
>>>>    	switch (ar->hw.cap) {
>>>>    	case WMI_11AN_CAP:
>>>> -		ht = true;
>>>> +		ht = true; /* fall through */
>>>>    	case WMI_11A_CAP:
>>>>    		band_5gig = true;
>>>>    		break;
>>>>    	case WMI_11GN_CAP:
>>>> -		ht = true;
>>>> +		ht = true; /* fall through */
>>>>    	case WMI_11G_CAP:
>>>>    		band_2gig = true;
>>>>    		break;
>>>>    	case WMI_11AGN_CAP:
>>>> -		ht = true;
>>>> +		ht = true; /* fall through */
>>>>    	case WMI_11AG_CAP:
>>>>    		band_2gig = true;
>>>>    		band_5gig = true;
>>>
>>>     Hm, typically such comments are done on a line of their own, have
>>> never seen this style...
>>
>> Yeah, I was wondering the same. Was there a particular reason for this?
>>
>
> Sometimes people use this style for a one-line code block.
>
> I can change it to the traditional style. No problem.

I would prefer that. So if you can send v2 that would be great.

-- 
Kalle Valo

^ permalink raw reply

* Re: [RFC PATCH net-next 00/12] XDP batching for TUN/vhost_net
From: Michael S. Tsirkin @ 2018-05-25 17:53 UTC (permalink / raw)
  To: Jason Wang; +Cc: kvm, virtualization, netdev, linux-kernel
In-Reply-To: <1526893473-20128-1-git-send-email-jasowang@redhat.com>

On Mon, May 21, 2018 at 05:04:21PM +0800, Jason Wang wrote:
> Hi all:
> 
> We do not support XDP batching for TUN since it can only receive one
> packet a time from vhost_net. This series tries to remove this
> limitation by:
> 
> - introduce a TUN specific msg_control that can hold a pointer to an
>   array of XDP buffs
> - try copy and build XDP buff in vhost_net
> - store XDP buffs in an array and submit them once for every N packets
>   from vhost_net
> - since TUN can only do native XDP for datacopy packet, to simplify
>   the logic, split datacopy out logic and only do batching for
>   datacopy.

I like how this rework looks. Pls go ahead and repost as
non-RFC.

> With this series, TX PPS can improve about 34% from 2.9Mpps to
> 3.9Mpps when doing xdp_redirect_map between TAP and ixgbe.
> 
> Thanks
> 
> Jason Wang (12):
>   vhost_net: introduce helper to initialize tx iov iter
>   vhost_net: introduce vhost_exceeds_weight()
>   vhost_net: introduce vhost_has_more_pkts()
>   vhost_net: split out datacopy logic
>   vhost_net: batch update used ring for datacopy TX
>   tuntap: enable premmption early
>   tuntap: simplify error handling in tun_build_skb()
>   tuntap: tweak on the path of non-xdp case in tun_build_skb()
>   tuntap: split out XDP logic
>   vhost_net: build xdp buff
>   vhost_net: passing raw xdp buff to tun
>   vhost_net: batch submitting XDP buffers to underlayer sockets
> 
>  drivers/net/tun.c      | 226 +++++++++++++++++++++++++++----------
>  drivers/vhost/net.c    | 297 ++++++++++++++++++++++++++++++++++++++++++++-----
>  include/linux/if_tun.h |   7 ++
>  3 files changed, 444 insertions(+), 86 deletions(-)
> 
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH] ath6kl: mark expected switch fall-throughs
From: Gustavo A. R. Silva @ 2018-05-25 17:50 UTC (permalink / raw)
  To: Kalle Valo, Sergei Shtylyov
  Cc: David S. Miller, linux-wireless, netdev, linux-kernel
In-Reply-To: <871sdzc16l.fsf@kamboji.qca.qualcomm.com>



On 05/25/2018 08:30 AM, Kalle Valo wrote:
> Sergei Shtylyov <sergei.shtylyov@cogentembedded.com> writes:
> 
>> On 5/25/2018 2:13 AM, Gustavo A. R. Silva wrote:
>>
>>> In preparation to enabling -Wimplicit-fallthrough, mark switch cases
>>> where we are expecting to fall through.
>>>
>>> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
>>> ---
>>>    drivers/net/wireless/ath/ath6kl/cfg80211.c | 6 +++---
>>>    1 file changed, 3 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
>>> index 2ba8cf3..29e32cd 100644
>>> --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
>>> +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
>>> @@ -3898,17 +3898,17 @@ int ath6kl_cfg80211_init(struct ath6kl *ar)
>>>    	wiphy->max_scan_ie_len = 1000; /* FIX: what is correct limit? */
>>>    	switch (ar->hw.cap) {
>>>    	case WMI_11AN_CAP:
>>> -		ht = true;
>>> +		ht = true; /* fall through */
>>>    	case WMI_11A_CAP:
>>>    		band_5gig = true;
>>>    		break;
>>>    	case WMI_11GN_CAP:
>>> -		ht = true;
>>> +		ht = true; /* fall through */
>>>    	case WMI_11G_CAP:
>>>    		band_2gig = true;
>>>    		break;
>>>    	case WMI_11AGN_CAP:
>>> -		ht = true;
>>> +		ht = true; /* fall through */
>>>    	case WMI_11AG_CAP:
>>>    		band_2gig = true;
>>>    		band_5gig = true;
>>
>>     Hm, typically such comments are done on a line of their own, have
>> never seen this style...
> 
> Yeah, I was wondering the same. Was there a particular reason for this?
> 

Sometimes people use this style for a one-line code block.

I can change it to the traditional style. No problem.

Thanks
--
Gustavo

^ permalink raw reply

* [PATCH] atm: zatm: fix memcmp casting
From: Ivan Bornyakov @ 2018-05-25 17:49 UTC (permalink / raw)
  To: 3chas3; +Cc: linux-atm-general, netdev, linux-kernel, Ivan Bornyakov

memcmp() returns int, but eprom_try_esi() cast it to unsigned char. One
can lose significant bits and get 0 from non-0 value returned by the
memcmp().

Signed-off-by: Ivan Bornyakov <brnkv.i1@gmail.com>
---
 drivers/atm/zatm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
index 9c9a22958717..a8d2eb0ceb8d 100644
--- a/drivers/atm/zatm.c
+++ b/drivers/atm/zatm.c
@@ -1151,8 +1151,8 @@ static void eprom_get_byte(struct zatm_dev *zatm_dev, unsigned char *byte,
 }
 
 
-static unsigned char eprom_try_esi(struct atm_dev *dev, unsigned short cmd,
-				   int offset, int swap)
+static int eprom_try_esi(struct atm_dev *dev, unsigned short cmd, int offset,
+			 int swap)
 {
 	unsigned char buf[ZEPROM_SIZE];
 	struct zatm_dev *zatm_dev;
-- 
2.16.1

^ permalink raw reply related

* RE: [PATCH] PCI: allow drivers to limit the number of VFs to 0
From: Keller, Jacob E @ 2018-05-25 17:46 UTC (permalink / raw)
  To: Bjorn Helgaas, Jakub Kicinski
  Cc: Bjorn Helgaas, linux-pci@vger.kernel.org, netdev@vger.kernel.org,
	Sathya Perla, Felix Manlunas, alexander.duyck@gmail.com,
	john.fastabend@gmail.com, Donald Dutile,
	oss-drivers@netronome.com, Christoph Hellwig, Derek Chickles,
	Satanand Burla, Raghu Vatsavayi, Ajit Khaparde,
	Sriharsha Basavapatna, Somnath Kotur <somnath.kotur
In-Reply-To: <20180525170122.GA63280@bhelgaas-glaptop.roam.corp.google.com>

> -----Original Message-----
> From: Bjorn Helgaas [mailto:helgaas@kernel.org]
> Sent: Friday, May 25, 2018 10:01 AM
> To: Jakub Kicinski <jakub.kicinski@netronome.com>
> Cc: Bjorn Helgaas <bhelgaas@google.com>; linux-pci@vger.kernel.org;
> netdev@vger.kernel.org; Sathya Perla <sathya.perla@broadcom.com>; Felix
> Manlunas <felix.manlunas@caviumnetworks.com>;
> alexander.duyck@gmail.com; john.fastabend@gmail.com; Keller, Jacob E
> <jacob.e.keller@intel.com>; Donald Dutile <ddutile@redhat.com>; oss-
> drivers@netronome.com; Christoph Hellwig <hch@infradead.org>; Derek
> Chickles <derek.chickles@caviumnetworks.com>; Satanand Burla
> <satananda.burla@caviumnetworks.com>; Raghu Vatsavayi
> <raghu.vatsavayi@caviumnetworks.com>; Ajit Khaparde
> <ajit.khaparde@broadcom.com>; Sriharsha Basavapatna
> <sriharsha.basavapatna@broadcom.com>; Somnath Kotur
> <somnath.kotur@broadcom.com>; Kirsher, Jeffrey T
> <jeffrey.t.kirsher@intel.com>; intel-wired-lan@lists.osuosl.org
> Subject: Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0
> 
> [+cc liquidio, benet, fm10k maintainers:
> 
>   The patch below will affect you if your driver calls
>     pci_sriov_set_totalvfs(dev, 0);
> 
>   Previously that caused a subsequent pci_sriov_get_totalvfs() to return
>   the totalVFs value from the SR-IOV capability.  After this patch, it will
>   return 0, which has implications for VF enablement via the sysfs
>   "sriov_numvfs" file.]
> 

Thanks. I don't foresee any issues with fm10k regarding this..

Thanks,
Jake

^ permalink raw reply

* [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs
From: Xin Long @ 2018-05-25 17:41 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: davem, David Ahern, Eric Dumazet, Marcelo Ricardo Leitner,
	Neil Horman, syzkaller

syzbot reported a rcu_sched self-detected stall on CPU which is caused
by too small value set on rto_min with SCTP_RTOINFO sockopt. With this
value, hb_timer will get stuck there, as in its timer handler it starts
this timer again with this value, then goes to the timer handler again.

This problem is there since very beginning, and thanks to Eric for the
reproducer shared from a syzbot mail.

This patch fixes it by not allowing to set rto_min with a value below
200 msecs, which is based on TCP's, by either setsockopt or sysctl.

Reported-by: syzbot+3dcd59a1f907245f891f@syzkaller.appspotmail.com
Suggested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/net/sctp/constants.h |  1 +
 net/sctp/socket.c            | 10 +++++++---
 net/sctp/sysctl.c            |  3 ++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 20ff237..2ee7a7b 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -277,6 +277,7 @@ enum { SCTP_MAX_GABS = 16 };
 #define SCTP_RTO_INITIAL	(3 * 1000)
 #define SCTP_RTO_MIN		(1 * 1000)
 #define SCTP_RTO_MAX		(60 * 1000)
+#define SCTP_RTO_HARD_MIN	200
 
 #define SCTP_RTO_ALPHA          3   /* 1/8 when converted to right shifts. */
 #define SCTP_RTO_BETA           2   /* 1/4 when converted to right shifts. */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ae7e7c6..6ef12c7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3029,7 +3029,8 @@ static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval,
  * be changed.
  *
  */
-static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigned int optlen)
+static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval,
+				   unsigned int optlen)
 {
 	struct sctp_rtoinfo rtoinfo;
 	struct sctp_association *asoc;
@@ -3056,10 +3057,13 @@ static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigne
 	else
 		rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max;
 
-	if (rto_min)
+	if (rto_min) {
+		if (rto_min < SCTP_RTO_HARD_MIN)
+			return -EINVAL;
 		rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min;
-	else
+	} else {
 		rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min;
+	}
 
 	if (rto_min > rto_max)
 		return -EINVAL;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 33ca5b7..7ec854a 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -52,6 +52,7 @@ static int rto_alpha_min = 0;
 static int rto_beta_min = 0;
 static int rto_alpha_max = 1000;
 static int rto_beta_max = 1000;
+static int rto_hard_min = SCTP_RTO_HARD_MIN;
 
 static unsigned long max_autoclose_min = 0;
 static unsigned long max_autoclose_max =
@@ -116,7 +117,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_sctp_do_rto_min,
-		.extra1         = &one,
+		.extra1         = &rto_hard_min,
 		.extra2         = &init_net.sctp.rto_max
 	},
 	{
-- 
2.1.0

^ permalink raw reply related

* [bpf-next PATCH] bpf: sockhash fix race with bpf_tcp_close and map delete
From: John Fastabend @ 2018-05-25 17:37 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev

syzbot reported two related splats, a use after free and null
pointer dereference, when a TCP socket is closed while the map is
also being removed.

The psock keeps a reference to all map slots that have a reference
to the sock so that when the sock is closed we can clean up any
outstanding sock{map|hash} entries. This avoids pinning a sock
forever if the map owner fails to do proper cleanup. However, the
result is we have two paths that can free an entry in the map. Even
the comment in the sock{map|hash} tear down function, sock_hash_free()
notes this:

 At this point no update, lookup or delete operations can happen.
 However, be aware we can still get a socket state event updates,
 and data ready callbacks that reference the psock from sk_user_data.

Both removal paths omitted taking the hash bucket lock resulting
in the case where we have two references that are in the process
of being free'd.

Reported-by: syzbot+a761b81c211794fa1072@syzkaller.appspotmail.com
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 kernel/bpf/sockmap.c |   33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 52a91d8..b508141f 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -225,6 +225,16 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	kfree_rcu(l, rcu);
 }
 
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
 	void (*close_fun)(struct sock *sk, long timeout);
@@ -268,9 +278,15 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 				smap_release_sock(psock, sk);
 			}
 		} else {
+			u32 hash = e->hash_link->hash;
+			struct bucket *b;
+
+			b = __select_bucket(e->htab, hash);
+			raw_spin_lock_bh(&b->lock);
 			hlist_del_rcu(&e->hash_link->hash_node);
 			smap_release_sock(psock, e->hash_link->sk);
 			free_htab_elem(e->htab, e->hash_link);
+			raw_spin_unlock_bh(&b->lock);
 		}
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
@@ -2043,16 +2059,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 	return ERR_PTR(err);
 }
 
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
-
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &__select_bucket(htab, hash)->head;
-}
-
 static void sock_hash_free(struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -2069,10 +2075,12 @@ static void sock_hash_free(struct bpf_map *map)
 	 */
 	rcu_read_lock();
 	for (i = 0; i < htab->n_buckets; i++) {
-		struct hlist_head *head = select_bucket(htab, i);
+		struct bucket *b = __select_bucket(htab, i);
+		struct hlist_head *head = &b->head;
 		struct hlist_node *n;
 		struct htab_elem *l;
 
+		raw_spin_lock_bh(&b->lock);
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			struct sock *sock = l->sk;
 			struct smap_psock *psock;
@@ -2090,8 +2098,9 @@ static void sock_hash_free(struct bpf_map *map)
 				smap_release_sock(psock, sock);
 			}
 			write_unlock_bh(&sock->sk_callback_lock);
-			kfree(l);
+			free_htab_elem(htab, l);
 		}
+		raw_spin_unlock_bh(&b->lock);
 	}
 	rcu_read_unlock();
 	bpf_map_area_free(htab->buckets);

^ permalink raw reply related

* Re: [PATCH bpf-next] libbpf: Install btf.h with libbpf
From: Martin KaFai Lau @ 2018-05-25 17:33 UTC (permalink / raw)
  To: Andrey Ignatov; +Cc: netdev, ast, daniel, kernel-team
In-Reply-To: <20180525172313.1043567-1-rdna@fb.com>

On Fri, May 25, 2018 at 10:23:13AM -0700, Andrey Ignatov wrote:
> install_headers target should contain all headers that are part of
> libbpf. Add missing btf.h
> 
> Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>

^ permalink raw reply

* [PATCH bpf-next] libbpf: Install btf.h with libbpf
From: Andrey Ignatov @ 2018-05-25 17:23 UTC (permalink / raw)
  To: netdev; +Cc: Andrey Ignatov, kafai, ast, daniel, kernel-team

install_headers target should contain all headers that are part of
libbpf. Add missing btf.h

Signed-off-by: Andrey Ignatov <rdna@fb.com>
---
 tools/lib/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index f3fab4a..5390e77 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -189,6 +189,7 @@ install_headers:
 	$(call QUIET_INSTALL, headers) \
 		$(call do_install,bpf.h,$(prefix)/include/bpf,644); \
 		$(call do_install,libbpf.h,$(prefix)/include/bpf,644);
+		$(call do_install,btf.h,$(prefix)/include/bpf,644);
 
 install: install_lib
 
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH net-next v12 0/5] Enable virtio_net to act as a standby for a passthru device
From: Michael S. Tsirkin @ 2018-05-25 17:19 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh, jiri, aaron.f.brown, anjali.singhai
In-Reply-To: <1527180917-39737-1-git-send-email-sridhar.samudrala@intel.com>

On Thu, May 24, 2018 at 09:55:12AM -0700, Sridhar Samudrala wrote:
> The main motivation for this patch is to enable cloud service providers
> to provide an accelerated datapath to virtio-net enabled VMs in a 
> transparent manner with no/minimal guest userspace changes. This also
> enables hypervisor controlled live migration to be supported with VMs that
> have direct attached SR-IOV VF devices.
> 
> Patch 1 introduces a failover module that provides a generic interface for 
> paravirtual drivers to listen for netdev register/unregister/link change
> events from pci ethernet devices with the same MAC and takeover their
> datapath. The notifier and event handling code is based on the existing
> netvsc implementation. 
> 
> Patch 2 refactors netvsc to use the registration/notification framework
> introduced by failover module.
> 
> Patch 3 introduces a net_failover driver that provides an automated
> failover mechanism to paravirtual drivers via APIs to create and destroy
> a failover master netdev and mananges a primary and standby slave netdevs
> that get registered via the generic failover infrastructure.
> 
> Patch 4 introduces a new feature bit VIRTIO_NET_F_STANDBY to virtio-net
> that can be used by hypervisor to indicate that virtio_net interface
> should act as a standby for another device with the same MAC address.
> 
> Patch 5 extends virtio_net to use alternate datapath when available and
> registered. When STANDBY feature is enabled, virtio_net driver uese the
> net_failover API to create an additional 'failover' netdev that acts as
> a master device and controls 2 slave devices.  The original virtio_net
> netdev is registered as 'standby' netdev and a passthru/vf device with
> the same MAC gets registered as 'primary' netdev. Both 'standby' and
> 'failover' netdevs are associated with the same 'pci' device.  The user
> accesses the network interface via 'failover' netdev. The 'failover'
> netdev chooses 'primary' netdev as default for transmits when it is
> available with link up and running.
> 
> As this patch series is initially focusing on usecases where hypervisor 
> fully controls the VM networking and the guest is not expected to directly 
> configure any hardware settings, it doesn't expose all the ndo/ethtool ops
> that are supported by virtio_net at this time. To support additional usecases,
> it should be possible to enable additional ops later by caching the state
> in failover netdev and replaying when the 'primary' netdev gets registered. 
>  
> At the time of live migration, the hypervisor needs to unplug the VF device
> from the guest on the source host and reset the MAC filter of the VF to
> initiate failover of datapath to virtio before starting the migration. After
> the migration is completed, the destination hypervisor sets the MAC filter
> on the VF and plugs it back to the guest to switch over to VF datapath.
> 
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2

Series:

Acked-by: Michael S. Tsirkin <mst@redhat.com>


> v12:
> - Tested live migration with virtio-net/AVF(i40evf) configured in failover
>   mode while running iperf in background. Tried static ip and dhcp
>   configurations using 'network' scripts and Network Manager.
> - Build tested netvsc module.
> Updates:
> - Extended generic failover module to do common functions like setting
>   FAILOVER_SLAVE flag, registering rx-handler and linking to upper dev in
>   the generic register/unregister handlers.
>   This required adding 3 additional failover ops pre_register, pre_unregister
>   and handle_frame.  netvsc and net_failover drivers are updated to support
>   these ops.
> 
> v11:
> - Split net_failover module into 2 components.
>   1. 'failover' module that provides generic failover infrastructure
>   to register a failover instance and listen for slave events.
>   2. 'net_failover' driver that provides APIs to create/destroy upper
>   netdev and supports 3-netdev model used by virtio-net.
> - Added documentation
> 
> v10:
> - fix net_failover_open() to update failover CARRIER correctly based on
>   standby and primary states. 
> - fix net_failover_handle_frame() to handle frames received on standby
>   when primary is present.
> - replace netdev_upper_dev_link with netdev_master_upper_dev_link and
>   handle lower dev state changes.
> - fix net_failver_create() and net_failover_register() interfaces to
>   use ERR_PTR and avoid arg **
> - disable setting mac address when virtio-net in STANDBY mode
> - document exported symbols
> - added entry to MAINTAINERS file
> 
> v9:
> Select NET_FAILOVER automatically when VIRTIO_NET/HYPERV_NET 
> are enabled. (stephen)
> 
> v8:
> - Made the failover managment routines more robust by updating the feature 
>   bits/other fields in the failover netdev when slave netdevs are 
>   registered/unregistered. (mst)
> - added support for handling vlans.
> - Limited the changes in netvsc to only use the notifier/event/lookups
>   from the failover module. The slave register/unregister/link-change 
>   handlers are only updated to use the getbymac routine to get the 
>   upper netdev. There is no change in their functionality. (stephen)
> - renamed structs/function/file names to use net_failover prefix. (mst)
> 
> v7
> - Rename 'bypass/active/backup' terminology with 'failover/primary/standy'
>   (jiri, mst)
> - re-arranged dev_open() and dev_set_mtu() calls in the register routines
>   so that they don't get called for 2-netdev model. (stephen)
> - fixed select_queue() routine to do queue selection based on VF if it is
>   registered as primary. (stephen)
> -  minor bugfixes
> 
> v6 RFC:
>   Simplified virtio_net changes by moving all the ndo_ops of the 
>   bypass_netdev and create/destroy of bypass_netdev to 'bypass' module.
>   avoided 2 phase registration(driver + instances).
>   introduced IFF_BYPASS/IFF_BYPASS_SLAVE dev->priv_flags 
>   replaced mutex with a spinlock
> 
> v5 RFC:
>   Based on Jiri's comments, moved the common functionality to a 'bypass'
>   module so that the same notifier and event handlers to handle child
>   register/unregister/link change events can be shared between virtio_net
>   and netvsc.
>   Improved error handling based on Siwei's comments.
> v4:
> - Based on the review comments on the v3 version of the RFC patch and
>   Jakub's suggestion for the naming issue with 3 netdev solution,
>   proposed 3 netdev in-driver bonding solution for virtio-net.
> v3 RFC:
> - Introduced 3 netdev model and pointed out a couple of issues with
>   that model and proposed 2 netdev model to avoid these issues.
> - Removed broadcast/multicast optimization and only use virtio as
>   backup path when VF is unplugged.
> v2 RFC:
> - Changed VIRTIO_NET_F_MASTER to VIRTIO_NET_F_BACKUP (mst)
> - made a small change to the virtio-net xmit path to only use VF datapath
>   for unicasts. Broadcasts/multicasts use virtio datapath. This avoids
>   east-west broadcasts to go over the PCI link.
> - added suppport for the feature bit in qemu
> 
> Sridhar Samudrala (5):
>   net: Introduce generic failover module
>   netvsc: refactor notifier/event handling code to use the failover
>     framework
>   net: Introduce net_failover driver
>   virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit
>   virtio_net: Extend virtio to use VF datapath when available
> 
>  Documentation/networking/failover.rst     |  18 +
>  Documentation/networking/net_failover.rst | 116 +++++
>  MAINTAINERS                               |  16 +
>  drivers/net/Kconfig                       |  13 +
>  drivers/net/Makefile                      |   1 +
>  drivers/net/hyperv/Kconfig                |   1 +
>  drivers/net/hyperv/hyperv_net.h           |   2 +
>  drivers/net/hyperv/netvsc_drv.c           | 222 ++------
>  drivers/net/net_failover.c                | 836 ++++++++++++++++++++++++++++++
>  drivers/net/virtio_net.c                  |  40 +-
>  include/linux/netdevice.h                 |  16 +
>  include/net/failover.h                    |  36 ++
>  include/net/net_failover.h                |  40 ++
>  include/uapi/linux/virtio_net.h           |   3 +
>  net/Kconfig                               |  13 +
>  net/core/Makefile                         |   1 +
>  net/core/failover.c                       | 315 +++++++++++
>  17 files changed, 1522 insertions(+), 167 deletions(-)
>  create mode 100644 Documentation/networking/failover.rst
>  create mode 100644 Documentation/networking/net_failover.rst
>  create mode 100644 drivers/net/net_failover.c
>  create mode 100644 include/net/failover.h
>  create mode 100644 include/net/net_failover.h
>  create mode 100644 net/core/failover.c
> 
> -- 
> 2.14.3

^ permalink raw reply

* Re: [PATCH v2 net-next] net: stmmac: Add PPS and Flexible PPS support
From: Richard Cochran @ 2018-05-25 17:02 UTC (permalink / raw)
  To: Jose Abreu
  Cc: netdev, David S. Miller, Joao Pinto, Vitor Soares,
	Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <fb25946a2fee8708876ff9a56d5b20f8b9224a5a.1527262324.git.joabreu@synopsys.com>

On Fri, May 25, 2018 at 04:32:52PM +0100, Jose Abreu wrote:
> +int dwmac5_pps_config(void __iomem *ioaddr, bool enable)
> +{
> +	u32 val = readl(ioaddr + MAC_PPS_CONTROL);
> +
> +	/* There is no way to disable fixed PPS output so we just reset
> +	 * the values to make sure its in fixed PPS mode */

In that case, don't try to make this appear to be programmable.  Just
reset the registers in your probe or setup function unconditionally.

> +	val &= ~PPSx_MASK(0);
> +	val |= TRGTMODSELx(0, 0x2);
> +
> +	writel(val, ioaddr + MAC_PPS_CONTROL);
> +	return 0;
> +}
> +
> +int dwmac5_flex_pps_config(void __iomem *ioaddr, int index,
> +			   struct stmmac_pps_cfg *cfg, bool enable,
> +			   u32 sub_second_inc, u32 systime_flags)
> +{
> +	u32 tnsec = readl(ioaddr + MAC_PPSx_TARGET_TIME_NSEC(index));
> +	u32 val = readl(ioaddr + MAC_PPS_CONTROL);
> +	u64 period;
> +
> +	if (!cfg->available)
> +		return -EINVAL;
> +	if (tnsec & TRGTBUSY0)
> +		return -EBUSY;
> +	if (!sub_second_inc || !systime_flags)
> +		return -EINVAL;

Don't add tests on the arguments like this.  Instead, make sure the
caller always provides correct values.

> +
> +	val &= ~PPSx_MASK(index);
> +
> +	if (!enable) {
> +		val |= PPSCMDx(index, 0x5);
> +		writel(val, ioaddr + MAC_PPS_CONTROL);
> +		return 0;
> +	}
> +
> +	val |= PPSCMDx(index, 0x2);
> +	val |= TRGTMODSELx(index, 0x2);
> +	val |= PPSEN0;
> +
> +	writel(cfg->start.tv_sec, ioaddr + MAC_PPSx_TARGET_TIME_SEC(index));
> +
> +	if (!(systime_flags & PTP_TCR_TSCTRLSSR))
> +		cfg->start.tv_nsec = (cfg->start.tv_nsec * 1000) / 465;
> +	writel(cfg->start.tv_nsec, ioaddr + MAC_PPSx_TARGET_TIME_NSEC(index));
> +
> +	period = cfg->period.tv_sec * 1000000000;
> +	period += cfg->period.tv_nsec;

> +	struct timespec64 period;

> +
> +	do_div(period, sub_second_inc);
> +
> +	if (period <= 1)
> +		return -EINVAL;
> +
> +	writel(period - 1, ioaddr + MAC_PPSx_INTERVAL(index));
> +
> +	period >>= 1;
> +	if (period <= 1)
> +		return -EINVAL;
> +
> +	writel(period - 1, ioaddr + MAC_PPSx_WIDTH(index));
> +
> +	/* Finally, activate it */
> +	writel(val, ioaddr + MAC_PPS_CONTROL);
> +	return 0;
> +}

> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
> index 7d3a5c7..35c6d0c 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
> @@ -140,19 +140,50 @@ static int stmmac_set_time(struct ptp_clock_info *ptp,
>  static int stmmac_enable(struct ptp_clock_info *ptp,
>  			 struct ptp_clock_request *rq, int on)
>  {
> -	return -EOPNOTSUPP;
> +	struct stmmac_priv *priv =
> +	    container_of(ptp, struct stmmac_priv, ptp_clock_ops);
> +	struct stmmac_pps_cfg *cfg;
> +	int ret = -EOPNOTSUPP;
> +	unsigned long flags;
> +
> +	switch (rq->type) {
> +	case PTP_CLK_REQ_PEROUT:
> +		cfg = &priv->pps[rq->perout.index];
> +
> +		cfg->start.tv_sec = rq->perout.start.sec;
> +		cfg->start.tv_nsec = rq->perout.start.nsec;
> +		cfg->period.tv_sec = rq->perout.period.sec;
> +		cfg->period.tv_nsec = rq->perout.period.nsec;
> +
> +		spin_lock_irqsave(&priv->ptp_lock, flags);
> +		ret = stmmac_flex_pps_config(priv, priv->ioaddr,
> +					     rq->perout.index, cfg, on,
> +					     priv->sub_second_inc,
> +					     priv->systime_flags);
> +		spin_unlock_irqrestore(&priv->ptp_lock, flags);
> +		break;
> +	case PTP_CLK_REQ_PPS:
> +		spin_lock_irqsave(&priv->ptp_lock, flags);
> +		ret = stmmac_pps_config(priv, priv->ioaddr, on);

This is not what PTP_CLK_REQ_PPS is for.  It only to arrange for a
trigger into the kernel's PPS sub-system.

[ Sorry that the ptp header files don't explain this in the comments.
  I should really fix that. ]

> +		spin_unlock_irqrestore(&priv->ptp_lock, flags);
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	return ret;
>  }
>  
>  /* structure describing a PTP hardware clock */
> -static const struct ptp_clock_info stmmac_ptp_clock_ops = {
> +static struct ptp_clock_info stmmac_ptp_clock_ops = {
>  	.owner = THIS_MODULE,
>  	.name = "stmmac_ptp_clock",
>  	.max_adj = 62500000,
>  	.n_alarm = 0,
>  	.n_ext_ts = 0,
> -	.n_per_out = 0,
> +	.n_per_out = 0, /* will be overwritten in stmmac_ptp_register */
>  	.n_pins = 0,
> -	.pps = 0,
> +	.pps = 0, /* will be overwritten in stmmac_ptp_register */

Again, this is for inputting a PPS event into the kernel's PPS subsystem.

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH] PCI: allow drivers to limit the number of VFs to 0
From: Bjorn Helgaas @ 2018-05-25 17:01 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Bjorn Helgaas, linux-pci, netdev, Sathya Perla, Felix Manlunas,
	alexander.duyck, john.fastabend, Jacob Keller, Donald Dutile,
	oss-drivers, Christoph Hellwig, Derek Chickles, Satanand Burla,
	Raghu Vatsavayi, Ajit Khaparde, Sriharsha Basavapatna,
	Somnath Kotur, Jeff Kirsher, intel-wired-lan
In-Reply-To: <20180525140223.GA45098@bhelgaas-glaptop.roam.corp.google.com>

[+cc liquidio, benet, fm10k maintainers:

  The patch below will affect you if your driver calls
    pci_sriov_set_totalvfs(dev, 0);

  Previously that caused a subsequent pci_sriov_get_totalvfs() to return
  the totalVFs value from the SR-IOV capability.  After this patch, it will
  return 0, which has implications for VF enablement via the sysfs
  "sriov_numvfs" file.]

On Fri, May 25, 2018 at 09:02:23AM -0500, Bjorn Helgaas wrote:
> On Thu, May 24, 2018 at 06:20:15PM -0700, Jakub Kicinski wrote:
> > Hi Bjorn!
> > 
> > On Thu, 24 May 2018 18:57:48 -0500, Bjorn Helgaas wrote:
> > > On Mon, Apr 02, 2018 at 03:46:52PM -0700, Jakub Kicinski wrote:
> > > > Some user space depends on enabling sriov_totalvfs number of VFs
> > > > to not fail, e.g.:
> > > > 
> > > > $ cat .../sriov_totalvfs > .../sriov_numvfs
> > > > 
> > > > For devices which VF support depends on loaded FW we have the
> > > > pci_sriov_{g,s}et_totalvfs() API.  However, this API uses 0 as
> > > > a special "unset" value, meaning drivers can't limit sriov_totalvfs
> > > > to 0.  Remove the special values completely and simply initialize
> > > > driver_max_VFs to total_VFs.  Then always use driver_max_VFs.
> > > > Add a helper for drivers to reset the VF limit back to total.  
> > > 
> > > I still can't really make sense out of the changelog.
> > >
> > > I think part of the reason it's confusing is because there are two
> > > things going on:
> > > 
> > >   1) You want this:
> > >   
> > >        pci_sriov_set_totalvfs(dev, 0);
> > >        x = pci_sriov_get_totalvfs(dev) 
> > > 
> > >      to return 0 instead of total_VFs.  That seems to connect with
> > >      your subject line.  It means "sriov_totalvfs" in sysfs could be
> > >      0, but I don't know how that is useful (I'm sure it is; just
> > >      educate me :))
> > 
> > Let me just quote the bug report that got filed on our internal bug
> > tracker :)
> > 
> >   When testing Juju Openstack with Ubuntu 18.04, enabling SR-IOV causes
> >   errors because Juju gets the sriov_totalvfs for SR-IOV-capable device
> >   then tries to set that as the sriov_numvfs parameter.
> > 
> >   For SR-IOV incapable FW, the sriov_totalvfs parameter should be 0, 
> >   but it's set to max.  When FW is switched to flower*, the correct 
> >   sriov_totalvfs value is presented.
> > 
> > * flower is a project name
> 
> From the point of view of the PCI core (which knows nothing about
> device firmware and relies on the architected config space described
> by the PCIe spec), this sounds like an erratum: with some firmware
> installed, the device is not capable of SR-IOV, but still advertises
> an SR-IOV capability with "TotalVFs > 0".
> 
> Regardless of whether that's an erratum, we do allow PF drivers to use
> pci_sriov_set_totalvfs() to limit the number of VFs that may be
> enabled by writing to the PF's "sriov_numvfs" sysfs file.
> 
> But the current implementation does not allow a PF driver to limit VFs
> to 0, and that does seem nonsensical.
> 
> > My understanding is OpenStack uses sriov_totalvfs to determine how many
> > VFs can be enabled, looks like this is the code:
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n464
> > 
> > >   2) You're adding the pci_sriov_reset_totalvfs() interface.  I'm not
> > >      sure what you intend for this.  Is *every* driver supposed to
> > >      call it in .remove()?  Could/should this be done in the core
> > >      somehow instead of depending on every driver?
> > 
> > Good question, I was just thinking yesterday we may want to call it
> > from the core, but I don't think it's strictly necessary nor always
> > sufficient (we may reload FW without re-probing).
> > 
> > We have a device which supports different number of VFs based on the FW
> > loaded.  Some legacy FWs does not inform the driver how many VFs it can
> > support, because it supports max.  So the flow in our driver is this:
> > 
> > load_fw(dev);
> > ...
> > max_vfs = ask_fw_for_max_vfs(dev);
> > if (max_vfs >= 0)
> > 	return pci_sriov_set_totalvfs(dev, max_vfs);
> > else /* FW didn't tell us, assume max */
> > 	return pci_sriov_reset_totalvfs(dev); 
> > 
> > We also reset the max on device remove, but that's not strictly
> > necessary.
> > 
> > Other users of pci_sriov_set_totalvfs() always know the value to set
> > the total to (either always get it from FW or it's a constant).
> > 
> > If you prefer we can work out the correct max for those legacy cases in
> > the driver as well, although it seemed cleaner to just ask the core,
> > since it already has total_VFs value handy :)
> > 
> > > I'm also having a hard time connecting your user-space command example
> > > with the rest of this.  Maybe it will make more sense to me tomorrow
> > > after some coffee.
> > 
> > OpenStack assumes it will always be able to set sriov_numvfs to
> > sriov_totalvfs, see this 'if':
> > 
> > http://git.openstack.org/cgit/openstack/charm-neutron-openvswitch/tree/hooks/neutron_ovs_utils.py#n512
> 
> Thanks for educating me.  I think there are two issues here that we
> can separate.  I extracted the patch below for the first.
> 
> The second is the question of resetting driver_max_VFs.  I think we
> currently have a general issue in the core:
> 
>   - load PF driver 1
>   - driver calls pci_sriov_set_totalvfs() to reduce driver_max_VFs
>   - unload PF driver 1
>   - load PF driver 2
> 
> Now driver_max_VFs is still stuck at the lower value set by driver 1.
> I don't think that's the way this should work.
> 
> I guess this is partly a consequence of setting driver_max_VFs in
> sriov_init(), which is called before driver attach and should only
> depend on hardware characteristics, so it is related to the patch
> below.  But I think we should fix it in general, not just for
> netronome.
> 
> 
> commit 4a338bc6f94b9ad824ac944f5dfc249d6838719c
> Author: Jakub Kicinski <jakub.kicinski@netronome.com>
> Date:   Fri May 25 08:18:34 2018 -0500
> 
>     PCI/IOV: Allow PF drivers to limit total_VFs to 0
>     
>     Some SR-IOV PF drivers implement .sriov_configure(), which allows
>     user-space to enable VFs by writing the desired number of VFs to the sysfs
>     "sriov_numvfs" file (see sriov_numvfs_store()).
>     
>     The PCI core limits the number of VFs to the TotalVFs advertised by the
>     device in its SR-IOV capability.  The PF driver can limit the number of VFs
>     to even fewer (it may have pre-allocated data structures or knowledge of
>     device limitations) by calling pci_sriov_set_totalvfs(), but previously it
>     could not limit the VFs to 0.
>     
>     Change pci_sriov_get_totalvfs() so it always respects the VF limit imposed
>     by the PF driver, even if the limit is 0.
>     
>     This sequence:
>     
>       pci_sriov_set_totalvfs(dev, 0);
>       x = pci_sriov_get_totalvfs(dev);
>     
>     previously set "x" to TotalVFs from the SR-IOV capability.  Now it will set
>     "x" to 0.
>     
>     Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>     Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 192b82898a38..d0d73dbbd5ca 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -469,6 +469,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	iov->nres = nres;
>  	iov->ctrl = ctrl;
>  	iov->total_VFs = total;
> +	iov->driver_max_VFs = total;
>  	pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
>  	iov->pgsz = pgsz;
>  	iov->self = dev;
> @@ -827,10 +828,7 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>  	if (!dev->is_physfn)
>  		return 0;
>  
> -	if (dev->sriov->driver_max_VFs)
> -		return dev->sriov->driver_max_VFs;
> -
> -	return dev->sriov->total_VFs;
> +	return dev->sriov->driver_max_VFs;
>  }
>  EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
>  

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox