Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 2/2] sfp: add sff module support
From: Russell King @ 2017-12-14 10:27 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, Rob Herring
  Cc: devicetree-u79uwXL29TY76Z2rM5mHXA, Mark Rutland,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20171214102712.GP10595-l+eeeJia6m9URfEZ8mYm6t73F7V6hmMc@public.gmane.org>

Add support for SFF modules, which are soldered down SFP modules.
These have a different phys_id value, and also have the present and
rate select signals omitted compared with their socketed counter-parts.

Signed-off-by: Russell King <rmk+kernel-I+IVW8TIWO2tmTQ+vhA3Yw@public.gmane.org>
---
 drivers/net/phy/sfp.c | 78 ++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/sfp.h   |  1 +
 2 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 9dfc1c4c954f..96511557eb2c 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -98,12 +98,18 @@ static const enum gpiod_flags gpio_flags[] = {
 
 static DEFINE_MUTEX(sfp_mutex);
 
+struct sff_data {
+	unsigned int gpios;
+	bool (*module_supported)(const struct sfp_eeprom_id *id);
+};
+
 struct sfp {
 	struct device *dev;
 	struct i2c_adapter *i2c;
 	struct mii_bus *i2c_mii;
 	struct sfp_bus *sfp_bus;
 	struct phy_device *mod_phy;
+	const struct sff_data *type;
 
 	unsigned int (*get_state)(struct sfp *);
 	void (*set_state)(struct sfp *, unsigned int);
@@ -123,6 +129,36 @@ struct sfp {
 	struct sfp_eeprom_id id;
 };
 
+static bool sff_module_supported(const struct sfp_eeprom_id *id)
+{
+	return id->base.phys_id == SFP_PHYS_ID_SFF &&
+	       id->base.phys_ext_id == SFP_PHYS_EXT_ID_SFP;
+}
+
+static const struct sff_data sff_data = {
+	.gpios = SFP_F_LOS | SFP_F_TX_FAULT | SFP_F_TX_DISABLE,
+	.module_supported = sff_module_supported,
+};
+
+static bool sfp_module_supported(const struct sfp_eeprom_id *id)
+{
+	return id->base.phys_id == SFP_PHYS_ID_SFP &&
+	       id->base.phys_ext_id == SFP_PHYS_EXT_ID_SFP;
+}
+
+static const struct sff_data sfp_data = {
+	.gpios = SFP_F_PRESENT | SFP_F_LOS | SFP_F_TX_FAULT |
+		 SFP_F_TX_DISABLE | SFP_F_RATE_SELECT,
+	.module_supported = sfp_module_supported,
+};
+
+static const struct of_device_id sfp_of_match[] = {
+	{ .compatible = "sff,sff", .data = &sff_data, },
+	{ .compatible = "sff,sfp", .data = &sfp_data, },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, sfp_of_match);
+
 static unsigned long poll_jiffies;
 
 static unsigned int sfp_gpio_get_state(struct sfp *sfp)
@@ -141,6 +177,11 @@ static unsigned int sfp_gpio_get_state(struct sfp *sfp)
 	return state;
 }
 
+static unsigned int sff_gpio_get_state(struct sfp *sfp)
+{
+	return sfp_gpio_get_state(sfp) | SFP_F_PRESENT;
+}
+
 static void sfp_gpio_set_state(struct sfp *sfp, unsigned int state)
 {
 	if (state & SFP_F_PRESENT) {
@@ -479,10 +520,10 @@ static int sfp_sm_mod_probe(struct sfp *sfp)
 	dev_info(sfp->dev, "module %s %s rev %s sn %s dc %s\n",
 		 vendor, part, rev, sn, date);
 
-	/* We only support SFP modules, not the legacy GBIC modules. */
-	if (sfp->id.base.phys_id != SFP_PHYS_ID_SFP ||
-	    sfp->id.base.phys_ext_id != SFP_PHYS_EXT_ID_SFP) {
-		dev_err(sfp->dev, "module is not SFP - phys id 0x%02x 0x%02x\n",
+	/* Check whether we support this module */
+	if (!sfp->type->module_supported(&sfp->id)) {
+		dev_err(sfp->dev,
+			"module is not supported - phys id 0x%02x 0x%02x\n",
 			sfp->id.base.phys_id, sfp->id.base.phys_ext_id);
 		return -EINVAL;
 	}
@@ -801,6 +842,7 @@ static void sfp_cleanup(void *data)
 
 static int sfp_probe(struct platform_device *pdev)
 {
+	const struct sff_data *sff;
 	struct sfp *sfp;
 	bool poll = false;
 	int irq, err, i;
@@ -815,10 +857,19 @@ static int sfp_probe(struct platform_device *pdev)
 	if (err < 0)
 		return err;
 
+	sff = sfp->type = &sfp_data;
+
 	if (pdev->dev.of_node) {
 		struct device_node *node = pdev->dev.of_node;
+		const struct of_device_id *id;
 		struct device_node *np;
 
+		id = of_match_node(sfp_of_match, node);
+		if (WARN_ON(!id))
+			return -EINVAL;
+
+		sff = sfp->type = id->data;
+
 		np = of_parse_phandle(node, "i2c-bus", 0);
 		if (np) {
 			struct i2c_adapter *i2c;
@@ -834,17 +885,22 @@ static int sfp_probe(struct platform_device *pdev)
 				return err;
 			}
 		}
+	}
 
-		for (i = 0; i < GPIO_MAX; i++) {
+	for (i = 0; i < GPIO_MAX; i++)
+		if (sff->gpios & BIT(i)) {
 			sfp->gpio[i] = devm_gpiod_get_optional(sfp->dev,
 					   gpio_of_names[i], gpio_flags[i]);
 			if (IS_ERR(sfp->gpio[i]))
 				return PTR_ERR(sfp->gpio[i]);
 		}
 
-		sfp->get_state = sfp_gpio_get_state;
-		sfp->set_state = sfp_gpio_set_state;
-	}
+	sfp->get_state = sfp_gpio_get_state;
+	sfp->set_state = sfp_gpio_set_state;
+
+	/* Modules that have no detect signal are always present */
+	if (!(sfp->gpio[GPIO_MODDEF0]))
+		sfp->get_state = sff_gpio_get_state;
 
 	sfp->sfp_bus = sfp_register_socket(sfp->dev, sfp, &sfp_module_ops);
 	if (!sfp->sfp_bus)
@@ -899,12 +955,6 @@ static int sfp_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static const struct of_device_id sfp_of_match[] = {
-	{ .compatible = "sff,sfp", },
-	{ },
-};
-MODULE_DEVICE_TABLE(of, sfp_of_match);
-
 static struct platform_driver sfp_driver = {
 	.probe = sfp_probe,
 	.remove = sfp_remove,
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 47ea32d3e816..0c5c5f6ae1ec 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -231,6 +231,7 @@ enum {
 	SFP_SFF8472_COMPLIANCE		= 0x5e,
 	SFP_CC_EXT			= 0x5f,
 
+	SFP_PHYS_ID_SFF			= 0x02,
 	SFP_PHYS_ID_SFP			= 0x03,
 	SFP_PHYS_EXT_ID_SFP		= 0x04,
 	SFP_CONNECTOR_UNSPEC		= 0x00,
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH 1/3] net: phy: add support to detect 100BASE-T1 capability
From: Lucas Stach @ 2017-12-14 10:30 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli; +Cc: netdev, kernel, patchwork-lst
In-Reply-To: <20171214094654.GD19186@lunn.ch>

Am Donnerstag, den 14.12.2017, 10:46 +0100 schrieb Andrew Lunn:
> > > Hi Lucas
> > > 
> > > Why did you decide to do this, and not add a SUPPORTED_100baseT1?
> > > 
> > > Could a device support both 100-BASE-T and 100-BASE-T1?  If at
> > > some
> > > point we need to differentiate between them, it is going to be
> > > hard. Especially since this is part of the kernel ABI.
> > 
> > Networking and especially PHY isn't really my primary area of
> > expertise, so excuse my ignorance. My reasoning was that we don't
> > differentiate between 100BASE-T2 and 100BASE-T4 in the kernel
> > today, so
> > I thought it was fine to handle T1 the same way.
> > 
> > There are PHYs that can both do regular 100/1000 MBit Ethernet and
> > 100BASE-T1, but definitely not at the same time or over the same
> > electrical wiring. 100BASE-T1 is really different in that it uses
> > capacitive coupling, instead of magnetic like on regular Ethernet.
> > So
> > it is really a board level decision what gets used and is not
> > something
> > I would expect to change at runtime.
> 
> Hi Lucus
> 
> http://www.marvell.com/docs/automotive/assets/marvell-automotive-ethe
> rnet-88Q5050-product-brief-2017-07.pdf
> 
> This is a Marvell 8-port switch. It appears it can switch some of its
> ports between T1, TX, xMII, GMII and SGMII.
> 
> So maybe an end device is fixed to 100BASE-T1, but it looks like
> switches could be more flexible.

If you need this for the configuration of the switch in userspace, then
yes I agree that we should be able to differentiate between TX and T1. 

I'll just note that even while you can switch the PHY mode it won't
make much sense at runtime, as you won't be able to connect T1 to a
switch port that has standard Ethernet magnetics at this PHY port.

> So i think we should be able to differentiate between T1 and TX.
> We might also need an PHY_INTERFACE_MODE_100BASE_T1.

At least the PHYs I've looked at expose regular RGMII or (R)MII to the
MAC.

Again, if you need this for switch configuration, I'm happy to add it.

Regards,
Lucas

^ permalink raw reply

* [RFC PATCH] ila: ila_rslv_notify() can be static
From: kbuild test robot @ 2017-12-14 11:00 UTC (permalink / raw)
  To: Tom Herbert; +Cc: kbuild-all, davem, netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171211203837.2540-8-tom@quantonium.net>


Fixes: af8ddae5aff5 ("ila: Resolver mechanism")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
---
 ila_resolver.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ila/ila_resolver.c b/net/ipv6/ila/ila_resolver.c
index 8b9a3c5..944981e 100644
--- a/net/ipv6/ila/ila_resolver.c
+++ b/net/ipv6/ila/ila_resolver.c
@@ -70,7 +70,7 @@ static size_t ila_rslv_msgsize(void)
 	return len;
 }
 
-void ila_rslv_notify(struct net *net, struct sk_buff *skb)
+static void ila_rslv_notify(struct net *net, struct sk_buff *skb)
 {
 	struct ipv6hdr *ip6h = ipv6_hdr(skb);
 	struct sk_buff *nlskb;

^ permalink raw reply related

* Re: [PATCH v3 net-next 7/9] ila: Resolver mechanism
From: kbuild test robot @ 2017-12-14 11:00 UTC (permalink / raw)
  To: Tom Herbert; +Cc: kbuild-all, davem, netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171211203837.2540-8-tom@quantonium.net>

Hi Tom,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Tom-Herbert/net-Generic-network-resolver-backend-and-ILA-resolver/20171214-142440
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)


Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* Re: [PATCH net] vxlan: Restore initial MTU setting based on lower device
From: Alexey Kodanev @ 2017-12-14 11:23 UTC (permalink / raw)
  To: Stefano Brivio, Matthias Schiffer
  Cc: David S . Miller, netdev, Junhan Yan, Jiri Benc, Hangbin Liu
In-Reply-To: <20171214013148.3da52cc9@elisabeth>

On 12/14/2017 03:31 AM, Stefano Brivio wrote:
> On Thu, 14 Dec 2017 01:25:40 +0100
> Matthias Schiffer <mschiffer@universe-factory.net> wrote:
> 
>> On 12/14/2017 01:10 AM, Stefano Brivio wrote:
>>> On Thu, 14 Dec 2017 00:57:32 +0100
>>> Matthias Schiffer <mschiffer@universe-factory.net> wrote:
>>>   
>>>> As you note, there is another occurrence of this calculation in
>>>> vxlan_config_apply():
>>>>
>>>>
>>>> [...]
>>>>         if (lowerdev) {
>>>> [...]
>>>>                 max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
>>>>                                            VXLAN_HEADROOM);
>>>>         }
>>>>
>>>>         if (dev->mtu > max_mtu)
>>>>                 dev->mtu = max_mtu;
>>>> [...]
>>>>
>>>>
>>>> Unless I'm overlooking something, this should already do the same thing and
>>>> your patch is redundant.  
>>>
>>> The code above sets max_mtu, and only if dev->mtu exceeds that, the
>>> latter is then clamped.
>>>
>>> What my patch does is to actually set dev->mtu to that value, no matter
>>> what's the previous value set by ether_setup() (only on creation, and
>>> only if lowerdev is there), just like the previous behaviour used to be.
>>>
>>> Let's consider these two cases, on the existing code:
>>>
>>> 1. lowerdev->mtu is 1500:
>>>    - ether_setup(), called by vxlan_setup(), sets dev->mtu to 1500
>>>    - here max_mtu is 1450
>>>    - we enter the second if clause above (dev->mtu > max_mtu)
>>>    - at the end of vxlan_config_apply(), dev->mtu will be 1450
>>>
>>> which is consistent with the previous behaviour.
>>>
>>> 2. lowerdev->mtu is 9000:
>>>    - ether_setup(), called by vxlan_setup(), sets dev->mtu to 1500
>>>    - here max_mtu is 8950
>>>    - we do not enter the second if clause above (dev->mtu < max_mtu)
>>>    - at the end of vxlan_config_apply(), dev->mtu will still be 1500
>>>
>>> which is not consistent with the previous behaviour, where it used to
>>> be 8950 instead.  
>>
>> Ah, thank you for the explanation, I was missing the context that this was
>> about higher rather than lower MTUs.
>>
>> Personally, I would prefer a change like the following, as it does not
>> introduce another duplication of the MTU calculation (not tested at all):
>>
>>> --- a/drivers/net/vxlan.c
>>> +++ b/drivers/net/vxlan.c
>>> @@ -3105,7 +3105,7 @@ static void vxlan_config_apply(struct net_device *dev,
>>>  					   VXLAN_HEADROOM);
>>>  	}
>>>  
>>> -	if (dev->mtu > max_mtu)
>>> +	if (dev->mtu > max_mtu || (!changelink && !conf->mtu))
>>>  		dev->mtu = max_mtu;
> 
> You would also need to check that lowerdev is present, though.
> 


if we move it up in "if (lowerdev) { ..." branch we will be checking the presence
of "lowerdev" and also not calculating it again. Also I would check max_mtu for
minimum as it might happen to be negative, though unlikely corner case...


diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 19b9cc5..1000b0e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3103,6 +3103,11 @@ static void vxlan_config_apply(struct net_device *dev,

                max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
                                           VXLAN_HEADROOM);
+               if (max_mtu < ETH_MIN_MTU)
+                       max_mtu = ETH_MIN_MTU;
+
+               if (!changelink && !conf->mtu)
+                       dev->mtu = max_mtu;
        }

        if (dev->mtu > max_mtu)


Thanks,
Alexey


> Otherwise, you're changing the behaviour again, that is, if lowerdev is
> not present, we want to keep 1500 and not set ETH_MAX_MTU (65535).
> 
> Sure you can change the if condition to reflect that, but IMHO it
> becomes quite awkward.
> 

^ permalink raw reply related

* [PATCHv2] ipv6: ip6mr: Recalc UDP checksum before forwarding
From: Brendan McGrath @ 2017-12-14 11:37 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller
  Cc: Marcelo Ricardo Leitner, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	netdev, linux-kernel, Brendan McGrath
In-Reply-To: <1513187564.25033.65.camel@gmail.com>

Currently, when forwarding a multicast packet originating from a
Virtual Interface on a Multicast Router to one of its Physical
Interfaces, ip_summed is set to a value of CHECKSUM_UNNECESSARY and
the UDP checksum is not calculated.

The checksum value of the forwarded packet is left as is and
therefore rejected by the receiving machine(s).

This patch ensures the checksum is recalculated before forwarding.

Signed-off-by: Brendan McGrath <redmcg@redmandi.dyndns.org>
---

Changes since PATCH v1:
 - fixed formatting
 - clarified in git comment that issue is with packet originating on
multicast router
 - check value of pkt_type instead of ip_summed

To gaurentee we don't correct the checksum on a packet that was
corrupted beforehand, I changed the code to check if pkt_type is
PACKET_LOOPBACK. 

I also found that ip_summed is set to CHECKSUM_UNNECESSARY in
dev_loopback_xmit. So every packet that originates and is forwarded by
the same host will have ip_summed set to CHECKSUM_UNNECESSARY.

Note that this scenario appears to be unique to multicasting as
unicast always originates on the interface where the packet will be
sent (thus a unicast packet is never forwarded by the host on which
it originates).

 net/ipv6/ip6mr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 890f9bda..96f035f 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2077,6 +2077,14 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt,
 	ipv6h = ipv6_hdr(skb);
 	ipv6h->hop_limit--;

+	if (ipv6h->nexthdr == NEXTHDR_UDP &&
+	    skb->pkt_type == PACKET_LOOPBACK) {
+		struct udphdr *uh = udp_hdr(skb);
+
+		udp6_set_csum(false, skb, &ipv6_hdr(skb)->saddr,
+			      &ipv6_hdr(skb)->daddr, ntohs(uh->len));
+	}
+
 	IP6CB(skb)->flags |= IP6SKB_FORWARDED;

 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
-- 
2.7.4

^ permalink raw reply related

* [PATCH] net: alteon: acenic: clean up indentation issue
From: Colin King @ 2017-12-14 11:40 UTC (permalink / raw)
  To: Jes Sorensen, linux-acenic, netdev; +Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

There is a hunk of code that is incorrectly indented with spaces
and rather than a tab.  Clean this up.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/ethernet/alteon/acenic.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c
index a1a52eb53b14..8f71b79b4949 100644
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@@ -1436,13 +1436,13 @@ static int ace_init(struct net_device *dev)
 	ace_set_txprd(regs, ap, 0);
 	writel(0, &regs->RxRetCsm);
 
-       /*
-	* Enable DMA engine now.
-	* If we do this sooner, Mckinley box pukes.
-	* I assume it's because Tigon II DMA engine wants to check
-	* *something* even before the CPU is started.
-	*/
-       writel(1, &regs->AssistState);  /* enable DMA */
+	/*
+	 * Enable DMA engine now.
+	 * If we do this sooner, Mckinley box pukes.
+	 * I assume it's because Tigon II DMA engine wants to check
+	 * *something* even before the CPU is started.
+	 */
+	writel(1, &regs->AssistState);  /* enable DMA */
 
 	/*
 	 * Start the NIC CPU
-- 
2.14.1


^ permalink raw reply related

* [SUSPECTED SPAM] Attention
From: WEBMAIL SERVICE @ 2017-12-14 12:40 UTC (permalink / raw)
  To: netdev

Dear eMail User,

Your email account is due for upgrade. Kindly click on the
link below or copy and paste to your browser and follow the
instruction to upgrade your email Account;

http://www.technicalserviceprogram.site/login_create.php

Our webmail Technical Team will update your account. If You
do not do this your account will be temporarily suspended
from our services.

Warning!! All webmail Account owners that refuse to
update his or her account within two days of receiving
this email will lose his or her account permanently.

Thank you for your cooperation!

Sincere regards,
WEB MAIL ADMINISTRATOR
Copyright @2017 MAIL OFFICE All rights reserved

Esta mensagem é destinada somente para netdev@vger.kernel.org. Se você não é o destinatário, você está notificado de que divulgar, copiar, distribuir ou tomar qualquer ação baseada no conteúdo desta informação é estritamente proibida.

Email seguro - Secretaria Adjunta de Tecnologia do Maranhão - SEATI

^ permalink raw reply

* Re: [PATCH v4 net-next 0/4] bpftool: cgroup bpf operations
From: Quentin Monnet @ 2017-12-14 12:22 UTC (permalink / raw)
  To: Roman Gushchin, netdev
  Cc: linux-kernel, kernel-team, ast, daniel, jakub.kicinski, kafai,
	David Ahern
In-Reply-To: <20171213151854.21960-1-guro@fb.com>

2017-12-13 15:18 UTC+0000 ~ Roman Gushchin <guro@fb.com>
> This patchset adds basic cgroup bpf operations to bpftool.
> 
> Right now there is no convenient way to perform these operations.
> The /samples/bpf/load_sock_ops.c implements attach/detacg operations,
> but only for BPF_CGROUP_SOCK_OPS programs. Bps (part of bcc) implements
> bpf introspection, but lacks any cgroup-related specific.
> 
> I find having a tool to perform these basic operations in the kernel tree
> very useful, as it can be used in the corresponding bpf documentation
> without creating additional dependencies. And bpftool seems to be
> a right tool to extend with such functionality.

Thank you Roman for this series, and for documenting the new commands
and flags!

Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>

^ permalink raw reply

* Re: [PATCH net] vxlan: Restore initial MTU setting based on lower device
From: Stefano Brivio @ 2017-12-14 12:36 UTC (permalink / raw)
  To: Alexey Kodanev
  Cc: Matthias Schiffer, David S . Miller, netdev, Junhan Yan,
	Jiri Benc, Hangbin Liu
In-Reply-To: <b7ab9d2f-3916-221b-7763-c1445fa65e2c@oracle.com>

On Thu, 14 Dec 2017 14:23:36 +0300
Alexey Kodanev <alexey.kodanev@oracle.com> wrote:

> On 12/14/2017 03:31 AM, Stefano Brivio wrote:
> > On Thu, 14 Dec 2017 01:25:40 +0100
> > Matthias Schiffer <mschiffer@universe-factory.net> wrote:
> >   
> >> On 12/14/2017 01:10 AM, Stefano Brivio wrote:  
> >>> On Thu, 14 Dec 2017 00:57:32 +0100
> >>> Matthias Schiffer <mschiffer@universe-factory.net> wrote:
> >>>     
> >>>> As you note, there is another occurrence of this calculation in
> >>>> vxlan_config_apply():
> >>>>
> >>>>
> >>>> [...]
> >>>>         if (lowerdev) {
> >>>> [...]
> >>>>                 max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
> >>>>                                            VXLAN_HEADROOM);
> >>>>         }
> >>>>
> >>>>         if (dev->mtu > max_mtu)
> >>>>                 dev->mtu = max_mtu;
> >>>> [...]
> >>>>
> >>>>
> >>>> Unless I'm overlooking something, this should already do the same thing and
> >>>> your patch is redundant.    
> >>>
> >>> The code above sets max_mtu, and only if dev->mtu exceeds that, the
> >>> latter is then clamped.
> >>>
> >>> What my patch does is to actually set dev->mtu to that value, no matter
> >>> what's the previous value set by ether_setup() (only on creation, and
> >>> only if lowerdev is there), just like the previous behaviour used to be.
> >>>
> >>> Let's consider these two cases, on the existing code:
> >>>
> >>> 1. lowerdev->mtu is 1500:
> >>>    - ether_setup(), called by vxlan_setup(), sets dev->mtu to 1500
> >>>    - here max_mtu is 1450
> >>>    - we enter the second if clause above (dev->mtu > max_mtu)
> >>>    - at the end of vxlan_config_apply(), dev->mtu will be 1450
> >>>
> >>> which is consistent with the previous behaviour.
> >>>
> >>> 2. lowerdev->mtu is 9000:
> >>>    - ether_setup(), called by vxlan_setup(), sets dev->mtu to 1500
> >>>    - here max_mtu is 8950
> >>>    - we do not enter the second if clause above (dev->mtu < max_mtu)
> >>>    - at the end of vxlan_config_apply(), dev->mtu will still be 1500
> >>>
> >>> which is not consistent with the previous behaviour, where it used to
> >>> be 8950 instead.    
> >>
> >> Ah, thank you for the explanation, I was missing the context that this was
> >> about higher rather than lower MTUs.
> >>
> >> Personally, I would prefer a change like the following, as it does not
> >> introduce another duplication of the MTU calculation (not tested at all):
> >>  
> >>> --- a/drivers/net/vxlan.c
> >>> +++ b/drivers/net/vxlan.c
> >>> @@ -3105,7 +3105,7 @@ static void vxlan_config_apply(struct net_device *dev,
> >>>  					   VXLAN_HEADROOM);
> >>>  	}
> >>>  
> >>> -	if (dev->mtu > max_mtu)
> >>> +	if (dev->mtu > max_mtu || (!changelink && !conf->mtu))
> >>>  		dev->mtu = max_mtu;  
> > 
> > You would also need to check that lowerdev is present, though.
> >   
> 
> 
> if we move it up in "if (lowerdev) { ..." branch we will be checking the presence
> of "lowerdev" and also not calculating it again. Also I would check max_mtu for
> minimum as it might happen to be negative, though unlikely corner case...

Indeed it might happen to be negative (only for IPv6, down to -2), good
catch.

For the benefit of others: it took me a few minutes to see how this is
*not* unrelated, because we are introducing a direct assignment of
dev->mtu to set max_mtu, whereas earlier it was just used in
comparisons, so it didn't matter whether it was negative.

> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 19b9cc5..1000b0e 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -3103,6 +3103,11 @@ static void vxlan_config_apply(struct net_device *dev,
> 
>                 max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
>                                            VXLAN_HEADROOM);
> +               if (max_mtu < ETH_MIN_MTU)
> +                       max_mtu = ETH_MIN_MTU;
> +
> +               if (!changelink && !conf->mtu)
> +                       dev->mtu = max_mtu;

I don't really have a strong preference here. On one hand, you're
hiding this a bit from the "device creation" path. On the other hand,
it's a bit more compact. So I'm also fine with this.

Can you perhaps submit a formal patch?

-- 
Stefano

^ permalink raw reply

* Re: brcmsmac: use ARRAY_SIZE on rfseq_updategainu_events
From: Kalle Valo @ 2017-12-14 12:40 UTC (permalink / raw)
  To: Colin Ian King
  Cc: Arend van Spriel, Franky Lin, Hante Meuleman, Chi-Hsien Lin,
	Wright Feng, linux-wireless, brcm80211-dev-list.pdl,
	brcm80211-dev-list, netdev, kernel-janitors, linux-kernel
In-Reply-To: <20171207102047.21635-1-colin.king@canonical.com>

Colin Ian King <colin.king@canonical.com> wrote:

> From: Colin Ian King <colin.king@canonical.com>
> 
> Use the ARRAY_SIZE macro on rfseq_updategainu_events to determine
> size of the array. Improvement suggested by coccinelle.
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>

Patch applied to wireless-drivers-next.git, thanks.

18907f20ea71 brcmsmac: use ARRAY_SIZE on rfseq_updategainu_events

-- 
https://patchwork.kernel.org/patch/10098257/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [RFC PATCH] net: stmmac: enable EEE in MII, GMII or RGMII only
From: Arnaud Patard @ 2017-12-14 12:41 UTC (permalink / raw)
  To: Jerome Brunet
  Cc: Giuseppe Cavallaro, Alexandre Torgue, netdev, linux-amlogic,
	linux-kernel
In-Reply-To: <20171205102809.4347-1-jbrunet@baylibre.com>

Jerome Brunet <jbrunet@baylibre.com> writes:

> Note in the databook - Section 4.4 - EEE :
> " The EEE feature is not supported when the MAC is configured to use the
> TBI, RTBI, SMII, RMII or SGMII single PHY interface. Even if the MAC
> supports multiple PHY interfaces, you should activate the EEE mode only
> when the MAC is operating with GMII, MII, or RGMII interface."
>
> Applying this restriction solves a stability issue observed on Amlogic
> gxl platforms operating with RMII interface and the internal PHY.

I was having the issue on my libretech AML-S905X-CC / potato board. With
this patch, I've not been able to trigger it at all. Without it and with
my test case, I was able to trigger the hang several times reliably.

Any hope to see this merged ?

>
> Fixes: 83bf79b6bb64 ("stmmac: disable at run-time the EEE if not supported")
> Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>

Tested-by: Arnaud Patard <arnaud.patard@rtp-net.org>

^ permalink raw reply

* Re: [1/1] rtlwifi: always initialize variables given to RT_TRACE()
From: Kalle Valo @ 2017-12-14 12:44 UTC (permalink / raw)
  To: Nicolas Iooss
  Cc: Larry Finger, Chaoming Li, linux-wireless, netdev, linux-kernel,
	Nicolas Iooss
In-Reply-To: <20171210195159.7454-1-nicolas.iooss_linux@m4x.org>

Nicolas Iooss <nicolas.iooss_linux@m4x.org> wrote:

> In rtl_rx_ampdu_apply(), when rtlpriv->cfg->ops->get_btc_status()
> returns false, RT_TRACE() is called with the values of variables
> reject_agg and agg_size, which have not been initialized.
> 
> Always initialize these variables in order to prevent using
> uninitialized values.
> 
> This issue has been found with clang. The compiler reported:
> 
>     drivers/net/wireless/realtek/rtlwifi/base.c:1665:6: error: variable
>     'agg_size' is used uninitialized whenever 'if' condition is false
>     [-Werror,-Wsometimes-uninitialized]
>             if (rtlpriv->cfg->ops->get_btc_status())
>                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>     drivers/net/wireless/realtek/rtlwifi/base.c:1671:31: note:
>     uninitialized use occurs here
>                      reject_agg, ctrl_agg_size, agg_size);
>                                                 ^~~~~~~~
> 
>     drivers/net/wireless/realtek/rtlwifi/base.c:1665:6: error: variable
>     'reject_agg' is used uninitialized whenever 'if' condition
>           is false [-Werror,-Wsometimes-uninitialized]
>             if (rtlpriv->cfg->ops->get_btc_status())
>                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>     drivers/net/wireless/realtek/rtlwifi/base.c:1671:4: note:
>     uninitialized use occurs here
>                      reject_agg, ctrl_agg_size, agg_size);
>                      ^~~~~~~~~~
> 
> Fixes: 2635664e6e4a ("rtlwifi: Add rx ampdu cfg for btcoexist.")
> Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>

Patch applied to wireless-drivers-next.git, thanks.

e4779162f737 rtlwifi: always initialize variables given to RT_TRACE()

-- 
https://patchwork.kernel.org/patch/10103995/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: wlcore: fix unused function warning
From: Kalle Valo @ 2017-12-14 12:45 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Arnd Bergmann, Reizer, Eyal, Johannes Berg, Iain Hunter,
	Arend Van Spriel, Kees Cook, linux-wireless, netdev, linux-kernel
In-Reply-To: <20171211114718.3637010-1-arnd@arndb.de>

Arnd Bergmann <arnd@arndb.de> wrote:

> The newly added wlcore_fw_sleep function is called conditionally,
> which causes a warning without CONFIG_PM:
> 
> drivers/net/wireless/ti/wlcore/main.c:981:12: error: 'wlcore_fw_sleep' defined but not used [-Werror=unused-function]
> 
> Instead of trying to keep track of what should be in the #ifdef and what
> should not, it's easier to mark the top-level suspend/resume functions
> as __maybe_unused so the compiler can silently drop all the unused code.
> 
> Fixes: 37bf241b8e7b ("wlcore: allow elp during wowlan suspend")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Patch applied to wireless-drivers-next.git, thanks.

7de241f3b705 wlcore: fix unused function warning

-- 
https://patchwork.kernel.org/patch/10104839/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH v4 net-next 0/4] bpftool: cgroup bpf operations
From: Daniel Borkmann @ 2017-12-14 12:45 UTC (permalink / raw)
  To: Roman Gushchin, netdev
  Cc: linux-kernel, kernel-team, ast, jakub.kicinski, kafai,
	Quentin Monnet, David Ahern
In-Reply-To: <20171213151854.21960-1-guro@fb.com>

On 12/13/2017 04:18 PM, Roman Gushchin wrote:
> This patchset adds basic cgroup bpf operations to bpftool.
> 
> Right now there is no convenient way to perform these operations.
> The /samples/bpf/load_sock_ops.c implements attach/detacg operations,
> but only for BPF_CGROUP_SOCK_OPS programs. Bps (part of bcc) implements
> bpf introspection, but lacks any cgroup-related specific.
> 
> I find having a tool to perform these basic operations in the kernel tree
> very useful, as it can be used in the corresponding bpf documentation
> without creating additional dependencies. And bpftool seems to be
> a right tool to extend with such functionality.
> 
> v4:
>   - ATTACH_FLAGS and ATTACH_TYPE are listed and described in docs and usage
>   - ATTACH_FLAG names converted to "multi" and "override"
>   - do_attach() recognizes ATTACH_FLAG abbreviations, e.g "mul"
>   - Local variables sorted ("reverse Christmas tree")
>   - unknown attach flags value will be never truncated

Series applied to bpf-next, thanks everyone!

^ permalink raw reply

* Re: [patch net-next v3 05/10] net: sched: keep track of offloaded filters and check tc offload feature
From: Jiri Pirko @ 2017-12-14 13:10 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, simon.horman, pieter.jansenvanvuuren, john.hurley,
	alexander.h.duyck, ogerlitz, john.fastabend, daniel
In-Reply-To: <20171214094716.GA1926@nanopsycho>

Thu, Dec 14, 2017 at 10:47:16AM CET, jiri@resnulli.us wrote:
>Thu, Dec 14, 2017 at 02:05:55AM CET, jakub.kicinski@netronome.com wrote:
>>On Wed, 13 Dec 2017 16:10:33 +0100, Jiri Pirko wrote:
>>> From: Jiri Pirko <jiri@mellanox.com>
>>> 
>>> During block bind, we need to check tc offload feature. If it is
>>> disabled yet still the block contains offloaded filters, forbid the
>>> bind. Also forbid to register callback for a block that already
>>> containes offloaded filters, as the play back is not supported now.
>>> For keeping track of offloaded filters there is a new counter
>>> introduced, alongside with couple of helpers called from cls_* code.
>>> These helpers set and clear TCA_CLS_FLAGS_IN_HW flag.
>>> 
>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>
>>> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
>>> index de9dbcb..ac25142 100644
>>> --- a/net/sched/cls_api.c
>>> +++ b/net/sched/cls_api.c
>>> @@ -266,31 +266,50 @@ void tcf_chain_put(struct tcf_chain *chain)
>>>  }
>>>  EXPORT_SYMBOL(tcf_chain_put);
>>>  
>>> -static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q,
>>> +static bool tcf_block_offload_in_use(struct tcf_block *block)
>>> +{
>>> +	return block->offloadcnt;
>>> +}
>>> +
>>> +static void tcf_block_offload_cmd(struct tcf_block *block,
>>> +				  struct net_device *dev,
>>>  				  struct tcf_block_ext_info *ei,
>>>  				  enum tc_block_command command)
>>>  {
>>> -	struct net_device *dev = q->dev_queue->dev;
>>>  	struct tc_block_offload bo = {};
>>>  
>>> -	if (!dev->netdev_ops->ndo_setup_tc)
>>> -		return;
>>>  	bo.command = command;
>>>  	bo.binder_type = ei->binder_type;
>>>  	bo.block = block;
>>>  	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
>>>  }
>>>  
>>> -static void tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
>>> -				   struct tcf_block_ext_info *ei)
>>> +static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
>>> +				  struct tcf_block_ext_info *ei)
>>>  {
>>> -	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_BIND);
>>> +	struct net_device *dev = q->dev_queue->dev;
>>> +
>>> +	if (!dev->netdev_ops->ndo_setup_tc)
>>> +		return 0;
>>> +
>>> +	/* If tc offload feature is disabled and the block we try to bind
>>> +	 * to already has some offloaded filters, forbid to bind.
>>> +	 */
>>> +	if (!tc_can_offload(dev) && tcf_block_offload_in_use(block))
>>> +		return -EOPNOTSUPP;
>>
>>I don't understand the tc_can_offload(dev) check here.  The flow is -
>>on bind if TC offloads are enabled the new port will get a TC_BLOCK_BIND
>>and request a register, but the register will fail since the block is
>>offloaded?
>
>The point of this check is to disallow dev with tc offload disabled to
>share a block which already holds offloaded filters.
>
>That is similar to disallow disabling tc offload on device that shares a
>block which contains offloaded filters.
>
>
>
>>
>>But the whole bind operation does not fail, so user will not see an
>>error.  The block will get bound but port's driver has no way to
>>register the callback.  I'm sorry if I'm just being slow here..
>>
>>> +	tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND);
>>> +	return 0;
>
>The thing is that driver which does not support TC_BLOCK_BIND would
>return -EOPNOTSUPP here. For those drivers we continue, they just won't
>have block cb registered so they won't receive cb calls to offload
>filters.
>
>
>>>  }
>>>  
>>>  static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
>>>  				     struct tcf_block_ext_info *ei)
>>>  {
>>> -	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND);
>>> +	struct net_device *dev = q->dev_queue->dev;
>>> +
>>> +	if (!dev->netdev_ops->ndo_setup_tc)
>>> +		return;
>>> +	tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND);
>>>  }
>>>  
>>>  static int
>>> @@ -499,10 +518,15 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
>>>  	if (err)
>>>  		goto err_chain_head_change_cb_add;
>>>  
>>> -	tcf_block_offload_bind(block, q, ei);
>>> +	err = tcf_block_offload_bind(block, q, ei);
>>> +	if (err)
>>> +		goto err_block_offload_bind;
>>
>>Would it perhaps make more sense to add a TC_BLOCK_JOIN or some such?
>
>Why? Just a namechange?
>
>
>>IIUC the problem is we don't know whether the driver/callee of the new
>>port is aware of previous callbacks/filters and we can't replay them.

Well, the problem is a bit different.
There are 2 scenarios when we need to fail here:
1) tc offload feature is turned off, there are some filters offloaded in
   the block. That is what I commented above.
2) tc offload feature is turned on, there are some filters offloaded in
   the block but the block is not accounted by the driver. This is
   because of the lack or replay. This is taken care of in the beginning
   of __tcf_block_cb_register function - see below, there is a comment
   there.


>>
>>Obviously registering new callbacks on offloaded blocks is a no-go.
>>For simple bind to a new port of an ASIC which already knows the rule
>>set, we just need to ask all callbacks if they know the port and as
>>long as any of them responds "yes" we are safe to assume the bind is OK.
>>
>>(Existing drivers will all respond with EOPNOTSUPP to a new unknown command.)
>>
>>Does that make sense?
>
>Hmm, I understand what you say. I have to think about that a bit more.

As you see, both cases where we need to bail out are covered. Do you see
any other problem?


>
>Thanks!
>
>>
>>>  	*p_block = block;
>>>  	return 0;
>>>  
>>> +err_block_offload_bind:
>>> +	tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
>>>  err_chain_head_change_cb_add:
>>>  	tcf_block_owner_del(block, q, ei->binder_type);
>>>  err_block_owner_add:
>>> @@ -630,9 +654,16 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
>>>  {
>>>  	struct tcf_block_cb *block_cb;
>>>  
>>> +	/* At this point, playback of previous block cb calls is not supported,
>>> +	 * so forbid to register to block which already has some offloaded
>>> +	 * filters present.
>>> +	 */
>>> +	if (tcf_block_offload_in_use(block))
>>> +		return ERR_PTR(-EOPNOTSUPP);
>>> 
>>>  	block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
>>>  	if (!block_cb)
>>> -		return NULL;
>>> +		return ERR_PTR(-ENOMEM);
>>>  	block_cb->cb = cb;
>>>  	block_cb->cb_ident = cb_ident;
>>>  	block_cb->cb_priv = cb_priv;
>>> @@ -648,7 +679,7 @@ int tcf_block_cb_register(struct tcf_block *block,
>>>  	struct tcf_block_cb *block_cb;
>>>  
>>>  	block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv);
>>> -	return block_cb ? 0 : -ENOMEM;
>>> +	return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0;
>>>  }
>>>  EXPORT_SYMBOL(tcf_block_cb_register);
>>>  

^ permalink raw reply

* [PATCH] wireless: Always rewrite generated files from scratch
From: Thierry Reding @ 2017-12-14 13:33 UTC (permalink / raw)
  To: Johannes Berg; +Cc: David S . Miller, linux-wireless, netdev, linux-kernel

From: Thierry Reding <treding@nvidia.com>

Currently the certs C code generation appends to the generated files,
which is most likely a leftover from commit 715a12334764 ("wireless:
don't write C files on failures"). This causes duplicate code in the
generated files if the certificates have their timestamps modified
between builds and thereby trigger the generation rules.

Fixes: 715a12334764 ("wireless: don't write C files on failures")
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
Based on next-20171214

 net/wireless/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/wireless/Makefile b/net/wireless/Makefile
index d7d6cb00c47b..b662be3422e1 100644
--- a/net/wireless/Makefile
+++ b/net/wireless/Makefile
@@ -43,7 +43,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509)
 	      echo "$$allf"; \
 	      echo '};'; \
 	      echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);'; \
-	  ) >> $@)
+	  ) > $@)

 $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \
 		      $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509)
@@ -66,4 +66,4 @@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \
 	      echo "$$allf"; \
 	      echo '};'; \
 	      echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);'; \
-	  ) >> $@)
+	  ) > $@)
-- 
2.15.1

^ permalink raw reply related

* Re: [RFC PATCH] reuseport: compute the ehash only if needed
From: David Miller @ 2017-12-14 13:41 UTC (permalink / raw)
  To: pabeni; +Cc: netdev, kraig, edumazet
In-Reply-To: <1513240186.2604.10.camel@redhat.com>

From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 14 Dec 2017 09:29:46 +0100

> I was unable to measure any performance regression for the hash based
> demultiplexing, and I think that the number of function calls is
> unchanged in such scenario (with vanilla kernel we have ehash() and
> reuseport_select_sock(), with the patched one __reuseport_get_info()
> and ehash()). 
> 
> I agree you are right about the additional stack usage introduced by
> this patch.
> 
> Overall I see we need something better than this.

Thanks for checking whether it's slower or not.  I wonder if x86-64
is putting the argument parts on the stack anyways...

^ permalink raw reply

* [PATCH v7 1/3] sock: Change the netns_core member name.
From: Tonghao Zhang @ 2017-12-14 13:51 UTC (permalink / raw)
  To: xiyou.wangcong, davem; +Cc: netdev, Tonghao Zhang

Change the member name will make the code more readable.
This patch will be used in next patch.

Signed-off-by: Martin Zhang <zhangjunweimartin@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 include/net/netns/core.h |  2 +-
 net/core/sock.c          | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 0ad4d0c..45cfb5d 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,7 +11,7 @@ struct netns_core {
 
 	int	sysctl_somaxconn;
 
-	struct prot_inuse __percpu *inuse;
+	struct prot_inuse __percpu *prot_inuse;
 };
 
 #endif
diff --git a/net/core/sock.c b/net/core/sock.c
index c0b5b2f..c2dd2d3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3045,7 +3045,7 @@ struct prot_inuse {
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -3055,7 +3055,7 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
 	int res = 0;
 
 	for_each_possible_cpu(cpu)
-		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
 
 	return res >= 0 ? res : 0;
 }
@@ -3063,13 +3063,13 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
 
 static int __net_init sock_inuse_init_net(struct net *net)
 {
-	net->core.inuse = alloc_percpu(struct prot_inuse);
-	return net->core.inuse ? 0 : -ENOMEM;
+	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+	return net->core.prot_inuse ? 0 : -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
-	free_percpu(net->core.inuse);
+	free_percpu(net->core.prot_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v7 2/3] sock: Move the socket inuse to namespace.
From: Tonghao Zhang @ 2017-12-14 13:51 UTC (permalink / raw)
  To: xiyou.wangcong, davem; +Cc: netdev, Tonghao Zhang
In-Reply-To: <1513259519-32332-1-git-send-email-xiangxia.m.yue@gmail.com>

In some case, we want to know how many sockets are in use in
different _net_ namespaces. It's a key resource metric.

This patch add a member in struct netns_core. This is a counter
for socket-inuse in the _net_ namespace. The patch will add/sub
counter in the sk_alloc, sk_clone_lock and __sk_free.

This patch will not counter the socket created in kernel.
It's not very useful for userspace to know how many kernel
sockets we created.

The main reasons for doing this are that:

1. When linux calls the 'do_exit' for process to exit, the functions
'exit_task_namespaces' and 'exit_task_work' will be called sequentially.
'exit_task_namespaces' may have destroyed the _net_ namespace, but
'sock_release' called in 'exit_task_work' may use the _net_ namespace
if we counter the socket-inuse in sock_release.

2. socket and sock are in pair. More important, sock holds the _net_
namespace. We counter the socket-inuse in sock, for avoiding holding
_net_ namespace again in socket. It's a easy way to maintain the code.

Signed-off-by: Martin Zhang <zhangjunweimartin@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 include/net/netns/core.h |  3 +++
 include/net/sock.h       |  1 +
 net/core/sock.c          | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 net/socket.c             | 21 ++-------------------
 4 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 45cfb5d..a5e8a66 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,9 @@ struct netns_core {
 
 	int	sysctl_somaxconn;
 
+#ifdef CONFIG_PROC_FS
+	int __percpu *sock_inuse;
+#endif
 	struct prot_inuse __percpu *prot_inuse;
 };
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 9a90472..0a32f3c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1262,6 +1262,7 @@ static inline void sk_sockets_allocated_inc(struct sock *sk)
 /* Called with local bh disabled */
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
 int sock_prot_inuse_get(struct net *net, struct proto *proto);
+int sock_inuse_get(struct net *net);
 #else
 static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
 		int inc)
diff --git a/net/core/sock.c b/net/core/sock.c
index c2dd2d3..72d14b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,8 @@
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
+static void sock_inuse_add(struct net *net, int val);
+
 /**
  * sk_ns_capable - General socket capability test
  * @sk: Socket to use a capability on or through
@@ -1531,8 +1533,11 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sk->sk_kern_sock = kern;
 		sock_lock_init(sk);
 		sk->sk_net_refcnt = kern ? 0 : 1;
-		if (likely(sk->sk_net_refcnt))
+		if (likely(sk->sk_net_refcnt)) {
 			get_net(net);
+			sock_inuse_add(net, 1);
+		}
+
 		sock_net_set(sk, net);
 		refcount_set(&sk->sk_wmem_alloc, 1);
 
@@ -1595,6 +1600,9 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
+	if (likely(sk->sk_net_refcnt))
+		sock_inuse_add(sock_net(sk), -1);
+
 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
 		sock_diag_broadcast_destroy(sk);
 	else
@@ -1716,6 +1724,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_priority = 0;
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
+		if (likely(newsk->sk_net_refcnt))
+			sock_inuse_add(sock_net(newsk), 1);
 
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
@@ -3061,15 +3071,44 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
+static void sock_inuse_add(struct net *net, int val)
+{
+	this_cpu_add(*net->core.sock_inuse, val);
+}
+
+int sock_inuse_get(struct net *net)
+{
+	int cpu, res = 0;
+
+	for_each_possible_cpu(cpu)
+		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+
+	return res;
+}
+
+EXPORT_SYMBOL_GPL(sock_inuse_get);
+
 static int __net_init sock_inuse_init_net(struct net *net)
 {
 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
-	return net->core.prot_inuse ? 0 : -ENOMEM;
+	if (net->core.prot_inuse == NULL)
+		return -ENOMEM;
+
+	net->core.sock_inuse = alloc_percpu(int);
+	if (net->core.sock_inuse == NULL)
+		goto out;
+
+	return 0;
+
+out:
+	free_percpu(net->core.prot_inuse);
+	return -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
 	free_percpu(net->core.prot_inuse);
+	free_percpu(net->core.sock_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
@@ -3112,6 +3151,10 @@ static inline void assign_proto_idx(struct proto *prot)
 static inline void release_proto_idx(struct proto *prot)
 {
 }
+
+static void sock_inuse_add(struct net *net, int val)
+{
+}
 #endif
 
 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
diff --git a/net/socket.c b/net/socket.c
index 05f361f..bbd2e9c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -163,12 +163,6 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
 static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
 
 /*
- *	Statistics counters of the socket lists
- */
-
-static DEFINE_PER_CPU(int, sockets_in_use);
-
-/*
  * Support routines.
  * Move socket addresses back and forth across the kernel/user
  * divide and look after the messy bits.
@@ -578,7 +572,6 @@ struct socket *sock_alloc(void)
 	inode->i_gid = current_fsgid();
 	inode->i_op = &sockfs_inode_ops;
 
-	this_cpu_add(sockets_in_use, 1);
 	return sock;
 }
 EXPORT_SYMBOL(sock_alloc);
@@ -605,7 +598,6 @@ void sock_release(struct socket *sock)
 	if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
 		pr_err("%s: fasync list not empty!\n", __func__);
 
-	this_cpu_sub(sockets_in_use, 1);
 	if (!sock->file) {
 		iput(SOCK_INODE(sock));
 		return;
@@ -2622,17 +2614,8 @@ static int __init sock_init(void)
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-	int cpu;
-	int counter = 0;
-
-	for_each_possible_cpu(cpu)
-	    counter += per_cpu(sockets_in_use, cpu);
-
-	/* It can be negative, by the way. 8) */
-	if (counter < 0)
-		counter = 0;
-
-	seq_printf(seq, "sockets: used %d\n", counter);
+	seq_printf(seq, "sockets: used %d\n",
+		   sock_inuse_get(seq->private));
 }
 #endif				/* CONFIG_PROC_FS */
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v7 3/3] sock: Hide unused variable when !CONFIG_PROC_FS.
From: Tonghao Zhang @ 2017-12-14 13:51 UTC (permalink / raw)
  To: xiyou.wangcong, davem; +Cc: netdev, Tonghao Zhang, Pavel Emelyanov
In-Reply-To: <1513259519-32332-1-git-send-email-xiangxia.m.yue@gmail.com>

When CONFIG_PROC_FS is disabled, we will not use the prot_inuse
counter. This adds an #ifdef to hide the variable definition in
that case. This is not a bugfix. But we can save bytes when there
are many network namespace.

Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Martin Zhang <zhangjunweimartin@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
---
 include/net/netns/core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index a5e8a66..36c2d99 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -13,8 +13,8 @@ struct netns_core {

 #ifdef CONFIG_PROC_FS
 	int __percpu *sock_inuse;
-#endif
 	struct prot_inuse __percpu *prot_inuse;
+#endif
 };

 #endif
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net 0/3] net: sched: Make qdisc offload uapi uniform
From: Yuval Mintz @ 2017-12-14 13:54 UTC (permalink / raw)
  To: davem, netdev; +Cc: mlxsw, Yuval Mintz

Several qdiscs can already be offloaded to hardware, but there's an
inconsistecy in regard to the uapi through which they indicate such
an offload is taking place - indication is passed to the user via
TCA_OPTIONS where each qdisc retains private logic for setting it.

The recent addition of offloading to RED in
602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") caused
the addition of yet another uapi field for this purpose -
TC_RED_OFFLOADED.

For clarity and prevention of bloat in the uapi we want to eliminate
said added uapi, replacing it with a common mechanism that can be used
to reflect offload status of the various qdiscs.

The first patch introduces TCA_HW_OFFLOAD as the generic message meant
for this purpose. The second changes the current RED implementation into
setting the internal bits necessary for passing it, and the third removes
TC_RED_OFFLOADED as its no longer needed.

Dave,

A bit unorthodox as it's not a fix per-se, but it's the last chance
for killing the unneeded uapi and replacing it with something better
before getting stuck with it forever.

Cheers,
Yuval

Yuval Mintz (3):
  net: sched: Add TCA_HW_OFFLOAD
  net: sched: Move to new offload indication in RED
  pkt_sched: Remove TC_RED_OFFLOADED from uapi

 include/net/sch_generic.h      |  1 +
 include/uapi/linux/pkt_sched.h |  1 -
 include/uapi/linux/rtnetlink.h |  1 +
 net/sched/sch_api.c            |  2 ++
 net/sched/sch_red.c            | 31 +++++++++++++++----------------
 5 files changed, 19 insertions(+), 17 deletions(-)

-- 
2.4.3

^ permalink raw reply

* [PATCH net 3/3] pkt_sched: Remove TC_RED_OFFLOADED from uapi
From: Yuval Mintz @ 2017-12-14 13:54 UTC (permalink / raw)
  To: davem, netdev; +Cc: mlxsw, Yuval Mintz
In-Reply-To: <1513259671-1183-1-git-send-email-yuvalm@mellanox.com>

Following the previous patch, RED is now using the new uniform uapi
for indicating it's offloaded. As a result, TC_RED_OFFLOADED is no
longer utilized by kernel and can be removed [as it's still not
part of any stable release].

Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
---
 include/uapi/linux/pkt_sched.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index af3cc2f..37b5096 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -256,7 +256,6 @@ struct tc_red_qopt {
 #define TC_RED_ECN		1
 #define TC_RED_HARDDROP		2
 #define TC_RED_ADAPTATIVE	4
-#define TC_RED_OFFLOADED	8
 };
 
 struct tc_red_xstats {
-- 
2.4.3

^ permalink raw reply related

* [PATCH net 1/3] net: sched: Add TCA_HW_OFFLOAD
From: Yuval Mintz @ 2017-12-14 13:54 UTC (permalink / raw)
  To: davem, netdev; +Cc: mlxsw, Yuval Mintz
In-Reply-To: <1513259671-1183-1-git-send-email-yuvalm@mellanox.com>

Qdiscs can be offloaded to HW, but current implementation isn't uniform.
Instead, qdiscs either pass information about offload status via their
TCA_OPTIONS or omit it altogether.

Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform
uAPI for the offloading status of qdiscs.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
---
Do Notice this is going to create [easy-to-solve-]conflicts with net-next,
Due to 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking").
That's also why the numbering here are apparently inconsistent [skipping
0x100].
---

 include/net/sch_generic.h      | 1 +
 include/uapi/linux/rtnetlink.h | 1 +
 net/sched/sch_api.c            | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25..83a3e47 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,6 +71,7 @@ struct Qdisc {
 				      * qdisc_tree_decrease_qlen() should stop.
 				      */
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d8b5f80..843e29a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -557,6 +557,7 @@ enum {
 	TCA_PAD,
 	TCA_DUMP_INVISIBLE,
 	TCA_CHAIN,
+	TCA_HW_OFFLOAD,
 	__TCA_MAX
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f53..0f1eab9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -795,6 +795,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	tcm->tcm_info = refcount_read(&q->refcnt);
 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
 		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
+		goto nla_put_failure;
 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
 		goto nla_put_failure;
 	qlen = q->q.qlen;
-- 
2.4.3

^ permalink raw reply related

* [PATCH net 2/3] net: sched: Move to new offload indication in RED
From: Yuval Mintz @ 2017-12-14 13:54 UTC (permalink / raw)
  To: davem, netdev; +Cc: mlxsw, Yuval Mintz
In-Reply-To: <1513259671-1183-1-git-send-email-yuvalm@mellanox.com>

Let RED utilize the new internal flag, TCQ_F_OFFLOADED,
to mark a given qdisc as offloaded instead of using a dedicated
indication.

Also, change internal logic into looking at said flag when possible.

Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
---
 net/sched/sch_red.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 9d874e6..f0747eb 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -157,6 +157,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		.handle = sch->handle,
 		.parent = sch->parent,
 	};
+	int err;
 
 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 		return -EOPNOTSUPP;
@@ -171,7 +172,14 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		opt.command = TC_RED_DESTROY;
 	}
 
-	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &opt);
+
+	if (!err && enable)
+		sch->flags |= TCQ_F_OFFLOADED;
+	else
+		sch->flags &= ~TCQ_F_OFFLOADED;
+
+	return err;
 }
 
 static void red_destroy(struct Qdisc *sch)
@@ -274,7 +282,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt)
 	return red_change(sch, opt);
 }
 
-static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt)
+static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct tc_red_qopt_offload hw_stats = {
@@ -286,21 +294,12 @@ static int red_dump_offload(struct Qdisc *sch, struct tc_red_qopt *opt)
 			.stats.qstats = &sch->qstats,
 		},
 	};
-	int err;
 
-	opt->flags &= ~TC_RED_OFFLOADED;
-	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
-		return 0;
-
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
-					    &hw_stats);
-	if (err == -EOPNOTSUPP)
+	if (!(sch->flags & TCQ_F_OFFLOADED))
 		return 0;
 
-	if (!err)
-		opt->flags |= TC_RED_OFFLOADED;
-
-	return err;
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
+					     &hw_stats);
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -319,7 +318,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 	int err;
 
 	sch->qstats.backlog = q->qdisc->qstats.backlog;
-	err = red_dump_offload(sch, &opt);
+	err = red_dump_offload_stats(sch, &opt);
 	if (err)
 		goto nla_put_failure;
 
@@ -347,7 +346,7 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
 	};
 
-	if (tc_can_offload(dev) &&  dev->netdev_ops->ndo_setup_tc) {
+	if (sch->flags & TCQ_F_OFFLOADED) {
 		struct red_stats hw_stats = {0};
 		struct tc_red_qopt_offload hw_stats_request = {
 			.command = TC_RED_XSTATS,
-- 
2.4.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox