Netdev List
 help / color / mirror / Atom feed
* Fwd: Problem with the kernel 4.15 - cutting the band (tc)
From: Linus Torvalds @ 2018-04-06 21:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <42a29f8a-beb3-84da-9129-fbda7ef81be4@hostcenter.eu>

Forwarding a report about what looks like a regression between 4.14 and 4.15.

New ENOSPC issue? I don't even knew where to start guessing where to look.

Help me, Davem-Wan Kenobi, you are my only hope.

(But adding netdev just in case somebody else goes "That's obviously Xyz")

              Linus

---------- Forwarded message ----------
From: Marcin Kabiesz <admin@hostcenter.eu>
Date: Thu, Apr 5, 2018 at 10:38 AM
Subject: Problem with the kernel 4.15 - cutting the band (tc)


Hello,
I have a problem with bandwidth cutting on kernel 4.15. On the version
up to 4.15, i.e. 4.14, this problem does not occur.

uname -a: Linux router 4.14.15 #1 SMP x86_64 Intel Xeon E3-1230 v6
command to reproduce:

tc qdisc add dev ifb0 root handle 1: htb r2q 2
tc class add dev ifb0 parent 1: classid 1:1 htb rate 10gbit ceil
10gbit quantum 16000
tc filter add dev ifb0 parent 1: prio 5 handle 1: protocol all u32 divisor 256
tc filter add dev ifb0 protocol all parent 1: prio 5 u32 ht 800::
match ip dst 0.0.0.0/0 hashkey mask 0x000000ff at 16 link 1:
tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32
tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32

This ok, no error/warnings and dmesg log.

uname -a: Linux router 4.15.8 #1 SMP x86_64 Intel Xeon E3-1230 v6 (or
4.15.14 this same effect)
command to reproduce:

tc qdisc add dev ifb0 root handle 1: htb r2q 2
tc class add dev ifb0 parent 1: classid 1:1 htb rate 10gbit ceil
10gbit quantum 16000
tc filter add dev ifb0 parent 1: prio 5 handle 1: protocol all u32 divisor 256
tc filter add dev ifb0 protocol all parent 1: prio 5 u32 ht 800::
match ip dst 0.0.0.0/0 hashkey mask 0x000000ff at 16 link 1:
tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32
tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
RTNETLINK answers: No space left on device
We have an error talking to the kernel

This not ok, on error/warnings and no dmesg log.

Best Regards
Please forgive my English
Marcin Kabiesz

^ permalink raw reply

* Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-04-06 22:59 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <20180406124814.GA24525@nanopsycho>

On 4/6/2018 5:48 AM, Jiri Pirko wrote:
> Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudrala@intel.com wrote:

<snip>
> 	
>> +
>> +static void virtnet_bypass_set_rx_mode(struct net_device *dev)
>> +{
>> +	struct virtnet_bypass_info *vbi = netdev_priv(dev);
>> +	struct net_device *child_netdev;
>> +
>> +	rcu_read_lock();
>> +
>> +	child_netdev = rcu_dereference(vbi->active_netdev);
>> +	if (child_netdev) {
>> +		dev_uc_sync_multiple(child_netdev, dev);
>> +		dev_mc_sync_multiple(child_netdev, dev);
>> +	}
>> +
>> +	child_netdev = rcu_dereference(vbi->backup_netdev);
>> +	if (child_netdev) {
>> +		dev_uc_sync_multiple(child_netdev, dev);
>> +		dev_mc_sync_multiple(child_netdev, dev);
>> +	}
>> +
>> +	rcu_read_unlock();
>> +}
> This should be moved to bypass module.

Sure. All these bypass ndo_ops can be moved to bypass module and any
paravirtual driver that want to go with 3 netdev model can reuse these functions.


>
>
>> +
>> +static const struct net_device_ops virtnet_bypass_netdev_ops = {
>> +	.ndo_open		= virtnet_bypass_open,
>> +	.ndo_stop		= virtnet_bypass_close,
>> +	.ndo_start_xmit		= virtnet_bypass_start_xmit,
>> +	.ndo_select_queue	= virtnet_bypass_select_queue,
>> +	.ndo_get_stats64	= virtnet_bypass_get_stats,
>> +	.ndo_change_mtu		= virtnet_bypass_change_mtu,
>> +	.ndo_set_rx_mode	= virtnet_bypass_set_rx_mode,
>> +	.ndo_validate_addr	= eth_validate_addr,
>> +	.ndo_features_check	= passthru_features_check,
>> +};
>> +
>> +static int
>> +virtnet_bypass_ethtool_get_link_ksettings(struct net_device *dev,
>> +					  struct ethtool_link_ksettings *cmd)
>> +{
>> +	struct virtnet_bypass_info *vbi = netdev_priv(dev);
>> +	struct net_device *child_netdev;
>> +
>> +	child_netdev = rtnl_dereference(vbi->active_netdev);
>> +	if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
>> +		child_netdev = rtnl_dereference(vbi->backup_netdev);
>> +		if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
>> +			cmd->base.duplex = DUPLEX_UNKNOWN;
>> +			cmd->base.port = PORT_OTHER;
>> +			cmd->base.speed = SPEED_UNKNOWN;
>> +
>> +			return 0;
>> +		}
>> +	}
>> +
>> +	return __ethtool_get_link_ksettings(child_netdev, cmd);
>> +}
>> +
>> +#define BYPASS_DRV_NAME "virtnet_bypass"
>> +#define BYPASS_DRV_VERSION "0.1"
>> +
>> +static void virtnet_bypass_ethtool_get_drvinfo(struct net_device *dev,
>> +					       struct ethtool_drvinfo *drvinfo)
>> +{
>> +	strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
>> +	strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
>> +}
>> +
>> +static const struct ethtool_ops virtnet_bypass_ethtool_ops = {
>> +	.get_drvinfo            = virtnet_bypass_ethtool_get_drvinfo,
>> +	.get_link               = ethtool_op_get_link,
>> +	.get_link_ksettings     = virtnet_bypass_ethtool_get_link_ksettings,
>> +};
>> +
>> +static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>> +				     struct net_device *child_netdev)
>> +{
>> +	struct virtnet_bypass_info *vbi;
>> +	bool backup;
>> +
>> +	vbi = netdev_priv(bypass_netdev);
>> +	backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>> +	if (backup ? rtnl_dereference(vbi->backup_netdev) :
>> +			rtnl_dereference(vbi->active_netdev)) {
>> +		netdev_info(bypass_netdev,
>> +			    "%s attempting to join bypass dev when %s already present\n",
>> +			    child_netdev->name, backup ? "backup" : "active");
> Bypass module should check if there is already some other netdev
> enslaved and refuse right there.

This will work for virtio-net with 3 netdev model, but this check has to be done by netvsc
as its bypass_netdev is same as the backup_netdev.
Will add a flag while registering with the bypass module to indicate if the driver is doing
a 2 netdev or 3 netdev model and based on that flag this check can be done in bypass module
for 3 netdev scenario.



>
> The active/backup terminology is quite confusing. From the bonding world
> that means active is the one which is currently used for tx of the
> packets. And it depends on link and other things what netdev is declared
> active. However here, it is different. Backup is always the virtio_net
> instance even when it is active. Odd. Please change the terminology.
> For "active" I suggest to use name "stolen".

I am not too happy with 'stolen' name. Will see if i can come up with a
better name.


>
> *** Also, the 2 slave netdev pointers should be stored in the bypass
> module instance, not in the drivers.

Will move virtnet_bypass_info struct to bypass.h

>
>
>
>> +		return -EEXIST;
>> +	}
>> +
>> +	dev_hold(child_netdev);
>> +
>> +	if (backup) {
>> +		rcu_assign_pointer(vbi->backup_netdev, child_netdev);
>> +		dev_get_stats(vbi->backup_netdev, &vbi->backup_stats);
>> +	} else {
>> +		rcu_assign_pointer(vbi->active_netdev, child_netdev);
>> +		dev_get_stats(vbi->active_netdev, &vbi->active_stats);
>> +		bypass_netdev->min_mtu = child_netdev->min_mtu;
>> +		bypass_netdev->max_mtu = child_netdev->max_mtu;
>> +	}
>> +
>> +	netdev_info(bypass_netdev, "child:%s joined\n", child_netdev->name);
>> +
>> +	return 0;
>> +}
>> +
>> +static int virtnet_bypass_register_child(struct net_device *bypass_netdev,
>> +					 struct net_device *child_netdev)
>> +{
>> +	struct virtnet_bypass_info *vbi;
>> +	bool backup;
>> +
>> +	vbi = netdev_priv(bypass_netdev);
>> +	backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>> +	if (backup ? rtnl_dereference(vbi->backup_netdev) :
>> +			rtnl_dereference(vbi->active_netdev)) {
>> +		netdev_info(bypass_netdev,
>> +			    "%s attempting to register bypass dev when %s already present\n",
>> +			    child_netdev->name, backup ? "backup" : "active");
>> +		return -EEXIST;
>> +	}
>> +
>> +	/* Avoid non pci devices as active netdev */
>> +	if (!backup && (!child_netdev->dev.parent ||
>> +			!dev_is_pci(child_netdev->dev.parent)))
>> +		return -EINVAL;
>> +
>> +	netdev_info(bypass_netdev, "child:%s registered\n", child_netdev->name);
>> +
>> +	return 0;
>> +}
>> +
>> +static int virtnet_bypass_release_child(struct net_device *bypass_netdev,
>> +					struct net_device *child_netdev)
>> +{
>> +	struct net_device *backup_netdev, *active_netdev;
>> +	struct virtnet_bypass_info *vbi;
>> +
>> +	vbi = netdev_priv(bypass_netdev);
>> +	active_netdev = rtnl_dereference(vbi->active_netdev);
>> +	backup_netdev = rtnl_dereference(vbi->backup_netdev);
>> +
>> +	if (child_netdev != active_netdev && child_netdev != backup_netdev)
>> +		return -EINVAL;
>> +
>> +	netdev_info(bypass_netdev, "child:%s released\n", child_netdev->name);
>> +
>> +	return 0;
>> +}
>> +
>> +static int virtnet_bypass_unregister_child(struct net_device *bypass_netdev,
>> +					   struct net_device *child_netdev)
>> +{
>> +	struct net_device *backup_netdev, *active_netdev;
>> +	struct virtnet_bypass_info *vbi;
>> +
>> +	vbi = netdev_priv(bypass_netdev);
>> +	active_netdev = rtnl_dereference(vbi->active_netdev);
>> +	backup_netdev = rtnl_dereference(vbi->backup_netdev);
>> +
>> +	if (child_netdev != active_netdev && child_netdev != backup_netdev)
>> +		return -EINVAL;
>> +
>> +	if (child_netdev == backup_netdev) {
>> +		RCU_INIT_POINTER(vbi->backup_netdev, NULL);
>> +	} else {
>> +		RCU_INIT_POINTER(vbi->active_netdev, NULL);
>> +		if (backup_netdev) {
>> +			bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> +			bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> +		}
>> +	}
>> +
>> +	dev_put(child_netdev);
>> +
>> +	netdev_info(bypass_netdev, "child:%s unregistered\n",
>> +		    child_netdev->name);
>> +
>> +	return 0;
>> +}
>> +
>> +static int virtnet_bypass_update_link(struct net_device *bypass_netdev,
>> +				      struct net_device *child_netdev)
>> +{
>> +	struct net_device *active_netdev, *backup_netdev;
>> +	struct virtnet_bypass_info *vbi;
>> +
>> +	if (!netif_running(bypass_netdev))
>> +		return 0;
>> +
>> +	vbi = netdev_priv(bypass_netdev);
>> +
>> +	active_netdev = rtnl_dereference(vbi->active_netdev);
>> +	backup_netdev = rtnl_dereference(vbi->backup_netdev);
>> +
>> +	if (child_netdev != active_netdev && child_netdev != backup_netdev)
>> +		return -EINVAL;
>> +
>> +	if ((active_netdev && virtnet_bypass_xmit_ready(active_netdev)) ||
>> +	    (backup_netdev && virtnet_bypass_xmit_ready(backup_netdev))) {
>> +		netif_carrier_on(bypass_netdev);
>> +		netif_tx_wake_all_queues(bypass_netdev);
>> +	} else {
>> +		netif_carrier_off(bypass_netdev);
>> +		netif_tx_stop_all_queues(bypass_netdev);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/* Called when child dev is injecting data into network stack.
>> + * Change the associated network device from lower dev to virtio.
>> + * note: already called with rcu_read_lock
>> + */
>> +static rx_handler_result_t virtnet_bypass_handle_frame(struct sk_buff **pskb)
>> +{
>> +	struct sk_buff *skb = *pskb;
>> +	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
>> +
>> +	skb->dev = ndev;
>> +
>> +	return RX_HANDLER_ANOTHER;
>> +}
> Hmm, you have the rx_handler defined in drivers and you register it in
> bypass module. It is odd because here you assume that the bypass module
> passed "ndev" and rx_handler_data. Also, you don't need advanced
> features rx_handler provides.
> Instead, just register a common rx_handler defined in the bypass module
> and have simple skb_rx callback here (static void).

Will move the rx_handler also to bypass.c


>
>
>> +
>> +static const struct bypass_ops virtnet_bypass_ops = {
>> +	.register_child		= virtnet_bypass_register_child,
>> +	.join_child		= virtnet_bypass_join_child,
>> +	.unregister_child	= virtnet_bypass_unregister_child,
>> +	.release_child		= virtnet_bypass_release_child,
>> +	.update_link		= virtnet_bypass_update_link,
>> +	.handle_frame		= virtnet_bypass_handle_frame,
>> +};
>> +
>> +static struct bypass *virtnet_bypass;
>> +
>> +static int virtnet_bypass_create(struct virtnet_info *vi)
>> +{
>> +	struct net_device *backup_netdev = vi->dev;
>> +	struct device *dev = &vi->vdev->dev;
>> +	struct net_device *bypass_netdev;
>> +	int res;
>> +
>> +	/* Alloc at least 2 queues, for now we are going with 16 assuming
>> +	 * that most devices being bonded won't have too many queues.
>> +	 */
>> +	bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
>> +					  16);
>> +	if (!bypass_netdev) {
>> +		dev_err(dev, "Unable to allocate bypass_netdev!\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	dev_net_set(bypass_netdev, dev_net(backup_netdev));
>> +	SET_NETDEV_DEV(bypass_netdev, dev);
>> +
>> +	bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
>> +	bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
>> +
>> +	/* Initialize the device options */
>> +	bypass_netdev->flags |= IFF_MASTER;
> I think I pointed that out already. Don't use "IFF_MASTER". That is
> specific to bonding. As I suggested in the reply to the patch #2, you
> should introduce IFF_BYPASS. Also, this flag should be set by the bypass
> module. Just create the netdev and do things specific to virtio and then
> call to bypass module, pass the netdev so it can do the rest. I think
> that the flags, features etc would be also fine to set there.

OK.


>
>
>> +	bypass_netdev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
>> +	bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
>> +				       IFF_TX_SKB_SHARING);
>> +
>> +	/* don't acquire bypass netdev's netif_tx_lock when transmitting */
>> +	bypass_netdev->features |= NETIF_F_LLTX;
>> +
>> +	/* Don't allow bypass devices to change network namespaces. */
>> +	bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
>> +
>> +	bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
>> +				     NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
>> +				     NETIF_F_HIGHDMA | NETIF_F_LRO;
>> +
>> +	bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
>> +	bypass_netdev->features |= bypass_netdev->hw_features;
>> +
>> +	/* For now treat bypass netdev as VLAN challenged since we
>> +	 * cannot assume VLAN functionality with a VF
> Why? I don't see such drivers. But to be 100% correct, you should check
> the NETIF_F_VLAN_CHALLENGED feature in bypass module during VF enslave
> and forbid to do so if it is on.

Will look into it.


>
>
>> +	 */
>> +	bypass_netdev->features |= NETIF_F_VLAN_CHALLENGED;
>> +
>> +	memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
>> +	       bypass_netdev->addr_len);
>> +
>> +	bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> +	bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> +
>> +	res = register_netdev(bypass_netdev);
>> +	if (res < 0) {
>> +		dev_err(dev, "Unable to register bypass_netdev!\n");
>> +		goto err_register_netdev;
>> +	}
>> +
>> +	netif_carrier_off(bypass_netdev);
>> +
>> +	res = bypass_register_instance(virtnet_bypass, bypass_netdev);
>> +	if (res < 0)
>> +		goto err_bypass;
>> +
>> +	rcu_assign_pointer(vi->bypass_netdev, bypass_netdev);
>> +
>> +	return 0;
>> +
>> +err_bypass:
>> +	unregister_netdev(bypass_netdev);
>> +err_register_netdev:
>> +	free_netdev(bypass_netdev);
>> +
>> +	return res;
>> +}
>> +
>> +static void virtnet_bypass_destroy(struct virtnet_info *vi)
>> +{
>> +	struct net_device *bypass_netdev;
>> +	struct virtnet_bypass_info *vbi;
>> +	struct net_device *child_netdev;
>> +
>> +	bypass_netdev = rcu_dereference(vi->bypass_netdev);
>> +	/* no device found, nothing to free */
>> +	if (!bypass_netdev)
>> +		return;
>> +
>> +	vbi = netdev_priv(bypass_netdev);
>> +
>> +	netif_device_detach(bypass_netdev);
>> +
>> +	rtnl_lock();
>> +
>> +	child_netdev = rtnl_dereference(vbi->active_netdev);
>> +	if (child_netdev)
>> +		bypass_unregister_child(child_netdev);
>> +
>> +	child_netdev = rtnl_dereference(vbi->backup_netdev);
>> +	if (child_netdev)
>> +		bypass_unregister_child(child_netdev);
>> +
>> +	unregister_netdevice(bypass_netdev);
>> +
>> +	bypass_unregister_instance(virtnet_bypass, bypass_netdev);
>> +
>> +	rtnl_unlock();
>> +
>> +	free_netdev(bypass_netdev);
>> +}
>> +
>> +/* END of functions supporting VIRTIO_NET_F_BACKUP feature. */
>> +
>> static int virtnet_probe(struct virtio_device *vdev)
>> {
>> 	int i, err = -ENOMEM;
>> @@ -2839,10 +3432,15 @@ static int virtnet_probe(struct virtio_device *vdev)
>>
>> 	virtnet_init_settings(dev);
>>
>> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) {
>> +		if (virtnet_bypass_create(vi) != 0)
> You need to do:
> 		err = virtnet_bypass_create(vi);
> 		if (err)
> otherwise you ignore err and virtnet_probe would return 0;

Sure.


>
>
>> +			goto free_vqs;
>> +	}
>> +
>> 	err = register_netdev(dev);
>> 	if (err) {
>> 		pr_debug("virtio_net: registering device failed\n");
>> -		goto free_vqs;
>> +		goto free_bypass;
>> 	}
>>
>> 	virtio_device_ready(vdev);
>> @@ -2879,6 +3477,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>> 	vi->vdev->config->reset(vdev);
>>
>> 	unregister_netdev(dev);
>> +free_bypass:
>> +	virtnet_bypass_destroy(vi);
>> free_vqs:
>> 	cancel_delayed_work_sync(&vi->refill);
>> 	free_receive_page_frags(vi);
>> @@ -2913,6 +3513,8 @@ static void virtnet_remove(struct virtio_device *vdev)
>>
>> 	unregister_netdev(vi->dev);
>>
>> +	virtnet_bypass_destroy(vi);
>> +
>> 	remove_vq_common(vi);
>>
>> 	free_netdev(vi->dev);
>> @@ -2996,6 +3598,11 @@ static __init int virtio_net_driver_init(void)
>> {
>> 	int ret;
>>
>> +	virtnet_bypass = bypass_register_driver(&virtnet_bypass_ops,
>> +						&virtnet_bypass_netdev_ops);
>> +	if (!virtnet_bypass)
>> +		return -ENOMEM;
> If CONFIG_NET_BYPASS is undefined, you will always return -ENOMEM here.

Will fix.


>
>
>> +
>> 	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
>> 				      virtnet_cpu_online,
>> 				      virtnet_cpu_down_prep);
>> @@ -3010,12 +3617,14 @@ static __init int virtio_net_driver_init(void)
>>          ret = register_virtio_driver(&virtio_net_driver);
>> 	if (ret)
>> 		goto err_virtio;
>> +
>> 	return 0;
>> err_virtio:
>> 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>> err_dead:
>> 	cpuhp_remove_multi_state(virtionet_online);
>> out:
>> +	bypass_unregister_driver(virtnet_bypass);
>> 	return ret;
>> }
>> module_init(virtio_net_driver_init);
>> @@ -3025,6 +3634,7 @@ static __exit void virtio_net_driver_exit(void)
>> 	unregister_virtio_driver(&virtio_net_driver);
>> 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>> 	cpuhp_remove_multi_state(virtionet_online);
>> +	bypass_unregister_driver(virtnet_bypass);
>> }
>> module_exit(virtio_net_driver_exit);
>>
>> -- 
>> 2.14.3
>>

^ permalink raw reply

* Re: [RFC PATCH net-next v5 2/4] net: Introduce generic bypass module
From: Samudrala, Sridhar @ 2018-04-06 23:08 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <20180406125758.GC19345@nanopsycho>

On 4/6/2018 5:57 AM, Jiri Pirko wrote:
> Thu, Apr 05, 2018 at 11:08:21PM CEST, sridhar.samudrala@intel.com wrote:
>> This provides a generic interface for paravirtual drivers to listen
>> for netdev register/unregister/link change events from pci ethernet
>> devices with the same MAC and takeover their datapath. The notifier and
>> event handling code is based on the existing netvsc implementation. A
>> paravirtual driver can use this module by registering a set of ops and
>> each instance of the device when it is probed.
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> ---
>> include/net/bypass.h |  80 ++++++++++
>> net/Kconfig          |  18 +++
>> net/core/Makefile    |   1 +
>> net/core/bypass.c    | 406 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 505 insertions(+)
>> create mode 100644 include/net/bypass.h
>> create mode 100644 net/core/bypass.c
>>
>> diff --git a/include/net/bypass.h b/include/net/bypass.h
>> new file mode 100644
>> index 000000000000..e2dd122f951a
>> --- /dev/null
>> +++ b/include/net/bypass.h
>> @@ -0,0 +1,80 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2018, Intel Corporation. */
>> +
>> +#ifndef _NET_BYPASS_H
>> +#define _NET_BYPASS_H
>> +
>> +#include <linux/netdevice.h>
>> +
>> +struct bypass_ops {
> Perhaps "net_bypass_" would be better prefix for this module structs
> and functions. No strong opinion though.
>
>
>> +	int (*register_child)(struct net_device *bypass_netdev,
>> +			      struct net_device *child_netdev);
> We have master/slave upper/lower netdevices. This adds "child". Consider
> using some existing names. Not sure if possible without loss of meaning.

OK. will change this to register_slave()

>
>
>> +	int (*join_child)(struct net_device *bypass_netdev,
>> +			  struct net_device *child_netdev);
>> +	int (*unregister_child)(struct net_device *bypass_netdev,
>> +				struct net_device *child_netdev);
>> +	int (*release_child)(struct net_device *bypass_netdev,
>> +			     struct net_device *child_netdev);
>> +	int (*update_link)(struct net_device *bypass_netdev,
>> +			   struct net_device *child_netdev);
>> +	rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>> +};
>> +
>> +struct bypass_instance {
>> +	struct list_head list;
>> +	struct net_device __rcu *bypass_netdev;
>> +	struct bypass *bypass;
>> +};
>> +
>> +struct bypass {
>> +	struct list_head list;
>> +	const struct bypass_ops *ops;
>> +	const struct net_device_ops *netdev_ops;
>> +	struct list_head instance_list;
>> +	struct mutex lock;
>> +};
>> +
>> +#if IS_ENABLED(CONFIG_NET_BYPASS)
>> +
>> +struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>> +				      const struct net_device_ops *netdev_ops);
>> +void bypass_unregister_driver(struct bypass *bypass);
>> +
>> +int bypass_register_instance(struct bypass *bypass, struct net_device *dev);
>> +int bypass_unregister_instance(struct bypass *bypass, struct net_device	*dev);
>> +
>> +int bypass_unregister_child(struct net_device *child_netdev);
>> +
>> +#else
>> +
>> +static inline
>> +struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>> +				      const struct net_device_ops *netdev_ops)
>> +{
>> +	return NULL;
>> +}
>> +
>> +static inline void bypass_unregister_driver(struct bypass *bypass)
>> +{
>> +}
>> +
>> +static inline int bypass_register_instance(struct bypass *bypass,
>> +					   struct net_device *dev)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline int bypass_unregister_instance(struct bypass *bypass,
>> +					     struct net_device *dev)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline int bypass_unregister_child(struct net_device *child_netdev)
>> +{
>> +	return 0;
>> +}
>> +
>> +#endif
>> +
>> +#endif /* _NET_BYPASS_H */
>> diff --git a/net/Kconfig b/net/Kconfig
>> index 0428f12c25c2..994445f4a96a 100644
>> --- a/net/Kconfig
>> +++ b/net/Kconfig
>> @@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
>> 	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
>> 	  devlink is a loadable module and the driver using it is built-in.
>>
>> +config NET_BYPASS
>> +	tristate "Bypass interface"
>> +	---help---
>> +	  This provides a generic interface for paravirtual drivers to listen
>> +	  for netdev register/unregister/link change events from pci ethernet
>> +	  devices with the same MAC and takeover their datapath. This also
>> +	  enables live migration of a VM with direct attached VF by failing
>> +	  over to the paravirtual datapath when the VF is unplugged.
>> +
>> +config MAY_USE_BYPASS
>> +	tristate
>> +	default m if NET_BYPASS=m
>> +	default y if NET_BYPASS=y || NET_BYPASS=n
>> +	help
>> +	  Drivers using the bypass infrastructure should have a dependency
>> +	  on MAY_USE_BYPASS to ensure they do not cause link errors when
>> +	  bypass is a loadable module and the driver using it is built-in.
>> +
>> endif   # if NET
>>
>> # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>> diff --git a/net/core/Makefile b/net/core/Makefile
>> index 6dbbba8c57ae..a9727ed1c8fc 100644
>> --- a/net/core/Makefile
>> +++ b/net/core/Makefile
>> @@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
>> obj-$(CONFIG_HWBM) += hwbm.o
>> obj-$(CONFIG_NET_DEVLINK) += devlink.o
>> obj-$(CONFIG_GRO_CELLS) += gro_cells.o
>> +obj-$(CONFIG_NET_BYPASS) += bypass.o
>> diff --git a/net/core/bypass.c b/net/core/bypass.c
>> new file mode 100644
>> index 000000000000..7bde962ec3d4
>> --- /dev/null
>> +++ b/net/core/bypass.c
>> @@ -0,0 +1,406 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2018, Intel Corporation. */
>> +
>> +/* A common module to handle registrations and notifications for paravirtual
>> + * drivers to enable accelerated datapath and support VF live migration.
>> + *
>> + * The notifier and event handling code is based on netvsc driver.
>> + */
>> +
>> +#include <linux/netdevice.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/ethtool.h>
>> +#include <linux/module.h>
>> +#include <linux/slab.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/netpoll.h>
>> +#include <linux/rtnetlink.h>
>> +#include <linux/if_vlan.h>
>> +#include <net/sch_generic.h>
>> +#include <uapi/linux/if_arp.h>
>> +#include <net/bypass.h>
>> +
>> +static LIST_HEAD(bypass_list);
>> +
>> +static DEFINE_MUTEX(bypass_mutex);
> Why mutex? Apparently you don't need to sleep while holding a lock.
> Simple spinlock would do.
>
>
>> +
>> +struct bypass_instance *bypass_instance_alloc(struct net_device *dev)
>> +{
>> +	struct bypass_instance *bypass_instance;
>> +
>> +	bypass_instance = kzalloc(sizeof(*bypass_instance), GFP_KERNEL);
>> +	if (!bypass_instance)
>> +		return NULL;
>> +
>> +	dev_hold(dev);
>> +	rcu_assign_pointer(bypass_instance->bypass_netdev, dev);
>> +
>> +	return bypass_instance;
>> +}
>> +
>> +void bypass_instance_free(struct bypass_instance *bypass_instance)
>> +{
>> +	struct net_device *bypass_netdev;
>> +
>> +	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> +
>> +	dev_put(bypass_netdev);
>> +	kfree(bypass_instance);
>> +}
>> +
>> +static struct bypass_instance *bypass_get_instance_bymac(u8 *mac)
>> +{
>> +	struct bypass_instance *bypass_instance;
>> +	struct net_device *bypass_netdev;
>> +	struct bypass *bypass;
>> +
>> +	list_for_each_entry(bypass, &bypass_list, list) {
>> +		mutex_lock(&bypass->lock);
>> +		list_for_each_entry(bypass_instance, &bypass->instance_list,
>> +				    list) {
>> +			bypass_netdev =
>> +				rcu_dereference(bypass_instance->bypass_netdev);
>> +			if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>> +				mutex_unlock(&bypass->lock);
>> +				goto out;
>> +			}
>> +		}
>> +		mutex_unlock(&bypass->lock);
>> +	}
>> +
>> +	bypass_instance = NULL;
>> +out:
>> +	return bypass_instance;
>> +}
>> +
>> +static int bypass_register_child(struct net_device *child_netdev)
>> +{
>> +	struct bypass_instance *bypass_instance;
>> +	struct bypass *bypass;
>> +	struct net_device *bypass_netdev;
>> +	int ret, orig_mtu;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	mutex_lock(&bypass_mutex);
>> +	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
>> +	if (!bypass_instance) {
>> +		mutex_unlock(&bypass_mutex);
>> +		goto done;
>> +	}
>> +
>> +	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> +	bypass = bypass_instance->bypass;
>> +	mutex_unlock(&bypass_mutex);
>> +
>> +	if (!bypass->ops->register_child)
>> +		goto done;
>> +
>> +	ret = bypass->ops->register_child(bypass_netdev, child_netdev);
>> +	if (ret != 0)
>> +		goto done;
>> +
>> +	ret = netdev_rx_handler_register(child_netdev,
>> +					 bypass->ops->handle_frame,
>> +					 bypass_netdev);
>> +	if (ret != 0) {
>> +		netdev_err(child_netdev,
>> +			   "can not register bypass rx handler (err = %d)\n",
>> +			   ret);
>> +		goto rx_handler_failed;
>> +	}
>> +
>> +	ret = netdev_upper_dev_link(child_netdev, bypass_netdev, NULL);
>> +	if (ret != 0) {
>> +		netdev_err(child_netdev,
> No line-wrap is needed here and in other cases like this.
>
>
>> +			   "can not set master device %s (err = %d)\n",
>> +			   bypass_netdev->name, ret);
>> +		goto upper_link_failed;
>> +	}
>> +
>> +	child_netdev->flags |= IFF_SLAVE;
> Don't reuse IFF_SLAVE. That is bonding-specific thing. I know that
> netvsc uses it, it is wrong.
> Please rather introduce:
> IFF_BYPASS for master
> and IFF_BYPASS_SLAVE for slaves.

OK. Will add these flags.

>
>
>
>
>> +
>> +	if (netif_running(bypass_netdev)) {
>> +		ret = dev_open(child_netdev);
>> +		if (ret && (ret != -EBUSY)) {
>> +			netdev_err(bypass_netdev,
>> +				   "Opening child %s failed ret:%d\n",
>> +				   child_netdev->name, ret);
>> +			goto err_interface_up;
>> +		}
>> +	}
>> +
>> +	/* Align MTU of child with master */
>> +	orig_mtu = child_netdev->mtu;
>> +	ret = dev_set_mtu(child_netdev, bypass_netdev->mtu);
>> +	if (ret != 0) {
>> +		netdev_err(bypass_netdev,
>> +			   "unable to change mtu of %s to %u register failed\n",
>> +			   child_netdev->name, bypass_netdev->mtu);
>> +		goto err_set_mtu;
>> +	}
>> +
>> +	ret = bypass->ops->join_child(bypass_netdev, child_netdev);
>> +	if (ret != 0)
>> +		goto err_join;
>> +
>> +	call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
>> +
>> +	goto done;
>> +
>> +err_join:
>> +	dev_set_mtu(child_netdev, orig_mtu);
>> +err_set_mtu:
>> +	dev_close(child_netdev);
>> +err_interface_up:
>> +	netdev_upper_dev_unlink(child_netdev, bypass_netdev);
>> +	child_netdev->flags &= ~IFF_SLAVE;
>> +upper_link_failed:
>> +	netdev_rx_handler_unregister(child_netdev);
>> +rx_handler_failed:
>> +	bypass->ops->unregister_child(bypass_netdev, child_netdev);
>> +
>> +done:
>> +	return NOTIFY_DONE;
>> +}
>> +
>> +int bypass_unregister_child(struct net_device *child_netdev)
>> +{
>> +	struct bypass_instance *bypass_instance;
>> +	struct net_device *bypass_netdev;
>> +	struct bypass *bypass;
>> +	int ret;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	mutex_lock(&bypass_mutex);
>> +	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
>> +	if (!bypass_instance) {
>> +		mutex_unlock(&bypass_mutex);
>> +		goto done;
>> +	}
>> +
>> +	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> +	bypass = bypass_instance->bypass;
>> +	mutex_unlock(&bypass_mutex);
>> +
>> +	ret = bypass->ops->release_child(bypass_netdev, child_netdev);
>> +	if (ret != 0)
>> +		goto done;
>> +
>> +	netdev_rx_handler_unregister(child_netdev);
>> +	netdev_upper_dev_unlink(child_netdev, bypass_netdev);
>> +	child_netdev->flags &= ~IFF_SLAVE;
>> +
>> +	if (!bypass->ops->unregister_child)
>> +		goto done;
>> +
>> +	bypass->ops->unregister_child(bypass_netdev, child_netdev);
>> +
>> +done:
>> +	return NOTIFY_DONE;
>> +}
>> +EXPORT_SYMBOL(bypass_unregister_child);
> Please use "EXPORT_SYMBOL_GPL" for all exported symbols.

OK.

>
>
>> +
>> +static int bypass_update_link(struct net_device *child_netdev)
>> +{
>> +	struct bypass_instance *bypass_instance;
>> +	struct net_device *bypass_netdev;
>> +	struct bypass *bypass;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	mutex_lock(&bypass_mutex);
>> +	bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
> You don't really need this lookup. The kernel knows about the master
> device, you can just use netdev_master_upper_dev_get_rcu() to get it.

Sure.


>
>
>> +	if (!bypass_instance) {
>> +		mutex_unlock(&bypass_mutex);
>> +		goto done;
>> +	}
>> +
>> +	bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> +	bypass = bypass_instance->bypass;
>> +	mutex_unlock(&bypass_mutex);
>> +
>> +	if (!bypass->ops->update_link)
>> +		goto done;
>> +
>> +	bypass->ops->update_link(bypass_netdev, child_netdev);
>> +
>> +done:
>> +	return NOTIFY_DONE;
>> +}
>> +
>> +static bool bypass_validate_child_dev(struct net_device *dev)
>> +{
>> +	/* Avoid non-Ethernet type devices */
>> +	if (dev->type != ARPHRD_ETHER)
>> +		return false;
>> +
>> +	/* Avoid Vlan dev with same MAC registering as VF */
>> +	if (is_vlan_dev(dev))
>> +		return false;
>> +
>> +	/* Avoid Bonding master dev with same MAC registering as child dev */
>> +	if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>> +		return false;
>> +
>> +	return true;
>> +}
>> +
>> +static int
>> +bypass_event(struct notifier_block *this, unsigned long event, void *ptr)
>> +{
>> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>> +	struct bypass *bypass;
>> +
>> +	/* Skip Parent events */
>> +	mutex_lock(&bypass_mutex);
>> +	list_for_each_entry(bypass, &bypass_list, list) {
>> +		if (event_dev->netdev_ops == bypass->netdev_ops) {
>> +			mutex_unlock(&bypass_mutex);
>> +			return NOTIFY_DONE;
>> +		}
> What you need instead of this is an identification helper
> netif_is_bypass_master()
> similar to
> netif_is_team_master()
> netif_is_bridge_master()
> etc

OK. Will introduce this helper.

>
>   
>
>> +	}
>> +	mutex_unlock(&bypass_mutex);
>> +
>> +	if (!bypass_validate_child_dev(event_dev))
>> +		return NOTIFY_DONE;
>> +
>> +	switch (event) {
>> +	case NETDEV_REGISTER:
>> +		return bypass_register_child(event_dev);
>> +	case NETDEV_UNREGISTER:
>> +		return bypass_unregister_child(event_dev);
>> +	case NETDEV_UP:
>> +	case NETDEV_DOWN:
>> +	case NETDEV_CHANGE:
>> +		return bypass_update_link(event_dev);
>> +	default:
>> +		return NOTIFY_DONE;
>> +	}
>> +}
>> +
>> +static struct notifier_block bypass_notifier = {
>> +	.notifier_call = bypass_event,
>> +};
>> +
>> +static void bypass_register_existing_child(struct net_device *bypass_netdev)
>> +{
>> +	struct net *net = dev_net(bypass_netdev);
>> +	struct net_device *dev;
>> +
>> +	rtnl_lock();
>> +	for_each_netdev(net, dev) {
>> +		if (dev == bypass_netdev)
>> +			continue;
>> +		if (!bypass_validate_child_dev(dev))
>> +			continue;
>> +		if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr))
>> +			bypass_register_child(dev);
>> +	}
>> +	rtnl_unlock();
>> +}
>> +
>> +int bypass_register_instance(struct bypass *bypass, struct net_device *dev)
>> +{
>> +	struct bypass_instance *bypass_instance;
> No need to allocate this instace here. You can just have is embedded
> inside netdevice priv and pass pointer to it here. You can pass the
> pointer back to the driver when you call ops as the driver can get priv
> back by it.
>
> I would also call it "struct bypass_master" and this function
> "bypass_master_register".
>
> It should contain the ops pointer too.

OK.


>
>
>> +	struct net_device *bypass_netdev;
>> +	int ret = 0;
>> +
>> +	mutex_lock(&bypass->lock);
>> +	list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
>> +		bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> +		if (bypass_netdev == dev) {
> This means the driver registered one netdev twice. That is a bug in
> driver, so WARN_ON would be nice here to point that out.

OK.


>
>
>> +			ret = -EEXIST;
>> +			goto done;
>> +		}
>> +	}
>> +
>> +	bypass_instance = bypass_instance_alloc(dev);
>> +	if (!bypass_instance) {
>> +		ret = -ENOMEM;
>> +		goto done;
>> +	}
>> +
>> +	bypass_instance->bypass = bypass;
>> +	list_add_tail(&bypass_instance->list, &bypass->instance_list);
>> +
>> +done:
>> +	mutex_unlock(&bypass->lock);
>> +	bypass_register_existing_child(dev);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(bypass_register_instance);
>> +
>> +int bypass_unregister_instance(struct bypass *bypass, struct net_device *dev)
>> +{
>> +	struct bypass_instance *bypass_instance;
>> +	struct net_device *bypass_netdev;
>> +	int ret = 0;
>> +
>> +	mutex_lock(&bypass->lock);
>> +	list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
>> +		bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> +		if (bypass_netdev == dev) {
>> +			list_del(&bypass_instance->list);
>> +			bypass_instance_free(bypass_instance);
>> +			goto done;
>> +		}
>> +	}
>> +
>> +	ret = -ENOENT;
>> +done:
>> +	mutex_unlock(&bypass->lock);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(bypass_unregister_instance);
>> +
>> +struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>> +				      const struct net_device_ops *netdev_ops)
> I don't see why you need a list of drivers. What you need is just a list
> of instances - bypass masters (probably to call them like that in the
> code as well). Well, you can use the common netdevice list for that
> purpose with the identification helper I mentioned above. Then you need
> no lists and no mutexes/spinlocks.

OK. looks like i should be able to just register instances and pass bypass_netdev,
bypass_ops and a flag to indicate 3/2 netdev model.


>
>
>> +{
>> +	struct bypass *bypass;
>> +
>> +	bypass = kzalloc(sizeof(*bypass), GFP_KERNEL);
>> +	if (!bypass)
>> +		return NULL;
>> +
>> +	bypass->ops = ops;
>> +	bypass->netdev_ops = netdev_ops;
>> +	INIT_LIST_HEAD(&bypass->instance_list);
>> +
>> +	mutex_lock(&bypass_mutex);
>> +	list_add_tail(&bypass->list, &bypass_list);
>> +	mutex_unlock(&bypass_mutex);
>> +
>> +	return bypass;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_register_driver);
>> +
>> +void bypass_unregister_driver(struct bypass *bypass)
>> +{
>> +	mutex_lock(&bypass_mutex);
>> +	list_del(&bypass->list);
>> +	mutex_unlock(&bypass_mutex);
>> +
>> +	kfree(bypass);
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_unregister_driver);
>> +
>> +static __init int
>> +bypass_init(void)
>> +{
>> +	register_netdevice_notifier(&bypass_notifier);
>> +
>> +	return 0;
>> +}
>> +module_init(bypass_init);
>> +
>> +static __exit
>> +void bypass_exit(void)
>> +{
>> +	unregister_netdevice_notifier(&bypass_notifier);
>> +}
>> +module_exit(bypass_exit);
>> +
>> +MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers");
>> +MODULE_LICENSE("GPL v2");
>> -- 
>> 2.14.3
>>

^ permalink raw reply

* [PATCH net 0/5] ibmvnic: Fix driver reset and DMA bugs
From: Thomas Falcon @ 2018-04-06 23:37 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, Thomas Falcon

This patch series introduces some fixes to the driver reset
routines and a patch that fixes mistakes caught by the kernel
DMA debugger.

The reset fixes include a fix to reset TX queue counters properly
after a reset as well as updates to driver reset error-handling code.
It also provides updates to the reset handling routine for redundant
backing VF failover and partition migration cases.

Nathan Fontenot (1):
  ibmvnic: Do not reset CRQ for Mobility driver resets

Thomas Falcon (4):
  ibmvnic: Fix DMA mapping mistakes
  ibmvnic: Zero used TX descriptor counter on reset
  ibmvnic: Fix reset scheduler error handling
  ibmvnic: Fix failover case for non-redundant configuration

 drivers/net/ethernet/ibm/ibmvnic.c | 146 ++++++++++++++++++++++++-------------
 drivers/net/ethernet/ibm/ibmvnic.h |   1 +
 2 files changed, 98 insertions(+), 49 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH net 1/5] ibmvnic: Fix DMA mapping mistakes
From: Thomas Falcon @ 2018-04-06 23:37 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, Thomas Falcon
In-Reply-To: <1523057826-5262-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Fix some mistakes caught by the DMA debugger. The first change
fixes a unnecessary unmap that should have been removed in an
earlier update. The next hunk fixes another bad unmap by zeroing
the bit checked to determine that an unmap is needed. The final 
change fixes some buffers that are unmapped with the wrong
direction specified.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index b492af6..58e0143 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -320,9 +320,6 @@ static void replenish_rx_pool(struct ibmvnic_adapter *adapter,
 	dev_info(dev, "replenish pools failure\n");
 	pool->free_map[pool->next_free] = index;
 	pool->rx_buff[index].skb = NULL;
-	if (!dma_mapping_error(dev, dma_addr))
-		dma_unmap_single(dev, dma_addr, pool->buff_size,
-				 DMA_FROM_DEVICE);
 
 	dev_kfree_skb_any(skb);
 	adapter->replenish_add_buff_failure++;
@@ -2574,7 +2571,7 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 	union sub_crq *next;
 	int index;
 	int i, j;
-	u8 first;
+	u8 *first;
 
 restart_loop:
 	while (pending_scrq(adapter, scrq)) {
@@ -2605,11 +2602,12 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 				txbuff->data_dma[j] = 0;
 			}
 			/* if sub_crq was sent indirectly */
-			first = txbuff->indir_arr[0].generic.first;
-			if (first == IBMVNIC_CRQ_CMD) {
+			first = &txbuff->indir_arr[0].generic.first;
+			if (*first == IBMVNIC_CRQ_CMD) {
 				dma_unmap_single(dev, txbuff->indir_dma,
 						 sizeof(txbuff->indir_arr),
 						 DMA_TO_DEVICE);
+				*first = 0;
 			}
 
 			if (txbuff->last_frag) {
@@ -3882,9 +3880,9 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq,
 	int i;
 
 	dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz,
-			 DMA_BIDIRECTIONAL);
+			 DMA_TO_DEVICE);
 	dma_unmap_single(dev, adapter->login_rsp_buf_token,
-			 adapter->login_rsp_buf_sz, DMA_BIDIRECTIONAL);
+			 adapter->login_rsp_buf_sz, DMA_FROM_DEVICE);
 
 	/* If the number of queues requested can't be allocated by the
 	 * server, the login response will return with code 1. We will need
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net 2/5] ibmvnic: Zero used TX descriptor counter on reset
From: Thomas Falcon @ 2018-04-06 23:37 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, Thomas Falcon
In-Reply-To: <1523057826-5262-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

The counter that tracks used TX descriptors pending completion
needs to be zeroed as part of a device reset. This change fixes
a bug causing transmit queues to be stopped unnecessarily and in
some cases a transmit queue stall and timeout reset. If the counter
is not reset, the remaining descriptors will not be "removed",
effectively reducing queue capacity. If the queue is over half full,
it will cause the queue to stall if stopped.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 58e0143..153a868 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2361,6 +2361,7 @@ static int reset_one_sub_crq_queue(struct ibmvnic_adapter *adapter,
 	}
 
 	memset(scrq->msgs, 0, 4 * PAGE_SIZE);
+	atomic_set(&scrq->used, 0);
 	scrq->cur = 0;
 
 	rc = h_reg_sub_crq(adapter->vdev->unit_address, scrq->msg_token,
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net 3/5] ibmvnic: Fix reset scheduler error handling
From: Thomas Falcon @ 2018-04-06 23:37 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, Thomas Falcon
In-Reply-To: <1523057826-5262-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

In some cases, if the driver is waiting for a reset following
a device parameter change, failure to schedule a reset can result
in a hang since a completion signal is never sent.

If the device configuration is being altered by a tool such
as ethtool or ifconfig, it could cause the console to hang
if the reset request does not get scheduled. Add some additional
error handling code to exit the wait_for_completion if there is
one in progress.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 39 ++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 153a868..bbcd07a 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1875,23 +1875,25 @@ static void __ibmvnic_reset(struct work_struct *work)
 	mutex_unlock(&adapter->reset_lock);
 }
 
-static void ibmvnic_reset(struct ibmvnic_adapter *adapter,
-			  enum ibmvnic_reset_reason reason)
+static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
+			 enum ibmvnic_reset_reason reason)
 {
 	struct ibmvnic_rwi *rwi, *tmp;
 	struct net_device *netdev = adapter->netdev;
 	struct list_head *entry;
+	int ret;
 
 	if (adapter->state == VNIC_REMOVING ||
 	    adapter->state == VNIC_REMOVED) {
+		ret = EBUSY;
 		netdev_dbg(netdev, "Adapter removing, skipping reset\n");
-		return;
+		goto err;
 	}
 
 	if (adapter->state == VNIC_PROBING) {
 		netdev_warn(netdev, "Adapter reset during probe\n");
-		adapter->init_done_rc = EAGAIN;
-		return;
+		ret = adapter->init_done_rc = EAGAIN;
+		goto err;
 	}
 
 	mutex_lock(&adapter->rwi_lock);
@@ -1901,7 +1903,8 @@ static void ibmvnic_reset(struct ibmvnic_adapter *adapter,
 		if (tmp->reset_reason == reason) {
 			netdev_dbg(netdev, "Skipping matching reset\n");
 			mutex_unlock(&adapter->rwi_lock);
-			return;
+			ret = EBUSY;
+			goto err;
 		}
 	}
 
@@ -1909,7 +1912,8 @@ static void ibmvnic_reset(struct ibmvnic_adapter *adapter,
 	if (!rwi) {
 		mutex_unlock(&adapter->rwi_lock);
 		ibmvnic_close(netdev);
-		return;
+		ret = ENOMEM;
+		goto err;
 	}
 
 	rwi->reset_reason = reason;
@@ -1918,6 +1922,12 @@ static void ibmvnic_reset(struct ibmvnic_adapter *adapter,
 
 	netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason);
 	schedule_work(&adapter->ibmvnic_reset);
+
+	return 0;
+err:
+	if (adapter->wait_for_reset)
+		adapter->wait_for_reset = false;
+	return -ret;
 }
 
 static void ibmvnic_tx_timeout(struct net_device *dev)
@@ -2052,6 +2062,8 @@ static void ibmvnic_netpoll_controller(struct net_device *dev)
 
 static int wait_for_reset(struct ibmvnic_adapter *adapter)
 {
+	int rc, ret;
+
 	adapter->fallback.mtu = adapter->req_mtu;
 	adapter->fallback.rx_queues = adapter->req_rx_queues;
 	adapter->fallback.tx_queues = adapter->req_tx_queues;
@@ -2059,11 +2071,15 @@ static int wait_for_reset(struct ibmvnic_adapter *adapter)
 	adapter->fallback.tx_entries = adapter->req_tx_entries_per_subcrq;
 
 	init_completion(&adapter->reset_done);
-	ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM);
 	adapter->wait_for_reset = true;
+	rc = ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM);
+	if (rc)
+		return rc;
 	wait_for_completion(&adapter->reset_done);
 
+	ret = 0;
 	if (adapter->reset_done_rc) {
+		ret = -EIO;
 		adapter->desired.mtu = adapter->fallback.mtu;
 		adapter->desired.rx_queues = adapter->fallback.rx_queues;
 		adapter->desired.tx_queues = adapter->fallback.tx_queues;
@@ -2071,12 +2087,15 @@ static int wait_for_reset(struct ibmvnic_adapter *adapter)
 		adapter->desired.tx_entries = adapter->fallback.tx_entries;
 
 		init_completion(&adapter->reset_done);
-		ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM);
+		adapter->wait_for_reset = true;
+		rc = ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM);
+		if (rc)
+			return ret;
 		wait_for_completion(&adapter->reset_done);
 	}
 	adapter->wait_for_reset = false;
 
-	return adapter->reset_done_rc;
+	return ret;
 }
 
 static int ibmvnic_change_mtu(struct net_device *netdev, int new_mtu)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net 5/5] ibmvnic: Do not reset CRQ for Mobility driver resets
From: Thomas Falcon @ 2018-04-06 23:37 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, Thomas Falcon
In-Reply-To: <1523057826-5262-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

From: Nathan Fontenot <nfont@linux.vnet.ibm.com>

When resetting the ibmvnic driver after a partition migration occurs
there is no requirement to do a reset of the main CRQ. The current
driver code does the required re-enable of the main CRQ, then does
a reset of the main CRQ later.

What we should be doing for a driver reset after a migration is to
re-enable the main CRQ, release all the sub-CRQs, and then allocate
new sub-CRQs after capability negotiation.

This patch updates the handling of mobility resets to do the proper
work and not reset the main CRQ. To do this the initialization/reset
of the main CRQ had to be moved out of the ibmvnic_init routine
and in to the ibmvnic_probe and do_reset routines.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 55 ++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 151542e..aad5658 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -118,6 +118,7 @@ static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *,
 static int ibmvnic_init(struct ibmvnic_adapter *);
 static void release_crq_queue(struct ibmvnic_adapter *);
 static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p);
+static int init_crq_queue(struct ibmvnic_adapter *adapter);
 
 struct ibmvnic_stat {
 	char name[ETH_GSTRING_LEN];
@@ -1224,7 +1225,6 @@ static int __ibmvnic_close(struct net_device *netdev)
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
 	if (rc)
 		return rc;
-	ibmvnic_cleanup(netdev);
 	adapter->state = VNIC_CLOSED;
 	return 0;
 }
@@ -1244,6 +1244,7 @@ static int ibmvnic_close(struct net_device *netdev)
 
 	mutex_lock(&adapter->reset_lock);
 	rc = __ibmvnic_close(netdev);
+	ibmvnic_cleanup(netdev);
 	mutex_unlock(&adapter->reset_lock);
 
 	return rc;
@@ -1726,14 +1727,10 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 	old_num_rx_queues = adapter->req_rx_queues;
 	old_num_tx_queues = adapter->req_tx_queues;
 
-	if (rwi->reset_reason == VNIC_RESET_MOBILITY) {
-		rc = ibmvnic_reenable_crq_queue(adapter);
-		if (rc)
-			return 0;
-		ibmvnic_cleanup(netdev);
-	} else if (rwi->reset_reason == VNIC_RESET_FAILOVER) {
-		ibmvnic_cleanup(netdev);
-	} else {
+	ibmvnic_cleanup(netdev);
+
+	if (adapter->reset_reason != VNIC_RESET_MOBILITY &&
+	    adapter->reset_reason != VNIC_RESET_FAILOVER) {
 		rc = __ibmvnic_close(netdev);
 		if (rc)
 			return rc;
@@ -1752,6 +1749,23 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 		 */
 		adapter->state = VNIC_PROBED;
 
+		if (adapter->wait_for_reset) {
+			rc = init_crq_queue(adapter);
+		} else if (adapter->reset_reason == VNIC_RESET_MOBILITY) {
+			rc = ibmvnic_reenable_crq_queue(adapter);
+			release_sub_crqs(adapter, 1);
+		} else {
+			rc = ibmvnic_reset_crq(adapter);
+			if (!rc)
+				rc = vio_enable_interrupts(adapter->vdev);
+		}
+
+		if (rc) {
+			netdev_err(adapter->netdev,
+				   "Couldn't initialize crq. rc=%d\n", rc);
+			return rc;
+		}
+
 		rc = ibmvnic_init(adapter);
 		if (rc)
 			return IBMVNIC_INIT_FAILED;
@@ -4500,19 +4514,6 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 	u64 old_num_rx_queues, old_num_tx_queues;
 	int rc;
 
-	if (adapter->resetting && !adapter->wait_for_reset) {
-		rc = ibmvnic_reset_crq(adapter);
-		if (!rc)
-			rc = vio_enable_interrupts(adapter->vdev);
-	} else {
-		rc = init_crq_queue(adapter);
-	}
-
-	if (rc) {
-		dev_err(dev, "Couldn't initialize crq. rc=%d\n", rc);
-		return rc;
-	}
-
 	adapter->from_passive_init = false;
 
 	old_num_rx_queues = adapter->req_rx_queues;
@@ -4537,7 +4538,8 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 		return -1;
 	}
 
-	if (adapter->resetting && !adapter->wait_for_reset) {
+	if (adapter->resetting && !adapter->wait_for_reset &&
+	    adapter->reset_reason != VNIC_RESET_MOBILITY) {
 		if (adapter->req_rx_queues != old_num_rx_queues ||
 		    adapter->req_tx_queues != old_num_tx_queues) {
 			release_sub_crqs(adapter, 0);
@@ -4625,6 +4627,13 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	adapter->mac_change_pending = false;
 
 	do {
+		rc = init_crq_queue(adapter);
+		if (rc) {
+			dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n",
+				rc);
+			goto ibmvnic_init_fail;
+		}
+
 		rc = ibmvnic_init(adapter);
 		if (rc && rc != EAGAIN)
 			goto ibmvnic_init_fail;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net 4/5] ibmvnic: Fix failover case for non-redundant configuration
From: Thomas Falcon @ 2018-04-06 23:37 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, Thomas Falcon
In-Reply-To: <1523057826-5262-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

There is a failover case for a non-redundant pseries VNIC
configuration that was not being handled properly. The current
implementation assumes that the driver will always have a redandant
device to communicate with following a failover notification. There
are cases, however, when a non-redundant configuration can receive
a failover request. If that happens, the driver should wait until
it receives a signal that the device is ready for operation.

The driver is agnostic of its backing hardware configuration,
so this fix necessarily affects all device failover management.
The driver needs to wait until it receives a signal that the device
is ready for resetting. A flag is introduced to track this intermediary
state where the driver is waiting for an active device.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 37 +++++++++++++++++++++++++++++--------
 drivers/net/ethernet/ibm/ibmvnic.h |  1 +
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index bbcd07a..151542e 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -325,10 +325,11 @@ static void replenish_rx_pool(struct ibmvnic_adapter *adapter,
 	adapter->replenish_add_buff_failure++;
 	atomic_add(buffers_added, &pool->available);
 
-	if (lpar_rc == H_CLOSED) {
+	if (lpar_rc == H_CLOSED || adapter->failover_pending) {
 		/* Disable buffer pool replenishment and report carrier off if
-		 * queue is closed. Firmware guarantees that a signal will
-		 * be sent to the driver, triggering a reset.
+		 * queue is closed or pending failover.
+		 * Firmware guarantees that a signal will be sent to the
+		 * driver, triggering a reset.
 		 */
 		deactivate_rx_pools(adapter);
 		netif_carrier_off(adapter->netdev);
@@ -1068,6 +1069,14 @@ static int ibmvnic_open(struct net_device *netdev)
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int rc;
 
+	/* If device failover is pending, just set device state and return.
+	 * Device operation will be handled by reset routine.
+	 */
+	if (adapter->failover_pending) {
+		adapter->state = VNIC_OPEN;
+		return 0;
+	}
+
 	mutex_lock(&adapter->reset_lock);
 
 	if (adapter->state != VNIC_CLOSED) {
@@ -1225,6 +1234,14 @@ static int ibmvnic_close(struct net_device *netdev)
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int rc;
 
+	/* If device failover is pending, just set device state and return.
+	 * Device operation will be handled by reset routine.
+	 */
+	if (adapter->failover_pending) {
+		adapter->state = VNIC_CLOSED;
+		return 0;
+	}
+
 	mutex_lock(&adapter->reset_lock);
 	rc = __ibmvnic_close(netdev);
 	mutex_unlock(&adapter->reset_lock);
@@ -1559,8 +1576,9 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		dev_kfree_skb_any(skb);
 		tx_buff->skb = NULL;
 
-		if (lpar_rc == H_CLOSED) {
-			/* Disable TX and report carrier off if queue is closed.
+		if (lpar_rc == H_CLOSED || adapter->failover_pending) {
+			/* Disable TX and report carrier off if queue is closed
+			 * or pending failover.
 			 * Firmware guarantees that a signal will be sent to the
 			 * driver, triggering a reset or some other action.
 			 */
@@ -1884,9 +1902,10 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
 	int ret;
 
 	if (adapter->state == VNIC_REMOVING ||
-	    adapter->state == VNIC_REMOVED) {
+	    adapter->state == VNIC_REMOVED ||
+	    adapter->failover_pending) {
 		ret = EBUSY;
-		netdev_dbg(netdev, "Adapter removing, skipping reset\n");
+		netdev_dbg(netdev, "Adapter removing or pending failover, skipping reset\n");
 		goto err;
 	}
 
@@ -4162,7 +4181,9 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		case IBMVNIC_CRQ_INIT:
 			dev_info(dev, "Partner initialized\n");
 			adapter->from_passive_init = true;
+			adapter->failover_pending = false;
 			complete(&adapter->init_done);
+			ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
 			break;
 		case IBMVNIC_CRQ_INIT_COMPLETE:
 			dev_info(dev, "Partner initialization complete\n");
@@ -4179,7 +4200,7 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 			ibmvnic_reset(adapter, VNIC_RESET_MOBILITY);
 		} else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) {
 			dev_info(dev, "Backing device failover detected\n");
-			ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
+			adapter->failover_pending = true;
 		} else {
 			/* The adapter lost the connection */
 			dev_err(dev, "Virtual Adapter failed (rc=%d)\n",
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 89efe70..99c0b58 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -1108,6 +1108,7 @@ struct ibmvnic_adapter {
 	bool napi_enabled, from_passive_init;
 
 	bool mac_change_pending;
+	bool failover_pending;
 
 	struct ibmvnic_tunables desired;
 	struct ibmvnic_tunables fallback;
-- 
1.8.3.1

^ permalink raw reply related

* Re: Problem with the kernel 4.15 - cutting the band (tc)
From: Cong Wang @ 2018-04-06 23:50 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: David Miller, Linux Kernel Network Developers
In-Reply-To: <CAADWXX-z+=E=Da3DtJsyLes6uc+ohAF4NSG7kUboCwqPX3+ArA@mail.gmail.com>

On Fri, Apr 6, 2018 at 2:56 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> Forwarding a report about what looks like a regression between 4.14 and 4.15.
>
> New ENOSPC issue? I don't even knew where to start guessing where to look.
>
> Help me, Davem-Wan Kenobi, you are my only hope.
>
> (But adding netdev just in case somebody else goes "That's obviously Xyz")
>
>               Linus
>
> ---------- Forwarded message ----------
> From: Marcin Kabiesz <admin@hostcenter.eu>
> Date: Thu, Apr 5, 2018 at 10:38 AM
> Subject: Problem with the kernel 4.15 - cutting the band (tc)
>
>
> Hello,
> I have a problem with bandwidth cutting on kernel 4.15. On the version
> up to 4.15, i.e. 4.14, this problem does not occur.
>
> uname -a: Linux router 4.14.15 #1 SMP x86_64 Intel Xeon E3-1230 v6
> command to reproduce:
>
> tc qdisc add dev ifb0 root handle 1: htb r2q 2
> tc class add dev ifb0 parent 1: classid 1:1 htb rate 10gbit ceil
> 10gbit quantum 16000
> tc filter add dev ifb0 parent 1: prio 5 handle 1: protocol all u32 divisor 256
> tc filter add dev ifb0 protocol all parent 1: prio 5 u32 ht 800::
> match ip dst 0.0.0.0/0 hashkey mask 0x000000ff at 16 link 1:
> tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
> ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
> tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32
> tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
> ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
> tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32
>
> This ok, no error/warnings and dmesg log.
>
> uname -a: Linux router 4.15.8 #1 SMP x86_64 Intel Xeon E3-1230 v6 (or
> 4.15.14 this same effect)
> command to reproduce:
>
> tc qdisc add dev ifb0 root handle 1: htb r2q 2
> tc class add dev ifb0 parent 1: classid 1:1 htb rate 10gbit ceil
> 10gbit quantum 16000
> tc filter add dev ifb0 parent 1: prio 5 handle 1: protocol all u32 divisor 256
> tc filter add dev ifb0 protocol all parent 1: prio 5 u32 ht 800::
> match ip dst 0.0.0.0/0 hashkey mask 0x000000ff at 16 link 1:
> tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
> ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
> tc filter del dev ifb0 parent 1:0 handle 1:2c:1 prio 5 u32
> tc filter add dev ifb0 parent 1:0 handle ::1 protocol all prio 5 u32
> ht 1:2c: match ip dst 192.168.3.44/32 flowid 1:2
> RTNETLINK answers: No space left on device
> We have an error talking to the kernel
>
> This not ok, on error/warnings and no dmesg log.

We forgot to call idr_remove() when deleting u32 key...

I am cooking a fix now.

Thanks!

^ permalink raw reply

* [Patch net] net_sched: fix a missing idr_remove() in u32_delete_key()
From: Cong Wang @ 2018-04-07  0:19 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Linus Torvalds, Jamal Hadi Salim

When we delete a u32 key via u32_delete_key(), we forget to
call idr_remove() to remove its handle from IDR.

Fixes: e7614370d6f0 ("net_sched: use idr to allocate u32 filter handles")
Reported-by: Marcin Kabiesz <admin@hostcenter.eu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/sched/cls_u32.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index ed8b6a24b9e9..bac47b5d18fd 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -489,6 +489,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 				RCU_INIT_POINTER(*kp, key->next);
 
 				tcf_unbind_filter(tp, &key->res);
+				idr_remove(&ht->handle_idr, key->handle);
 				tcf_exts_get_net(&key->exts);
 				call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
 				return 0;
-- 
2.13.0

^ permalink raw reply related

* [Patch net] tipc: use the right skb in tipc_sk_fill_sock_diag()
From: Cong Wang @ 2018-04-07  1:54 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, GhantaKrishnamurthy MohanKrishna, Jon Maloy, Ying Xue

Commit 4b2e6877b879 ("tipc: Fix namespace violation in tipc_sk_fill_sock_diag")
tried to fix the crash but failed, the crash is still 100% reproducible
with it.

In tipc_sk_fill_sock_diag(), skb is the diag dump we are filling, it is not
correct to retrieve its NETLINK_CB(), instead, like other protocol diag,
we should use NETLINK_CB(cb->skb).sk here.

Reported-by: <syzbot+326e587eff1074657718@syzkaller.appspotmail.com>
Fixes: 4b2e6877b879 ("tipc: Fix namespace violation in tipc_sk_fill_sock_diag")
Fixes: c30b70deb5f4 (tipc: implement socket diagnostics for AF_TIPC)
Cc: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/tipc/diag.c   | 2 +-
 net/tipc/socket.c | 6 +++---
 net/tipc/socket.h | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/tipc/diag.c b/net/tipc/diag.c
index 46d9cd62f781..aaabb0b776dd 100644
--- a/net/tipc/diag.c
+++ b/net/tipc/diag.c
@@ -59,7 +59,7 @@ static int __tipc_add_sock_diag(struct sk_buff *skb,
 	if (!nlh)
 		return -EMSGSIZE;
 
-	err = tipc_sk_fill_sock_diag(skb, tsk, req->tidiag_states,
+	err = tipc_sk_fill_sock_diag(skb, cb, tsk, req->tidiag_states,
 				     __tipc_diag_gen_cookie);
 	if (err)
 		return err;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index cee6674a3bf4..1fd1c8b5ce03 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3257,8 +3257,8 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
 }
 EXPORT_SYMBOL(tipc_nl_sk_walk);
 
-int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk,
-			   u32 sk_filter_state,
+int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
+			   struct tipc_sock *tsk, u32 sk_filter_state,
 			   u64 (*tipc_diag_gen_cookie)(struct sock *sk))
 {
 	struct sock *sk = &tsk->sk;
@@ -3280,7 +3280,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk,
 	    nla_put_u32(skb, TIPC_NLA_SOCK_TIPC_STATE, (u32)sk->sk_state) ||
 	    nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) ||
 	    nla_put_u32(skb, TIPC_NLA_SOCK_UID,
-			from_kuid_munged(sk_user_ns(NETLINK_CB(skb).sk),
+			from_kuid_munged(sk_user_ns(NETLINK_CB(cb->skb).sk),
 					 sock_i_uid(sk))) ||
 	    nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE,
 			      tipc_diag_gen_cookie(sk),
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index aae3fd4cd06c..aff9b2ae5a1f 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -61,8 +61,8 @@ int tipc_sk_rht_init(struct net *net);
 void tipc_sk_rht_destroy(struct net *net);
 int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb);
 int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb);
-int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk,
-			   u32 sk_filter_state,
+int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
+			   struct tipc_sock *tsk, u32 sk_filter_state,
 			   u64 (*tipc_diag_gen_cookie)(struct sock *sk));
 int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
 		    int (*skb_handler)(struct sk_buff *skb,
-- 
2.13.0

^ permalink raw reply related

* RE: [Intel-wired-lan] [next-queue PATCH v6 08/10] igb: Add MAC address support for ethtool nftuple filters
From: Brown, Aaron F @ 2018-04-07  2:11 UTC (permalink / raw)
  To: Gomes, Vinicius, intel-wired-lan@lists.osuosl.org
  Cc: netdev@vger.kernel.org, Sanchez-Palencia, Jesus
In-Reply-To: <87h8opmt4m.fsf@intel.com>

> From: Gomes, Vinicius
> Sent: Thursday, April 5, 2018 11:00 AM
> To: Brown, Aaron F <aaron.f.brown@intel.com>; intel-wired-
> lan@lists.osuosl.org
> Cc: netdev@vger.kernel.org; Sanchez-Palencia, Jesus <jesus.sanchez-
> palencia@intel.com>
> Subject: RE: [Intel-wired-lan] [next-queue PATCH v6 08/10] igb: Add MAC
> address support for ethtool nftuple filters
> 
> Hi,
> 
> "Brown, Aaron F" <aaron.f.brown@intel.com> writes:
> 
> >> From: Intel-wired-lan [mailto:intel-wired-lan-bounces@osuosl.org] On
> >> Behalf Of Vinicius Costa Gomes
> >> Sent: Thursday, March 29, 2018 2:08 PM
> >> To: intel-wired-lan@lists.osuosl.org
> >> Cc: netdev@vger.kernel.org; Sanchez-Palencia, Jesus <jesus.sanchez-
> >> palencia@intel.com>
> >> Subject: [Intel-wired-lan] [next-queue PATCH v6 08/10] igb: Add MAC
> >> address support for ethtool nftuple filters
> >>
> >> This adds the capability of configuring the queue steering of arriving
> >> packets based on their source and destination MAC addresses.
> >>
> >> In practical terms this adds support for the following use cases,
> >> characterized by these examples:
> >>
> >> $ ethtool -N eth0 flow-type ether dst aa:aa:aa:aa:aa:aa action 0
> >> (this will direct packets with destination address "aa:aa:aa:aa:aa:aa"
> >> to the RX queue 0)
> >
> > This is now working for me, testing with the dst MAC being the MAC on the
> i210.  I set the filter and all the traffic to the destination MAC address gets
> routed to the chosen RX queue.
> >
> >> $ ethtool -N eth0 flow-type ether src 44:44:44:44:44:44 action 3
> >> (this will direct packets with source address "44:44:44:44:44:44" to
> >> the RX queue 3)

Since this apparently does not work without refining the filter down to an ethertype I would like to see this example touched up to include the proto keyword.

> >
> > However, I am still not getting the raw ethernet source filter to
> > work.  Even back to back with no other system to "confuse" the stream,
> > I set the filter so the source MAC is the same as the MAC on the link
> > partner, send traffic and the traffic bounces around the queues as if
> > the filter is not set.
> 
> It seems there is at least a documentation issue in the i210 datasheet,
> steering (placing traffic into a specific queue) by source address
> doesn't work, filtering (accepting the traffic based on some rule) does
> work. I pointed this out in the cover letter of v5 as a known issue, but
> forgot to repeat it for v6, sorry about the confusion.

Yes, I recall that now.  I don't think I quite understood the implication at the time, but after trying it out it that makes perfect sense with what I am seeing.

> 
> But only the filtering part is useful, I think, it enables cases like
> this:
> 
> $ ethtool -N enp2s0 flow-type ether src 68:05:ca:4a:c9:73 proto 0x22f0 action
> 3

Ok, yes, this works.  If I tack on the proto keyword I can filter on whatever ethertype I choose and it seems to direct to the queue as expected.

> 
> I added that note in the hope that someone else would have an stronger
> opinion about what to do.

I don't have a strong opinion beyond my preference for an ideal world where everything works :)  If the part simply cannot filter on the src address as a whole without the protocol I would ideally prefer an attempt in ethtool to set the filter on src address as a whole to return an error WHILE still allowing the filter to be set on an ethertype when the proto keyword is issued.  If ethtool does not allow that fine grain of control then I think the way it is now is good, I'd rather have the annoyance of being able to set a filter that does nothing then not be able to set the more specific filter at all.  

> 
> Anyway, my plan for now will be to document this better and turn the
> case that only the source address is specified into an error.
> 
> >
> >>
> >> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> >> ---
> >>  drivers/net/ethernet/intel/igb/igb_ethtool.c | 35
> >> ++++++++++++++++++++++++----
> >>  1 file changed, 31 insertions(+), 4 deletions(-)
> 
> 
> Cheers,
> --
> Vinicius

^ permalink raw reply

* RE: [Intel-wired-lan] [next-queue PATCH v6 10/10] igb: Add support for adding offloaded clsflower filters
From: Brown, Aaron F @ 2018-04-07  2:19 UTC (permalink / raw)
  To: Gomes, Vinicius, intel-wired-lan@lists.osuosl.org
  Cc: netdev@vger.kernel.org, Sanchez-Palencia, Jesus
In-Reply-To: <20180329210751.11531-11-vinicius.gomes@intel.com>

> From: Intel-wired-lan [mailto:intel-wired-lan-bounces@osuosl.org] On
> Behalf Of Vinicius Costa Gomes
> Sent: Thursday, March 29, 2018 2:08 PM
> To: intel-wired-lan@lists.osuosl.org
> Cc: netdev@vger.kernel.org; Sanchez-Palencia, Jesus <jesus.sanchez-
> palencia@intel.com>
> Subject: [Intel-wired-lan] [next-queue PATCH v6 10/10] igb: Add support for
> adding offloaded clsflower filters
> 
> This allows filters added by tc-flower and specifying MAC addresses,
> Ethernet types, and the VLAN priority field, to be offloaded to the
> controller.

Can I get a brief explanation for enabling this?  I'm currently happy with this patch series from a regression perspective, but am personally a bit, umm, challenged with tc in general but would like to run it through the paces a bit.  If it can be done in a one or two liner I think it would be a good addition to the patch description.

> 
> This reuses most of the infrastructure used by ethtool, but clsflower
> filters are kept in a separated list, so they are invisible to
> ethtool.
> 
> Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
> ---
>  drivers/net/ethernet/intel/igb/igb.h      |   2 +
>  drivers/net/ethernet/intel/igb/igb_main.c | 188
> +++++++++++++++++++++++++++++-
>  2 files changed, 188 insertions(+), 2 deletions(-)

^ permalink raw reply

* Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Siwei Liu @ 2018-04-07  2:32 UTC (permalink / raw)
  To: David Miller
  Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Netdev,
	David Ahern, si-wei liu
In-Reply-To: <20180404.133749.1802514210170809419.davem@davemloft.net>

On Wed, Apr 4, 2018 at 10:37 AM, David Miller <davem@davemloft.net> wrote:
> From: David Ahern <dsahern@gmail.com>
> Date: Wed, 4 Apr 2018 11:21:54 -0600
>
>> It is a netdev so there is no reason to have a separate ip command to
>> inspect it. 'ip link' is the right place.
>
> I agree on this.

I'm completely fine of having an API for inspection purpose. The thing
is that we'd perhaps need to go for the namespace approach, for which
I think everyone seems to agree not to fiddle with the ":" prefix, but
rather have a new class of network subsystem under /sys/class thus a
separate device namespace e.g. /sys/class/net-kernel for those
auto-managed lower netdevs is needed.

And I assume everyone here understands the use case for live migration
(in the context of providing cloud service) is very different, and we
have to hide the netdevs. If not, I'm more than happy to clarify.

With that in mind, if having a new class of net-kernel namespace, we
can name the kernel device elaborately which is not neccessarily equal
to the device name exposed to userspace. For example, we can use
driver name as the prefix as opposed to "eth" or ":eth". And we don't
need to have auto-managed netdevs locked into the ":" prefix at all (I
intentionally left it out in the this RFC patch to ask for comments on
the namespace solution which is much cleaner). That said, an userpsace
named device through udev may call something like ens3 and
switch1-port2, but in the kernel-net namespace, it may look like
ixgbevf0 and mlxsw1p2.

So if we all agree introducing a new namespace is the rigth thing to
do, `ip link' will no longer serve the purpose of displaying the
information for kernel-net devnames for the sake of avoiding ambiguity
and namespace collision: it's entirely possible the ip link name could
collide with a kernel-net devname, it's become unclear which name of a
netdev object the command is expected to operate on. That's why I
thought showing the kernel-only netdevs using a separate subcommand
makes more sense.

Thoughts and comments? Please let me know.

Thanks,
-Siwei

>
> What I really don't understand still is the use case... really.
>
> So there are control netdevs, what exactly is the problem with that?
>
> Are we not exporting enough information for applications to handle
> these devices sanely?  If so, then's let add that information.
>
> We can set netdev->type to ETH_P_LINUXCONTROL or something like that.
>
> Another alternative is to add an interface flag like IFF_CONTROL or
> similar, and that probably is much nicer.
>
> Hiding the devices means that we acknowledge that applications are
> currently broken with control netdevs... and we want them to stay
> broken!
>
> That doesn't sound like a good plan to me.
>
> So let's fix handling of control netdevs instead of hiding them.
>
> Thanks.

^ permalink raw reply

* Re: Re: [RFC PATCH 1/3] qemu: virtio-bypass should explicitly bind to a passthrough device
From: Siwei Liu @ 2018-04-07  2:54 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Michael S. Tsirkin, Si-Wei Liu, Jiri Pirko, Stephen Hemminger,
	Alexander Duyck, David Miller, Brandeburg, Jesse, Jakub Kicinski,
	Jason Wang, Samudrala, Sridhar, Netdev, virtualization,
	virtio-dev
In-Reply-To: <d666b8a9-9ff8-4a56-0faa-ef3d0bf81d25@redhat.com>

(click the wrong reply button again, sorry)


On Thu, Apr 5, 2018 at 8:31 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 04/04/2018 10:02, Siwei Liu wrote:
>>> pci_bus_num is almost always a bug if not done within
>>> a context of a PCI host, bridge, etc.
>>>
>>> In particular this will not DTRT if run before guest assigns bus
>>> numbers.
>>>
>> I was seeking means to reserve a specific pci bus slot from drivers,
>> and update the driver when guest assigns the bus number but it seems
>> there's no low-hanging fruits. Because of that reason the bus_num is
>> only obtained until it's really needed (during get_config) and I
>> assume at that point the pci bus assignment is already done. I know
>> the current one is not perfect, but we need that information (PCI
>> bus:slot.func number) to name the guest device correctly.
>
> Can you use the -device "id", and look it up as
>
>     devices = container_get(qdev_get_machine(), "/peripheral");
>     return object_resolve_path_component(devices, id);


No. The problem of using device id is that the vfio device may come
and go at any time, this is particularly true when live migration is
happening. There's no gurantee we can get the bus:device.func info if
that device is gone. Currently the binding between vfio and virtio-net
is weakly coupled through the backup property, there's no better way
than specifying the bus id and addr property directly.

Regards,
-Siwei

>
> ?
>
> Thanks,
>
> Paolo

^ permalink raw reply

* Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Andrew Lunn @ 2018-04-07  3:19 UTC (permalink / raw)
  To: Siwei Liu
  Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
	Jakub Kicinski, Samudrala, Sridhar, virtualization, Netdev,
	David Ahern, si-wei liu, David Miller
In-Reply-To: <CADGSJ206RKi704uoLe3xVs6PyWpURPLb75OPCM_YO9DfnnEzpw@mail.gmail.com>

Hi Siwei

> I think everyone seems to agree not to fiddle with the ":" prefix, but
> rather have a new class of network subsystem under /sys/class thus a
> separate device namespace e.g. /sys/class/net-kernel for those
> auto-managed lower netdevs is needed.
 
How do you get a device into this new class? I don't know the Linux
driver model too well, but to get a device out of one class and into
another, i think you need to device_del(dev). modify dev->class and
then device_add(dev). However, device_add() says you are not allowed
to do this.

And i don't even see how this helps. Are you also not going to call
list_netdevice()? Are you going to add some other list for these
devices in a different class?

   Andrew

^ permalink raw reply

* [PATCH 0/8] ipconfig: NTP server support, bug fixes, documentation improvements
From: Chris Novakovic @ 2018-04-07  4:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Chris Novakovic

This series (against net-next) makes various improvements to ipconfig:

 - Patch #1 correctly documents the behaviour of parameter 4 in the
   "ip=" and "nfsaddrs=" command line parameter.
 - Patch #2 tidies up the printk()s for reporting configured name
   servers.
 - Patch #3 fixes a bug in autoconfiguration via BOOTP whereby the IP
   addresses of IEN-116 name servers are requested from the BOOTP
   server, rather than those of DNS name servers.
 - Patch #4 requests the number of DNS servers specified by
   CONF_NAMESERVERS_MAX when autoconfiguring via BOOTP, rather than
   hardcoding it to 2.
 - Patch #5 fully documents the contents and format of /proc/net/pnp in
   Documentation/filesystems/nfs/nfsroot.txt.
 - Patch #6 fixes a bug whereby bogus information is written to
   /proc/net/pnp when ipconfig is not used.
 - Patch #7 allows for NTP servers to be configured (manually on the
   kernel command line or automatically via DHCP), enabling systems with
   an NFS root filesystem to synchronise their clock before mounting
   their root filesystem.

Patch #7 gets a few warnings when run through checkpatch.pl, but I felt
it'd be better to use the same style as surrounding code where
appropriate, rather than adhering strictly to the net/ style guide.

Chris Novakovic (8):
  ipconfig: Document setting of NIS domain name
  ipconfig: Tidy up reporting of name servers
  ipconfig: BOOTP: Don't request IEN-116 name servers
  ipconfig: BOOTP: Request CONF_NAMESERVERS_MAX name servers
  ipconfig: Document /proc/net/pnp
  ipconfig: Correctly initialise ic_nameservers
  ipconfig: Write NTP server IPs to /proc/net/ntp
  CREDITS: Add Chris Novakovic

 CREDITS                                   |   4 +
 Documentation/filesystems/nfs/nfsroot.txt |  70 ++++++++++++++---
 net/ipv4/ipconfig.c                       | 121 +++++++++++++++++++++++++++---
 3 files changed, 174 insertions(+), 21 deletions(-)

-- 
2.14.1

^ permalink raw reply

* [PATCH 1/8] ipconfig: Document setting of NIS domain name
From: Chris Novakovic @ 2018-04-07  4:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Chris Novakovic
In-Reply-To: <20180407040903.8997-1-chris@chrisn.me.uk>

ic_do_bootp_ext() is responsible for parsing the "ip=" and "nfsaddrs="
kernel parameters. If a "." character is found in parameter 4 (the
client's hostname), everything before the first "." is used as the
hostname, and everything after it is used as the NIS domain name (but
not necessarily the DNS domain name).

Document this behaviour in Documentation/filesystems/nfs/nfsroot.txt,
as it is not made explicit.

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
---
 Documentation/filesystems/nfs/nfsroot.txt | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 5efae00f6c7f..1513e5d663fd 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -123,10 +123,13 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
 
 		Default:  Determined using autoconfiguration.
 
-  <hostname>	Name of the client. May be supplied by autoconfiguration,
-  		but its absence will not trigger autoconfiguration.
-		If specified and DHCP is used, the user provided hostname will
-		be carried in the DHCP request to hopefully update DNS record.
+  <hostname>	Name of the client. If a '.' character is present, anything
+		before the first '.' is used as the client's hostname, and anything
+		after it is used as its NIS domain name. May be supplied by
+		autoconfiguration, but its absence will not trigger autoconfiguration.
+		If specified and DHCP is used, the user-provided hostname (and NIS
+		domain name, if present) will be carried in the DHCP request; this
+		may cause a DNS record to be created or updated for the client.
 
   		Default: Client IP address is used in ASCII notation.
 
-- 
2.14.1

^ permalink raw reply related

* [PATCH 2/8] ipconfig: Tidy up reporting of name servers
From: Chris Novakovic @ 2018-04-07  4:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Chris Novakovic
In-Reply-To: <20180407040903.8997-1-chris@chrisn.me.uk>

Commit 5e953778a2aab04929a5e7b69f53dc26e39b079e ("ipconfig: add
nameserver IPs to kernel-parameter ip=") adds the IP addresses of
discovered name servers to the summary printed by ipconfig when
configuration is complete. It appears the intention in ip_auto_config()
was to print the name servers on a new line (especially given the
spacing and lack of comma before "nameserver0="), but they're actually
printed on the same line as the NFS root filesystem configuration
summary:

  [    0.686186] IP-Config: Complete:
  [    0.686226]      device=eth0, hwaddr=xx:xx:xx:xx:xx:xx, ipaddr=10.0.0.2, mask=255.255.255.0, gw=10.0.0.1
  [    0.686328]      host=test, domain=example.com, nis-domain=(none)
  [    0.686386]      bootserver=10.0.0.1, rootserver=10.0.0.1, rootpath=     nameserver0=10.0.0.1

This makes it harder to read and parse ipconfig's output. Instead, print
the name servers on a separate line:

  [    0.791250] IP-Config: Complete:
  [    0.791289]      device=eth0, hwaddr=xx:xx:xx:xx:xx:xx, ipaddr=10.0.0.2, mask=255.255.255.0, gw=10.0.0.1
  [    0.791407]      host=test, domain=example.com, nis-domain=(none)
  [    0.791475]      bootserver=10.0.0.1, rootserver=10.0.0.1, rootpath=
  [    0.791476]      nameserver0=10.0.0.1

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
---
 net/ipv4/ipconfig.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 43f620feb1c4..d0ea0ecc9008 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1481,16 +1481,19 @@ static int __init ip_auto_config(void)
 		&ic_servaddr, &root_server_addr, root_server_path);
 	if (ic_dev_mtu)
 		pr_cont(", mtu=%d", ic_dev_mtu);
-	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+	/* Name servers (if any): */
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
 		if (ic_nameservers[i] != NONE) {
-			pr_cont("     nameserver%u=%pI4",
-				i, &ic_nameservers[i]);
-			break;
+			if (i == 0)
+				pr_info("     nameserver%u=%pI4",
+					i, &ic_nameservers[i]);
+			else
+				pr_cont(", nameserver%u=%pI4",
+					i, &ic_nameservers[i]);
 		}
-	for (i++; i < CONF_NAMESERVERS_MAX; i++)
-		if (ic_nameservers[i] != NONE)
-			pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
-	pr_cont("\n");
+		if (i + 1 == CONF_NAMESERVERS_MAX)
+			pr_cont("\n");
+	}
 #endif /* !SILENT */
 
 	/*
-- 
2.14.1

^ permalink raw reply related

* [PATCH 3/8] ipconfig: BOOTP: Don't request IEN-116 name servers
From: Chris Novakovic @ 2018-04-07  4:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Chris Novakovic
In-Reply-To: <20180407040903.8997-1-chris@chrisn.me.uk>

When ipconfig is autoconfigured via BOOTP, the request packet
initialised by ic_bootp_init_ext() allocates 8 bytes for tag 5 ("Name
Server" [1, §3.7]), but tag 5 in the response isn't processed by
ic_do_bootp_ext(). Instead, allocate the 8 bytes to tag 6 ("Domain Name
Server" [1, §3.8]), which is processed by ic_do_bootp_ext(), and appears
to have been the intended tag to request.

This won't cause any breakage for existing users, as tag 5 responses
provided by BOOTP servers weren't being processed anyway.

[1] RFC 2132, "DHCP Options and BOOTP Vendor Extensions":
    https://tools.ietf.org/rfc/rfc2132.txt

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
---
 net/ipv4/ipconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d0ea0ecc9008..bcf3c4f9882d 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -721,7 +721,7 @@ static void __init ic_bootp_init_ext(u8 *e)
 	*e++ = 3;		/* Default gateway request */
 	*e++ = 4;
 	e += 4;
-	*e++ = 5;		/* Name server request */
+	*e++ = 6;		/* (DNS) name server request */
 	*e++ = 8;
 	e += 8;
 	*e++ = 12;		/* Host name request */
-- 
2.14.1

^ permalink raw reply related

* [PATCH 4/8] ipconfig: BOOTP: Request CONF_NAMESERVERS_MAX name servers
From: Chris Novakovic @ 2018-04-07  4:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Chris Novakovic
In-Reply-To: <20180407040903.8997-1-chris@chrisn.me.uk>

When ipconfig is autoconfigured via BOOTP, the request packet
initialised by ic_bootp_init_ext() always allocates 8 bytes for the name
server option, limiting the BOOTP server to responding with at most 2
name servers even though ipconfig in fact supports an arbitrary number
of name servers (as defined by CONF_NAMESERVERS_MAX, which is currently
3).

Only request name servers in the request packet if CONF_NAMESERVERS_MAX
is positive (to comply with [1, §3.8]), and allocate enough space in the
packet for CONF_NAMESERVERS_MAX name servers to indicate the maximum
number we can accept in response.

[1] RFC 2132, "DHCP Options and BOOTP Vendor Extensions":
    https://tools.ietf.org/rfc/rfc2132.txt

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
---
 net/ipv4/ipconfig.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index bcf3c4f9882d..0f460d6d3cce 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -721,9 +721,11 @@ static void __init ic_bootp_init_ext(u8 *e)
 	*e++ = 3;		/* Default gateway request */
 	*e++ = 4;
 	e += 4;
+#if CONF_NAMESERVERS_MAX > 0
 	*e++ = 6;		/* (DNS) name server request */
-	*e++ = 8;
-	e += 8;
+	*e++ = 4 * CONF_NAMESERVERS_MAX;
+	e += 4 * CONF_NAMESERVERS_MAX;
+#endif
 	*e++ = 12;		/* Host name request */
 	*e++ = 32;
 	e += 32;
-- 
2.14.1

^ permalink raw reply related

* [PATCH 5/8] ipconfig: Document /proc/net/pnp
From: Chris Novakovic @ 2018-04-07  4:09 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Chris Novakovic
In-Reply-To: <20180407040903.8997-1-chris@chrisn.me.uk>

Fully document the format used by the /proc/net/pnp file written by
ipconfig, explain where its values originate from, and clarify that the
tertiary name server IP and DNS domain name are only written to the file
when autoconfiguration is used.

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
---
 Documentation/filesystems/nfs/nfsroot.txt | 34 ++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 1513e5d663fd..a1030bea60d3 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -110,6 +110,9 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
 		will not be triggered if it is missing and NFS root is not
 		in operation.
 
+		Value is exported to /proc/net/pnp with the prefix "bootserver "
+		(see below).
+
 		Default: Determined using autoconfiguration.
 		         The address of the autoconfiguration server is used.
 
@@ -165,12 +168,33 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
 
                 Default: any
 
-  <dns0-ip>	IP address of first nameserver.
-		Value gets exported by /proc/net/pnp which is often linked
-		on embedded systems by /etc/resolv.conf.
+  <dns0-ip>	IP address of primary nameserver.
+		Value is exported to /proc/net/pnp with the prefix "nameserver "
+		(see below).
+
+		Default: None if not using autoconfiguration; determined
+		automatically if using autoconfiguration.
+
+  <dns1-ip>	IP address of secondary nameserver.
+		See <dns0-ip>.
+
+  After configuration (whether manual or automatic) is complete, a file is
+  created at /proc/net/pnp in the following format; lines are omitted if
+  their respective value is empty following configuration.
+
+	#PROTO: <DHCP|BOOTP|RARP|MANUAL>	(depending on configuration method)
+	domain <dns-domain>			(if autoconfigured, the DNS domain)
+	nameserver <dns0-ip>			(primary name server IP)
+	nameserver <dns1-ip>			(secondary name server IP)
+	nameserver <dns2-ip>			(tertiary name server IP)
+	bootserver <server-ip>			(NFS server IP)
+
+  <dns-domain> and <dns2-ip> are requested during autoconfiguration; they
+  cannot be specified as part of the "ip=" kernel command line parameter.
 
-  <dns1-ip>	IP address of second nameserver.
-		Same as above.
+  Because the "domain" and "nameserver" options are recognised by DNS
+  resolvers, /etc/resolv.conf is often linked to /proc/net/pnp on systems
+  that use an NFS root filesystem.
 
 
 nfsrootdebug
-- 
2.14.1

^ permalink raw reply related

* [PATCH 6/8] ipconfig: Correctly initialise ic_nameservers
From: Chris Novakovic @ 2018-04-07  4:09 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Chris Novakovic
In-Reply-To: <20180407040903.8997-1-chris@chrisn.me.uk>

ic_nameservers, which stores the list of name servers discovered by
ipconfig, is initialised (i.e. has all of its elements set to NONE, or
0xffffffff) by ic_nameservers_predef() in the following scenarios:

 - before the "ip=" and "nfsaddrs=" kernel command line parameters are
   parsed (in ip_auto_config_setup());
 - before autoconfiguring via DHCP or BOOTP (in ic_bootp_init()), in
   order to clear any values that may have been set after parsing "ip="
   or "nfsaddrs=" and are no longer needed.

This means that ic_nameservers_predef() is not called when neither "ip="
nor "nfsaddrs=" is specified on the kernel command line. In this
scenario, every element in ic_nameservers remains set to 0x00000000,
which is indistinguishable from ANY and causes pnp_seq_show() to write
the following (bogus) information to /proc/net/pnp:

  #MANUAL
  nameserver 0.0.0.0
  nameserver 0.0.0.0
  nameserver 0.0.0.0

This is potentially problematic for systems that blindly link
/etc/resolv.conf to /proc/net/pnp.

Ensure that ic_nameservers is also initialised when neither "ip=" nor
"nfsaddrs=" is specified by calling ic_nameservers_predef() in
ip_auto_config(), but only when ip_auto_config_setup() was not called
earlier. This causes the following to be written to /proc/net/pnp, and
is consistent with what gets written when ipconfig is configured
manually but no name servers are specified on the kernel command line:

  #MANUAL

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
---
 net/ipv4/ipconfig.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 0f460d6d3cce..e11dfd29a929 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -750,6 +750,11 @@ static void __init ic_bootp_init_ext(u8 *e)
  */
 static inline void __init ic_bootp_init(void)
 {
+	/* Re-initialise all name servers to NONE, in case any were set via the
+	 * "ip=" or "nfsaddrs=" kernel command line parameters: any IP addresses
+	 * specified there will already have been decoded but are no longer
+	 * needed
+	 */
 	ic_nameservers_predef();
 
 	dev_add_pack(&bootp_packet_type);
@@ -1370,6 +1375,13 @@ static int __init ip_auto_config(void)
 	int err;
 	unsigned int i;
 
+	/* Initialise all name servers to NONE (but only if the "ip=" or
+	 * "nfsaddrs=" kernel command line parameters weren't decoded, otherwise
+	 * we'll overwrite the IP addresses specified there)
+	 */
+	if (ic_set_manually == 0)
+		ic_nameservers_predef();
+
 #ifdef CONFIG_PROC_FS
 	proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
 #endif /* CONFIG_PROC_FS */
@@ -1593,6 +1605,7 @@ static int __init ip_auto_config_setup(char *addrs)
 		return 1;
 	}
 
+	/* Initialise all name servers to NONE */
 	ic_nameservers_predef();
 
 	/* Parse string for static IP assignment.  */
-- 
2.14.1

^ permalink raw reply related

* [PATCH 7/8] ipconfig: Write NTP server IPs to /proc/net/ntp
From: Chris Novakovic @ 2018-04-07  4:09 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Chris Novakovic
In-Reply-To: <20180407040903.8997-1-chris@chrisn.me.uk>

Distributed filesystems are most effective when the server and client
clocks are synchronised. Embedded devices often use NFS for their
root filesystem but typically do not contain an RTC, so the clocks of
the NFS server and the embedded device will be out-of-sync when the root
filesystem is mounted (and may not be synchronised until late in the
boot process).

Extend ipconfig with the ability to export IP addresses of NTP servers
it discovers to /proc/net/ntp. They can be supplied as follows:

 - If ipconfig is configured manually via the "ip=" or "nfsaddrs="
   kernel command line parameters, one NTP server can be specified in
   the new "<ntp-ip>" parameter.
 - If ipconfig is autoconfigured via DHCP, request DHCP option 42 in
   the DHCPDISCOVER message, and record the IP addresses of up to three
   NTP servers sent by the responding DHCP server in the subsequent
   DHCPOFFER message.

ipconfig will only write the NTP server IP addresses it discovers to
/proc/net/ntp, one per line (in the order received from the DHCP server,
if DHCP autoconfiguration is used); making use of these NTP servers is
the responsibility of a user space process (e.g. an initrd/initram
script that invokes an NTP client before mounting an NFS root
filesystem).

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
---
 Documentation/filesystems/nfs/nfsroot.txt | 35 +++++++++--
 net/ipv4/ipconfig.c                       | 99 ++++++++++++++++++++++++++++---
 2 files changed, 119 insertions(+), 15 deletions(-)

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index a1030bea60d3..4d55470f7ca9 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -5,6 +5,7 @@ Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
 Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
 Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
 Updated 2006 by Horms <horms@verge.net.au>
+Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
 
 
 
@@ -79,7 +80,7 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
 
 
 ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
-   <dns0-ip>:<dns1-ip>
+   <dns0-ip>:<dns1-ip>:<nfs0-ip>
 
   This parameter tells the kernel how to configure IP addresses of devices
   and also how to set up the IP routing table. It was originally called
@@ -178,9 +179,18 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
   <dns1-ip>	IP address of secondary nameserver.
 		See <dns0-ip>.
 
-  After configuration (whether manual or automatic) is complete, a file is
-  created at /proc/net/pnp in the following format; lines are omitted if
-  their respective value is empty following configuration.
+  <ntp-ip>	IP address of a Network Time Protocol (NTP) server.
+		Value is exported to /proc/net/ntp, but is otherwise unused
+		(see below).
+
+		Default: None if not using autoconfiguration; determined
+		automatically if using autoconfiguration.
+
+  After configuration (whether manual or automatic) is complete, two files
+  are created in the following format; lines are omitted if their respective
+  value is empty following configuration:
+
+  - /proc/net/pnp:
 
 	#PROTO: <DHCP|BOOTP|RARP|MANUAL>	(depending on configuration method)
 	domain <dns-domain>			(if autoconfigured, the DNS domain)
@@ -189,13 +199,26 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
 	nameserver <dns2-ip>			(tertiary name server IP)
 	bootserver <server-ip>			(NFS server IP)
 
-  <dns-domain> and <dns2-ip> are requested during autoconfiguration; they
-  cannot be specified as part of the "ip=" kernel command line parameter.
+  - /proc/net/ntp:
+
+	<ntp0-ip>				(NTP server IP)
+	<ntp1-ip>				(NTP server IP)
+	<ntp2-ip>				(NTP server IP)
+
+  <dns-domain> and <dns2-ip> (in /proc/net/pnp) and <ntp1-ip> and <ntp2-ip>
+  (in /proc/net/ntp) are requested during autoconfiguration; they cannot be
+  specified as part of the "ip=" kernel command line parameter.
 
   Because the "domain" and "nameserver" options are recognised by DNS
   resolvers, /etc/resolv.conf is often linked to /proc/net/pnp on systems
   that use an NFS root filesystem.
 
+  Note that the kernel will not synchronise the system time with any NTP
+  servers it discovers; this is the responsibility of a user space process
+  (e.g. an initrd/initramfs script that passes the IP addresses listed in
+  /proc/net/ntp to an NTP client before mounting the real root filesystem
+  if it is on NFS).
+
 
 nfsrootdebug
 
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e11dfd29a929..a5d68e506494 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -28,6 +28,9 @@
  *
  *  Multiple Nameservers in /proc/net/pnp
  *              --  Josef Siemes <jsiemes@web.de>, Aug 2002
+ *
+ *  NTP servers in /proc/net/ntp
+ *              --  Chris Novakovic <chris@chrisn.me.uk>, April 2018
  */
 
 #include <linux/types.h>
@@ -93,6 +96,7 @@
 #define CONF_TIMEOUT_MAX	(HZ*30)	/* Maximum allowed timeout */
 #define CONF_NAMESERVERS_MAX   3       /* Maximum number of nameservers
 					   - '3' from resolv.h */
+#define CONF_NTP_SERVERS_MAX   3	/* Maximum number of NTP servers */
 
 #define NONE cpu_to_be32(INADDR_NONE)
 #define ANY cpu_to_be32(INADDR_ANY)
@@ -152,6 +156,7 @@ static int ic_proto_used;			/* Protocol used, if any */
 #define ic_proto_used 0
 #endif
 static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */
 static u8 ic_domain[64];		/* DNS (not NIS) domain name */
 
 /*
@@ -576,6 +581,17 @@ static inline void __init ic_nameservers_predef(void)
 		ic_nameservers[i] = NONE;
 }
 
+/*
+ *  Predefine NTP servers
+ */
+static inline void __init ic_ntp_servers_predef(void)
+{
+	int i;
+
+	for (i = 0; i < CONF_NTP_SERVERS_MAX; i++)
+		ic_ntp_servers[i] = NONE;
+}
+
 /*
  *	DHCP/BOOTP support.
  */
@@ -671,6 +687,7 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
 			17,	/* Boot path */
 			26,	/* MTU */
 			40,	/* NIS domain name */
+			42,	/* NTP servers */
 		};
 
 		*e++ = 55;	/* Parameter request list */
@@ -750,12 +767,13 @@ static void __init ic_bootp_init_ext(u8 *e)
  */
 static inline void __init ic_bootp_init(void)
 {
-	/* Re-initialise all name servers to NONE, in case any were set via the
-	 * "ip=" or "nfsaddrs=" kernel command line parameters: any IP addresses
-	 * specified there will already have been decoded but are no longer
-	 * needed
+	/* Re-initialise all name servers and NTP servers to NONE, in case any
+	 * were set via the "ip=" or "nfsaddrs=" kernel command line parameters:
+	 * any IP addresses specified there will already have been decoded but
+	 * are no longer needed
 	 */
 	ic_nameservers_predef();
+	ic_ntp_servers_predef();
 
 	dev_add_pack(&bootp_packet_type);
 }
@@ -919,6 +937,15 @@ static void __init ic_do_bootp_ext(u8 *ext)
 		ic_bootp_string(utsname()->domainname, ext+1, *ext,
 				__NEW_UTS_LEN);
 		break;
+	case 42:	/* NTP servers */
+		servers = *ext / 4;
+		if (servers > CONF_NTP_SERVERS_MAX)
+			servers = CONF_NTP_SERVERS_MAX;
+		for (i = 0; i < servers; i++) {
+			if (ic_ntp_servers[i] == NONE)
+				memcpy(&ic_ntp_servers[i], ext+1+4*i, 4);
+		}
+		break;
 	}
 }
 
@@ -1265,6 +1292,9 @@ static int __init ic_dynamic(void)
 
 #ifdef CONFIG_PROC_FS
 
+/*
+ *  Name servers:
+ */
 static int pnp_seq_show(struct seq_file *seq, void *v)
 {
 	int i;
@@ -1301,6 +1331,33 @@ static const struct file_operations pnp_seq_fops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+
+/*
+ *  NTP servers:
+ */
+static int ntp_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+		if (ic_ntp_servers[i] != NONE)
+			seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]);
+	}
+	return 0;
+}
+
+static int ntp_seq_open(struct inode *indoe, struct file *file)
+{
+	return single_open(file, ntp_seq_show, NULL);
+}
+
+static const struct file_operations ntp_seq_fops = {
+	.open		= ntp_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -1375,15 +1432,18 @@ static int __init ip_auto_config(void)
 	int err;
 	unsigned int i;
 
-	/* Initialise all name servers to NONE (but only if the "ip=" or
-	 * "nfsaddrs=" kernel command line parameters weren't decoded, otherwise
-	 * we'll overwrite the IP addresses specified there)
+	/* Initialise all name servers and NTP servers to NONE (but only if the
+	 * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
+	 * otherwise we'll overwrite the IP addresses specified there)
 	 */
-	if (ic_set_manually == 0)
+	if (ic_set_manually == 0) {
 		ic_nameservers_predef();
+		ic_ntp_servers_predef();
+	}
 
 #ifdef CONFIG_PROC_FS
 	proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
+	proc_create("ntp", 0444, init_net.proc_net, &ntp_seq_fops);
 #endif /* CONFIG_PROC_FS */
 
 	if (!ic_enable)
@@ -1508,6 +1568,19 @@ static int __init ip_auto_config(void)
 		if (i + 1 == CONF_NAMESERVERS_MAX)
 			pr_cont("\n");
 	}
+	/* NTP servers (if any): */
+	for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+		if (ic_ntp_servers[i] != NONE) {
+			if (i == 0)
+				pr_info("     ntpserver%u=%pI4",
+					i, &ic_ntp_servers[i]);
+			else
+				pr_cont(", ntpserver%u=%pI4",
+					i, &ic_ntp_servers[i]);
+		}
+		if (i + 1 == CONF_NTP_SERVERS_MAX)
+			pr_cont("\n");
+	}
 #endif /* !SILENT */
 
 	/*
@@ -1605,8 +1678,9 @@ static int __init ip_auto_config_setup(char *addrs)
 		return 1;
 	}
 
-	/* Initialise all name servers to NONE */
+	/* Initialise all name servers and NTP servers to NONE */
 	ic_nameservers_predef();
+	ic_ntp_servers_predef();
 
 	/* Parse string for static IP assignment.  */
 	ip = addrs;
@@ -1665,6 +1739,13 @@ static int __init ip_auto_config_setup(char *addrs)
 						ic_nameservers[1] = NONE;
 				}
 				break;
+			case 9:
+				if (CONF_NTP_SERVERS_MAX >= 1) {
+					ic_ntp_servers[0] = in_aton(ip);
+					if (ic_ntp_servers[0] == ANY)
+						ic_ntp_servers[0] = NONE;
+				}
+				break;
 			}
 		}
 		ip = cp;
-- 
2.14.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox