* Re: [RFC PATCH net-next v5 2/4] net: Introduce generic bypass module
From: Jiri Pirko @ 2018-04-06 12:57 UTC (permalink / raw)
To: Sridhar Samudrala
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <1522962503-3963-3-git-send-email-sridhar.samudrala@intel.com>
Thu, Apr 05, 2018 at 11:08:21PM CEST, sridhar.samudrala@intel.com wrote:
>This provides a generic interface for paravirtual drivers to listen
>for netdev register/unregister/link change events from pci ethernet
>devices with the same MAC and takeover their datapath. The notifier and
>event handling code is based on the existing netvsc implementation. A
>paravirtual driver can use this module by registering a set of ops and
>each instance of the device when it is probed.
>
>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>---
> include/net/bypass.h | 80 ++++++++++
> net/Kconfig | 18 +++
> net/core/Makefile | 1 +
> net/core/bypass.c | 406 +++++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 505 insertions(+)
> create mode 100644 include/net/bypass.h
> create mode 100644 net/core/bypass.c
>
>diff --git a/include/net/bypass.h b/include/net/bypass.h
>new file mode 100644
>index 000000000000..e2dd122f951a
>--- /dev/null
>+++ b/include/net/bypass.h
>@@ -0,0 +1,80 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/* Copyright (c) 2018, Intel Corporation. */
>+
>+#ifndef _NET_BYPASS_H
>+#define _NET_BYPASS_H
>+
>+#include <linux/netdevice.h>
>+
>+struct bypass_ops {
Perhaps "net_bypass_" would be better prefix for this module structs
and functions. No strong opinion though.
>+ int (*register_child)(struct net_device *bypass_netdev,
>+ struct net_device *child_netdev);
We have master/slave upper/lower netdevices. This adds "child". Consider
using some existing names. Not sure if possible without loss of meaning.
>+ int (*join_child)(struct net_device *bypass_netdev,
>+ struct net_device *child_netdev);
>+ int (*unregister_child)(struct net_device *bypass_netdev,
>+ struct net_device *child_netdev);
>+ int (*release_child)(struct net_device *bypass_netdev,
>+ struct net_device *child_netdev);
>+ int (*update_link)(struct net_device *bypass_netdev,
>+ struct net_device *child_netdev);
>+ rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>+};
>+
>+struct bypass_instance {
>+ struct list_head list;
>+ struct net_device __rcu *bypass_netdev;
>+ struct bypass *bypass;
>+};
>+
>+struct bypass {
>+ struct list_head list;
>+ const struct bypass_ops *ops;
>+ const struct net_device_ops *netdev_ops;
>+ struct list_head instance_list;
>+ struct mutex lock;
>+};
>+
>+#if IS_ENABLED(CONFIG_NET_BYPASS)
>+
>+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>+ const struct net_device_ops *netdev_ops);
>+void bypass_unregister_driver(struct bypass *bypass);
>+
>+int bypass_register_instance(struct bypass *bypass, struct net_device *dev);
>+int bypass_unregister_instance(struct bypass *bypass, struct net_device *dev);
>+
>+int bypass_unregister_child(struct net_device *child_netdev);
>+
>+#else
>+
>+static inline
>+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>+ const struct net_device_ops *netdev_ops)
>+{
>+ return NULL;
>+}
>+
>+static inline void bypass_unregister_driver(struct bypass *bypass)
>+{
>+}
>+
>+static inline int bypass_register_instance(struct bypass *bypass,
>+ struct net_device *dev)
>+{
>+ return 0;
>+}
>+
>+static inline int bypass_unregister_instance(struct bypass *bypass,
>+ struct net_device *dev)
>+{
>+ return 0;
>+}
>+
>+static inline int bypass_unregister_child(struct net_device *child_netdev)
>+{
>+ return 0;
>+}
>+
>+#endif
>+
>+#endif /* _NET_BYPASS_H */
>diff --git a/net/Kconfig b/net/Kconfig
>index 0428f12c25c2..994445f4a96a 100644
>--- a/net/Kconfig
>+++ b/net/Kconfig
>@@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
> on MAY_USE_DEVLINK to ensure they do not cause link errors when
> devlink is a loadable module and the driver using it is built-in.
>
>+config NET_BYPASS
>+ tristate "Bypass interface"
>+ ---help---
>+ This provides a generic interface for paravirtual drivers to listen
>+ for netdev register/unregister/link change events from pci ethernet
>+ devices with the same MAC and takeover their datapath. This also
>+ enables live migration of a VM with direct attached VF by failing
>+ over to the paravirtual datapath when the VF is unplugged.
>+
>+config MAY_USE_BYPASS
>+ tristate
>+ default m if NET_BYPASS=m
>+ default y if NET_BYPASS=y || NET_BYPASS=n
>+ help
>+ Drivers using the bypass infrastructure should have a dependency
>+ on MAY_USE_BYPASS to ensure they do not cause link errors when
>+ bypass is a loadable module and the driver using it is built-in.
>+
> endif # if NET
>
> # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>diff --git a/net/core/Makefile b/net/core/Makefile
>index 6dbbba8c57ae..a9727ed1c8fc 100644
>--- a/net/core/Makefile
>+++ b/net/core/Makefile
>@@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
> obj-$(CONFIG_HWBM) += hwbm.o
> obj-$(CONFIG_NET_DEVLINK) += devlink.o
> obj-$(CONFIG_GRO_CELLS) += gro_cells.o
>+obj-$(CONFIG_NET_BYPASS) += bypass.o
>diff --git a/net/core/bypass.c b/net/core/bypass.c
>new file mode 100644
>index 000000000000..7bde962ec3d4
>--- /dev/null
>+++ b/net/core/bypass.c
>@@ -0,0 +1,406 @@
>+// SPDX-License-Identifier: GPL-2.0
>+/* Copyright (c) 2018, Intel Corporation. */
>+
>+/* A common module to handle registrations and notifications for paravirtual
>+ * drivers to enable accelerated datapath and support VF live migration.
>+ *
>+ * The notifier and event handling code is based on netvsc driver.
>+ */
>+
>+#include <linux/netdevice.h>
>+#include <linux/etherdevice.h>
>+#include <linux/ethtool.h>
>+#include <linux/module.h>
>+#include <linux/slab.h>
>+#include <linux/netdevice.h>
>+#include <linux/netpoll.h>
>+#include <linux/rtnetlink.h>
>+#include <linux/if_vlan.h>
>+#include <net/sch_generic.h>
>+#include <uapi/linux/if_arp.h>
>+#include <net/bypass.h>
>+
>+static LIST_HEAD(bypass_list);
>+
>+static DEFINE_MUTEX(bypass_mutex);
Why mutex? Apparently you don't need to sleep while holding a lock.
Simple spinlock would do.
>+
>+struct bypass_instance *bypass_instance_alloc(struct net_device *dev)
>+{
>+ struct bypass_instance *bypass_instance;
>+
>+ bypass_instance = kzalloc(sizeof(*bypass_instance), GFP_KERNEL);
>+ if (!bypass_instance)
>+ return NULL;
>+
>+ dev_hold(dev);
>+ rcu_assign_pointer(bypass_instance->bypass_netdev, dev);
>+
>+ return bypass_instance;
>+}
>+
>+void bypass_instance_free(struct bypass_instance *bypass_instance)
>+{
>+ struct net_device *bypass_netdev;
>+
>+ bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+
>+ dev_put(bypass_netdev);
>+ kfree(bypass_instance);
>+}
>+
>+static struct bypass_instance *bypass_get_instance_bymac(u8 *mac)
>+{
>+ struct bypass_instance *bypass_instance;
>+ struct net_device *bypass_netdev;
>+ struct bypass *bypass;
>+
>+ list_for_each_entry(bypass, &bypass_list, list) {
>+ mutex_lock(&bypass->lock);
>+ list_for_each_entry(bypass_instance, &bypass->instance_list,
>+ list) {
>+ bypass_netdev =
>+ rcu_dereference(bypass_instance->bypass_netdev);
>+ if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>+ mutex_unlock(&bypass->lock);
>+ goto out;
>+ }
>+ }
>+ mutex_unlock(&bypass->lock);
>+ }
>+
>+ bypass_instance = NULL;
>+out:
>+ return bypass_instance;
>+}
>+
>+static int bypass_register_child(struct net_device *child_netdev)
>+{
>+ struct bypass_instance *bypass_instance;
>+ struct bypass *bypass;
>+ struct net_device *bypass_netdev;
>+ int ret, orig_mtu;
>+
>+ ASSERT_RTNL();
>+
>+ mutex_lock(&bypass_mutex);
>+ bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
>+ if (!bypass_instance) {
>+ mutex_unlock(&bypass_mutex);
>+ goto done;
>+ }
>+
>+ bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+ bypass = bypass_instance->bypass;
>+ mutex_unlock(&bypass_mutex);
>+
>+ if (!bypass->ops->register_child)
>+ goto done;
>+
>+ ret = bypass->ops->register_child(bypass_netdev, child_netdev);
>+ if (ret != 0)
>+ goto done;
>+
>+ ret = netdev_rx_handler_register(child_netdev,
>+ bypass->ops->handle_frame,
>+ bypass_netdev);
>+ if (ret != 0) {
>+ netdev_err(child_netdev,
>+ "can not register bypass rx handler (err = %d)\n",
>+ ret);
>+ goto rx_handler_failed;
>+ }
>+
>+ ret = netdev_upper_dev_link(child_netdev, bypass_netdev, NULL);
>+ if (ret != 0) {
>+ netdev_err(child_netdev,
No line-wrap is needed here and in other cases like this.
>+ "can not set master device %s (err = %d)\n",
>+ bypass_netdev->name, ret);
>+ goto upper_link_failed;
>+ }
>+
>+ child_netdev->flags |= IFF_SLAVE;
Don't reuse IFF_SLAVE. That is bonding-specific thing. I know that
netvsc uses it, it is wrong.
Please rather introduce:
IFF_BYPASS for master
and IFF_BYPASS_SLAVE for slaves.
>+
>+ if (netif_running(bypass_netdev)) {
>+ ret = dev_open(child_netdev);
>+ if (ret && (ret != -EBUSY)) {
>+ netdev_err(bypass_netdev,
>+ "Opening child %s failed ret:%d\n",
>+ child_netdev->name, ret);
>+ goto err_interface_up;
>+ }
>+ }
>+
>+ /* Align MTU of child with master */
>+ orig_mtu = child_netdev->mtu;
>+ ret = dev_set_mtu(child_netdev, bypass_netdev->mtu);
>+ if (ret != 0) {
>+ netdev_err(bypass_netdev,
>+ "unable to change mtu of %s to %u register failed\n",
>+ child_netdev->name, bypass_netdev->mtu);
>+ goto err_set_mtu;
>+ }
>+
>+ ret = bypass->ops->join_child(bypass_netdev, child_netdev);
>+ if (ret != 0)
>+ goto err_join;
>+
>+ call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
>+
>+ goto done;
>+
>+err_join:
>+ dev_set_mtu(child_netdev, orig_mtu);
>+err_set_mtu:
>+ dev_close(child_netdev);
>+err_interface_up:
>+ netdev_upper_dev_unlink(child_netdev, bypass_netdev);
>+ child_netdev->flags &= ~IFF_SLAVE;
>+upper_link_failed:
>+ netdev_rx_handler_unregister(child_netdev);
>+rx_handler_failed:
>+ bypass->ops->unregister_child(bypass_netdev, child_netdev);
>+
>+done:
>+ return NOTIFY_DONE;
>+}
>+
>+int bypass_unregister_child(struct net_device *child_netdev)
>+{
>+ struct bypass_instance *bypass_instance;
>+ struct net_device *bypass_netdev;
>+ struct bypass *bypass;
>+ int ret;
>+
>+ ASSERT_RTNL();
>+
>+ mutex_lock(&bypass_mutex);
>+ bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
>+ if (!bypass_instance) {
>+ mutex_unlock(&bypass_mutex);
>+ goto done;
>+ }
>+
>+ bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+ bypass = bypass_instance->bypass;
>+ mutex_unlock(&bypass_mutex);
>+
>+ ret = bypass->ops->release_child(bypass_netdev, child_netdev);
>+ if (ret != 0)
>+ goto done;
>+
>+ netdev_rx_handler_unregister(child_netdev);
>+ netdev_upper_dev_unlink(child_netdev, bypass_netdev);
>+ child_netdev->flags &= ~IFF_SLAVE;
>+
>+ if (!bypass->ops->unregister_child)
>+ goto done;
>+
>+ bypass->ops->unregister_child(bypass_netdev, child_netdev);
>+
>+done:
>+ return NOTIFY_DONE;
>+}
>+EXPORT_SYMBOL(bypass_unregister_child);
Please use "EXPORT_SYMBOL_GPL" for all exported symbols.
>+
>+static int bypass_update_link(struct net_device *child_netdev)
>+{
>+ struct bypass_instance *bypass_instance;
>+ struct net_device *bypass_netdev;
>+ struct bypass *bypass;
>+
>+ ASSERT_RTNL();
>+
>+ mutex_lock(&bypass_mutex);
>+ bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
You don't really need this lookup. The kernel knows about the master
device, you can just use netdev_master_upper_dev_get_rcu() to get it.
>+ if (!bypass_instance) {
>+ mutex_unlock(&bypass_mutex);
>+ goto done;
>+ }
>+
>+ bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+ bypass = bypass_instance->bypass;
>+ mutex_unlock(&bypass_mutex);
>+
>+ if (!bypass->ops->update_link)
>+ goto done;
>+
>+ bypass->ops->update_link(bypass_netdev, child_netdev);
>+
>+done:
>+ return NOTIFY_DONE;
>+}
>+
>+static bool bypass_validate_child_dev(struct net_device *dev)
>+{
>+ /* Avoid non-Ethernet type devices */
>+ if (dev->type != ARPHRD_ETHER)
>+ return false;
>+
>+ /* Avoid Vlan dev with same MAC registering as VF */
>+ if (is_vlan_dev(dev))
>+ return false;
>+
>+ /* Avoid Bonding master dev with same MAC registering as child dev */
>+ if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>+ return false;
>+
>+ return true;
>+}
>+
>+static int
>+bypass_event(struct notifier_block *this, unsigned long event, void *ptr)
>+{
>+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>+ struct bypass *bypass;
>+
>+ /* Skip Parent events */
>+ mutex_lock(&bypass_mutex);
>+ list_for_each_entry(bypass, &bypass_list, list) {
>+ if (event_dev->netdev_ops == bypass->netdev_ops) {
>+ mutex_unlock(&bypass_mutex);
>+ return NOTIFY_DONE;
>+ }
What you need instead of this is an identification helper
netif_is_bypass_master()
similar to
netif_is_team_master()
netif_is_bridge_master()
etc
>+ }
>+ mutex_unlock(&bypass_mutex);
>+
>+ if (!bypass_validate_child_dev(event_dev))
>+ return NOTIFY_DONE;
>+
>+ switch (event) {
>+ case NETDEV_REGISTER:
>+ return bypass_register_child(event_dev);
>+ case NETDEV_UNREGISTER:
>+ return bypass_unregister_child(event_dev);
>+ case NETDEV_UP:
>+ case NETDEV_DOWN:
>+ case NETDEV_CHANGE:
>+ return bypass_update_link(event_dev);
>+ default:
>+ return NOTIFY_DONE;
>+ }
>+}
>+
>+static struct notifier_block bypass_notifier = {
>+ .notifier_call = bypass_event,
>+};
>+
>+static void bypass_register_existing_child(struct net_device *bypass_netdev)
>+{
>+ struct net *net = dev_net(bypass_netdev);
>+ struct net_device *dev;
>+
>+ rtnl_lock();
>+ for_each_netdev(net, dev) {
>+ if (dev == bypass_netdev)
>+ continue;
>+ if (!bypass_validate_child_dev(dev))
>+ continue;
>+ if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr))
>+ bypass_register_child(dev);
>+ }
>+ rtnl_unlock();
>+}
>+
>+int bypass_register_instance(struct bypass *bypass, struct net_device *dev)
>+{
>+ struct bypass_instance *bypass_instance;
No need to allocate this instace here. You can just have is embedded
inside netdevice priv and pass pointer to it here. You can pass the
pointer back to the driver when you call ops as the driver can get priv
back by it.
I would also call it "struct bypass_master" and this function
"bypass_master_register".
It should contain the ops pointer too.
>+ struct net_device *bypass_netdev;
>+ int ret = 0;
>+
>+ mutex_lock(&bypass->lock);
>+ list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
>+ bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+ if (bypass_netdev == dev) {
This means the driver registered one netdev twice. That is a bug in
driver, so WARN_ON would be nice here to point that out.
>+ ret = -EEXIST;
>+ goto done;
>+ }
>+ }
>+
>+ bypass_instance = bypass_instance_alloc(dev);
>+ if (!bypass_instance) {
>+ ret = -ENOMEM;
>+ goto done;
>+ }
>+
>+ bypass_instance->bypass = bypass;
>+ list_add_tail(&bypass_instance->list, &bypass->instance_list);
>+
>+done:
>+ mutex_unlock(&bypass->lock);
>+ bypass_register_existing_child(dev);
>+ return ret;
>+}
>+EXPORT_SYMBOL(bypass_register_instance);
>+
>+int bypass_unregister_instance(struct bypass *bypass, struct net_device *dev)
>+{
>+ struct bypass_instance *bypass_instance;
>+ struct net_device *bypass_netdev;
>+ int ret = 0;
>+
>+ mutex_lock(&bypass->lock);
>+ list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
>+ bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>+ if (bypass_netdev == dev) {
>+ list_del(&bypass_instance->list);
>+ bypass_instance_free(bypass_instance);
>+ goto done;
>+ }
>+ }
>+
>+ ret = -ENOENT;
>+done:
>+ mutex_unlock(&bypass->lock);
>+ return ret;
>+}
>+EXPORT_SYMBOL(bypass_unregister_instance);
>+
>+struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>+ const struct net_device_ops *netdev_ops)
I don't see why you need a list of drivers. What you need is just a list
of instances - bypass masters (probably to call them like that in the
code as well). Well, you can use the common netdevice list for that
purpose with the identification helper I mentioned above. Then you need
no lists and no mutexes/spinlocks.
>+{
>+ struct bypass *bypass;
>+
>+ bypass = kzalloc(sizeof(*bypass), GFP_KERNEL);
>+ if (!bypass)
>+ return NULL;
>+
>+ bypass->ops = ops;
>+ bypass->netdev_ops = netdev_ops;
>+ INIT_LIST_HEAD(&bypass->instance_list);
>+
>+ mutex_lock(&bypass_mutex);
>+ list_add_tail(&bypass->list, &bypass_list);
>+ mutex_unlock(&bypass_mutex);
>+
>+ return bypass;
>+}
>+EXPORT_SYMBOL_GPL(bypass_register_driver);
>+
>+void bypass_unregister_driver(struct bypass *bypass)
>+{
>+ mutex_lock(&bypass_mutex);
>+ list_del(&bypass->list);
>+ mutex_unlock(&bypass_mutex);
>+
>+ kfree(bypass);
>+}
>+EXPORT_SYMBOL_GPL(bypass_unregister_driver);
>+
>+static __init int
>+bypass_init(void)
>+{
>+ register_netdevice_notifier(&bypass_notifier);
>+
>+ return 0;
>+}
>+module_init(bypass_init);
>+
>+static __exit
>+void bypass_exit(void)
>+{
>+ unregister_netdevice_notifier(&bypass_notifier);
>+}
>+module_exit(bypass_exit);
>+
>+MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers");
>+MODULE_LICENSE("GPL v2");
>--
>2.14.3
>
^ permalink raw reply
* Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Siwei Liu @ 2018-04-06 21:29 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
Jakub Kicinski, Samudrala, Sridhar, virtualization, Netdev,
si-wei liu, David Miller
In-Reply-To: <20180403160834.51594373@xeon-e3>
(put discussions back on list which got accidentally removed)
On Tue, Apr 3, 2018 at 4:08 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 3 Apr 2018 12:04:38 -0700
> Siwei Liu <loseweigh@gmail.com> wrote:
>
>> On Tue, Apr 3, 2018 at 10:35 AM, Stephen Hemminger
>> <stephen@networkplumber.org> wrote:
>> > On Sun, 1 Apr 2018 05:13:09 -0400
>> > Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>> >
>> >> Hidden netdevice is not visible to userspace such that
>> >> typical network utilites e.g. ip, ifconfig and et al,
>> >> cannot sense its existence or configure it. Internally
>> >> hidden netdev may associate with an upper level netdev
>> >> that userspace has access to. Although userspace cannot
>> >> manipulate the lower netdev directly, user may control
>> >> or configure the underlying hidden device through the
>> >> upper-level netdev. For identification purpose, the
>> >> kobject for hidden netdev still presents in the sysfs
>> >> hierarchy, however, no uevent message will be generated
>> >> when the sysfs entry is created, modified or destroyed.
>> >>
>> >> For that end, a separate namescope needs to be carved
>> >> out for IFF_HIDDEN netdevs. As of now netdev name that
>> >> starts with colon i.e. ':' is invalid in userspace,
>> >> since socket ioctls such as SIOCGIFCONF use ':' as the
>> >> separator for ifname. The absence of namescope started
>> >> with ':' can rightly be used as the namescope for
>> >> the kernel-only IFF_HIDDEN netdevs.
>> >>
>> >> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>> >> ---
>> >
>> > I understand the use case. I proposed using . as a prefix before
>> > but that ran into resistance. Using colon seems worse.
>>
>> Using dot (.) can't be good because it would cause namespace collision
>> and thus breaking apps when you hide the device. Imagine a user really
>> wants to add a link with the same name as the one hidden and it starts
>> with a dot. It would fail, and users don't know its just because the
>> name starts with dot. IMHO users should be agnostic of (the namespace
>> of) hidden device at all if what they pick is a valid name.
>>
>> ":" is an invalid prefix to userspace, there's no such problem if
>> being used to construct the namescope for hidden devices.
>>
>> However, technically, just as what I alluded to in the reply earlier,
>> it might really be consistent to put this under a separeate namespace
>> instead than fiddling with name prefix. But I am just not sure if that
>> is a big hammer and would like to earn enough feedback and attention
>> before going that way too quickly.
>>
>>
>> >
>> > Rather than playing with names and all the issues that can cause,
>> > why not make it an attribute flag of the device in netlink.
>>
>> Atrribute flag doesn't help. It's a matter of namespace.
>>
>> Regards,
>> -Siwei
>
> In Vyatta, we used names like ".isatap" for devices that would clutter up
> the user experience. They are naturally not visible by simple scans of
> /sys/class/net, and there was a patch to ignore them in iproute2.
> It was a hack which worked but not really worth upstreaming.
>
> The question is if this a security feature then it needs to be more
I don't expect the namespace to be a security aspect of feature, but
rather a way to make old userspace unmodified to work with a new
feature. And, we're going to add API to expose the netdev info for the
invisible IFF_AUTO_MANAGED links anyway. We don't need to make it
secure and all hidden under the dark to be honest.
> robust than just name prefix. Plus it took years to handle network
> namespaces everywhere; this kind of flag would start same problems.
>
> Network namespaces work but have the problem namespaces only weakly
> support hierarchy and nesting. I prefer the namespace approach
> because it fits better and has less impact.
Great, thanks!
-Siwei
^ permalink raw reply
* Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-04-06 22:59 UTC (permalink / raw)
To: Jiri Pirko
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <20180406124814.GA24525@nanopsycho>
On 4/6/2018 5:48 AM, Jiri Pirko wrote:
> Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudrala@intel.com wrote:
<snip>
>
>> +
>> +static void virtnet_bypass_set_rx_mode(struct net_device *dev)
>> +{
>> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
>> + struct net_device *child_netdev;
>> +
>> + rcu_read_lock();
>> +
>> + child_netdev = rcu_dereference(vbi->active_netdev);
>> + if (child_netdev) {
>> + dev_uc_sync_multiple(child_netdev, dev);
>> + dev_mc_sync_multiple(child_netdev, dev);
>> + }
>> +
>> + child_netdev = rcu_dereference(vbi->backup_netdev);
>> + if (child_netdev) {
>> + dev_uc_sync_multiple(child_netdev, dev);
>> + dev_mc_sync_multiple(child_netdev, dev);
>> + }
>> +
>> + rcu_read_unlock();
>> +}
> This should be moved to bypass module.
Sure. All these bypass ndo_ops can be moved to bypass module and any
paravirtual driver that want to go with 3 netdev model can reuse these functions.
>
>
>> +
>> +static const struct net_device_ops virtnet_bypass_netdev_ops = {
>> + .ndo_open = virtnet_bypass_open,
>> + .ndo_stop = virtnet_bypass_close,
>> + .ndo_start_xmit = virtnet_bypass_start_xmit,
>> + .ndo_select_queue = virtnet_bypass_select_queue,
>> + .ndo_get_stats64 = virtnet_bypass_get_stats,
>> + .ndo_change_mtu = virtnet_bypass_change_mtu,
>> + .ndo_set_rx_mode = virtnet_bypass_set_rx_mode,
>> + .ndo_validate_addr = eth_validate_addr,
>> + .ndo_features_check = passthru_features_check,
>> +};
>> +
>> +static int
>> +virtnet_bypass_ethtool_get_link_ksettings(struct net_device *dev,
>> + struct ethtool_link_ksettings *cmd)
>> +{
>> + struct virtnet_bypass_info *vbi = netdev_priv(dev);
>> + struct net_device *child_netdev;
>> +
>> + child_netdev = rtnl_dereference(vbi->active_netdev);
>> + if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
>> + child_netdev = rtnl_dereference(vbi->backup_netdev);
>> + if (!child_netdev || !virtnet_bypass_xmit_ready(child_netdev)) {
>> + cmd->base.duplex = DUPLEX_UNKNOWN;
>> + cmd->base.port = PORT_OTHER;
>> + cmd->base.speed = SPEED_UNKNOWN;
>> +
>> + return 0;
>> + }
>> + }
>> +
>> + return __ethtool_get_link_ksettings(child_netdev, cmd);
>> +}
>> +
>> +#define BYPASS_DRV_NAME "virtnet_bypass"
>> +#define BYPASS_DRV_VERSION "0.1"
>> +
>> +static void virtnet_bypass_ethtool_get_drvinfo(struct net_device *dev,
>> + struct ethtool_drvinfo *drvinfo)
>> +{
>> + strlcpy(drvinfo->driver, BYPASS_DRV_NAME, sizeof(drvinfo->driver));
>> + strlcpy(drvinfo->version, BYPASS_DRV_VERSION, sizeof(drvinfo->version));
>> +}
>> +
>> +static const struct ethtool_ops virtnet_bypass_ethtool_ops = {
>> + .get_drvinfo = virtnet_bypass_ethtool_get_drvinfo,
>> + .get_link = ethtool_op_get_link,
>> + .get_link_ksettings = virtnet_bypass_ethtool_get_link_ksettings,
>> +};
>> +
>> +static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev)
>> +{
>> + struct virtnet_bypass_info *vbi;
>> + bool backup;
>> +
>> + vbi = netdev_priv(bypass_netdev);
>> + backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>> + if (backup ? rtnl_dereference(vbi->backup_netdev) :
>> + rtnl_dereference(vbi->active_netdev)) {
>> + netdev_info(bypass_netdev,
>> + "%s attempting to join bypass dev when %s already present\n",
>> + child_netdev->name, backup ? "backup" : "active");
> Bypass module should check if there is already some other netdev
> enslaved and refuse right there.
This will work for virtio-net with 3 netdev model, but this check has to be done by netvsc
as its bypass_netdev is same as the backup_netdev.
Will add a flag while registering with the bypass module to indicate if the driver is doing
a 2 netdev or 3 netdev model and based on that flag this check can be done in bypass module
for 3 netdev scenario.
>
> The active/backup terminology is quite confusing. From the bonding world
> that means active is the one which is currently used for tx of the
> packets. And it depends on link and other things what netdev is declared
> active. However here, it is different. Backup is always the virtio_net
> instance even when it is active. Odd. Please change the terminology.
> For "active" I suggest to use name "stolen".
I am not too happy with 'stolen' name. Will see if i can come up with a
better name.
>
> *** Also, the 2 slave netdev pointers should be stored in the bypass
> module instance, not in the drivers.
Will move virtnet_bypass_info struct to bypass.h
>
>
>
>> + return -EEXIST;
>> + }
>> +
>> + dev_hold(child_netdev);
>> +
>> + if (backup) {
>> + rcu_assign_pointer(vbi->backup_netdev, child_netdev);
>> + dev_get_stats(vbi->backup_netdev, &vbi->backup_stats);
>> + } else {
>> + rcu_assign_pointer(vbi->active_netdev, child_netdev);
>> + dev_get_stats(vbi->active_netdev, &vbi->active_stats);
>> + bypass_netdev->min_mtu = child_netdev->min_mtu;
>> + bypass_netdev->max_mtu = child_netdev->max_mtu;
>> + }
>> +
>> + netdev_info(bypass_netdev, "child:%s joined\n", child_netdev->name);
>> +
>> + return 0;
>> +}
>> +
>> +static int virtnet_bypass_register_child(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev)
>> +{
>> + struct virtnet_bypass_info *vbi;
>> + bool backup;
>> +
>> + vbi = netdev_priv(bypass_netdev);
>> + backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>> + if (backup ? rtnl_dereference(vbi->backup_netdev) :
>> + rtnl_dereference(vbi->active_netdev)) {
>> + netdev_info(bypass_netdev,
>> + "%s attempting to register bypass dev when %s already present\n",
>> + child_netdev->name, backup ? "backup" : "active");
>> + return -EEXIST;
>> + }
>> +
>> + /* Avoid non pci devices as active netdev */
>> + if (!backup && (!child_netdev->dev.parent ||
>> + !dev_is_pci(child_netdev->dev.parent)))
>> + return -EINVAL;
>> +
>> + netdev_info(bypass_netdev, "child:%s registered\n", child_netdev->name);
>> +
>> + return 0;
>> +}
>> +
>> +static int virtnet_bypass_release_child(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev)
>> +{
>> + struct net_device *backup_netdev, *active_netdev;
>> + struct virtnet_bypass_info *vbi;
>> +
>> + vbi = netdev_priv(bypass_netdev);
>> + active_netdev = rtnl_dereference(vbi->active_netdev);
>> + backup_netdev = rtnl_dereference(vbi->backup_netdev);
>> +
>> + if (child_netdev != active_netdev && child_netdev != backup_netdev)
>> + return -EINVAL;
>> +
>> + netdev_info(bypass_netdev, "child:%s released\n", child_netdev->name);
>> +
>> + return 0;
>> +}
>> +
>> +static int virtnet_bypass_unregister_child(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev)
>> +{
>> + struct net_device *backup_netdev, *active_netdev;
>> + struct virtnet_bypass_info *vbi;
>> +
>> + vbi = netdev_priv(bypass_netdev);
>> + active_netdev = rtnl_dereference(vbi->active_netdev);
>> + backup_netdev = rtnl_dereference(vbi->backup_netdev);
>> +
>> + if (child_netdev != active_netdev && child_netdev != backup_netdev)
>> + return -EINVAL;
>> +
>> + if (child_netdev == backup_netdev) {
>> + RCU_INIT_POINTER(vbi->backup_netdev, NULL);
>> + } else {
>> + RCU_INIT_POINTER(vbi->active_netdev, NULL);
>> + if (backup_netdev) {
>> + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> + }
>> + }
>> +
>> + dev_put(child_netdev);
>> +
>> + netdev_info(bypass_netdev, "child:%s unregistered\n",
>> + child_netdev->name);
>> +
>> + return 0;
>> +}
>> +
>> +static int virtnet_bypass_update_link(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev)
>> +{
>> + struct net_device *active_netdev, *backup_netdev;
>> + struct virtnet_bypass_info *vbi;
>> +
>> + if (!netif_running(bypass_netdev))
>> + return 0;
>> +
>> + vbi = netdev_priv(bypass_netdev);
>> +
>> + active_netdev = rtnl_dereference(vbi->active_netdev);
>> + backup_netdev = rtnl_dereference(vbi->backup_netdev);
>> +
>> + if (child_netdev != active_netdev && child_netdev != backup_netdev)
>> + return -EINVAL;
>> +
>> + if ((active_netdev && virtnet_bypass_xmit_ready(active_netdev)) ||
>> + (backup_netdev && virtnet_bypass_xmit_ready(backup_netdev))) {
>> + netif_carrier_on(bypass_netdev);
>> + netif_tx_wake_all_queues(bypass_netdev);
>> + } else {
>> + netif_carrier_off(bypass_netdev);
>> + netif_tx_stop_all_queues(bypass_netdev);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +/* Called when child dev is injecting data into network stack.
>> + * Change the associated network device from lower dev to virtio.
>> + * note: already called with rcu_read_lock
>> + */
>> +static rx_handler_result_t virtnet_bypass_handle_frame(struct sk_buff **pskb)
>> +{
>> + struct sk_buff *skb = *pskb;
>> + struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
>> +
>> + skb->dev = ndev;
>> +
>> + return RX_HANDLER_ANOTHER;
>> +}
> Hmm, you have the rx_handler defined in drivers and you register it in
> bypass module. It is odd because here you assume that the bypass module
> passed "ndev" and rx_handler_data. Also, you don't need advanced
> features rx_handler provides.
> Instead, just register a common rx_handler defined in the bypass module
> and have simple skb_rx callback here (static void).
Will move the rx_handler also to bypass.c
>
>
>> +
>> +static const struct bypass_ops virtnet_bypass_ops = {
>> + .register_child = virtnet_bypass_register_child,
>> + .join_child = virtnet_bypass_join_child,
>> + .unregister_child = virtnet_bypass_unregister_child,
>> + .release_child = virtnet_bypass_release_child,
>> + .update_link = virtnet_bypass_update_link,
>> + .handle_frame = virtnet_bypass_handle_frame,
>> +};
>> +
>> +static struct bypass *virtnet_bypass;
>> +
>> +static int virtnet_bypass_create(struct virtnet_info *vi)
>> +{
>> + struct net_device *backup_netdev = vi->dev;
>> + struct device *dev = &vi->vdev->dev;
>> + struct net_device *bypass_netdev;
>> + int res;
>> +
>> + /* Alloc at least 2 queues, for now we are going with 16 assuming
>> + * that most devices being bonded won't have too many queues.
>> + */
>> + bypass_netdev = alloc_etherdev_mq(sizeof(struct virtnet_bypass_info),
>> + 16);
>> + if (!bypass_netdev) {
>> + dev_err(dev, "Unable to allocate bypass_netdev!\n");
>> + return -ENOMEM;
>> + }
>> +
>> + dev_net_set(bypass_netdev, dev_net(backup_netdev));
>> + SET_NETDEV_DEV(bypass_netdev, dev);
>> +
>> + bypass_netdev->netdev_ops = &virtnet_bypass_netdev_ops;
>> + bypass_netdev->ethtool_ops = &virtnet_bypass_ethtool_ops;
>> +
>> + /* Initialize the device options */
>> + bypass_netdev->flags |= IFF_MASTER;
> I think I pointed that out already. Don't use "IFF_MASTER". That is
> specific to bonding. As I suggested in the reply to the patch #2, you
> should introduce IFF_BYPASS. Also, this flag should be set by the bypass
> module. Just create the netdev and do things specific to virtio and then
> call to bypass module, pass the netdev so it can do the rest. I think
> that the flags, features etc would be also fine to set there.
OK.
>
>
>> + bypass_netdev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
>> + bypass_netdev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
>> + IFF_TX_SKB_SHARING);
>> +
>> + /* don't acquire bypass netdev's netif_tx_lock when transmitting */
>> + bypass_netdev->features |= NETIF_F_LLTX;
>> +
>> + /* Don't allow bypass devices to change network namespaces. */
>> + bypass_netdev->features |= NETIF_F_NETNS_LOCAL;
>> +
>> + bypass_netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
>> + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
>> + NETIF_F_HIGHDMA | NETIF_F_LRO;
>> +
>> + bypass_netdev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
>> + bypass_netdev->features |= bypass_netdev->hw_features;
>> +
>> + /* For now treat bypass netdev as VLAN challenged since we
>> + * cannot assume VLAN functionality with a VF
> Why? I don't see such drivers. But to be 100% correct, you should check
> the NETIF_F_VLAN_CHALLENGED feature in bypass module during VF enslave
> and forbid to do so if it is on.
Will look into it.
>
>
>> + */
>> + bypass_netdev->features |= NETIF_F_VLAN_CHALLENGED;
>> +
>> + memcpy(bypass_netdev->dev_addr, backup_netdev->dev_addr,
>> + bypass_netdev->addr_len);
>> +
>> + bypass_netdev->min_mtu = backup_netdev->min_mtu;
>> + bypass_netdev->max_mtu = backup_netdev->max_mtu;
>> +
>> + res = register_netdev(bypass_netdev);
>> + if (res < 0) {
>> + dev_err(dev, "Unable to register bypass_netdev!\n");
>> + goto err_register_netdev;
>> + }
>> +
>> + netif_carrier_off(bypass_netdev);
>> +
>> + res = bypass_register_instance(virtnet_bypass, bypass_netdev);
>> + if (res < 0)
>> + goto err_bypass;
>> +
>> + rcu_assign_pointer(vi->bypass_netdev, bypass_netdev);
>> +
>> + return 0;
>> +
>> +err_bypass:
>> + unregister_netdev(bypass_netdev);
>> +err_register_netdev:
>> + free_netdev(bypass_netdev);
>> +
>> + return res;
>> +}
>> +
>> +static void virtnet_bypass_destroy(struct virtnet_info *vi)
>> +{
>> + struct net_device *bypass_netdev;
>> + struct virtnet_bypass_info *vbi;
>> + struct net_device *child_netdev;
>> +
>> + bypass_netdev = rcu_dereference(vi->bypass_netdev);
>> + /* no device found, nothing to free */
>> + if (!bypass_netdev)
>> + return;
>> +
>> + vbi = netdev_priv(bypass_netdev);
>> +
>> + netif_device_detach(bypass_netdev);
>> +
>> + rtnl_lock();
>> +
>> + child_netdev = rtnl_dereference(vbi->active_netdev);
>> + if (child_netdev)
>> + bypass_unregister_child(child_netdev);
>> +
>> + child_netdev = rtnl_dereference(vbi->backup_netdev);
>> + if (child_netdev)
>> + bypass_unregister_child(child_netdev);
>> +
>> + unregister_netdevice(bypass_netdev);
>> +
>> + bypass_unregister_instance(virtnet_bypass, bypass_netdev);
>> +
>> + rtnl_unlock();
>> +
>> + free_netdev(bypass_netdev);
>> +}
>> +
>> +/* END of functions supporting VIRTIO_NET_F_BACKUP feature. */
>> +
>> static int virtnet_probe(struct virtio_device *vdev)
>> {
>> int i, err = -ENOMEM;
>> @@ -2839,10 +3432,15 @@ static int virtnet_probe(struct virtio_device *vdev)
>>
>> virtnet_init_settings(dev);
>>
>> + if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) {
>> + if (virtnet_bypass_create(vi) != 0)
> You need to do:
> err = virtnet_bypass_create(vi);
> if (err)
> otherwise you ignore err and virtnet_probe would return 0;
Sure.
>
>
>> + goto free_vqs;
>> + }
>> +
>> err = register_netdev(dev);
>> if (err) {
>> pr_debug("virtio_net: registering device failed\n");
>> - goto free_vqs;
>> + goto free_bypass;
>> }
>>
>> virtio_device_ready(vdev);
>> @@ -2879,6 +3477,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>> vi->vdev->config->reset(vdev);
>>
>> unregister_netdev(dev);
>> +free_bypass:
>> + virtnet_bypass_destroy(vi);
>> free_vqs:
>> cancel_delayed_work_sync(&vi->refill);
>> free_receive_page_frags(vi);
>> @@ -2913,6 +3513,8 @@ static void virtnet_remove(struct virtio_device *vdev)
>>
>> unregister_netdev(vi->dev);
>>
>> + virtnet_bypass_destroy(vi);
>> +
>> remove_vq_common(vi);
>>
>> free_netdev(vi->dev);
>> @@ -2996,6 +3598,11 @@ static __init int virtio_net_driver_init(void)
>> {
>> int ret;
>>
>> + virtnet_bypass = bypass_register_driver(&virtnet_bypass_ops,
>> + &virtnet_bypass_netdev_ops);
>> + if (!virtnet_bypass)
>> + return -ENOMEM;
> If CONFIG_NET_BYPASS is undefined, you will always return -ENOMEM here.
Will fix.
>
>
>> +
>> ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
>> virtnet_cpu_online,
>> virtnet_cpu_down_prep);
>> @@ -3010,12 +3617,14 @@ static __init int virtio_net_driver_init(void)
>> ret = register_virtio_driver(&virtio_net_driver);
>> if (ret)
>> goto err_virtio;
>> +
>> return 0;
>> err_virtio:
>> cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>> err_dead:
>> cpuhp_remove_multi_state(virtionet_online);
>> out:
>> + bypass_unregister_driver(virtnet_bypass);
>> return ret;
>> }
>> module_init(virtio_net_driver_init);
>> @@ -3025,6 +3634,7 @@ static __exit void virtio_net_driver_exit(void)
>> unregister_virtio_driver(&virtio_net_driver);
>> cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>> cpuhp_remove_multi_state(virtionet_online);
>> + bypass_unregister_driver(virtnet_bypass);
>> }
>> module_exit(virtio_net_driver_exit);
>>
>> --
>> 2.14.3
>>
^ permalink raw reply
* Re: [RFC PATCH net-next v5 2/4] net: Introduce generic bypass module
From: Samudrala, Sridhar @ 2018-04-06 23:08 UTC (permalink / raw)
To: Jiri Pirko
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <20180406125758.GC19345@nanopsycho>
On 4/6/2018 5:57 AM, Jiri Pirko wrote:
> Thu, Apr 05, 2018 at 11:08:21PM CEST, sridhar.samudrala@intel.com wrote:
>> This provides a generic interface for paravirtual drivers to listen
>> for netdev register/unregister/link change events from pci ethernet
>> devices with the same MAC and takeover their datapath. The notifier and
>> event handling code is based on the existing netvsc implementation. A
>> paravirtual driver can use this module by registering a set of ops and
>> each instance of the device when it is probed.
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> ---
>> include/net/bypass.h | 80 ++++++++++
>> net/Kconfig | 18 +++
>> net/core/Makefile | 1 +
>> net/core/bypass.c | 406 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 505 insertions(+)
>> create mode 100644 include/net/bypass.h
>> create mode 100644 net/core/bypass.c
>>
>> diff --git a/include/net/bypass.h b/include/net/bypass.h
>> new file mode 100644
>> index 000000000000..e2dd122f951a
>> --- /dev/null
>> +++ b/include/net/bypass.h
>> @@ -0,0 +1,80 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2018, Intel Corporation. */
>> +
>> +#ifndef _NET_BYPASS_H
>> +#define _NET_BYPASS_H
>> +
>> +#include <linux/netdevice.h>
>> +
>> +struct bypass_ops {
> Perhaps "net_bypass_" would be better prefix for this module structs
> and functions. No strong opinion though.
>
>
>> + int (*register_child)(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev);
> We have master/slave upper/lower netdevices. This adds "child". Consider
> using some existing names. Not sure if possible without loss of meaning.
OK. will change this to register_slave()
>
>
>> + int (*join_child)(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev);
>> + int (*unregister_child)(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev);
>> + int (*release_child)(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev);
>> + int (*update_link)(struct net_device *bypass_netdev,
>> + struct net_device *child_netdev);
>> + rx_handler_result_t (*handle_frame)(struct sk_buff **pskb);
>> +};
>> +
>> +struct bypass_instance {
>> + struct list_head list;
>> + struct net_device __rcu *bypass_netdev;
>> + struct bypass *bypass;
>> +};
>> +
>> +struct bypass {
>> + struct list_head list;
>> + const struct bypass_ops *ops;
>> + const struct net_device_ops *netdev_ops;
>> + struct list_head instance_list;
>> + struct mutex lock;
>> +};
>> +
>> +#if IS_ENABLED(CONFIG_NET_BYPASS)
>> +
>> +struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>> + const struct net_device_ops *netdev_ops);
>> +void bypass_unregister_driver(struct bypass *bypass);
>> +
>> +int bypass_register_instance(struct bypass *bypass, struct net_device *dev);
>> +int bypass_unregister_instance(struct bypass *bypass, struct net_device *dev);
>> +
>> +int bypass_unregister_child(struct net_device *child_netdev);
>> +
>> +#else
>> +
>> +static inline
>> +struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>> + const struct net_device_ops *netdev_ops)
>> +{
>> + return NULL;
>> +}
>> +
>> +static inline void bypass_unregister_driver(struct bypass *bypass)
>> +{
>> +}
>> +
>> +static inline int bypass_register_instance(struct bypass *bypass,
>> + struct net_device *dev)
>> +{
>> + return 0;
>> +}
>> +
>> +static inline int bypass_unregister_instance(struct bypass *bypass,
>> + struct net_device *dev)
>> +{
>> + return 0;
>> +}
>> +
>> +static inline int bypass_unregister_child(struct net_device *child_netdev)
>> +{
>> + return 0;
>> +}
>> +
>> +#endif
>> +
>> +#endif /* _NET_BYPASS_H */
>> diff --git a/net/Kconfig b/net/Kconfig
>> index 0428f12c25c2..994445f4a96a 100644
>> --- a/net/Kconfig
>> +++ b/net/Kconfig
>> @@ -423,6 +423,24 @@ config MAY_USE_DEVLINK
>> on MAY_USE_DEVLINK to ensure they do not cause link errors when
>> devlink is a loadable module and the driver using it is built-in.
>>
>> +config NET_BYPASS
>> + tristate "Bypass interface"
>> + ---help---
>> + This provides a generic interface for paravirtual drivers to listen
>> + for netdev register/unregister/link change events from pci ethernet
>> + devices with the same MAC and takeover their datapath. This also
>> + enables live migration of a VM with direct attached VF by failing
>> + over to the paravirtual datapath when the VF is unplugged.
>> +
>> +config MAY_USE_BYPASS
>> + tristate
>> + default m if NET_BYPASS=m
>> + default y if NET_BYPASS=y || NET_BYPASS=n
>> + help
>> + Drivers using the bypass infrastructure should have a dependency
>> + on MAY_USE_BYPASS to ensure they do not cause link errors when
>> + bypass is a loadable module and the driver using it is built-in.
>> +
>> endif # if NET
>>
>> # Used by archs to tell that they support BPF JIT compiler plus which flavour.
>> diff --git a/net/core/Makefile b/net/core/Makefile
>> index 6dbbba8c57ae..a9727ed1c8fc 100644
>> --- a/net/core/Makefile
>> +++ b/net/core/Makefile
>> @@ -30,3 +30,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
>> obj-$(CONFIG_HWBM) += hwbm.o
>> obj-$(CONFIG_NET_DEVLINK) += devlink.o
>> obj-$(CONFIG_GRO_CELLS) += gro_cells.o
>> +obj-$(CONFIG_NET_BYPASS) += bypass.o
>> diff --git a/net/core/bypass.c b/net/core/bypass.c
>> new file mode 100644
>> index 000000000000..7bde962ec3d4
>> --- /dev/null
>> +++ b/net/core/bypass.c
>> @@ -0,0 +1,406 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2018, Intel Corporation. */
>> +
>> +/* A common module to handle registrations and notifications for paravirtual
>> + * drivers to enable accelerated datapath and support VF live migration.
>> + *
>> + * The notifier and event handling code is based on netvsc driver.
>> + */
>> +
>> +#include <linux/netdevice.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/ethtool.h>
>> +#include <linux/module.h>
>> +#include <linux/slab.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/netpoll.h>
>> +#include <linux/rtnetlink.h>
>> +#include <linux/if_vlan.h>
>> +#include <net/sch_generic.h>
>> +#include <uapi/linux/if_arp.h>
>> +#include <net/bypass.h>
>> +
>> +static LIST_HEAD(bypass_list);
>> +
>> +static DEFINE_MUTEX(bypass_mutex);
> Why mutex? Apparently you don't need to sleep while holding a lock.
> Simple spinlock would do.
>
>
>> +
>> +struct bypass_instance *bypass_instance_alloc(struct net_device *dev)
>> +{
>> + struct bypass_instance *bypass_instance;
>> +
>> + bypass_instance = kzalloc(sizeof(*bypass_instance), GFP_KERNEL);
>> + if (!bypass_instance)
>> + return NULL;
>> +
>> + dev_hold(dev);
>> + rcu_assign_pointer(bypass_instance->bypass_netdev, dev);
>> +
>> + return bypass_instance;
>> +}
>> +
>> +void bypass_instance_free(struct bypass_instance *bypass_instance)
>> +{
>> + struct net_device *bypass_netdev;
>> +
>> + bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> +
>> + dev_put(bypass_netdev);
>> + kfree(bypass_instance);
>> +}
>> +
>> +static struct bypass_instance *bypass_get_instance_bymac(u8 *mac)
>> +{
>> + struct bypass_instance *bypass_instance;
>> + struct net_device *bypass_netdev;
>> + struct bypass *bypass;
>> +
>> + list_for_each_entry(bypass, &bypass_list, list) {
>> + mutex_lock(&bypass->lock);
>> + list_for_each_entry(bypass_instance, &bypass->instance_list,
>> + list) {
>> + bypass_netdev =
>> + rcu_dereference(bypass_instance->bypass_netdev);
>> + if (ether_addr_equal(bypass_netdev->perm_addr, mac)) {
>> + mutex_unlock(&bypass->lock);
>> + goto out;
>> + }
>> + }
>> + mutex_unlock(&bypass->lock);
>> + }
>> +
>> + bypass_instance = NULL;
>> +out:
>> + return bypass_instance;
>> +}
>> +
>> +static int bypass_register_child(struct net_device *child_netdev)
>> +{
>> + struct bypass_instance *bypass_instance;
>> + struct bypass *bypass;
>> + struct net_device *bypass_netdev;
>> + int ret, orig_mtu;
>> +
>> + ASSERT_RTNL();
>> +
>> + mutex_lock(&bypass_mutex);
>> + bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
>> + if (!bypass_instance) {
>> + mutex_unlock(&bypass_mutex);
>> + goto done;
>> + }
>> +
>> + bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> + bypass = bypass_instance->bypass;
>> + mutex_unlock(&bypass_mutex);
>> +
>> + if (!bypass->ops->register_child)
>> + goto done;
>> +
>> + ret = bypass->ops->register_child(bypass_netdev, child_netdev);
>> + if (ret != 0)
>> + goto done;
>> +
>> + ret = netdev_rx_handler_register(child_netdev,
>> + bypass->ops->handle_frame,
>> + bypass_netdev);
>> + if (ret != 0) {
>> + netdev_err(child_netdev,
>> + "can not register bypass rx handler (err = %d)\n",
>> + ret);
>> + goto rx_handler_failed;
>> + }
>> +
>> + ret = netdev_upper_dev_link(child_netdev, bypass_netdev, NULL);
>> + if (ret != 0) {
>> + netdev_err(child_netdev,
> No line-wrap is needed here and in other cases like this.
>
>
>> + "can not set master device %s (err = %d)\n",
>> + bypass_netdev->name, ret);
>> + goto upper_link_failed;
>> + }
>> +
>> + child_netdev->flags |= IFF_SLAVE;
> Don't reuse IFF_SLAVE. That is bonding-specific thing. I know that
> netvsc uses it, it is wrong.
> Please rather introduce:
> IFF_BYPASS for master
> and IFF_BYPASS_SLAVE for slaves.
OK. Will add these flags.
>
>
>
>
>> +
>> + if (netif_running(bypass_netdev)) {
>> + ret = dev_open(child_netdev);
>> + if (ret && (ret != -EBUSY)) {
>> + netdev_err(bypass_netdev,
>> + "Opening child %s failed ret:%d\n",
>> + child_netdev->name, ret);
>> + goto err_interface_up;
>> + }
>> + }
>> +
>> + /* Align MTU of child with master */
>> + orig_mtu = child_netdev->mtu;
>> + ret = dev_set_mtu(child_netdev, bypass_netdev->mtu);
>> + if (ret != 0) {
>> + netdev_err(bypass_netdev,
>> + "unable to change mtu of %s to %u register failed\n",
>> + child_netdev->name, bypass_netdev->mtu);
>> + goto err_set_mtu;
>> + }
>> +
>> + ret = bypass->ops->join_child(bypass_netdev, child_netdev);
>> + if (ret != 0)
>> + goto err_join;
>> +
>> + call_netdevice_notifiers(NETDEV_JOIN, child_netdev);
>> +
>> + goto done;
>> +
>> +err_join:
>> + dev_set_mtu(child_netdev, orig_mtu);
>> +err_set_mtu:
>> + dev_close(child_netdev);
>> +err_interface_up:
>> + netdev_upper_dev_unlink(child_netdev, bypass_netdev);
>> + child_netdev->flags &= ~IFF_SLAVE;
>> +upper_link_failed:
>> + netdev_rx_handler_unregister(child_netdev);
>> +rx_handler_failed:
>> + bypass->ops->unregister_child(bypass_netdev, child_netdev);
>> +
>> +done:
>> + return NOTIFY_DONE;
>> +}
>> +
>> +int bypass_unregister_child(struct net_device *child_netdev)
>> +{
>> + struct bypass_instance *bypass_instance;
>> + struct net_device *bypass_netdev;
>> + struct bypass *bypass;
>> + int ret;
>> +
>> + ASSERT_RTNL();
>> +
>> + mutex_lock(&bypass_mutex);
>> + bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
>> + if (!bypass_instance) {
>> + mutex_unlock(&bypass_mutex);
>> + goto done;
>> + }
>> +
>> + bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> + bypass = bypass_instance->bypass;
>> + mutex_unlock(&bypass_mutex);
>> +
>> + ret = bypass->ops->release_child(bypass_netdev, child_netdev);
>> + if (ret != 0)
>> + goto done;
>> +
>> + netdev_rx_handler_unregister(child_netdev);
>> + netdev_upper_dev_unlink(child_netdev, bypass_netdev);
>> + child_netdev->flags &= ~IFF_SLAVE;
>> +
>> + if (!bypass->ops->unregister_child)
>> + goto done;
>> +
>> + bypass->ops->unregister_child(bypass_netdev, child_netdev);
>> +
>> +done:
>> + return NOTIFY_DONE;
>> +}
>> +EXPORT_SYMBOL(bypass_unregister_child);
> Please use "EXPORT_SYMBOL_GPL" for all exported symbols.
OK.
>
>
>> +
>> +static int bypass_update_link(struct net_device *child_netdev)
>> +{
>> + struct bypass_instance *bypass_instance;
>> + struct net_device *bypass_netdev;
>> + struct bypass *bypass;
>> +
>> + ASSERT_RTNL();
>> +
>> + mutex_lock(&bypass_mutex);
>> + bypass_instance = bypass_get_instance_bymac(child_netdev->perm_addr);
> You don't really need this lookup. The kernel knows about the master
> device, you can just use netdev_master_upper_dev_get_rcu() to get it.
Sure.
>
>
>> + if (!bypass_instance) {
>> + mutex_unlock(&bypass_mutex);
>> + goto done;
>> + }
>> +
>> + bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> + bypass = bypass_instance->bypass;
>> + mutex_unlock(&bypass_mutex);
>> +
>> + if (!bypass->ops->update_link)
>> + goto done;
>> +
>> + bypass->ops->update_link(bypass_netdev, child_netdev);
>> +
>> +done:
>> + return NOTIFY_DONE;
>> +}
>> +
>> +static bool bypass_validate_child_dev(struct net_device *dev)
>> +{
>> + /* Avoid non-Ethernet type devices */
>> + if (dev->type != ARPHRD_ETHER)
>> + return false;
>> +
>> + /* Avoid Vlan dev with same MAC registering as VF */
>> + if (is_vlan_dev(dev))
>> + return false;
>> +
>> + /* Avoid Bonding master dev with same MAC registering as child dev */
>> + if ((dev->priv_flags & IFF_BONDING) && (dev->flags & IFF_MASTER))
>> + return false;
>> +
>> + return true;
>> +}
>> +
>> +static int
>> +bypass_event(struct notifier_block *this, unsigned long event, void *ptr)
>> +{
>> + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>> + struct bypass *bypass;
>> +
>> + /* Skip Parent events */
>> + mutex_lock(&bypass_mutex);
>> + list_for_each_entry(bypass, &bypass_list, list) {
>> + if (event_dev->netdev_ops == bypass->netdev_ops) {
>> + mutex_unlock(&bypass_mutex);
>> + return NOTIFY_DONE;
>> + }
> What you need instead of this is an identification helper
> netif_is_bypass_master()
> similar to
> netif_is_team_master()
> netif_is_bridge_master()
> etc
OK. Will introduce this helper.
>
>
>
>> + }
>> + mutex_unlock(&bypass_mutex);
>> +
>> + if (!bypass_validate_child_dev(event_dev))
>> + return NOTIFY_DONE;
>> +
>> + switch (event) {
>> + case NETDEV_REGISTER:
>> + return bypass_register_child(event_dev);
>> + case NETDEV_UNREGISTER:
>> + return bypass_unregister_child(event_dev);
>> + case NETDEV_UP:
>> + case NETDEV_DOWN:
>> + case NETDEV_CHANGE:
>> + return bypass_update_link(event_dev);
>> + default:
>> + return NOTIFY_DONE;
>> + }
>> +}
>> +
>> +static struct notifier_block bypass_notifier = {
>> + .notifier_call = bypass_event,
>> +};
>> +
>> +static void bypass_register_existing_child(struct net_device *bypass_netdev)
>> +{
>> + struct net *net = dev_net(bypass_netdev);
>> + struct net_device *dev;
>> +
>> + rtnl_lock();
>> + for_each_netdev(net, dev) {
>> + if (dev == bypass_netdev)
>> + continue;
>> + if (!bypass_validate_child_dev(dev))
>> + continue;
>> + if (ether_addr_equal(bypass_netdev->perm_addr, dev->perm_addr))
>> + bypass_register_child(dev);
>> + }
>> + rtnl_unlock();
>> +}
>> +
>> +int bypass_register_instance(struct bypass *bypass, struct net_device *dev)
>> +{
>> + struct bypass_instance *bypass_instance;
> No need to allocate this instace here. You can just have is embedded
> inside netdevice priv and pass pointer to it here. You can pass the
> pointer back to the driver when you call ops as the driver can get priv
> back by it.
>
> I would also call it "struct bypass_master" and this function
> "bypass_master_register".
>
> It should contain the ops pointer too.
OK.
>
>
>> + struct net_device *bypass_netdev;
>> + int ret = 0;
>> +
>> + mutex_lock(&bypass->lock);
>> + list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
>> + bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> + if (bypass_netdev == dev) {
> This means the driver registered one netdev twice. That is a bug in
> driver, so WARN_ON would be nice here to point that out.
OK.
>
>
>> + ret = -EEXIST;
>> + goto done;
>> + }
>> + }
>> +
>> + bypass_instance = bypass_instance_alloc(dev);
>> + if (!bypass_instance) {
>> + ret = -ENOMEM;
>> + goto done;
>> + }
>> +
>> + bypass_instance->bypass = bypass;
>> + list_add_tail(&bypass_instance->list, &bypass->instance_list);
>> +
>> +done:
>> + mutex_unlock(&bypass->lock);
>> + bypass_register_existing_child(dev);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(bypass_register_instance);
>> +
>> +int bypass_unregister_instance(struct bypass *bypass, struct net_device *dev)
>> +{
>> + struct bypass_instance *bypass_instance;
>> + struct net_device *bypass_netdev;
>> + int ret = 0;
>> +
>> + mutex_lock(&bypass->lock);
>> + list_for_each_entry(bypass_instance, &bypass->instance_list, list) {
>> + bypass_netdev = rcu_dereference(bypass_instance->bypass_netdev);
>> + if (bypass_netdev == dev) {
>> + list_del(&bypass_instance->list);
>> + bypass_instance_free(bypass_instance);
>> + goto done;
>> + }
>> + }
>> +
>> + ret = -ENOENT;
>> +done:
>> + mutex_unlock(&bypass->lock);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(bypass_unregister_instance);
>> +
>> +struct bypass *bypass_register_driver(const struct bypass_ops *ops,
>> + const struct net_device_ops *netdev_ops)
> I don't see why you need a list of drivers. What you need is just a list
> of instances - bypass masters (probably to call them like that in the
> code as well). Well, you can use the common netdevice list for that
> purpose with the identification helper I mentioned above. Then you need
> no lists and no mutexes/spinlocks.
OK. looks like i should be able to just register instances and pass bypass_netdev,
bypass_ops and a flag to indicate 3/2 netdev model.
>
>
>> +{
>> + struct bypass *bypass;
>> +
>> + bypass = kzalloc(sizeof(*bypass), GFP_KERNEL);
>> + if (!bypass)
>> + return NULL;
>> +
>> + bypass->ops = ops;
>> + bypass->netdev_ops = netdev_ops;
>> + INIT_LIST_HEAD(&bypass->instance_list);
>> +
>> + mutex_lock(&bypass_mutex);
>> + list_add_tail(&bypass->list, &bypass_list);
>> + mutex_unlock(&bypass_mutex);
>> +
>> + return bypass;
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_register_driver);
>> +
>> +void bypass_unregister_driver(struct bypass *bypass)
>> +{
>> + mutex_lock(&bypass_mutex);
>> + list_del(&bypass->list);
>> + mutex_unlock(&bypass_mutex);
>> +
>> + kfree(bypass);
>> +}
>> +EXPORT_SYMBOL_GPL(bypass_unregister_driver);
>> +
>> +static __init int
>> +bypass_init(void)
>> +{
>> + register_netdevice_notifier(&bypass_notifier);
>> +
>> + return 0;
>> +}
>> +module_init(bypass_init);
>> +
>> +static __exit
>> +void bypass_exit(void)
>> +{
>> + unregister_netdevice_notifier(&bypass_notifier);
>> +}
>> +module_exit(bypass_exit);
>> +
>> +MODULE_DESCRIPTION("Bypass infrastructure/interface for Paravirtual drivers");
>> +MODULE_LICENSE("GPL v2");
>> --
>> 2.14.3
>>
^ permalink raw reply
* Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Siwei Liu @ 2018-04-07 2:32 UTC (permalink / raw)
To: David Miller
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
Jakub Kicinski, Samudrala, Sridhar, virtualization, Netdev,
David Ahern, si-wei liu
In-Reply-To: <20180404.133749.1802514210170809419.davem@davemloft.net>
On Wed, Apr 4, 2018 at 10:37 AM, David Miller <davem@davemloft.net> wrote:
> From: David Ahern <dsahern@gmail.com>
> Date: Wed, 4 Apr 2018 11:21:54 -0600
>
>> It is a netdev so there is no reason to have a separate ip command to
>> inspect it. 'ip link' is the right place.
>
> I agree on this.
I'm completely fine of having an API for inspection purpose. The thing
is that we'd perhaps need to go for the namespace approach, for which
I think everyone seems to agree not to fiddle with the ":" prefix, but
rather have a new class of network subsystem under /sys/class thus a
separate device namespace e.g. /sys/class/net-kernel for those
auto-managed lower netdevs is needed.
And I assume everyone here understands the use case for live migration
(in the context of providing cloud service) is very different, and we
have to hide the netdevs. If not, I'm more than happy to clarify.
With that in mind, if having a new class of net-kernel namespace, we
can name the kernel device elaborately which is not neccessarily equal
to the device name exposed to userspace. For example, we can use
driver name as the prefix as opposed to "eth" or ":eth". And we don't
need to have auto-managed netdevs locked into the ":" prefix at all (I
intentionally left it out in the this RFC patch to ask for comments on
the namespace solution which is much cleaner). That said, an userpsace
named device through udev may call something like ens3 and
switch1-port2, but in the kernel-net namespace, it may look like
ixgbevf0 and mlxsw1p2.
So if we all agree introducing a new namespace is the rigth thing to
do, `ip link' will no longer serve the purpose of displaying the
information for kernel-net devnames for the sake of avoiding ambiguity
and namespace collision: it's entirely possible the ip link name could
collide with a kernel-net devname, it's become unclear which name of a
netdev object the command is expected to operate on. That's why I
thought showing the kernel-only netdevs using a separate subcommand
makes more sense.
Thoughts and comments? Please let me know.
Thanks,
-Siwei
>
> What I really don't understand still is the use case... really.
>
> So there are control netdevs, what exactly is the problem with that?
>
> Are we not exporting enough information for applications to handle
> these devices sanely? If so, then's let add that information.
>
> We can set netdev->type to ETH_P_LINUXCONTROL or something like that.
>
> Another alternative is to add an interface flag like IFF_CONTROL or
> similar, and that probably is much nicer.
>
> Hiding the devices means that we acknowledge that applications are
> currently broken with control netdevs... and we want them to stay
> broken!
>
> That doesn't sound like a good plan to me.
>
> So let's fix handling of control netdevs instead of hiding them.
>
> Thanks.
^ permalink raw reply
* Re: [virtio-dev] Re: [RFC PATCH 1/3] qemu: virtio-bypass should explicitly bind to a passthrough device
From: Siwei Liu @ 2018-04-07 2:54 UTC (permalink / raw)
To: Paolo Bonzini
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
Jakub Kicinski, Samudrala, Sridhar, virtualization, Netdev,
Si-Wei Liu, David Miller
In-Reply-To: <d666b8a9-9ff8-4a56-0faa-ef3d0bf81d25@redhat.com>
(click the wrong reply button again, sorry)
On Thu, Apr 5, 2018 at 8:31 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 04/04/2018 10:02, Siwei Liu wrote:
>>> pci_bus_num is almost always a bug if not done within
>>> a context of a PCI host, bridge, etc.
>>>
>>> In particular this will not DTRT if run before guest assigns bus
>>> numbers.
>>>
>> I was seeking means to reserve a specific pci bus slot from drivers,
>> and update the driver when guest assigns the bus number but it seems
>> there's no low-hanging fruits. Because of that reason the bus_num is
>> only obtained until it's really needed (during get_config) and I
>> assume at that point the pci bus assignment is already done. I know
>> the current one is not perfect, but we need that information (PCI
>> bus:slot.func number) to name the guest device correctly.
>
> Can you use the -device "id", and look it up as
>
> devices = container_get(qdev_get_machine(), "/peripheral");
> return object_resolve_path_component(devices, id);
No. The problem of using device id is that the vfio device may come
and go at any time, this is particularly true when live migration is
happening. There's no gurantee we can get the bus:device.func info if
that device is gone. Currently the binding between vfio and virtio-net
is weakly coupled through the backup property, there's no better way
than specifying the bus id and addr property directly.
Regards,
-Siwei
>
> ?
>
> Thanks,
>
> Paolo
^ permalink raw reply
* Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Andrew Lunn @ 2018-04-07 3:19 UTC (permalink / raw)
To: Siwei Liu
Cc: Alexander Duyck, virtio-dev, Jiri Pirko, Michael S. Tsirkin,
Jakub Kicinski, Samudrala, Sridhar, virtualization, Netdev,
David Ahern, si-wei liu, David Miller
In-Reply-To: <CADGSJ206RKi704uoLe3xVs6PyWpURPLb75OPCM_YO9DfnnEzpw@mail.gmail.com>
Hi Siwei
> I think everyone seems to agree not to fiddle with the ":" prefix, but
> rather have a new class of network subsystem under /sys/class thus a
> separate device namespace e.g. /sys/class/net-kernel for those
> auto-managed lower netdevs is needed.
How do you get a device into this new class? I don't know the Linux
driver model too well, but to get a device out of one class and into
another, i think you need to device_del(dev). modify dev->class and
then device_add(dev). However, device_add() says you are not allowed
to do this.
And i don't even see how this helps. Are you also not going to call
list_netdevice()? Are you going to add some other list for these
devices in a different class?
Andrew
^ permalink raw reply
* Re: [RFC PATCH 0/2] use larger max_request_size for virtio_blk
From: Weiping Zhang @ 2018-04-07 8:30 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-block, Cornelia Huck, virtualization, Michael S . Tsirkin
In-Reply-To: <04280c25-aecb-3d79-e589-5fafa974d9e0@kernel.dk>
2018-04-05 22:29 GMT+08:00 Jens Axboe <axboe@kernel.dk>:
> On 4/5/18 4:09 AM, Weiping Zhang wrote:
>> Hi,
>>
>> For virtio block device, actually there is no a hard limit for max request
>> size, and virtio_blk driver set -1 to blk_queue_max_hw_sectors(q, -1U);.
>> But it doesn't work, because there is a default upper limitation
>> BLK_DEF_MAX_SECTORS (1280 sectors). So this series want to add a new helper
>> blk_queue_max_hw_sectors_no_limit to set a proper max reqeust size.
>>
>> Weiping Zhang (2):
>> blk-setting: add new helper blk_queue_max_hw_sectors_no_limit
>> virtio_blk: add new module parameter to set max request size
>>
>> block/blk-settings.c | 20 ++++++++++++++++++++
>> drivers/block/virtio_blk.c | 32 ++++++++++++++++++++++++++++++--
>> include/linux/blkdev.h | 2 ++
>> 3 files changed, 52 insertions(+), 2 deletions(-)
>
> The driver should just use blk_queue_max_hw_sectors() to set the limit,
> and then the soft limit can be modified by a udev rule. Technically the
> driver doesn't own the software limit, it's imposed to ensure that we
> don't introduce too much latency per request.
>
> Your situation is no different from many other setups, where the
> hw limit is much higher than the default 1280k.
>
Hi Martin, Jens,
It seems more reasonable to change software limitation by udev rule,
thanks you.
>
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: David Miller @ 2018-04-08 16:32 UTC (permalink / raw)
To: loseweigh
Cc: alexander.h.duyck, virtio-dev, jiri, mst, kubakici,
sridhar.samudrala, virtualization, netdev, dsahern, si-wei.liu
In-Reply-To: <CADGSJ206RKi704uoLe3xVs6PyWpURPLb75OPCM_YO9DfnnEzpw@mail.gmail.com>
From: Siwei Liu <loseweigh@gmail.com>
Date: Fri, 6 Apr 2018 19:32:05 -0700
> And I assume everyone here understands the use case for live
> migration (in the context of providing cloud service) is very
> different, and we have to hide the netdevs. If not, I'm more than
> happy to clarify.
I think you still need to clarify.
netdevs are netdevs. If they have special attributes, mark them as
such and the tools base their actions upon that.
"Hiding", or changing classes, doesn't make any sense to me still.
^ permalink raw reply
* Re: [PATCH] vhost-net: set packet weight of tx polling to 2 * vq size
From: David Miller @ 2018-04-08 16:52 UTC (permalink / raw)
To: haibinzhang
Cc: kvm, mst, netdev, linux-kernel, virtualization, yunfangtai,
lidongchen
In-Reply-To: <88D661ADF6AFBF42B2AB88D8E7682B0901FC47D3@EXMBX-SZMAIL011.tencent.com>
From: haibinzhang(张海斌) <haibinzhang@tencent.com>
Date: Fri, 6 Apr 2018 08:22:37 +0000
> handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
> polling udp packets with small length(e.g. 1byte udp payload), because setting
> VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length.
>
> Ping-Latencies shown below were tested between two Virtual Machines using
> netperf (UDP_STREAM, len=1), and then another machine pinged the client:
...
> Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
> Signed-off-by: Yunfang Tai <yunfangtai@tencent.com>
> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Michael and Jason, please review.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: kernel BUG at drivers/vhost/vhost.c:LINE! (2)
From: Stefan Hajnoczi @ 2018-04-09 2:37 UTC (permalink / raw)
To: syzbot
Cc: kvm, Michael S. Tsirkin, netdev, syzkaller-bugs, linux-kernel,
Linux Virtualization
In-Reply-To: <001a11449424f11322056932b09c@google.com>
On Sat, Apr 7, 2018 at 3:02 AM, syzbot
<syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com> wrote:
> syzbot hit the following crash on upstream commit
> 38c23685b273cfb4ccf31a199feccce3bdcb5d83 (Fri Apr 6 04:29:35 2018 +0000)
> Merge tag 'armsoc-drivers' of
> git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
> syzbot dashboard link:
> https://syzkaller.appspot.com/bug?extid=65a84dde0214b0387ccd
To prevent duplicated work: I am working on this one.
Stefan
>
> So far this crash happened 4 times on upstream.
> C reproducer: https://syzkaller.appspot.com/x/repro.c?id=6586748079439872
> syzkaller reproducer:
> https://syzkaller.appspot.com/x/repro.syz?id=5974272052822016
> Raw console output:
> https://syzkaller.appspot.com/x/log.txt?id=6224632407392256
> Kernel config:
> https://syzkaller.appspot.com/x/.config?id=-5813481738265533882
> compiler: gcc (GCC) 8.0.1 20180301 (experimental)
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
>
> ------------[ cut here ]------------
> kernel BUG at drivers/vhost/vhost.c:1652!
> invalid opcode: 0000 [#1] SMP KASAN
> ------------[ cut here ]------------
> Dumping ftrace buffer:
> kernel BUG at drivers/vhost/vhost.c:1652!
> (ftrace buffer empty)
> Modules linked in:
> CPU: 1 PID: 4461 Comm: syzkaller684218 Not tainted 4.16.0+ #3
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> RIP: 0010:set_bit_to_user drivers/vhost/vhost.c:1652 [inline]
> RIP: 0010:log_write+0x42a/0x4d0 drivers/vhost/vhost.c:1676
> RSP: 0018:ffff8801b256f920 EFLAGS: 00010293
> RAX: ffff8801adc9e2c0 RBX: dffffc0000000000 RCX: ffffffff85924a0f
> RDX: 0000000000000000 RSI: ffffffff85924cea RDI: 0000000000000005
> RBP: ffff8801b256fa58 R08: ffff8801adc9e2c0 R09: ffffed003962412d
> R10: ffff8801b256fad8 R11: ffff8801cb12096f R12: 0001ffffffffffff
> R13: ffffed00364adf36 R14: 0000000000000000 R15: ffff8801b256fa30
> FS: 00007fdf24b19700(0000) GS:ffff8801db100000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000020bf6000 CR3: 00000001ae6a7000 CR4: 00000000001406e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> vhost_update_used_flags+0x3af/0x4a0 drivers/vhost/vhost.c:1723
> vhost_vq_init_access+0x117/0x590 drivers/vhost/vhost.c:1763
> vhost_vsock_start drivers/vhost/vsock.c:446 [inline]
> vhost_vsock_dev_ioctl+0x751/0x920 drivers/vhost/vsock.c:678
> vfs_ioctl fs/ioctl.c:46 [inline]
> file_ioctl fs/ioctl.c:500 [inline]
> do_vfs_ioctl+0x1cf/0x1650 fs/ioctl.c:684
> ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701
> SYSC_ioctl fs/ioctl.c:708 [inline]
> SyS_ioctl+0x24/0x30 fs/ioctl.c:706
> do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287
> entry_SYSCALL_64_after_hwframe+0x42/0xb7
> RIP: 0033:0x4456c9
> RSP: 002b:00007fdf24b18da8 EFLAGS: 00000297 ORIG_RAX: 0000000000000010
> RAX: ffffffffffffffda RBX: 00000000006dac24 RCX: 00000000004456c9
> RDX: 0000000020f82ffc RSI: 000000004004af61 RDI: 000000000000001b
> RBP: 00000000006dac20 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000297 R12: 6b636f73762d7473
> R13: 6f68762f7665642f R14: fffffffffffffffc R15: 0000000000000007
> Code: e8 7c 5e e4 fb 4c 89 ef e8 e4 16 06 fc 48 8d 85 58 ff ff ff 48 c1 e8
> 03 c6 04 18 f8 e9 46 ff ff ff 45 31 f6 eb 91 e8 56 5e e4 fb <0f> 0b e8 4f 5e
> e4 fb 48 c7 c6 a0 a3 24 88 4c 89 ef e8 60 b6 10
> RIP: set_bit_to_user drivers/vhost/vhost.c:1652 [inline] RSP:
> ffff8801b256f920
> RIP: log_write+0x42a/0x4d0 drivers/vhost/vhost.c:1676 RSP: ffff8801b256f920
> invalid opcode: 0000 [#2] SMP KASAN
> ---[ end trace 0d0ff45aa44d8a23 ]---
> Dumping ftrace buffer:
> (ftrace buffer empty)
> Modules linked in:
>
>
> ---
> This bug is generated by a dumb bot. It may contain errors.
> See https://goo.gl/tpsmEJ for details.
> Direct all questions to syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report.
> If you forgot to add the Reported-by tag, once the fix for this bug is
> merged
> into any tree, please reply to this email with:
> #syz fix: exact-commit-title
> If you want to test a patch for this bug, please reply with:
> #syz test: git://repo/address.git branch
> and provide the patch inline or as an attachment.
> To mark this as a duplicate of another syzbot report, please reply with:
> #syz dup: exact-subject-of-another-report
> If it's a one-off invalid bug report, please reply with:
> #syz invalid
> Note: if the crash happens again, it will cause creation of a new bug
> report.
> Note: all commands must start from beginning of the line in the email body.
^ permalink raw reply
* Re: [PATCH] vhost-net: set packet weight of tx polling to 2 * vq size
From: Michael S. Tsirkin @ 2018-04-09 2:42 UTC (permalink / raw)
To: haibinzhang(张海斌)
Cc: kvm@vger.kernel.org, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org,
virtualization@lists.linux-foundation.org,
yunfangtai(台运方),
lidongchen(陈立东)
In-Reply-To: <88D661ADF6AFBF42B2AB88D8E7682B0901FC47D3@EXMBX-SZMAIL011.tencent.com>
On Fri, Apr 06, 2018 at 08:22:37AM +0000, haibinzhang(张海斌) wrote:
> handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
> polling udp packets with small length(e.g. 1byte udp payload), because setting
> VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length.
>
> Ping-Latencies shown below were tested between two Virtual Machines using
> netperf (UDP_STREAM, len=1), and then another machine pinged the client:
>
> Packet-Weight Ping-Latencies(millisecond)
> min avg max
> Origin 3.319 18.489 57.303
> 64 1.643 2.021 2.552
> 128 1.825 2.600 3.224
> 256 1.997 2.710 4.295
> 512 1.860 3.171 4.631
> 1024 2.002 4.173 9.056
> 2048 2.257 5.650 9.688
> 4096 2.093 8.508 15.943
And this is with Q size 256 right?
> Ring size is a hint from device about a burst size it can tolerate. Based on
> benchmarks, set the weight to 2 * vq size.
>
> To evaluate this change, another tests were done using netperf(RR, TX) between
> two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was
> tweaked through qemu. Results shown below does not show obvious changes.
What I asked for is ping-latency with different VQ sizes,
streaming below does not show anything.
> vq size=256 TCP_RR vq size=512 TCP_RR
> size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
> 1/ 1/ -7%/ -2% 1/ 1/ 0%/ -2%
> 1/ 4/ +1%/ 0% 1/ 4/ +1%/ 0%
> 1/ 8/ +1%/ -2% 1/ 8/ 0%/ +1%
> 64/ 1/ -6%/ 0% 64/ 1/ +7%/ +3%
> 64/ 4/ 0%/ +2% 64/ 4/ -1%/ +1%
> 64/ 8/ 0%/ 0% 64/ 8/ -1%/ -2%
> 256/ 1/ -3%/ -4% 256/ 1/ -4%/ -2%
> 256/ 4/ +3%/ +4% 256/ 4/ +1%/ +2%
> 256/ 8/ +2%/ 0% 256/ 8/ +1%/ -1%
>
> vq size=256 UDP_RR vq size=512 UDP_RR
> size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
> 1/ 1/ -5%/ +1% 1/ 1/ -3%/ -2%
> 1/ 4/ +4%/ +1% 1/ 4/ -2%/ +2%
> 1/ 8/ -1%/ -1% 1/ 8/ -1%/ 0%
> 64/ 1/ -2%/ -3% 64/ 1/ +1%/ +1%
> 64/ 4/ -5%/ -1% 64/ 4/ +2%/ 0%
> 64/ 8/ 0%/ -1% 64/ 8/ -2%/ +1%
> 256/ 1/ +7%/ +1% 256/ 1/ -7%/ 0%
> 256/ 4/ +1%/ +1% 256/ 4/ -3%/ -4%
> 256/ 8/ +2%/ +2% 256/ 8/ +1%/ +1%
>
> vq size=256 TCP_STREAM vq size=512 TCP_STREAM
> size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
> 64/ 1/ 0%/ -3% 64/ 1/ 0%/ 0%
> 64/ 4/ +3%/ -1% 64/ 4/ -2%/ +4%
> 64/ 8/ +9%/ -4% 64/ 8/ -1%/ +2%
> 256/ 1/ +1%/ -4% 256/ 1/ +1%/ +1%
> 256/ 4/ -1%/ -1% 256/ 4/ -3%/ 0%
> 256/ 8/ +7%/ +5% 256/ 8/ -3%/ 0%
> 512/ 1/ +1%/ 0% 512/ 1/ -1%/ -1%
> 512/ 4/ +1%/ -1% 512/ 4/ 0%/ 0%
> 512/ 8/ +7%/ -5% 512/ 8/ +6%/ -1%
> 1024/ 1/ 0%/ -1% 1024/ 1/ 0%/ +1%
> 1024/ 4/ +3%/ 0% 1024/ 4/ +1%/ 0%
> 1024/ 8/ +8%/ +5% 1024/ 8/ -1%/ 0%
> 2048/ 1/ +2%/ +2% 2048/ 1/ -1%/ 0%
> 2048/ 4/ +1%/ 0% 2048/ 4/ 0%/ -1%
> 2048/ 8/ -2%/ 0% 2048/ 8/ 5%/ -1%
> 4096/ 1/ -2%/ 0% 4096/ 1/ -2%/ 0%
> 4096/ 4/ +2%/ 0% 4096/ 4/ 0%/ 0%
> 4096/ 8/ +9%/ -2% 4096/ 8/ -5%/ -1%
>
> Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
> Signed-off-by: Yunfang Tai <yunfangtai@tencent.com>
> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Code is fine but I'd like to see validation of the heuristic
2*vq->num with another vq size.
> ---
> drivers/vhost/net.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 8139bc70ad7d..3563a305cc0a 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -44,6 +44,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
> * Using this limit prevents one virtqueue from starving others. */
> #define VHOST_NET_WEIGHT 0x80000
>
> +/* Max number of packets transferred before requeueing the job.
> + * Using this limit prevents one virtqueue from starving rx. */
> +#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
> +
> /* MAX number of TX used buffers for outstanding zerocopy */
> #define VHOST_MAX_PEND 128
> #define VHOST_GOODCOPY_LEN 256
> @@ -473,6 +477,7 @@ static void handle_tx(struct vhost_net *net)
> struct socket *sock;
> struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
> bool zcopy, zcopy_used;
> + int sent_pkts = 0;
>
> mutex_lock(&vq->mutex);
> sock = vq->private_data;
> @@ -580,7 +585,8 @@ static void handle_tx(struct vhost_net *net)
> else
> vhost_zerocopy_signal_used(net, vq);
> vhost_net_tx_packet(net);
> - if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> + if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
> + unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
> vhost_poll_queue(&vq->poll);
> break;
> }
> --
> 2.12.3
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: kernel BUG at drivers/vhost/vhost.c:LINE! (2)
From: Michael S. Tsirkin @ 2018-04-09 2:44 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: kvm, netdev, syzkaller-bugs, syzbot, linux-kernel,
Linux Virtualization
In-Reply-To: <CAJSP0QVHk7NSQcjfLWq13=1Vzm=_vtmKZqp4dDjuh8ETLO5g_g@mail.gmail.com>
On Mon, Apr 09, 2018 at 10:37:45AM +0800, Stefan Hajnoczi wrote:
> On Sat, Apr 7, 2018 at 3:02 AM, syzbot
> <syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com> wrote:
> > syzbot hit the following crash on upstream commit
> > 38c23685b273cfb4ccf31a199feccce3bdcb5d83 (Fri Apr 6 04:29:35 2018 +0000)
> > Merge tag 'armsoc-drivers' of
> > git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
> > syzbot dashboard link:
> > https://syzkaller.appspot.com/bug?extid=65a84dde0214b0387ccd
>
> To prevent duplicated work: I am working on this one.
>
> Stefan
Do you want to try this patchset:
https://lkml.org/lkml/2018/4/5/665
?
--
MST
^ permalink raw reply
* Re: kernel BUG at drivers/vhost/vhost.c:LINE! (2)
From: Stefan Hajnoczi @ 2018-04-09 3:28 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: kvm, netdev, syzkaller-bugs, syzbot, linux-kernel,
Linux Virtualization
In-Reply-To: <20180409054316-mutt-send-email-mst@kernel.org>
[-- Attachment #1.1: Type: text/plain, Size: 961 bytes --]
On Mon, Apr 09, 2018 at 05:44:36AM +0300, Michael S. Tsirkin wrote:
> On Mon, Apr 09, 2018 at 10:37:45AM +0800, Stefan Hajnoczi wrote:
> > On Sat, Apr 7, 2018 at 3:02 AM, syzbot
> > <syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com> wrote:
> > > syzbot hit the following crash on upstream commit
> > > 38c23685b273cfb4ccf31a199feccce3bdcb5d83 (Fri Apr 6 04:29:35 2018 +0000)
> > > Merge tag 'armsoc-drivers' of
> > > git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
> > > syzbot dashboard link:
> > > https://syzkaller.appspot.com/bug?extid=65a84dde0214b0387ccd
> >
> > To prevent duplicated work: I am working on this one.
> >
> > Stefan
>
> Do you want to try this patchset:
> https://lkml.org/lkml/2018/4/5/665
>
> ?
Thanks, I'll give it a shot.
I also noticed a regression in commit
d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log when
IOTLB is enabled") and am currently testing a fix.
Stefan
[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH] vhost-net: set packet weight of tx polling to 2 * vq size
From: Michael S. Tsirkin @ 2018-04-09 5:46 UTC (permalink / raw)
To: haibinzhang(张海斌)
Cc: kvm@vger.kernel.org, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org,
virtualization@lists.linux-foundation.org,
yunfangtai(台运方),
lidongchen(陈立东)
In-Reply-To: <88D661ADF6AFBF42B2AB88D8E7682B0901FC49BE@EXMBX-SZMAIL011.tencent.com>
On Mon, Apr 09, 2018 at 04:09:20AM +0000, haibinzhang(张海斌) wrote:
>
> > On Fri, Apr 06, 2018 at 08:22:37AM +0000, haibinzhang(张海斌) wrote:
> > > handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
> > > polling udp packets with small length(e.g. 1byte udp payload), because setting
> > > VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length.
> > >
> > > Ping-Latencies shown below were tested between two Virtual Machines using
> > > netperf (UDP_STREAM, len=1), and then another machine pinged the client:
> > >
> > > Packet-Weight Ping-Latencies(millisecond)
> > > min avg max
> > > Origin 3.319 18.489 57.303
> > > 64 1.643 2.021 2.552
> > > 128 1.825 2.600 3.224
> > > 256 1.997 2.710 4.295
> > > 512 1.860 3.171 4.631
> > > 1024 2.002 4.173 9.056
> > > 2048 2.257 5.650 9.688
> > > 4096 2.093 8.508 15.943
> >
> > And this is with Q size 256 right?
>
> Yes. Ping-latencies with 512 VQ size show below.
>
> Packet-Weight Ping-Latencies(millisecond)
> min avg max
> Origin 6.357 29.177 66.245
> 64 2.798 3.614 4.403
> 128 2.861 3.820 4.775
> 256 3.008 4.018 4.807
> 512 3.254 4.523 5.824
> 1024 3.079 5.335 7.747
> 2048 3.944 8.201 12.762
> 4096 4.158 11.057 19.985
>
> We will submit again. Is there anything else?
Seems pretty consistent, a small dip at 2 VQ sizes.
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> >
> > > Ring size is a hint from device about a burst size it can tolerate. Based on
> > > benchmarks, set the weight to 2 * vq size.
> > >
> > > To evaluate this change, another tests were done using netperf(RR, TX) between
> > > two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was
> > > tweaked through qemu. Results shown below does not show obvious changes.
> >
> > What I asked for is ping-latency with different VQ sizes,
> > streaming below does not show anything.
> >
> > > vq size=256 TCP_RR vq size=512 TCP_RR
> > > size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
> > > 1/ 1/ -7%/ -2% 1/ 1/ 0%/ -2%
> > > 1/ 4/ +1%/ 0% 1/ 4/ +1%/ 0%
> > > 1/ 8/ +1%/ -2% 1/ 8/ 0%/ +1%
> > > 64/ 1/ -6%/ 0% 64/ 1/ +7%/ +3%
> > > 64/ 4/ 0%/ +2% 64/ 4/ -1%/ +1%
> > > 64/ 8/ 0%/ 0% 64/ 8/ -1%/ -2%
> > > 256/ 1/ -3%/ -4% 256/ 1/ -4%/ -2%
> > > 256/ 4/ +3%/ +4% 256/ 4/ +1%/ +2%
> > > 256/ 8/ +2%/ 0% 256/ 8/ +1%/ -1%
> > >
> > > vq size=256 UDP_RR vq size=512 UDP_RR
> > > size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
> > > 1/ 1/ -5%/ +1% 1/ 1/ -3%/ -2%
> > > 1/ 4/ +4%/ +1% 1/ 4/ -2%/ +2%
> > > 1/ 8/ -1%/ -1% 1/ 8/ -1%/ 0%
> > > 64/ 1/ -2%/ -3% 64/ 1/ +1%/ +1%
> > > 64/ 4/ -5%/ -1% 64/ 4/ +2%/ 0%
> > > 64/ 8/ 0%/ -1% 64/ 8/ -2%/ +1%
> > > 256/ 1/ +7%/ +1% 256/ 1/ -7%/ 0%
> > > 256/ 4/ +1%/ +1% 256/ 4/ -3%/ -4%
> > > 256/ 8/ +2%/ +2% 256/ 8/ +1%/ +1%
> > >
> > > vq size=256 TCP_STREAM vq size=512 TCP_STREAM
> > > size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize%
> > > 64/ 1/ 0%/ -3% 64/ 1/ 0%/ 0%
> > > 64/ 4/ +3%/ -1% 64/ 4/ -2%/ +4%
> > > 64/ 8/ +9%/ -4% 64/ 8/ -1%/ +2%
> > > 256/ 1/ +1%/ -4% 256/ 1/ +1%/ +1%
> > > 256/ 4/ -1%/ -1% 256/ 4/ -3%/ 0%
> > > 256/ 8/ +7%/ +5% 256/ 8/ -3%/ 0%
> > > 512/ 1/ +1%/ 0% 512/ 1/ -1%/ -1%
> > > 512/ 4/ +1%/ -1% 512/ 4/ 0%/ 0%
> > > 512/ 8/ +7%/ -5% 512/ 8/ +6%/ -1%
> > > 1024/ 1/ 0%/ -1% 1024/ 1/ 0%/ +1%
> > > 1024/ 4/ +3%/ 0% 1024/ 4/ +1%/ 0%
> > > 1024/ 8/ +8%/ +5% 1024/ 8/ -1%/ 0%
> > > 2048/ 1/ +2%/ +2% 2048/ 1/ -1%/ 0%
> > > 2048/ 4/ +1%/ 0% 2048/ 4/ 0%/ -1%
> > > 2048/ 8/ -2%/ 0% 2048/ 8/ 5%/ -1%
> > > 4096/ 1/ -2%/ 0% 4096/ 1/ -2%/ 0%
> > > 4096/ 4/ +2%/ 0% 4096/ 4/ 0%/ 0%
> > > 4096/ 8/ +9%/ -2% 4096/ 8/ -5%/ -1%
> > >
> > > Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
> > > Signed-off-by: Yunfang Tai <yunfangtai@tencent.com>
> > > Signed-off-by: Lidong Chen <lidongchen@tencent.com>
> >
> > Code is fine but I'd like to see validation of the heuristic
> > 2*vq->num with another vq size.
> >
> >
> >
> > > ---
> > > drivers/vhost/net.c | 8 +++++++-
> > > 1 file changed, 7 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 8139bc70ad7d..3563a305cc0a 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -44,6 +44,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
> > > * Using this limit prevents one virtqueue from starving others. */
> > > #define VHOST_NET_WEIGHT 0x80000
> > >
> > > +/* Max number of packets transferred before requeueing the job.
> > > + * Using this limit prevents one virtqueue from starving rx. */
> > > +#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
> > > +
> > > /* MAX number of TX used buffers for outstanding zerocopy */
> > > #define VHOST_MAX_PEND 128
> > > #define VHOST_GOODCOPY_LEN 256
> > > @@ -473,6 +477,7 @@ static void handle_tx(struct vhost_net *net)
> > > struct socket *sock;
> > > struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
> > > bool zcopy, zcopy_used;
> > > + int sent_pkts = 0;
> > >
> > > mutex_lock(&vq->mutex);
> > > sock = vq->private_data;
> > > @@ -580,7 +585,8 @@ static void handle_tx(struct vhost_net *net)
> > > else
> > > vhost_zerocopy_signal_used(net, vq);
> > > vhost_net_tx_packet(net);
> > > - if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> > > + if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
> > > + unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
> > > vhost_poll_queue(&vq->poll);
> > > break;
> > > }
> > > --
> > > 2.12.3
> > >
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH v31 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
From: Michael S. Tsirkin @ 2018-04-09 6:03 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, huangzhichao,
pbonzini, akpm, virtualization
In-Reply-To: <1523017045-18315-3-git-send-email-wei.w.wang@intel.com>
On Fri, Apr 06, 2018 at 08:17:23PM +0800, Wei Wang wrote:
> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the
> support of reporting hints of guest free pages to host via virtio-balloon.
>
> Host requests the guest to report free page hints by sending a new cmd
> id to the guest via the free_page_report_cmd_id configuration register.
>
> When the guest starts to report, the first element added to the free page
> vq is the cmd id given by host. When the guest finishes the reporting
> of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
> to the vq to tell host that the reporting is done. Host polls the free
> page vq after sending the starting cmd id, so the guest doesn't need to
> kick after filling an element to the vq.
>
> Host may also requests the guest to stop the reporting in advance by
> sending the stop cmd id to the guest via the configuration register.
>
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Michal Hocko <mhocko@kernel.org>
Pretty good by now, Minor comments below.
> ---
> drivers/virtio/virtio_balloon.c | 272 +++++++++++++++++++++++++++++++-----
> include/uapi/linux/virtio_balloon.h | 4 +
> 2 files changed, 240 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index dfe5684..aef73ee 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -51,9 +51,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
> static struct vfsmount *balloon_mnt;
> #endif
>
> +enum virtio_balloon_vq {
> + VIRTIO_BALLOON_VQ_INFLATE,
> + VIRTIO_BALLOON_VQ_DEFLATE,
> + VIRTIO_BALLOON_VQ_STATS,
> + VIRTIO_BALLOON_VQ_FREE_PAGE,
> + VIRTIO_BALLOON_VQ_MAX
> +};
> +
> struct virtio_balloon {
> struct virtio_device *vdev;
> - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> +
> + /* Balloon's own wq for cpu-intensive work items */
> + struct workqueue_struct *balloon_wq;
> + /* The free page reporting work item submitted to the balloon wq */
> + struct work_struct report_free_page_work;
>
> /* The balloon servicing is delegated to a freezable workqueue. */
> struct work_struct update_balloon_stats_work;
> @@ -63,6 +76,13 @@ struct virtio_balloon {
> spinlock_t stop_update_lock;
> bool stop_update;
>
> + /* The new cmd id received from host */
> + uint32_t cmd_id_received;
> + /* The cmd id that is in use */
> + __virtio32 cmd_id_use;
I'd prefer cmd_id_active but it's not critical.
> + /* Buffer to store the stop sign */
> + __virtio32 stop_cmd_id;
> +
> /* Waiting for host to ack the pages we released. */
> wait_queue_head_t acked;
>
> @@ -320,17 +340,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
> virtqueue_kick(vq);
> }
>
> -static void virtballoon_changed(struct virtio_device *vdev)
> -{
> - struct virtio_balloon *vb = vdev->priv;
> - unsigned long flags;
> -
> - spin_lock_irqsave(&vb->stop_update_lock, flags);
> - if (!vb->stop_update)
> - queue_work(system_freezable_wq, &vb->update_balloon_size_work);
> - spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> -}
> -
> static inline s64 towards_target(struct virtio_balloon *vb)
> {
> s64 target;
> @@ -347,6 +356,34 @@ static inline s64 towards_target(struct virtio_balloon *vb)
> return target - vb->num_pages;
> }
>
> +static void virtballoon_changed(struct virtio_device *vdev)
> +{
> + struct virtio_balloon *vb = vdev->priv;
> + unsigned long flags;
> + s64 diff = towards_target(vb);
> +
> + if (diff) {
> + spin_lock_irqsave(&vb->stop_update_lock, flags);
> + if (!vb->stop_update)
> + queue_work(system_freezable_wq,
> + &vb->update_balloon_size_work);
> + spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> + }
> +
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> + virtio_cread(vdev, struct virtio_balloon_config,
> + free_page_report_cmd_id, &vb->cmd_id_received);
> + if (vb->cmd_id_received !=
> + VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID) {
> + spin_lock_irqsave(&vb->stop_update_lock, flags);
> + if (!vb->stop_update)
> + queue_work(vb->balloon_wq,
> + &vb->report_free_page_work);
> + spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> + }
> + }
> +}
> +
> static void update_balloon_size(struct virtio_balloon *vb)
> {
> u32 actual = vb->num_pages;
> @@ -421,42 +458,178 @@ static void update_balloon_size_func(struct work_struct *work)
>
> static int init_vqs(struct virtio_balloon *vb)
> {
> - struct virtqueue *vqs[3];
> - vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
> - static const char * const names[] = { "inflate", "deflate", "stats" };
> - int err, nvqs;
> + struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
> + vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX];
> + const char *names[VIRTIO_BALLOON_VQ_MAX];
> + struct scatterlist sg;
> + int ret;
>
> /*
> - * We expect two virtqueues: inflate and deflate, and
> - * optionally stat.
> + * Inflateq and deflateq are used unconditionally. The names[]
> + * will be NULL if the related feature is not enabled, which will
> + * cause no allocation for the corresponding virtqueue in find_vqs.
> */
> - nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
> - err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
> - if (err)
> - return err;
> + callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack;
> + names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate";
> + callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack;
> + names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> + names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> + names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>
> - vb->inflate_vq = vqs[0];
> - vb->deflate_vq = vqs[1];
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> - struct scatterlist sg;
> - unsigned int num_stats;
> - vb->stats_vq = vqs[2];
> + names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> + callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request;
> + }
>
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> + names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq";
> + callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> + }
> +
> + ret = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> + vqs, callbacks, names, NULL, NULL);
> + if (ret)
> + return ret;
> +
> + vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> + vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> + vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS];
> /*
> * Prime this virtqueue with one buffer so the hypervisor can
> * use it to signal us later (it can't be broken yet!).
> */
> - num_stats = update_balloon_stats(vb);
> -
> - sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
> - if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
> - < 0)
> - BUG();
> + sg_init_one(&sg, vb->stats, sizeof(vb->stats));
> + ret = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
> + GFP_KERNEL);
> + if (ret) {
> + dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
> + __func__);
> + return ret;
> + }
> virtqueue_kick(vb->stats_vq);
> }
> +
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
> + vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
> +
> return 0;
> }
>
> +static int add_one_sg(struct virtqueue *vq, unsigned long pfn, uint32_t len)
> +{
> + struct scatterlist sg;
> + unsigned int unused;
> +
> + sg_init_table(&sg, 1);
> + sg_set_page(&sg, pfn_to_page(pfn), len, 0);
> +
> + /* Detach all the used buffers from the vq */
> + while (virtqueue_get_buf(vq, &unused))
> + ;
> +
> + /*
> + * Since this is an optimization feature, losing a couple of free
> + * pages to report isn't important. We simply return without adding
> + * this page hint if the vq is full.
> + * We are adding one entry each time, which essentially results in no
> + * memory allocation, so the GFP_KERNEL flag below can be ignored.
> + * Host works by polling the free page vq for hints after sending the
> + * starting cmd id, so the driver doesn't need to kick after filling
> + * the vq.
> + * Lastly, there is always one entry reserved for the cmd id to use.
> + *
> + * TODO: The current implementation could be further improved by
> + * stopping the reporting when the vq is full and continuing the
> + * reporting when host notifies the driver that entries have been
> + * used.
> + */
> + if (vq->num_free > 1)
> + return virtqueue_add_inbuf(vq, &sg, 1, vq, GFP_KERNEL);
> +
> + return 0;
> +}
> +
> +static int virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
> + unsigned long nr_pages)
> +{
> + struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
> + uint32_t len = nr_pages << PAGE_SHIFT;
> +
> + /*
> + * If a stop id or a new cmd id was just received from host, stop
> + * the reporting, and return -EINTR to indicate an active stop.
> + *
> + * Ideally, we could have cmd_id_received accessed under locks, which
> + * ensures that no more entries are added to the vq once a stop cmd
> + * id is received from host. This requires host to wait for the
> + * driver's ACK about finishing the update of cmd_id_received. But
> + * this is not how host side works, because host doesn't work on an
> + * assumption that the driver would always be responsive. So
> + * theorically, there are possibilities that some entries may stay in
> + * the vq when host has exited the optimization. This isn't an issue,
> + * because entries simply contain guest physical addresses. There is no
> + * allocated memory that needs to be freed or dma mapped pages that
> + * need to be unmapped since virtio-balloon works with
> + * VIRTIO_F_IOMMU_PLATFORM disabled.
> + */
> + if (virtio32_to_cpu(vb->vdev, vb->cmd_id_use) != vb->cmd_id_received)
> + return -EINTR;
> +
> + return add_one_sg(vb->free_page_vq, pfn, len);
> +}
> +
> +static int send_start_cmd_id(struct virtio_balloon *vb, uint32_t cmd_id)
> +{
> + struct scatterlist sg;
> + struct virtqueue *vq = vb->free_page_vq;
> +
> + vb->cmd_id_use = cpu_to_virtio32(vb->vdev, cmd_id);
> + sg_init_one(&sg, &vb->cmd_id_use, sizeof(vb->cmd_id_use));
> + return virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL);
> +}
> +
> +static int send_stop_cmd_id(struct virtio_balloon *vb)
> +{
> + struct scatterlist sg;
> + struct virtqueue *vq = vb->free_page_vq;
> +
> + sg_init_one(&sg, &vb->stop_cmd_id, sizeof(vb->stop_cmd_id));
> + return virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL);
> +}
> +
> +static void report_free_page_func(struct work_struct *work)
> +{
> + struct virtio_balloon *vb;
> + struct virtqueue *vq;
> + unsigned int unused;
> + int ret;
> +
> + vb = container_of(work, struct virtio_balloon, report_free_page_work);
> + vq = vb->free_page_vq;
> +
> + /* Start by sending the received cmd id to host with an outbuf. */
> + ret = send_start_cmd_id(vb, vb->cmd_id_received);
> + if (unlikely(ret))
> + goto err;
> +
> + ret = walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> + if (unlikely(ret == -EIO))
> + goto err;
why is EIO special? I think you should special-case EINTR maybe.
> +
> + /* End by sending a stop id to host with an outbuf. */
> + ret = send_stop_cmd_id(vb);
> + if (likely(!ret)) {
What happens on failure? Don't we need to detach anyway?
> + /* Ending: detach all the used buffers from the vq. */
> + while (vq->num_free != virtqueue_get_vring_size(vq))
> + virtqueue_get_buf(vq, &unused);
This isn't all that happens here. It also waits for buffers to
be consumed. Is this by design? And why is it good idea to
busy poll while doing it?
> + return;
Better move this return outside. Fall through to error is confusing.
> + }
> +err:
> + dev_err(&vb->vdev->dev, "%s: free page vq failure, ret=%d\n",
> + __func__, ret);
> +}
> +
> #ifdef CONFIG_BALLOON_COMPACTION
> /*
> * virtballoon_migratepage - perform the balloon page migration on behalf of
> @@ -570,18 +743,36 @@ static int virtballoon_probe(struct virtio_device *vdev)
> if (err)
> goto out_free_vb;
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> + /*
> + * There is always one entry reserved for cmd id, so the ring
> + * size needs to be at least two to report free page hints.
> + */
> + if (virtqueue_get_vring_size(vb->free_page_vq) < 2)
> + goto out_free_vb;
> + vb->balloon_wq = alloc_workqueue("balloon-wq",
> + WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
> + if (!vb->balloon_wq) {
> + err = -ENOMEM;
> + goto out_del_vqs;
> + }
> + vb->stop_cmd_id = cpu_to_virtio32(vb->vdev,
> + VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
> + INIT_WORK(&vb->report_free_page_work, report_free_page_func);
> + }
> +
> vb->nb.notifier_call = virtballoon_oom_notify;
> vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
> err = register_oom_notifier(&vb->nb);
> if (err < 0)
> - goto out_del_vqs;
> + goto out_del_balloon_wq;
>
> #ifdef CONFIG_BALLOON_COMPACTION
> balloon_mnt = kern_mount(&balloon_fs);
> if (IS_ERR(balloon_mnt)) {
> err = PTR_ERR(balloon_mnt);
> unregister_oom_notifier(&vb->nb);
> - goto out_del_vqs;
> + goto out_del_balloon_wq;
> }
>
> vb->vb_dev_info.migratepage = virtballoon_migratepage;
> @@ -591,7 +782,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
> kern_unmount(balloon_mnt);
> unregister_oom_notifier(&vb->nb);
> vb->vb_dev_info.inode = NULL;
> - goto out_del_vqs;
> + goto out_del_balloon_wq;
> }
> vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
> #endif
> @@ -602,6 +793,9 @@ static int virtballoon_probe(struct virtio_device *vdev)
> virtballoon_changed(vdev);
> return 0;
>
> +out_del_balloon_wq:
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
> + destroy_workqueue(vb->balloon_wq);
> out_del_vqs:
> vdev->config->del_vqs(vdev);
> out_free_vb:
> @@ -635,6 +829,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
> cancel_work_sync(&vb->update_balloon_size_work);
> cancel_work_sync(&vb->update_balloon_stats_work);
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> + cancel_work_sync(&vb->report_free_page_work);
> + destroy_workqueue(vb->balloon_wq);
> + }
> +
> remove_common(vb);
> #ifdef CONFIG_BALLOON_COMPACTION
> if (vb->vb_dev_info.inode)
> @@ -686,6 +885,7 @@ static unsigned int features[] = {
> VIRTIO_BALLOON_F_MUST_TELL_HOST,
> VIRTIO_BALLOON_F_STATS_VQ,
> VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> + VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> };
>
> static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index 4e8b830..b2d86c2 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -34,15 +34,19 @@
> #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */
> #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */
> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */
> +#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
>
> /* Size of a PFN in the balloon interface. */
> #define VIRTIO_BALLOON_PFN_SHIFT 12
>
> +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0
> struct virtio_balloon_config {
> /* Number of pages host wants Guest to give up. */
> __u32 num_pages;
> /* Number of pages we've actually got in balloon. */
> __u32 actual;
> + /* Free page report command id, readonly by guest */
> + __u32 free_page_report_cmd_id;
> };
>
> #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */
> --
> 2.7.4
^ permalink raw reply
* Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Jiri Pirko @ 2018-04-09 8:07 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <bb4f0821-246f-1052-de8b-d96be3388ed3@intel.com>
Sat, Apr 07, 2018 at 12:59:14AM CEST, sridhar.samudrala@intel.com wrote:
>On 4/6/2018 5:48 AM, Jiri Pirko wrote:
>> Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudrala@intel.com wrote:
[...]
>> > +static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>> > + struct net_device *child_netdev)
>> > +{
>> > + struct virtnet_bypass_info *vbi;
>> > + bool backup;
>> > +
>> > + vbi = netdev_priv(bypass_netdev);
>> > + backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>> > + if (backup ? rtnl_dereference(vbi->backup_netdev) :
>> > + rtnl_dereference(vbi->active_netdev)) {
>> > + netdev_info(bypass_netdev,
>> > + "%s attempting to join bypass dev when %s already present\n",
>> > + child_netdev->name, backup ? "backup" : "active");
>> Bypass module should check if there is already some other netdev
>> enslaved and refuse right there.
>
>This will work for virtio-net with 3 netdev model, but this check has to be done by netvsc
>as its bypass_netdev is same as the backup_netdev.
>Will add a flag while registering with the bypass module to indicate if the driver is doing
>a 2 netdev or 3 netdev model and based on that flag this check can be done in bypass module
>for 3 netdev scenario.
Just let me undestand it clearly. What I expect the difference would be
between 2netdev and3 netdev model is this:
2netdev:
bypass_master
/
/
VF_slave
3netdev:
bypass_master
/ \
/ \
VF_slave backup_slave
Is that correct? If not, how does it look like?
Thanks!
^ permalink raw reply
* Re: [PATCH v31 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
From: Wei Wang @ 2018-04-09 11:46 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, huangzhichao,
pbonzini, akpm, virtualization
In-Reply-To: <20180409085457-mutt-send-email-mst@kernel.org>
On 04/09/2018 02:03 PM, Michael S. Tsirkin wrote:
> On Fri, Apr 06, 2018 at 08:17:23PM +0800, Wei Wang wrote:
>> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the
>> support of reporting hints of guest free pages to host via virtio-balloon.
>>
>> Host requests the guest to report free page hints by sending a new cmd
>> id to the guest via the free_page_report_cmd_id configuration register.
>>
>> When the guest starts to report, the first element added to the free page
>> vq is the cmd id given by host. When the guest finishes the reporting
>> of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
>> to the vq to tell host that the reporting is done. Host polls the free
>> page vq after sending the starting cmd id, so the guest doesn't need to
>> kick after filling an element to the vq.
>>
>> Host may also requests the guest to stop the reporting in advance by
>> sending the stop cmd id to the guest via the configuration register.
>>
>> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
>> Signed-off-by: Liang Li <liang.z.li@intel.com>
>> Cc: Michael S. Tsirkin <mst@redhat.com>
>> Cc: Michal Hocko <mhocko@kernel.org>
>
> Pretty good by now, Minor comments below.
Thanks for the comments.
>
>> ---
>> drivers/virtio/virtio_balloon.c | 272 +++++++++++++++++++++++++++++++-----
>> include/uapi/linux/virtio_balloon.h | 4 +
>> 2 files changed, 240 insertions(+), 36 deletions(-)
>>
>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>> index dfe5684..aef73ee 100644
>> --- a/drivers/virtio/virtio_balloon.c
>> +++ b/drivers/virtio/virtio_balloon.c
>> @@ -51,9 +51,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
>> static struct vfsmount *balloon_mnt;
>> #endif
>>
>> +enum virtio_balloon_vq {
>> + VIRTIO_BALLOON_VQ_INFLATE,
>> + VIRTIO_BALLOON_VQ_DEFLATE,
>> + VIRTIO_BALLOON_VQ_STATS,
>> + VIRTIO_BALLOON_VQ_FREE_PAGE,
>> + VIRTIO_BALLOON_VQ_MAX
>> +};
>> +
>> struct virtio_balloon {
>> struct virtio_device *vdev;
>> - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
>> + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
>> +
>> + /* Balloon's own wq for cpu-intensive work items */
>> + struct workqueue_struct *balloon_wq;
>> + /* The free page reporting work item submitted to the balloon wq */
>> + struct work_struct report_free_page_work;
>>
>> /* The balloon servicing is delegated to a freezable workqueue. */
>> struct work_struct update_balloon_stats_work;
>> @@ -63,6 +76,13 @@ struct virtio_balloon {
>> spinlock_t stop_update_lock;
>> bool stop_update;
>>
>> + /* The new cmd id received from host */
>> + uint32_t cmd_id_received;
>> + /* The cmd id that is in use */
>> + __virtio32 cmd_id_use;
> I'd prefer cmd_id_active but it's not critical.
OK, will change.
>
> +
> +static void report_free_page_func(struct work_struct *work)
> +{
> + struct virtio_balloon *vb;
> + struct virtqueue *vq;
> + unsigned int unused;
> + int ret;
> +
> + vb = container_of(work, struct virtio_balloon, report_free_page_work);
> + vq = vb->free_page_vq;
> +
> + /* Start by sending the received cmd id to host with an outbuf. */
> + ret = send_start_cmd_id(vb, vb->cmd_id_received);
> + if (unlikely(ret))
> + goto err;
> +
> + ret = walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> + if (unlikely(ret == -EIO))
> + goto err;
> why is EIO special? I think you should special-case EINTR maybe.
Actually EINTR isn't an error which needs to bail out. That's just the
case that the vq is full, that hint isn't added. Maybe it is not
necessary to treat the "vq full" case as an error.
How about just returning "0" when the vq is full, instead of returning
"EINTR"? (The next hint will continue to be added)
>
>> +
>> + /* End by sending a stop id to host with an outbuf. */
>> + ret = send_stop_cmd_id(vb);
>> + if (likely(!ret)) {
> What happens on failure? Don't we need to detach anyway?
Yes. Please see below, we could make some more change.
>
>> + /* Ending: detach all the used buffers from the vq. */
>> + while (vq->num_free != virtqueue_get_vring_size(vq))
>> + virtqueue_get_buf(vq, &unused);
> This isn't all that happens here. It also waits for buffers to
> be consumed. Is this by design? And why is it good idea to
> busy poll while doing it?
Because host and guest operations happen asynchronously. When the guest
reaches here, host may have not put anything to the vq yet. How about
doing this via the free page vq handler?
Host will send a free page vq interrupt before exiting the optimization.
I think this would be nicer.
Best,
Wei
^ permalink raw reply
* [PATCH] vhost: fix vhost_vq_access_ok() log check
From: Stefan Hajnoczi @ 2018-04-09 13:10 UTC (permalink / raw)
To: virtualization; +Cc: kvm, mst, syzkaller-bugs, linux-kernel, Stefan Hajnoczi
Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
when IOTLB is enabled") introduced a regression. The logic was
originally:
if (vq->iotlb)
return 1;
return A && B;
After the patch the short-circuit logic for A was inverted:
if (A || vq->iotlb)
return A;
return B;
The correct logic is:
if (!A || vq->iotlb)
return A;
return B;
Reported-by: syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
drivers/vhost/vhost.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5320039671b7..f6af4210679a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
{
int ret = vq_log_access_ok(vq, vq->log_base);
- if (ret || vq->iotlb)
+ if (!ret || vq->iotlb)
return ret;
return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
--
2.14.3
^ permalink raw reply related
* Re: kernel BUG at drivers/vhost/vhost.c:LINE! (2)
From: Stefan Hajnoczi @ 2018-04-09 13:17 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: kvm, netdev, syzkaller-bugs, syzbot, linux-kernel,
Linux Virtualization
In-Reply-To: <20180409032835.GB1648@stefanha-x1.localdomain>
On Mon, Apr 9, 2018 at 11:28 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Mon, Apr 09, 2018 at 05:44:36AM +0300, Michael S. Tsirkin wrote:
>> On Mon, Apr 09, 2018 at 10:37:45AM +0800, Stefan Hajnoczi wrote:
>> > On Sat, Apr 7, 2018 at 3:02 AM, syzbot
>> > <syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com> wrote:
>> > > syzbot hit the following crash on upstream commit
>> > > 38c23685b273cfb4ccf31a199feccce3bdcb5d83 (Fri Apr 6 04:29:35 2018 +0000)
>> > > Merge tag 'armsoc-drivers' of
>> > > git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
>> > > syzbot dashboard link:
>> > > https://syzkaller.appspot.com/bug?extid=65a84dde0214b0387ccd
>> >
>> > To prevent duplicated work: I am working on this one.
>> >
>> > Stefan
>>
>> Do you want to try this patchset:
>> https://lkml.org/lkml/2018/4/5/665
>>
>> ?
>
> Thanks, I'll give it a shot.
>
> I also noticed a regression in commit
> d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log when
> IOTLB is enabled") and am currently testing a fix.
I have sent a fix:
https://lkml.org/lkml/2018/4/9/390
Stefan
^ permalink raw reply
* Re: [PATCH] vhost: fix vhost_vq_access_ok() log check
From: Michael S. Tsirkin @ 2018-04-09 13:58 UTC (permalink / raw)
To: Stefan Hajnoczi; +Cc: syzkaller-bugs, linux-kernel, kvm, virtualization
In-Reply-To: <20180409131021.5132-1-stefanha@redhat.com>
On Mon, Apr 09, 2018 at 09:10:21PM +0800, Stefan Hajnoczi wrote:
> Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
> when IOTLB is enabled") introduced a regression. The logic was
> originally:
>
> if (vq->iotlb)
> return 1;
> return A && B;
>
> After the patch the short-circuit logic for A was inverted:
>
> if (A || vq->iotlb)
> return A;
> return B;
>
> The correct logic is:
>
> if (!A || vq->iotlb)
> return A;
> return B;
>
> Reported-by: syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com
> Cc: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/vhost.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 5320039671b7..f6af4210679a 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
> {
> int ret = vq_log_access_ok(vq, vq->log_base);
>
> - if (ret || vq->iotlb)
> + if (!ret || vq->iotlb)
> return ret;
>
> return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
> --
> 2.14.3
^ permalink raw reply
* Re: [PATCH RESEND v2] vhost-net: set packet weight of tx polling to 2 * vq size
From: David Miller @ 2018-04-09 15:02 UTC (permalink / raw)
To: haibinzhang
Cc: kvm, mst, netdev, linux-kernel, virtualization, yunfangtai,
lidongchen
In-Reply-To: <88D661ADF6AFBF42B2AB88D8E7682B0901FC4A32@EXMBX-SZMAIL011.tencent.com>
From: haibinzhang(张海斌) <haibinzhang@tencent.com>
Date: Mon, 9 Apr 2018 07:22:17 +0000
> handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
> polling udp packets with small length(e.g. 1byte udp payload), because setting
> VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length.
>
> Ping-Latencies shown below were tested between two Virtual Machines using
> netperf (UDP_STREAM, len=1), and then another machine pinged the client:
...
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
> Signed-off-by: Yunfang Tai <yunfangtai@tencent.com>
> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Applied, thank you.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH] vhost: fix vhost_vq_access_ok() log check
From: Linus Torvalds @ 2018-04-09 16:52 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: KVM list, Michael S. Tsirkin, syzkaller-bugs,
Linux Kernel Mailing List, virtualization
In-Reply-To: <20180409131021.5132-1-stefanha@redhat.com>
On Mon, Apr 9, 2018 at 6:10 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> @@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
> {
> int ret = vq_log_access_ok(vq, vq->log_base);
>
> - if (ret || vq->iotlb)
> + if (!ret || vq->iotlb)
> return ret;
That logic is still very non-obvious.
This code already had one bug because of an odd illegible test
sequence. Let's not keep the crazy code.
Why not just do the *obvious* thing, and get rid of "ret" entirely,
and make the damn thing return a boolean, and then just write it all
as
/* Caller should have vq mutex and device mutex */
bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
{
if (!vq_log_access_ok(vq, vq->log_base))
return false;
if (vq->iotlb || vq_access_ok(vq, vq->num, vq->desc,
vq->avail, vq->used);
}
which makes the logic obvious: if vq_log_access_ok() fails, then then
vhost_vq_access_ok() fails unconditionally.
Otherwise, we need to have an iotlb, or a successful vq_access_ok() check.
Doesn't that all make more sense, and avoid the insane "ret" value use
that is really quite subtle?
Linus
^ permalink raw reply
* Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-04-09 18:47 UTC (permalink / raw)
To: Jiri Pirko
Cc: alexander.h.duyck, virtio-dev, mst, kubakici, netdev,
virtualization, loseweigh, davem
In-Reply-To: <20180409080751.GE19345@nanopsycho>
On 4/9/2018 1:07 AM, Jiri Pirko wrote:
> Sat, Apr 07, 2018 at 12:59:14AM CEST, sridhar.samudrala@intel.com wrote:
>> On 4/6/2018 5:48 AM, Jiri Pirko wrote:
>>> Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudrala@intel.com wrote:
> [...]
>
>>>> +static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>>>> + struct net_device *child_netdev)
>>>> +{
>>>> + struct virtnet_bypass_info *vbi;
>>>> + bool backup;
>>>> +
>>>> + vbi = netdev_priv(bypass_netdev);
>>>> + backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>>>> + if (backup ? rtnl_dereference(vbi->backup_netdev) :
>>>> + rtnl_dereference(vbi->active_netdev)) {
>>>> + netdev_info(bypass_netdev,
>>>> + "%s attempting to join bypass dev when %s already present\n",
>>>> + child_netdev->name, backup ? "backup" : "active");
>>> Bypass module should check if there is already some other netdev
>>> enslaved and refuse right there.
>> This will work for virtio-net with 3 netdev model, but this check has to be done by netvsc
>> as its bypass_netdev is same as the backup_netdev.
>> Will add a flag while registering with the bypass module to indicate if the driver is doing
>> a 2 netdev or 3 netdev model and based on that flag this check can be done in bypass module
>> for 3 netdev scenario.
> Just let me undestand it clearly. What I expect the difference would be
> between 2netdev and3 netdev model is this:
> 2netdev:
> bypass_master
> /
> /
> VF_slave
>
> 3netdev:
> bypass_master
> / \
> / \
> VF_slave backup_slave
>
> Is that correct? If not, how does it look like?
>
>
Looks correct.
VF_slave and backup_slave are the original netdevs and are present in both the models.
In the 3 netdev model, bypass_master netdev is created and VF_slave and backup_slave are
marked as the 2 slaves of this new netdev.
In the 2 netdev model, backup_slave acts as bypass_master and the bypass module doesn't
have access to netdev_priv of the backup_slave.
Once i moved all the ndo_ops of the master netdev to bypass module, i realized that we can
move the create/destroy of the upper netdev also to bypass.c.
That way the changes to virtio_net become very minimal.
With these updates, bypass module now supports both the models by exporting 2 sets of
functions.
3 netdev:
int bypass_master_create(struct net_device *backup_netdev,
struct bypass_master **pbypass_master);
void bypass_master_destroy(struct bypass_master *bypass_master);
2 netdev:
int bypass_master_register(struct net_device *backup_netdev, struct bypass_ops *ops,
struct bypass_master **pbypass_master);
void bypass_master_unregister(struct bypass_master *bypass_master);
Will send the next revision in a day or two.
Thanks
Sridhar
^ permalink raw reply
* [PATCH RESEND net] vhost: fix vhost_vq_access_ok() log check
From: Michael S. Tsirkin @ 2018-04-09 19:40 UTC (permalink / raw)
To: David Miller, netdev, virtualization
Cc: kvm, mst, syzkaller-bugs, linux-kernel, Stefan Hajnoczi
In-Reply-To: <20180409131021.5132-1-stefanha@redhat.com>
From: Stefan Hajnoczi <stefanha@redhat.com>
Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
when IOTLB is enabled") introduced a regression. The logic was
originally:
if (vq->iotlb)
return 1;
return A && B;
After the patch the short-circuit logic for A was inverted:
if (A || vq->iotlb)
return A;
return B;
The correct logic is:
if (!A || vq->iotlb)
return A;
return B;
Reported-by: syzbot+65a84dde0214b0387ccd@syzkaller.appspotmail.com
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/vhost.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5320039671b7..f6af4210679a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
{
int ret = vq_log_access_ok(vq, vq->log_base);
- if (ret || vq->iotlb)
+ if (!ret || vq->iotlb)
return ret;
return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
--
2.14.3
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox