Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCHv1 6/7] TAP: tap as an independent module
From: Sainath Grandhi @ 2017-01-06 22:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, mahesh, linux-kernel, Sainath Grandhi
In-Reply-To: <1483742009-19184-1-git-send-email-sainath.grandhi@intel.com>

This patch makes tap a separate module for other types of virtual interfaces, for example,
ipvlan to use.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Tested-by: Sainath Grandhi <sainath.grandhi@intel.com>
---
 drivers/net/Kconfig        |  14 +++
 drivers/net/Makefile       |   3 +-
 drivers/net/macvtap.c      | 247 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/macvtap_main.c | 247 ---------------------------------------------
 drivers/net/tap.c          |  10 ++
 5 files changed, 272 insertions(+), 249 deletions(-)
 create mode 100644 drivers/net/macvtap.c
 delete mode 100644 drivers/net/macvtap_main.c

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 95c32f2..280380d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -284,6 +284,20 @@ config TUN
 
 	  If you don't know what to use this for, you don't need it.
 
+config TAP
+        tristate "TAP module support for virtual interfaces"
+        ---help---
+          TAP module serves two purposes. This can be used as library of functions
+          for virtual interfaces to implement tap functionality.
+
+          This module also includes character device file and socket operations
+          that can be used by virtual interface implementing tap.
+
+          To compile this driver as a module, choose M here: the module
+          will be called tap.
+
+          If you don't know what to use this for, you don't need it.
+
 config TUN_VNET_CROSS_LE
 	bool "Support for cross-endian vnet headers on little-endian kernels"
 	default n
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 19b03a9..7dd86ca 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_PHYLIB) += phy/
 obj-$(CONFIG_RIONET) += rionet.o
 obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
+obj-$(CONFIG_TAP) += tap.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
 obj-$(CONFIG_VXLAN) += vxlan.o
@@ -29,8 +30,6 @@ obj-$(CONFIG_GTP) += gtp.o
 obj-$(CONFIG_NLMON) += nlmon.o
 obj-$(CONFIG_NET_VRF) += vrf.o
 
-macvtap-objs := macvtap_main.o tap.o
-
 #
 # Networking Drivers
 #
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
new file mode 100644
index 0000000..3f047b4
--- /dev/null
+++ b/drivers/net/macvtap.c
@@ -0,0 +1,247 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/if_tap.h>
+#include <linux/if_vlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/fs.h>
+#include <linux/uio.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+#include <linux/virtio_net.h>
+#include <linux/skb_array.h>
+
+struct macvtap_dev {
+	struct macvlan_dev vlan;
+	struct tap_dev    tap;
+};
+
+/*
+ * Variables for dealing with macvtaps device numbers.
+ */
+static dev_t macvtap_major;
+
+static const void *macvtap_net_namespace(struct device *d)
+{
+	struct net_device *dev = to_net_dev(d->parent);
+	return dev_net(dev);
+}
+
+static struct class macvtap_class = {
+	.name = "macvtap",
+	.owner = THIS_MODULE,
+	.ns_type = &net_ns_type_operations,
+	.namespace = macvtap_net_namespace,
+};
+static struct cdev macvtap_cdev;
+
+#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
+		      NETIF_F_TSO6 | NETIF_F_UFO)
+
+static void macvtap_count_tx_dropped(struct tap_dev *tap)
+{
+	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
+
+	this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+}
+
+static void macvtap_count_rx_dropped(struct tap_dev *tap)
+{
+	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
+
+	macvlan_count_rx(vlan, 0, 0, 0);
+}
+
+static void macvtap_update_features(struct tap_dev *tap,
+				    netdev_features_t features)
+{
+	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
+
+	vlan->set_features = features;
+	netdev_update_features(vlan->dev);
+}
+
+static int macvtap_newlink(struct net *src_net,
+			   struct net_device *dev,
+			   struct nlattr *tb[],
+			   struct nlattr *data[])
+{
+	struct macvtap_dev *vlantap = netdev_priv(dev);
+	int err;
+
+	INIT_LIST_HEAD(&vlantap->tap.queue_list);
+
+	/* Since macvlan supports all offloads by default, make
+	 * tap support all offloads also.
+	 */
+	vlantap->tap.tap_features = TUN_OFFLOADS;
+
+	/* Register callbacks for rx/tx drops accounting and updating
+	 * net_device features
+	 */
+	vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
+	vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
+	vlantap->tap.update_features  = macvtap_update_features;
+
+	err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
+	if (err)
+		return err;
+
+	/* Don't put anything that may fail after macvlan_common_newlink
+	 * because we can't undo what it does.
+	 */
+	err = macvlan_common_newlink(src_net, dev, tb, data);
+	if (err) {
+		netdev_rx_handler_unregister(dev);
+		return err;
+	}
+
+	vlantap->tap.dev = vlantap->vlan.dev;
+
+	return 0;
+}
+
+static void macvtap_dellink(struct net_device *dev,
+			    struct list_head *head)
+{
+	struct macvtap_dev *vlantap = netdev_priv(dev);
+
+	netdev_rx_handler_unregister(dev);
+	tap_del_queues(&vlantap->tap);
+	macvlan_dellink(dev, head);
+}
+
+static void macvtap_setup(struct net_device *dev)
+{
+	macvlan_common_setup(dev);
+	dev->tx_queue_len = TUN_READQ_SIZE;
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+	.kind		= "macvtap",
+	.setup		= macvtap_setup,
+	.newlink	= macvtap_newlink,
+	.dellink	= macvtap_dellink,
+	.priv_size      = sizeof(struct macvtap_dev),
+};
+
+static int macvtap_device_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct macvtap_dev *vlantap;
+	struct device *classdev;
+	dev_t devt;
+	int err;
+	char tap_name[IFNAMSIZ];
+
+	if (dev->rtnl_link_ops != &macvtap_link_ops)
+		return NOTIFY_DONE;
+
+	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
+	vlantap = netdev_priv(dev);
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		/* Create the device node here after the network device has
+		 * been registered but before register_netdevice has
+		 * finished running.
+		 */
+		err = tap_get_minor(macvtap_major, &vlantap->tap);
+		if (err)
+			return notifier_from_errno(err);
+
+		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
+		classdev = device_create(&macvtap_class, &dev->dev, devt,
+					 dev, tap_name);
+		if (IS_ERR(classdev)) {
+			tap_free_minor(macvtap_major, &vlantap->tap);
+			return notifier_from_errno(PTR_ERR(classdev));
+		}
+		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
+					tap_name);
+		if (err)
+			return notifier_from_errno(err);
+		break;
+	case NETDEV_UNREGISTER:
+		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
+		if (vlantap->tap.minor == 0)
+			break;
+		sysfs_remove_link(&dev->dev.kobj, tap_name);
+		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
+		device_destroy(&macvtap_class, devt);
+		tap_free_minor(macvtap_major, &vlantap->tap);
+		break;
+	case NETDEV_CHANGE_TX_QUEUE_LEN:
+		if (tap_queue_resize(&vlantap->tap))
+			return NOTIFY_BAD;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block macvtap_notifier_block __read_mostly = {
+	.notifier_call	= macvtap_device_event,
+};
+
+static int macvtap_init(void)
+{
+	int err;
+
+	err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");
+
+	if (err)
+		goto out1;
+
+	err = class_register(&macvtap_class);
+	if (err)
+		goto out2;
+
+	err = register_netdevice_notifier(&macvtap_notifier_block);
+	if (err)
+		goto out3;
+
+	err = macvlan_link_register(&macvtap_link_ops);
+	if (err)
+		goto out4;
+
+	return 0;
+
+out4:
+	unregister_netdevice_notifier(&macvtap_notifier_block);
+out3:
+	class_unregister(&macvtap_class);
+out2:
+	cdev_del(&macvtap_cdev);
+out1:
+	return err;
+}
+module_init(macvtap_init);
+
+extern struct idr minor_idr;
+static void macvtap_exit(void)
+{
+	rtnl_link_unregister(&macvtap_link_ops);
+	unregister_netdevice_notifier(&macvtap_notifier_block);
+	class_unregister(&macvtap_class);
+	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
+}
+module_exit(macvtap_exit);
+
+MODULE_ALIAS_RTNL_LINK("macvtap");
+MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
deleted file mode 100644
index 3f047b4..0000000
--- a/drivers/net/macvtap_main.c
+++ /dev/null
@@ -1,247 +0,0 @@
-#include <linux/etherdevice.h>
-#include <linux/if_macvlan.h>
-#include <linux/if_tap.h>
-#include <linux/if_vlan.h>
-#include <linux/interrupt.h>
-#include <linux/nsproxy.h>
-#include <linux/compat.h>
-#include <linux/if_tun.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/cache.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
-#include <linux/cdev.h>
-#include <linux/idr.h>
-#include <linux/fs.h>
-#include <linux/uio.h>
-
-#include <net/net_namespace.h>
-#include <net/rtnetlink.h>
-#include <net/sock.h>
-#include <linux/virtio_net.h>
-#include <linux/skb_array.h>
-
-struct macvtap_dev {
-	struct macvlan_dev vlan;
-	struct tap_dev    tap;
-};
-
-/*
- * Variables for dealing with macvtaps device numbers.
- */
-static dev_t macvtap_major;
-
-static const void *macvtap_net_namespace(struct device *d)
-{
-	struct net_device *dev = to_net_dev(d->parent);
-	return dev_net(dev);
-}
-
-static struct class macvtap_class = {
-	.name = "macvtap",
-	.owner = THIS_MODULE,
-	.ns_type = &net_ns_type_operations,
-	.namespace = macvtap_net_namespace,
-};
-static struct cdev macvtap_cdev;
-
-#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
-		      NETIF_F_TSO6 | NETIF_F_UFO)
-
-static void macvtap_count_tx_dropped(struct tap_dev *tap)
-{
-	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
-
-	this_cpu_inc(vlan->pcpu_stats->tx_dropped);
-}
-
-static void macvtap_count_rx_dropped(struct tap_dev *tap)
-{
-	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
-
-	macvlan_count_rx(vlan, 0, 0, 0);
-}
-
-static void macvtap_update_features(struct tap_dev *tap,
-				    netdev_features_t features)
-{
-	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
-
-	vlan->set_features = features;
-	netdev_update_features(vlan->dev);
-}
-
-static int macvtap_newlink(struct net *src_net,
-			   struct net_device *dev,
-			   struct nlattr *tb[],
-			   struct nlattr *data[])
-{
-	struct macvtap_dev *vlantap = netdev_priv(dev);
-	int err;
-
-	INIT_LIST_HEAD(&vlantap->tap.queue_list);
-
-	/* Since macvlan supports all offloads by default, make
-	 * tap support all offloads also.
-	 */
-	vlantap->tap.tap_features = TUN_OFFLOADS;
-
-	/* Register callbacks for rx/tx drops accounting and updating
-	 * net_device features
-	 */
-	vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
-	vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
-	vlantap->tap.update_features  = macvtap_update_features;
-
-	err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
-	if (err)
-		return err;
-
-	/* Don't put anything that may fail after macvlan_common_newlink
-	 * because we can't undo what it does.
-	 */
-	err = macvlan_common_newlink(src_net, dev, tb, data);
-	if (err) {
-		netdev_rx_handler_unregister(dev);
-		return err;
-	}
-
-	vlantap->tap.dev = vlantap->vlan.dev;
-
-	return 0;
-}
-
-static void macvtap_dellink(struct net_device *dev,
-			    struct list_head *head)
-{
-	struct macvtap_dev *vlantap = netdev_priv(dev);
-
-	netdev_rx_handler_unregister(dev);
-	tap_del_queues(&vlantap->tap);
-	macvlan_dellink(dev, head);
-}
-
-static void macvtap_setup(struct net_device *dev)
-{
-	macvlan_common_setup(dev);
-	dev->tx_queue_len = TUN_READQ_SIZE;
-}
-
-static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
-	.kind		= "macvtap",
-	.setup		= macvtap_setup,
-	.newlink	= macvtap_newlink,
-	.dellink	= macvtap_dellink,
-	.priv_size      = sizeof(struct macvtap_dev),
-};
-
-static int macvtap_device_event(struct notifier_block *unused,
-				unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct macvtap_dev *vlantap;
-	struct device *classdev;
-	dev_t devt;
-	int err;
-	char tap_name[IFNAMSIZ];
-
-	if (dev->rtnl_link_ops != &macvtap_link_ops)
-		return NOTIFY_DONE;
-
-	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
-	vlantap = netdev_priv(dev);
-
-	switch (event) {
-	case NETDEV_REGISTER:
-		/* Create the device node here after the network device has
-		 * been registered but before register_netdevice has
-		 * finished running.
-		 */
-		err = tap_get_minor(macvtap_major, &vlantap->tap);
-		if (err)
-			return notifier_from_errno(err);
-
-		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
-		classdev = device_create(&macvtap_class, &dev->dev, devt,
-					 dev, tap_name);
-		if (IS_ERR(classdev)) {
-			tap_free_minor(macvtap_major, &vlantap->tap);
-			return notifier_from_errno(PTR_ERR(classdev));
-		}
-		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
-					tap_name);
-		if (err)
-			return notifier_from_errno(err);
-		break;
-	case NETDEV_UNREGISTER:
-		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
-		if (vlantap->tap.minor == 0)
-			break;
-		sysfs_remove_link(&dev->dev.kobj, tap_name);
-		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
-		device_destroy(&macvtap_class, devt);
-		tap_free_minor(macvtap_major, &vlantap->tap);
-		break;
-	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		if (tap_queue_resize(&vlantap->tap))
-			return NOTIFY_BAD;
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block macvtap_notifier_block __read_mostly = {
-	.notifier_call	= macvtap_device_event,
-};
-
-static int macvtap_init(void)
-{
-	int err;
-
-	err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");
-
-	if (err)
-		goto out1;
-
-	err = class_register(&macvtap_class);
-	if (err)
-		goto out2;
-
-	err = register_netdevice_notifier(&macvtap_notifier_block);
-	if (err)
-		goto out3;
-
-	err = macvlan_link_register(&macvtap_link_ops);
-	if (err)
-		goto out4;
-
-	return 0;
-
-out4:
-	unregister_netdevice_notifier(&macvtap_notifier_block);
-out3:
-	class_unregister(&macvtap_class);
-out2:
-	cdev_del(&macvtap_cdev);
-out1:
-	return err;
-}
-module_init(macvtap_init);
-
-extern struct idr minor_idr;
-static void macvtap_exit(void)
-{
-	rtnl_link_unregister(&macvtap_link_ops);
-	unregister_netdevice_notifier(&macvtap_notifier_block);
-	class_unregister(&macvtap_class);
-	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
-}
-module_exit(macvtap_exit);
-
-MODULE_ALIAS_RTNL_LINK("macvtap");
-MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 1d5bcf3..ee0d49a 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -311,6 +311,7 @@ void tap_del_queues(struct tap_dev *tap)
 	/* guarantee that any future tap_set_queue will fail */
 	tap->numvtaps = MAX_TAP_QUEUES;
 }
+EXPORT_SYMBOL_GPL(tap_del_queues);
 
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 {
@@ -388,6 +389,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 	kfree_skb(skb);
 	return RX_HANDLER_CONSUMED;
 }
+EXPORT_SYMBOL_GPL(tap_handle_frame);
 
 int tap_get_minor(dev_t major, struct tap_dev *tap)
 {
@@ -416,6 +418,7 @@ int tap_get_minor(dev_t major, struct tap_dev *tap)
 	mutex_unlock(&tap_major->minor_lock);
 	return retval < 0 ? retval : 0;
 }
+EXPORT_SYMBOL_GPL(tap_get_minor);
 
 void tap_free_minor(dev_t major, struct tap_dev *tap)
 {
@@ -439,6 +442,7 @@ void tap_free_minor(dev_t major, struct tap_dev *tap)
 	}
 	mutex_unlock(&tap_major->minor_lock);
 }
+EXPORT_SYMBOL_GPL(tap_free_minor);
 
 static struct tap_dev *dev_get_by_tap_file(int major, int minor)
 {
@@ -1201,6 +1205,7 @@ int tap_queue_resize(struct tap_dev *tap)
 	kfree(arrays);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(tap_queue_resize);
 
 static int tap_list_add(dev_t major, const char *device_name)
 {
@@ -1244,6 +1249,7 @@ int tap_create_cdev(struct cdev *tap_cdev,
 out1:
 	return err;
 }
+EXPORT_SYMBOL_GPL(tap_create_cdev);
 
 void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
 {
@@ -1264,3 +1270,7 @@ void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
 	unregister_chrdev_region(major, TAP_NUM_DEVS);
 	idr_destroy(&tap_major->minor_idr);
 }
+EXPORT_SYMBOL_GPL(tap_destroy_cdev);
+
+MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
+MODULE_LICENSE("GPL");
-- 
2.7.4

^ permalink raw reply related

* [PATCHv1 5/7] TAP: Extending tap device create/destroy APIs
From: Sainath Grandhi @ 2017-01-06 22:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, mahesh, linux-kernel, Sainath Grandhi
In-Reply-To: <1483742009-19184-1-git-send-email-sainath.grandhi@intel.com>

Extending tap APIs get/free_minor and create/destroy_cdev to handle more than one
type of virtual interface.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Tested-by: Sainath Grandhi <sainath.grandhi@intel.com>
---
 drivers/net/macvtap_main.c |   6 +--
 drivers/net/tap.c          | 110 ++++++++++++++++++++++++++++++++++++---------
 include/linux/if_tap.h     |   4 +-
 3 files changed, 93 insertions(+), 27 deletions(-)

diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
index 6326a82..3f047b4 100644
--- a/drivers/net/macvtap_main.c
+++ b/drivers/net/macvtap_main.c
@@ -160,7 +160,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		 * been registered but before register_netdevice has
 		 * finished running.
 		 */
-		err = tap_get_minor(&vlantap->tap);
+		err = tap_get_minor(macvtap_major, &vlantap->tap);
 		if (err)
 			return notifier_from_errno(err);
 
@@ -168,7 +168,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		classdev = device_create(&macvtap_class, &dev->dev, devt,
 					 dev, tap_name);
 		if (IS_ERR(classdev)) {
-			tap_free_minor(&vlantap->tap);
+			tap_free_minor(macvtap_major, &vlantap->tap);
 			return notifier_from_errno(PTR_ERR(classdev));
 		}
 		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
@@ -183,7 +183,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		sysfs_remove_link(&dev->dev.kobj, tap_name);
 		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
 		device_destroy(&macvtap_class, devt);
-		tap_free_minor(&vlantap->tap);
+		tap_free_minor(macvtap_major, &vlantap->tap);
 		break;
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		if (tap_queue_resize(&vlantap->tap))
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 6306ab9..1d5bcf3 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -99,12 +99,16 @@ static struct proto tap_proto = {
 };
 
 #define TAP_NUM_DEVS (1U << MINORBITS)
+
+LIST_HEAD(major_list);
+
 struct major_info {
 	dev_t major;
 	struct idr minor_idr;
 	struct mutex minor_lock;
 	const char *device_name;
-} macvtap_major;
+	struct list_head next;
+};
 
 #define GOODCOPY_LEN 128
 
@@ -385,44 +389,81 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 	return RX_HANDLER_CONSUMED;
 }
 
-int tap_get_minor(struct tap_dev *tap)
+int tap_get_minor(dev_t major, struct tap_dev *tap)
 {
 	int retval = -ENOMEM;
+	struct major_info *tap_major, *tmp;
+	bool found = false;
 
-	mutex_lock(&macvtap_major.minor_lock);
-	retval = idr_alloc(&macvtap_major.minor_idr, tap, 1, TAP_NUM_DEVS, GFP_KERNEL);
+	list_for_each_entry_safe(tap_major, tmp, &major_list, next) {
+		if (tap_major->major == MAJOR(major)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return -EINVAL;
+
+	mutex_lock(&tap_major->minor_lock);
+	retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_KERNEL);
 	if (retval >= 0) {
 		tap->minor = retval;
 	} else if (retval == -ENOSPC) {
 		netdev_err(tap->dev, "Too many tap devices\n");
 		retval = -EINVAL;
 	}
-	mutex_unlock(&macvtap_major.minor_lock);
+	mutex_unlock(&tap_major->minor_lock);
 	return retval < 0 ? retval : 0;
 }
 
-void tap_free_minor(struct tap_dev *tap)
+void tap_free_minor(dev_t major, struct tap_dev *tap)
 {
-	mutex_lock(&macvtap_major.minor_lock);
+	struct major_info *tap_major, *tmp;
+	bool found = false;
+
+	list_for_each_entry_safe(tap_major, tmp, &major_list, next) {
+		if (tap_major->major == MAJOR(major)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return;
+
+	mutex_lock(&tap_major->minor_lock);
 	if (tap->minor) {
-		idr_remove(&macvtap_major.minor_idr, tap->minor);
+		idr_remove(&tap_major->minor_idr, tap->minor);
 		tap->minor = 0;
 	}
-	mutex_unlock(&macvtap_major.minor_lock);
+	mutex_unlock(&tap_major->minor_lock);
 }
 
-static struct tap_dev *dev_get_by_tap_minor(int minor)
+static struct tap_dev *dev_get_by_tap_file(int major, int minor)
 {
 	struct net_device *dev = NULL;
 	struct tap_dev *tap;
+	struct major_info *tap_major, *tmp;
+	bool found = false;
 
-	mutex_lock(&macvtap_major.minor_lock);
-	tap = idr_find(&macvtap_major.minor_idr, minor);
+	list_for_each_entry_safe(tap_major, tmp, &major_list, next) {
+		if (tap_major->major == major) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return NULL;
+
+	mutex_lock(&tap_major->minor_lock);
+	tap = idr_find(&tap_major->minor_idr, minor);
 	if (tap) {
 		dev = tap->dev;
 		dev_hold(dev);
 	}
-	mutex_unlock(&macvtap_major.minor_lock);
+	mutex_unlock(&tap_major->minor_lock);
 	return tap;
 }
 
@@ -454,7 +495,7 @@ static int tap_open(struct inode *inode, struct file *file)
 	int err = -ENODEV;
 
 	rtnl_lock();
-	tap = dev_get_by_tap_minor(iminor(inode));
+	tap = dev_get_by_tap_file(imajor(inode), iminor(inode));
 	if (!tap)
 		goto err;
 
@@ -1161,6 +1202,24 @@ int tap_queue_resize(struct tap_dev *tap)
 	return ret;
 }
 
+static int tap_list_add(dev_t major, const char *device_name)
+{
+	int err = 0;
+	struct major_info *tap_major;
+
+	tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC);
+
+	tap_major->major = MAJOR(major);
+
+	idr_init(&tap_major->minor_idr);
+	mutex_init(&tap_major->minor_lock);
+
+	tap_major->device_name = device_name;
+
+	list_add_tail(&tap_major->next, &major_list);
+	return err;
+}
+
 int tap_create_cdev(struct cdev *tap_cdev,
 		    dev_t *tap_major, const char *device_name)
 {
@@ -1176,12 +1235,7 @@ int tap_create_cdev(struct cdev *tap_cdev,
 	if (err)
 		goto out2;
 
-	macvtap_major.major = MAJOR(*tap_major);
-
-	idr_init(&macvtap_major.minor_idr);
-	mutex_init(&macvtap_major.minor_lock);
-
-	macvtap_major.device_name = device_name;
+	err = tap_list_add(*tap_major, device_name);
 
 	return err;
 
@@ -1193,8 +1247,20 @@ int tap_create_cdev(struct cdev *tap_cdev,
 
 void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
 {
+	struct major_info *tap_major, *tmp;
+	bool found = false;
+
+	list_for_each_entry_safe(tap_major, tmp, &major_list, next) {
+		if (tap_major->major == MAJOR(major)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return;
+
 	cdev_del(tap_cdev);
 	unregister_chrdev_region(major, TAP_NUM_DEVS);
-	idr_destroy(&macvtap_major.minor_idr);
+	idr_destroy(&tap_major->minor_idr);
 }
-
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index e6b2a88..a8a8e16 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -52,8 +52,8 @@ struct tap_queue {
 
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);
 void tap_del_queues(struct tap_dev *tap);
-int tap_get_minor(struct tap_dev *tap);
-void tap_free_minor(struct tap_dev *tap);
+int tap_get_minor(dev_t major, struct tap_dev *tap);
+void tap_free_minor(dev_t major, struct tap_dev *tap);
 int tap_queue_resize(struct tap_dev *tap);
 int tap_create_cdev(struct cdev *tap_cdev,
 		    dev_t *tap_major, const char *device_name);
-- 
2.7.4

^ permalink raw reply related

* [PATCHv1 4/7] TAP: Abstract type of virtual interface from tap implementation
From: Sainath Grandhi @ 2017-01-06 22:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, mahesh, linux-kernel, Sainath Grandhi
In-Reply-To: <1483742009-19184-1-git-send-email-sainath.grandhi@intel.com>

macvlan object is re-structured to hold tap related elements in a separate
entity, tap_dev. Upon NETDEV_REGISTER device_event, tap_dev is registered with
idr and fetched again on tap_open. Few of the tap functions are modified to
accepted tap_dev as argument. tap_dev object includes callbacks to be used by
underlying virtual interface to take care of tx and rx accounting.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Tested-by: Sainath Grandhi <sainath.grandhi@intel.com>
---
 drivers/net/macvlan.c      |   2 +-
 drivers/net/macvtap_main.c |  68 +++++++++---
 drivers/net/tap.c          | 264 ++++++++++++++++++++-------------------------
 include/linux/if_tap.h     |  59 +++++++++-
 4 files changed, 227 insertions(+), 166 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 20b3fdf2..79383f9 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1526,7 +1526,6 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
 int macvlan_link_register(struct rtnl_link_ops *ops)
 {
 	/* common fields */
-	ops->priv_size		= sizeof(struct macvlan_dev);
 	ops->validate		= macvlan_validate;
 	ops->maxtype		= IFLA_MACVLAN_MAX;
 	ops->policy		= macvlan_policy;
@@ -1549,6 +1548,7 @@ static struct rtnl_link_ops macvlan_link_ops = {
 	.newlink	= macvlan_newlink,
 	.dellink	= macvlan_dellink,
 	.get_link_net	= macvlan_get_link_net,
+	.priv_size      = sizeof(struct macvlan_dev),
 };
 
 static int macvlan_device_event(struct notifier_block *unused,
diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
index 32ad560..6326a82 100644
--- a/drivers/net/macvtap_main.c
+++ b/drivers/net/macvtap_main.c
@@ -24,6 +24,11 @@
 #include <linux/virtio_net.h>
 #include <linux/skb_array.h>
 
+struct macvtap_dev {
+	struct macvlan_dev vlan;
+	struct tap_dev    tap;
+};
+
 /*
  * Variables for dealing with macvtaps device numbers.
  */
@@ -46,22 +51,52 @@ static struct cdev macvtap_cdev;
 #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
 		      NETIF_F_TSO6 | NETIF_F_UFO)
 
+static void macvtap_count_tx_dropped(struct tap_dev *tap)
+{
+	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
+
+	this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+}
+
+static void macvtap_count_rx_dropped(struct tap_dev *tap)
+{
+	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
+
+	macvlan_count_rx(vlan, 0, 0, 0);
+}
+
+static void macvtap_update_features(struct tap_dev *tap,
+				    netdev_features_t features)
+{
+	struct macvlan_dev *vlan = (struct macvlan_dev *)container_of(tap, struct macvtap_dev, tap);
+
+	vlan->set_features = features;
+	netdev_update_features(vlan->dev);
+}
+
 static int macvtap_newlink(struct net *src_net,
 			   struct net_device *dev,
 			   struct nlattr *tb[],
 			   struct nlattr *data[])
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvtap_dev *vlantap = netdev_priv(dev);
 	int err;
 
-	INIT_LIST_HEAD(&vlan->queue_list);
+	INIT_LIST_HEAD(&vlantap->tap.queue_list);
 
 	/* Since macvlan supports all offloads by default, make
 	 * tap support all offloads also.
 	 */
-	vlan->tap_features = TUN_OFFLOADS;
+	vlantap->tap.tap_features = TUN_OFFLOADS;
 
-	err = netdev_rx_handler_register(dev, tap_handle_frame, vlan);
+	/* Register callbacks for rx/tx drops accounting and updating
+	 * net_device features
+	 */
+	vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
+	vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
+	vlantap->tap.update_features  = macvtap_update_features;
+
+	err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
 	if (err)
 		return err;
 
@@ -74,14 +109,18 @@ static int macvtap_newlink(struct net *src_net,
 		return err;
 	}
 
+	vlantap->tap.dev = vlantap->vlan.dev;
+
 	return 0;
 }
 
 static void macvtap_dellink(struct net_device *dev,
 			    struct list_head *head)
 {
+	struct macvtap_dev *vlantap = netdev_priv(dev);
+
 	netdev_rx_handler_unregister(dev);
-	tap_del_queues(dev);
+	tap_del_queues(&vlantap->tap);
 	macvlan_dellink(dev, head);
 }
 
@@ -96,13 +135,14 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
 	.setup		= macvtap_setup,
 	.newlink	= macvtap_newlink,
 	.dellink	= macvtap_dellink,
+	.priv_size      = sizeof(struct macvtap_dev),
 };
 
 static int macvtap_device_event(struct notifier_block *unused,
 				unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct macvlan_dev *vlan;
+	struct macvtap_dev *vlantap;
 	struct device *classdev;
 	dev_t devt;
 	int err;
@@ -112,7 +152,7 @@ static int macvtap_device_event(struct notifier_block *unused,
 		return NOTIFY_DONE;
 
 	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
-	vlan = netdev_priv(dev);
+	vlantap = netdev_priv(dev);
 
 	switch (event) {
 	case NETDEV_REGISTER:
@@ -120,15 +160,15 @@ static int macvtap_device_event(struct notifier_block *unused,
 		 * been registered but before register_netdevice has
 		 * finished running.
 		 */
-		err = tap_get_minor(vlan);
+		err = tap_get_minor(&vlantap->tap);
 		if (err)
 			return notifier_from_errno(err);
 
-		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
+		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
 		classdev = device_create(&macvtap_class, &dev->dev, devt,
 					 dev, tap_name);
 		if (IS_ERR(classdev)) {
-			tap_free_minor(vlan);
+			tap_free_minor(&vlantap->tap);
 			return notifier_from_errno(PTR_ERR(classdev));
 		}
 		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
@@ -138,15 +178,15 @@ static int macvtap_device_event(struct notifier_block *unused,
 		break;
 	case NETDEV_UNREGISTER:
 		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
-		if (vlan->minor == 0)
+		if (vlantap->tap.minor == 0)
 			break;
 		sysfs_remove_link(&dev->dev.kobj, tap_name);
-		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
+		devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
 		device_destroy(&macvtap_class, devt);
-		tap_free_minor(vlan);
+		tap_free_minor(&vlantap->tap);
 		break;
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		if (tap_queue_resize(vlan))
+		if (tap_queue_resize(&vlantap->tap))
 			return NOTIFY_BAD;
 		break;
 	}
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 52692d2..6306ab9 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1,5 +1,5 @@
 #include <linux/etherdevice.h>
-#include <linux/if_macvlan.h>
+#include <linux/if_tap.h>
 #include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/nsproxy.h>
@@ -23,30 +23,6 @@
 #include <linux/virtio_net.h>
 #include <linux/skb_array.h>
 
-/*
- * A tap queue is the central object of this driver, it connects
- * an open character device to a macvlan interface. There can be
- * multiple queues on one interface, which map back to queues
- * implemented in hardware on the underlying device.
- *
- * tap_proto is used to allocate queues through the sock allocation
- * mechanism.
- *
- */
-struct tap_queue {
-	struct sock sk;
-	struct socket sock;
-	struct socket_wq wq;
-	int vnet_hdr_sz;
-	struct macvlan_dev __rcu *vlan;
-	struct file *file;
-	unsigned int flags;
-	u16 queue_index;
-	bool enabled;
-	struct list_head next;
-	struct skb_array skb_array;
-};
-
 #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
 
 #define TAP_VNET_LE 0x80000000
@@ -137,7 +113,7 @@ static const struct proto_ops tap_socket_ops;
 #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
 #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
 
-static struct macvlan_dev *tap_get_vlan_rcu(const struct net_device *dev)
+static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev)
 {
 	return rcu_dereference(dev->rx_handler_data);
 }
@@ -159,10 +135,9 @@ static struct macvlan_dev *tap_get_vlan_rcu(const struct net_device *dev)
  * when both our references and any pending SKBs are gone.
  */
 
-static int tap_enable_queue(struct net_device *dev, struct file *file,
+static int tap_enable_queue(struct tap_dev *tap, struct file *file,
 			    struct tap_queue *q)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
 	int err = -EINVAL;
 
 	ASSERT_RTNL();
@@ -171,62 +146,60 @@ static int tap_enable_queue(struct net_device *dev, struct file *file,
 		goto out;
 
 	err = 0;
-	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
-	q->queue_index = vlan->numvtaps;
+	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
+	q->queue_index = tap->numvtaps;
 	q->enabled = true;
 
-	vlan->numvtaps++;
+	tap->numvtaps++;
 out:
 	return err;
 }
 
 /* Requires RTNL */
-static int tap_set_queue(struct net_device *dev, struct file *file,
+static int tap_set_queue(struct tap_dev *tap, struct file *file,
 			 struct tap_queue *q)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
-
-	if (vlan->numqueues == MAX_TAP_QUEUES)
+	if (tap->numqueues == MAX_TAP_QUEUES)
 		return -EBUSY;
 
-	rcu_assign_pointer(q->vlan, vlan);
-	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
+	rcu_assign_pointer(q->tap, tap);
+	rcu_assign_pointer(tap->taps[tap->numvtaps], q);
 	sock_hold(&q->sk);
 
 	q->file = file;
-	q->queue_index = vlan->numvtaps;
+	q->queue_index = tap->numvtaps;
 	q->enabled = true;
 	file->private_data = q;
-	list_add_tail(&q->next, &vlan->queue_list);
+	list_add_tail(&q->next, &tap->queue_list);
 
-	vlan->numvtaps++;
-	vlan->numqueues++;
+	tap->numvtaps++;
+	tap->numqueues++;
 
 	return 0;
 }
 
 static int tap_disable_queue(struct tap_queue *q)
 {
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	struct tap_queue *nq;
 
 	ASSERT_RTNL();
 	if (!q->enabled)
 		return -EINVAL;
 
-	vlan = rtnl_dereference(q->vlan);
+	tap = rtnl_dereference(q->tap);
 
-	if (vlan) {
+	if (tap) {
 		int index = q->queue_index;
-		BUG_ON(index >= vlan->numvtaps);
-		nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]);
+		BUG_ON(index >= tap->numvtaps);
+		nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]);
 		nq->queue_index = index;
 
-		rcu_assign_pointer(vlan->taps[index], nq);
-		RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL);
+		rcu_assign_pointer(tap->taps[index], nq);
+		RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL);
 		q->enabled = false;
 
-		vlan->numvtaps--;
+		tap->numvtaps--;
 	}
 
 	return 0;
@@ -242,17 +215,17 @@ static int tap_disable_queue(struct tap_queue *q)
  */
 static void tap_put_queue(struct tap_queue *q)
 {
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 
 	rtnl_lock();
-	vlan = rtnl_dereference(q->vlan);
+	tap = rtnl_dereference(q->tap);
 
-	if (vlan) {
+	if (tap) {
 		if (q->enabled)
 			BUG_ON(tap_disable_queue(q));
 
-		vlan->numqueues--;
-		RCU_INIT_POINTER(q->vlan, NULL);
+		tap->numqueues--;
+		RCU_INIT_POINTER(q->tap, NULL);
 		sock_put(&q->sk);
 		list_del_init(&q->next);
 	}
@@ -270,17 +243,16 @@ static void tap_put_queue(struct tap_queue *q)
  * Cache vlan->numvtaps since it can become zero during the execution
  * of this function.
  */
-static struct tap_queue *tap_get_queue(struct net_device *dev,
+static struct tap_queue *tap_get_queue(struct tap_dev *tap,
 				       struct sk_buff *skb)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct tap_queue *tap = NULL;
+	struct tap_queue *queue = NULL;
 	/* Access to taps array is protected by rcu, but access to numvtaps
 	 * isn't. Below we use it to lookup a queue, but treat it as a hint
 	 * and validate that the result isn't NULL - in case we are
 	 * racing against queue removal.
 	 */
-	int numvtaps = ACCESS_ONCE(vlan->numvtaps);
+	int numvtaps = ACCESS_ONCE(tap->numvtaps);
 	__u32 rxq;
 
 	if (!numvtaps)
@@ -292,7 +264,7 @@ static struct tap_queue *tap_get_queue(struct net_device *dev,
 	/* Check if we can use flow to select a queue */
 	rxq = skb_get_hash(skb);
 	if (rxq) {
-		tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
+		queue = rcu_dereference(tap->taps[rxq % numvtaps]);
 		goto out;
 	}
 
@@ -302,14 +274,14 @@ static struct tap_queue *tap_get_queue(struct net_device *dev,
 		while (unlikely(rxq >= numvtaps))
 			rxq -= numvtaps;
 
-		tap = rcu_dereference(vlan->taps[rxq]);
+		queue = rcu_dereference(tap->taps[rxq]);
 		goto out;
 	}
 
 single:
-	tap = rcu_dereference(vlan->taps[0]);
+	queue = rcu_dereference(tap->taps[0]);
 out:
-	return tap;
+	return queue;
 }
 
 /*
@@ -317,39 +289,38 @@ static struct tap_queue *tap_get_queue(struct net_device *dev,
  * that it holds on all queues and safely set the pointer
  * from the queues to NULL.
  */
-void tap_del_queues(struct net_device *dev)
+void tap_del_queues(struct tap_dev *tap)
 {
-	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct tap_queue *q, *tmp;
 
 	ASSERT_RTNL();
-	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
+	list_for_each_entry_safe(q, tmp, &tap->queue_list, next) {
 		list_del_init(&q->next);
-		RCU_INIT_POINTER(q->vlan, NULL);
+		RCU_INIT_POINTER(q->tap, NULL);
 		if (q->enabled)
-			vlan->numvtaps--;
-		vlan->numqueues--;
+			tap->numvtaps--;
+		tap->numqueues--;
 		sock_put(&q->sk);
 	}
-	BUG_ON(vlan->numvtaps);
-	BUG_ON(vlan->numqueues);
+	BUG_ON(tap->numvtaps);
+	BUG_ON(tap->numqueues);
 	/* guarantee that any future tap_set_queue will fail */
-	vlan->numvtaps = MAX_TAP_QUEUES;
+	tap->numvtaps = MAX_TAP_QUEUES;
 }
 
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct net_device *dev = skb->dev;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	struct tap_queue *q;
 	netdev_features_t features = TAP_FEATURES;
 
-	vlan = tap_get_vlan_rcu(dev);
-	if (!vlan)
+	tap = tap_dev_get_rcu(dev);
+	if (!tap)
 		return RX_HANDLER_PASS;
 
-	q = tap_get_queue(dev, skb);
+	q = tap_get_queue(tap, skb);
 	if (!q)
 		return RX_HANDLER_PASS;
 
@@ -363,7 +334,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 	 * enabled.
 	 */
 	if (q->flags & IFF_VNET_HDR)
-		features |= vlan->tap_features;
+		features |= tap->tap_features;
 	if (netif_needs_gso(skb, features)) {
 		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
 
@@ -408,50 +379,51 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 
 drop:
 	/* Count errors/drops only here, thus don't care about args. */
-	macvlan_count_rx(vlan, 0, 0, 0);
+	if (tap->count_rx_dropped)
+		tap->count_rx_dropped(tap);
 	kfree_skb(skb);
 	return RX_HANDLER_CONSUMED;
 }
 
-int tap_get_minor(struct macvlan_dev *vlan)
+int tap_get_minor(struct tap_dev *tap)
 {
 	int retval = -ENOMEM;
 
 	mutex_lock(&macvtap_major.minor_lock);
-	retval = idr_alloc(&macvtap_major.minor_idr, vlan, 1, TAP_NUM_DEVS, GFP_KERNEL);
+	retval = idr_alloc(&macvtap_major.minor_idr, tap, 1, TAP_NUM_DEVS, GFP_KERNEL);
 	if (retval >= 0) {
-		vlan->minor = retval;
+		tap->minor = retval;
 	} else if (retval == -ENOSPC) {
-		netdev_err(vlan->dev, "Too many tap devices\n");
+		netdev_err(tap->dev, "Too many tap devices\n");
 		retval = -EINVAL;
 	}
 	mutex_unlock(&macvtap_major.minor_lock);
 	return retval < 0 ? retval : 0;
 }
 
-void tap_free_minor(struct macvlan_dev *vlan)
+void tap_free_minor(struct tap_dev *tap)
 {
 	mutex_lock(&macvtap_major.minor_lock);
-	if (vlan->minor) {
-		idr_remove(&macvtap_major.minor_idr, vlan->minor);
-		vlan->minor = 0;
+	if (tap->minor) {
+		idr_remove(&macvtap_major.minor_idr, tap->minor);
+		tap->minor = 0;
 	}
 	mutex_unlock(&macvtap_major.minor_lock);
 }
 
-static struct net_device *dev_get_by_tap_minor(int minor)
+static struct tap_dev *dev_get_by_tap_minor(int minor)
 {
 	struct net_device *dev = NULL;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 
 	mutex_lock(&macvtap_major.minor_lock);
-	vlan = idr_find(&macvtap_major.minor_idr, minor);
-	if (vlan) {
-		dev = vlan->dev;
+	tap = idr_find(&macvtap_major.minor_idr, minor);
+	if (tap) {
+		dev = tap->dev;
 		dev_hold(dev);
 	}
 	mutex_unlock(&macvtap_major.minor_lock);
-	return dev;
+	return tap;
 }
 
 static void tap_sock_write_space(struct sock *sk)
@@ -477,13 +449,13 @@ static void tap_sock_destruct(struct sock *sk)
 static int tap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
-	struct net_device *dev;
+	struct tap_dev *tap;
 	struct tap_queue *q;
 	int err = -ENODEV;
 
 	rtnl_lock();
-	dev = dev_get_by_tap_minor(iminor(inode));
-	if (!dev)
+	tap = dev_get_by_tap_minor(iminor(inode));
+	if (!tap)
 		goto err;
 
 	err = -ENOMEM;
@@ -511,18 +483,18 @@ static int tap_open(struct inode *inode, struct file *file)
 	 * The macvlan supports zerocopy iff the lower device supports zero
 	 * copy so we don't have to look at the lower device directly.
 	 */
-	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
+	if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
 		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
 
 	err = -ENOMEM;
-	if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL))
+	if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
 		goto err_array;
 
-	err = tap_set_queue(dev, file, q);
+	err = tap_set_queue(tap, file, q);
 	if (err)
 		goto err_queue;
 
-	dev_put(dev);
+	dev_put(tap->dev);
 
 	rtnl_unlock();
 	return err;
@@ -532,8 +504,8 @@ static int tap_open(struct inode *inode, struct file *file)
 err_array:
 	sock_put(&q->sk);
 err:
-	if (dev)
-		dev_put(dev);
+	if (tap)
+		dev_put(tap->dev);
 
 	rtnl_unlock();
 	return err;
@@ -601,7 +573,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 {
 	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
 	struct sk_buff *skb;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	unsigned long total_len = iov_iter_count(from);
 	unsigned long len = total_len;
 	int err;
@@ -698,7 +670,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 		skb_set_network_header(skb, depth);
 
 	rcu_read_lock();
-	vlan = rcu_dereference(q->vlan);
+	tap = rcu_dereference(q->tap);
 	/* copy skb_ubuf_info for callback when skb has no error */
 	if (zerocopy) {
 		skb_shinfo(skb)->destructor_arg = m->msg_control;
@@ -709,8 +681,8 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 		uarg->callback(uarg, false);
 	}
 
-	if (vlan) {
-		skb->dev = vlan->dev;
+	if (tap) {
+		skb->dev = tap->dev;
 		dev_queue_xmit(skb);
 	} else {
 		kfree_skb(skb);
@@ -724,9 +696,9 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 
 err:
 	rcu_read_lock();
-	vlan = rcu_dereference(q->vlan);
-	if (vlan)
-		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+	tap = rcu_dereference(q->tap);
+	if (tap && tap->count_tx_dropped)
+		tap->count_tx_dropped(tap);
 	rcu_read_unlock();
 
 	return err;
@@ -853,55 +825,55 @@ static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return ret;
 }
 
-static struct macvlan_dev *tap_get_vlan(struct tap_queue *q)
+static struct tap_dev *tap_get_tap_dev(struct tap_queue *q)
 {
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 
 	ASSERT_RTNL();
-	vlan = rtnl_dereference(q->vlan);
-	if (vlan)
-		dev_hold(vlan->dev);
+	tap = rtnl_dereference(q->tap);
+	if (tap)
+		dev_hold(tap->dev);
 
-	return vlan;
+	return tap;
 }
 
-static void tap_put_vlan(struct macvlan_dev *vlan)
+static void tap_put_tap_dev(struct tap_dev *tap)
 {
-	dev_put(vlan->dev);
+	dev_put(tap->dev);
 }
 
 static int tap_ioctl_set_queue(struct file *file, unsigned int flags)
 {
 	struct tap_queue *q = file->private_data;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	int ret;
 
-	vlan = tap_get_vlan(q);
-	if (!vlan)
+	tap = tap_get_tap_dev(q);
+	if (!tap)
 		return -EINVAL;
 
 	if (flags & IFF_ATTACH_QUEUE)
-		ret = tap_enable_queue(vlan->dev, file, q);
+		ret = tap_enable_queue(tap, file, q);
 	else if (flags & IFF_DETACH_QUEUE)
 		ret = tap_disable_queue(q);
 	else
 		ret = -EINVAL;
 
-	tap_put_vlan(vlan);
+	tap_put_tap_dev(tap);
 	return ret;
 }
 
 static int set_offload(struct tap_queue *q, unsigned long arg)
 {
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	netdev_features_t features;
 	netdev_features_t feature_mask = 0;
 
-	vlan = rtnl_dereference(q->vlan);
-	if (!vlan)
+	tap = rtnl_dereference(q->tap);
+	if (!tap)
 		return -ENOLINK;
 
-	features = vlan->dev->features;
+	features = tap->dev->features;
 
 	if (arg & TUN_F_CSUM) {
 		feature_mask = NETIF_F_HW_CSUM;
@@ -935,9 +907,9 @@ static int set_offload(struct tap_queue *q, unsigned long arg)
 	/* tap_features are the same as features on tun/tap and
 	 * reflect user expectations.
 	 */
-	vlan->tap_features = feature_mask;
-	vlan->set_features = features;
-	netdev_update_features(vlan->dev);
+	tap->tap_features = feature_mask;
+	if (tap->update_features)
+		tap->update_features(tap, features);
 
 	return 0;
 }
@@ -949,7 +921,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 		      unsigned long arg)
 {
 	struct tap_queue *q = file->private_data;
-	struct macvlan_dev *vlan;
+	struct tap_dev *tap;
 	void __user *argp = (void __user *)arg;
 	struct ifreq __user *ifr = argp;
 	unsigned int __user *up = argp;
@@ -975,18 +947,18 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 
 	case TUNGETIFF:
 		rtnl_lock();
-		vlan = tap_get_vlan(q);
-		if (!vlan) {
+		tap = tap_get_tap_dev(q);
+		if (!tap) {
 			rtnl_unlock();
 			return -ENOLINK;
 		}
 
 		ret = 0;
 		u = q->flags;
-		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
+		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
 		    put_user(u, &ifr->ifr_flags))
 			ret = -EFAULT;
-		tap_put_vlan(vlan);
+		tap_put_tap_dev(tap);
 		rtnl_unlock();
 		return ret;
 
@@ -1059,18 +1031,18 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 
 	case SIOCGIFHWADDR:
 		rtnl_lock();
-		vlan = tap_get_vlan(q);
-		if (!vlan) {
+		tap = tap_get_tap_dev(q);
+		if (!tap) {
 			rtnl_unlock();
 			return -ENOLINK;
 		}
 		ret = 0;
-		u = vlan->dev->type;
-		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
-		    copy_to_user(&ifr->ifr_hwaddr.sa_data, vlan->dev->dev_addr, ETH_ALEN) ||
+		u = tap->dev->type;
+		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
+		    copy_to_user(&ifr->ifr_hwaddr.sa_data, tap->dev->dev_addr, ETH_ALEN) ||
 		    put_user(u, &ifr->ifr_hwaddr.sa_family))
 			ret = -EFAULT;
-		tap_put_vlan(vlan);
+		tap_put_tap_dev(tap);
 		rtnl_unlock();
 		return ret;
 
@@ -1078,13 +1050,13 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
 			return -EFAULT;
 		rtnl_lock();
-		vlan = tap_get_vlan(q);
-		if (!vlan) {
+		tap = tap_get_tap_dev(q);
+		if (!tap) {
 			rtnl_unlock();
 			return -ENOLINK;
 		}
-		ret = dev_set_mac_address(vlan->dev, &sa);
-		tap_put_vlan(vlan);
+		ret = dev_set_mac_address(tap->dev, &sa);
+		tap_put_tap_dev(tap);
 		rtnl_unlock();
 		return ret;
 
@@ -1167,19 +1139,19 @@ struct socket *tap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
-int tap_queue_resize(struct macvlan_dev *vlan)
+int tap_queue_resize(struct tap_dev *tap)
 {
-	struct net_device *dev = vlan->dev;
+	struct net_device *dev = tap->dev;
 	struct tap_queue *q;
 	struct skb_array **arrays;
-	int n = vlan->numqueues;
+	int n = tap->numqueues;
 	int ret, i = 0;
 
 	arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
 	if (!arrays)
 		return -ENOMEM;
 
-	list_for_each_entry(q, &vlan->queue_list, next)
+	list_for_each_entry(q, &tap->queue_list, next)
 		arrays[i++] = &q->skb_array;
 
 	ret = skb_array_resize_multiple(arrays, n,
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 89bcd42..e6b2a88 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -1,13 +1,62 @@
 #ifndef _LINUX_IF_TAP_H_
 #define _LINUX_IF_TAP_H_
 
+#include <net/sock.h>
+#include <linux/skb_array.h>
+
+#define MAX_TAP_QUEUES 256
+
+struct tap_queue;
+
+struct tap_dev {
+	struct net_device	*dev;
+	u16			flags;
+	/* This array tracks active taps. */
+	struct tap_queue    __rcu *taps[MAX_TAP_QUEUES];
+	/* This list tracks all taps (both enabled and disabled) */
+	struct list_head	queue_list;
+	int			numvtaps;
+	int			numqueues;
+	netdev_features_t	tap_features;
+	int			minor;
+
+	void (*update_features)(struct tap_dev *tap, netdev_features_t features);
+	void (*count_tx_dropped)(struct tap_dev *tap);
+	void (*count_rx_dropped)(struct tap_dev *tap);
+};
+
+/*
+ * A tap queue is the central object of tap module, it connects
+ * an open character device to virtual interface. There can be
+ * multiple queues on one interface, which map back to queues
+ * implemented in hardware on the underlying device.
+ *
+ * tap_proto is used to allocate queues through the sock allocation
+ * mechanism.
+ *
+ */
+
+struct tap_queue {
+	struct sock sk;
+	struct socket sock;
+	struct socket_wq wq;
+	int vnet_hdr_sz;
+	struct tap_dev __rcu *tap;
+	struct file *file;
+	unsigned int flags;
+	u16 queue_index;
+	bool enabled;
+	struct list_head next;
+	struct skb_array skb_array;
+};
+
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);
-void tap_del_queues(struct net_device *dev);
-int tap_get_minor(struct macvlan_dev *vlan);
-void tap_free_minor(struct macvlan_dev *vlan);
-int tap_queue_resize(struct macvlan_dev *vlan);
+void tap_del_queues(struct tap_dev *tap);
+int tap_get_minor(struct tap_dev *tap);
+void tap_free_minor(struct tap_dev *tap);
+int tap_queue_resize(struct tap_dev *tap);
 int tap_create_cdev(struct cdev *tap_cdev,
-		     dev_t *tap_major, const char *device_name);
+		    dev_t *tap_major, const char *device_name);
 void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev);
 struct socket *tap_get_socket(struct file *file);
 
-- 
2.7.4

^ permalink raw reply related

* [PATCHv1 3/7] TAP: Tap character device creation/destroy API
From: Sainath Grandhi @ 2017-01-06 22:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, mahesh, linux-kernel, Sainath Grandhi
In-Reply-To: <1483742009-19184-1-git-send-email-sainath.grandhi@intel.com>

This patch provides tap device create/destroy APIs in tap.c.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Tested-by: Sainath Grandhi <sainath.grandhi@intel.com>
---
 drivers/net/macvtap_main.c | 29 +++++++--------------
 drivers/net/tap.c          | 64 ++++++++++++++++++++++++++++++++++++++--------
 include/linux/if_tap.h     |  3 +++
 3 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
index 548f339..32ad560 100644
--- a/drivers/net/macvtap_main.c
+++ b/drivers/net/macvtap_main.c
@@ -28,7 +28,6 @@
  * Variables for dealing with macvtaps device numbers.
  */
 static dev_t macvtap_major;
-#define MACVTAP_NUM_DEVS (1U << MINORBITS)
 
 static const void *macvtap_net_namespace(struct device *d)
 {
@@ -159,43 +158,35 @@ static struct notifier_block macvtap_notifier_block __read_mostly = {
 	.notifier_call	= macvtap_device_event,
 };
 
-extern struct file_operations tap_fops;
 static int macvtap_init(void)
 {
 	int err;
 
-	err = alloc_chrdev_region(&macvtap_major, 0,
-				MACVTAP_NUM_DEVS, "macvtap");
-	if (err)
-		goto out1;
+	err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");
 
-	cdev_init(&macvtap_cdev, &tap_fops);
-	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
 	if (err)
-		goto out2;
+		goto out1;
 
 	err = class_register(&macvtap_class);
 	if (err)
-		goto out3;
+		goto out2;
 
 	err = register_netdevice_notifier(&macvtap_notifier_block);
 	if (err)
-		goto out4;
+		goto out3;
 
 	err = macvlan_link_register(&macvtap_link_ops);
 	if (err)
-		goto out5;
+		goto out4;
 
 	return 0;
 
-out5:
-	unregister_netdevice_notifier(&macvtap_notifier_block);
 out4:
-	class_unregister(&macvtap_class);
+	unregister_netdevice_notifier(&macvtap_notifier_block);
 out3:
-	cdev_del(&macvtap_cdev);
+	class_unregister(&macvtap_class);
 out2:
-	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+	cdev_del(&macvtap_cdev);
 out1:
 	return err;
 }
@@ -207,9 +198,7 @@ static void macvtap_exit(void)
 	rtnl_link_unregister(&macvtap_link_ops);
 	unregister_netdevice_notifier(&macvtap_notifier_block);
 	class_unregister(&macvtap_class);
-	cdev_del(&macvtap_cdev);
-	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
-	idr_destroy(&minor_idr);
+	tap_destroy_cdev(macvtap_major, &macvtap_cdev);
 }
 module_exit(macvtap_exit);
 
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index d0807c2..52692d2 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -123,8 +123,12 @@ static struct proto tap_proto = {
 };
 
 #define TAP_NUM_DEVS (1U << MINORBITS)
-static DEFINE_MUTEX(minor_lock);
-DEFINE_IDR(minor_idr);
+struct major_info {
+	dev_t major;
+	struct idr minor_idr;
+	struct mutex minor_lock;
+	const char *device_name;
+} macvtap_major;
 
 #define GOODCOPY_LEN 128
 
@@ -413,26 +417,26 @@ int tap_get_minor(struct macvlan_dev *vlan)
 {
 	int retval = -ENOMEM;
 
-	mutex_lock(&minor_lock);
-	retval = idr_alloc(&minor_idr, vlan, 1, TAP_NUM_DEVS, GFP_KERNEL);
+	mutex_lock(&macvtap_major.minor_lock);
+	retval = idr_alloc(&macvtap_major.minor_idr, vlan, 1, TAP_NUM_DEVS, GFP_KERNEL);
 	if (retval >= 0) {
 		vlan->minor = retval;
 	} else if (retval == -ENOSPC) {
 		netdev_err(vlan->dev, "Too many tap devices\n");
 		retval = -EINVAL;
 	}
-	mutex_unlock(&minor_lock);
+	mutex_unlock(&macvtap_major.minor_lock);
 	return retval < 0 ? retval : 0;
 }
 
 void tap_free_minor(struct macvlan_dev *vlan)
 {
-	mutex_lock(&minor_lock);
+	mutex_lock(&macvtap_major.minor_lock);
 	if (vlan->minor) {
-		idr_remove(&minor_idr, vlan->minor);
+		idr_remove(&macvtap_major.minor_idr, vlan->minor);
 		vlan->minor = 0;
 	}
-	mutex_unlock(&minor_lock);
+	mutex_unlock(&macvtap_major.minor_lock);
 }
 
 static struct net_device *dev_get_by_tap_minor(int minor)
@@ -440,13 +444,13 @@ static struct net_device *dev_get_by_tap_minor(int minor)
 	struct net_device *dev = NULL;
 	struct macvlan_dev *vlan;
 
-	mutex_lock(&minor_lock);
-	vlan = idr_find(&minor_idr, minor);
+	mutex_lock(&macvtap_major.minor_lock);
+	vlan = idr_find(&macvtap_major.minor_idr, minor);
 	if (vlan) {
 		dev = vlan->dev;
 		dev_hold(dev);
 	}
-	mutex_unlock(&minor_lock);
+	mutex_unlock(&macvtap_major.minor_lock);
 	return dev;
 }
 
@@ -1184,3 +1188,41 @@ int tap_queue_resize(struct macvlan_dev *vlan)
 	kfree(arrays);
 	return ret;
 }
+
+int tap_create_cdev(struct cdev *tap_cdev,
+		    dev_t *tap_major, const char *device_name)
+{
+	int err;
+
+	err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name);
+
+	if (err)
+		goto out1;
+
+	cdev_init(tap_cdev, &tap_fops);
+	err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS);
+	if (err)
+		goto out2;
+
+	macvtap_major.major = MAJOR(*tap_major);
+
+	idr_init(&macvtap_major.minor_idr);
+	mutex_init(&macvtap_major.minor_lock);
+
+	macvtap_major.device_name = device_name;
+
+	return err;
+
+out2:
+	unregister_chrdev_region(*tap_major, TAP_NUM_DEVS);
+out1:
+	return err;
+}
+
+void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev)
+{
+	cdev_del(tap_cdev);
+	unregister_chrdev_region(major, TAP_NUM_DEVS);
+	idr_destroy(&macvtap_major.minor_idr);
+}
+
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index d9ecc15..89bcd42 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -6,6 +6,9 @@ void tap_del_queues(struct net_device *dev);
 int tap_get_minor(struct macvlan_dev *vlan);
 void tap_free_minor(struct macvlan_dev *vlan);
 int tap_queue_resize(struct macvlan_dev *vlan);
+int tap_create_cdev(struct cdev *tap_cdev,
+		     dev_t *tap_major, const char *device_name);
+void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev);
 struct socket *tap_get_socket(struct file *file);
 
 #endif /*_LINUX_IF_TAP_H_*/
-- 
2.7.4

^ permalink raw reply related

* [PATCHv1 1/7] TAP: Refactoring macvtap.c
From: Sainath Grandhi @ 2017-01-06 22:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, mahesh, linux-kernel, Sainath Grandhi
In-Reply-To: <1483742009-19184-1-git-send-email-sainath.grandhi@intel.com>

macvtap module has code for tap/queue management and link management. This patch splits
the code into macvtap_main.c for link management and tap.c for tap/queue management.
Functionality in tap.c can be re-used for implementing tap on other virtual interfaces.

Signed-off-by: Sainath Grandhi <sainath.grandhi@intel.com>
Tested-by: Sainath Grandhi <sainath.grandhi@intel.com>
---
 drivers/net/Makefile       |    2 +
 drivers/net/macvtap.c      | 1374 --------------------------------------------
 drivers/net/macvtap_main.c |  218 +++++++
 drivers/net/tap.c          | 1186 ++++++++++++++++++++++++++++++++++++++
 include/linux/if_macvtap.h |   10 +
 5 files changed, 1416 insertions(+), 1374 deletions(-)
 delete mode 100644 drivers/net/macvtap.c
 create mode 100644 drivers/net/macvtap_main.c
 create mode 100644 drivers/net/tap.c
 create mode 100644 include/linux/if_macvtap.h

diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 7336cbd..19b03a9 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -29,6 +29,8 @@ obj-$(CONFIG_GTP) += gtp.o
 obj-$(CONFIG_NLMON) += nlmon.o
 obj-$(CONFIG_NET_VRF) += vrf.o
 
+macvtap-objs := macvtap_main.o tap.o
+
 #
 # Networking Drivers
 #
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
deleted file mode 100644
index 5c26653..0000000
--- a/drivers/net/macvtap.c
+++ /dev/null
@@ -1,1374 +0,0 @@
-#include <linux/etherdevice.h>
-#include <linux/if_macvlan.h>
-#include <linux/if_vlan.h>
-#include <linux/interrupt.h>
-#include <linux/nsproxy.h>
-#include <linux/compat.h>
-#include <linux/if_tun.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/cache.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
-#include <linux/cdev.h>
-#include <linux/idr.h>
-#include <linux/fs.h>
-#include <linux/uio.h>
-
-#include <net/net_namespace.h>
-#include <net/rtnetlink.h>
-#include <net/sock.h>
-#include <linux/virtio_net.h>
-#include <linux/skb_array.h>
-
-/*
- * A macvtap queue is the central object of this driver, it connects
- * an open character device to a macvlan interface. There can be
- * multiple queues on one interface, which map back to queues
- * implemented in hardware on the underlying device.
- *
- * macvtap_proto is used to allocate queues through the sock allocation
- * mechanism.
- *
- */
-struct macvtap_queue {
-	struct sock sk;
-	struct socket sock;
-	struct socket_wq wq;
-	int vnet_hdr_sz;
-	struct macvlan_dev __rcu *vlan;
-	struct file *file;
-	unsigned int flags;
-	u16 queue_index;
-	bool enabled;
-	struct list_head next;
-	struct skb_array skb_array;
-};
-
-#define MACVTAP_FEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
-
-#define MACVTAP_VNET_LE 0x80000000
-#define MACVTAP_VNET_BE 0x40000000
-
-#ifdef CONFIG_TUN_VNET_CROSS_LE
-static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
-{
-	return q->flags & MACVTAP_VNET_BE ? false :
-		virtio_legacy_is_little_endian();
-}
-
-static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *sp)
-{
-	int s = !!(q->flags & MACVTAP_VNET_BE);
-
-	if (put_user(s, sp))
-		return -EFAULT;
-
-	return 0;
-}
-
-static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *sp)
-{
-	int s;
-
-	if (get_user(s, sp))
-		return -EFAULT;
-
-	if (s)
-		q->flags |= MACVTAP_VNET_BE;
-	else
-		q->flags &= ~MACVTAP_VNET_BE;
-
-	return 0;
-}
-#else
-static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
-{
-	return virtio_legacy_is_little_endian();
-}
-
-static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *argp)
-{
-	return -EINVAL;
-}
-
-static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *argp)
-{
-	return -EINVAL;
-}
-#endif /* CONFIG_TUN_VNET_CROSS_LE */
-
-static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
-{
-	return q->flags & MACVTAP_VNET_LE ||
-		macvtap_legacy_is_little_endian(q);
-}
-
-static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
-{
-	return __virtio16_to_cpu(macvtap_is_little_endian(q), val);
-}
-
-static inline __virtio16 cpu_to_macvtap16(struct macvtap_queue *q, u16 val)
-{
-	return __cpu_to_virtio16(macvtap_is_little_endian(q), val);
-}
-
-static struct proto macvtap_proto = {
-	.name = "macvtap",
-	.owner = THIS_MODULE,
-	.obj_size = sizeof (struct macvtap_queue),
-};
-
-/*
- * Variables for dealing with macvtaps device numbers.
- */
-static dev_t macvtap_major;
-#define MACVTAP_NUM_DEVS (1U << MINORBITS)
-static DEFINE_MUTEX(minor_lock);
-static DEFINE_IDR(minor_idr);
-
-#define GOODCOPY_LEN 128
-static const void *macvtap_net_namespace(struct device *d)
-{
-	struct net_device *dev = to_net_dev(d->parent);
-	return dev_net(dev);
-}
-
-static struct class macvtap_class = {
-	.name = "macvtap",
-	.owner = THIS_MODULE,
-	.ns_type = &net_ns_type_operations,
-	.namespace = macvtap_net_namespace,
-};
-static struct cdev macvtap_cdev;
-
-static const struct proto_ops macvtap_socket_ops;
-
-#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
-		      NETIF_F_TSO6 | NETIF_F_UFO)
-#define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
-#define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
-
-static struct macvlan_dev *macvtap_get_vlan_rcu(const struct net_device *dev)
-{
-	return rcu_dereference(dev->rx_handler_data);
-}
-
-/*
- * RCU usage:
- * The macvtap_queue and the macvlan_dev are loosely coupled, the
- * pointers from one to the other can only be read while rcu_read_lock
- * or rtnl is held.
- *
- * Both the file and the macvlan_dev hold a reference on the macvtap_queue
- * through sock_hold(&q->sk). When the macvlan_dev goes away first,
- * q->vlan becomes inaccessible. When the files gets closed,
- * macvtap_get_queue() fails.
- *
- * There may still be references to the struct sock inside of the
- * queue from outbound SKBs, but these never reference back to the
- * file or the dev. The data structure is freed through __sk_free
- * when both our references and any pending SKBs are gone.
- */
-
-static int macvtap_enable_queue(struct net_device *dev, struct file *file,
-				struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	int err = -EINVAL;
-
-	ASSERT_RTNL();
-
-	if (q->enabled)
-		goto out;
-
-	err = 0;
-	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
-	q->queue_index = vlan->numvtaps;
-	q->enabled = true;
-
-	vlan->numvtaps++;
-out:
-	return err;
-}
-
-/* Requires RTNL */
-static int macvtap_set_queue(struct net_device *dev, struct file *file,
-			     struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-
-	if (vlan->numqueues == MAX_MACVTAP_QUEUES)
-		return -EBUSY;
-
-	rcu_assign_pointer(q->vlan, vlan);
-	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
-	sock_hold(&q->sk);
-
-	q->file = file;
-	q->queue_index = vlan->numvtaps;
-	q->enabled = true;
-	file->private_data = q;
-	list_add_tail(&q->next, &vlan->queue_list);
-
-	vlan->numvtaps++;
-	vlan->numqueues++;
-
-	return 0;
-}
-
-static int macvtap_disable_queue(struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan;
-	struct macvtap_queue *nq;
-
-	ASSERT_RTNL();
-	if (!q->enabled)
-		return -EINVAL;
-
-	vlan = rtnl_dereference(q->vlan);
-
-	if (vlan) {
-		int index = q->queue_index;
-		BUG_ON(index >= vlan->numvtaps);
-		nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]);
-		nq->queue_index = index;
-
-		rcu_assign_pointer(vlan->taps[index], nq);
-		RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL);
-		q->enabled = false;
-
-		vlan->numvtaps--;
-	}
-
-	return 0;
-}
-
-/*
- * The file owning the queue got closed, give up both
- * the reference that the files holds as well as the
- * one from the macvlan_dev if that still exists.
- *
- * Using the spinlock makes sure that we don't get
- * to the queue again after destroying it.
- */
-static void macvtap_put_queue(struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan;
-
-	rtnl_lock();
-	vlan = rtnl_dereference(q->vlan);
-
-	if (vlan) {
-		if (q->enabled)
-			BUG_ON(macvtap_disable_queue(q));
-
-		vlan->numqueues--;
-		RCU_INIT_POINTER(q->vlan, NULL);
-		sock_put(&q->sk);
-		list_del_init(&q->next);
-	}
-
-	rtnl_unlock();
-
-	synchronize_rcu();
-	sock_put(&q->sk);
-}
-
-/*
- * Select a queue based on the rxq of the device on which this packet
- * arrived. If the incoming device is not mq, calculate a flow hash
- * to select a queue. If all fails, find the first available queue.
- * Cache vlan->numvtaps since it can become zero during the execution
- * of this function.
- */
-static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
-					       struct sk_buff *skb)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *tap = NULL;
-	/* Access to taps array is protected by rcu, but access to numvtaps
-	 * isn't. Below we use it to lookup a queue, but treat it as a hint
-	 * and validate that the result isn't NULL - in case we are
-	 * racing against queue removal.
-	 */
-	int numvtaps = ACCESS_ONCE(vlan->numvtaps);
-	__u32 rxq;
-
-	if (!numvtaps)
-		goto out;
-
-	if (numvtaps == 1)
-		goto single;
-
-	/* Check if we can use flow to select a queue */
-	rxq = skb_get_hash(skb);
-	if (rxq) {
-		tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
-		goto out;
-	}
-
-	if (likely(skb_rx_queue_recorded(skb))) {
-		rxq = skb_get_rx_queue(skb);
-
-		while (unlikely(rxq >= numvtaps))
-			rxq -= numvtaps;
-
-		tap = rcu_dereference(vlan->taps[rxq]);
-		goto out;
-	}
-
-single:
-	tap = rcu_dereference(vlan->taps[0]);
-out:
-	return tap;
-}
-
-/*
- * The net_device is going away, give up the reference
- * that it holds on all queues and safely set the pointer
- * from the queues to NULL.
- */
-static void macvtap_del_queues(struct net_device *dev)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *q, *tmp;
-
-	ASSERT_RTNL();
-	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
-		list_del_init(&q->next);
-		RCU_INIT_POINTER(q->vlan, NULL);
-		if (q->enabled)
-			vlan->numvtaps--;
-		vlan->numqueues--;
-		sock_put(&q->sk);
-	}
-	BUG_ON(vlan->numvtaps);
-	BUG_ON(vlan->numqueues);
-	/* guarantee that any future macvtap_set_queue will fail */
-	vlan->numvtaps = MAX_MACVTAP_QUEUES;
-}
-
-static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
-{
-	struct sk_buff *skb = *pskb;
-	struct net_device *dev = skb->dev;
-	struct macvlan_dev *vlan;
-	struct macvtap_queue *q;
-	netdev_features_t features = TAP_FEATURES;
-
-	vlan = macvtap_get_vlan_rcu(dev);
-	if (!vlan)
-		return RX_HANDLER_PASS;
-
-	q = macvtap_get_queue(dev, skb);
-	if (!q)
-		return RX_HANDLER_PASS;
-
-	if (__skb_array_full(&q->skb_array))
-		goto drop;
-
-	skb_push(skb, ETH_HLEN);
-
-	/* Apply the forward feature mask so that we perform segmentation
-	 * according to users wishes.  This only works if VNET_HDR is
-	 * enabled.
-	 */
-	if (q->flags & IFF_VNET_HDR)
-		features |= vlan->tap_features;
-	if (netif_needs_gso(skb, features)) {
-		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
-
-		if (IS_ERR(segs))
-			goto drop;
-
-		if (!segs) {
-			if (skb_array_produce(&q->skb_array, skb))
-				goto drop;
-			goto wake_up;
-		}
-
-		consume_skb(skb);
-		while (segs) {
-			struct sk_buff *nskb = segs->next;
-
-			segs->next = NULL;
-			if (skb_array_produce(&q->skb_array, segs)) {
-				kfree_skb(segs);
-				kfree_skb_list(nskb);
-				break;
-			}
-			segs = nskb;
-		}
-	} else {
-		/* If we receive a partial checksum and the tap side
-		 * doesn't support checksum offload, compute the checksum.
-		 * Note: it doesn't matter which checksum feature to
-		 *        check, we either support them all or none.
-		 */
-		if (skb->ip_summed == CHECKSUM_PARTIAL &&
-		    !(features & NETIF_F_CSUM_MASK) &&
-		    skb_checksum_help(skb))
-			goto drop;
-		if (skb_array_produce(&q->skb_array, skb))
-			goto drop;
-	}
-
-wake_up:
-	wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND);
-	return RX_HANDLER_CONSUMED;
-
-drop:
-	/* Count errors/drops only here, thus don't care about args. */
-	macvlan_count_rx(vlan, 0, 0, 0);
-	kfree_skb(skb);
-	return RX_HANDLER_CONSUMED;
-}
-
-static int macvtap_get_minor(struct macvlan_dev *vlan)
-{
-	int retval = -ENOMEM;
-
-	mutex_lock(&minor_lock);
-	retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL);
-	if (retval >= 0) {
-		vlan->minor = retval;
-	} else if (retval == -ENOSPC) {
-		netdev_err(vlan->dev, "Too many macvtap devices\n");
-		retval = -EINVAL;
-	}
-	mutex_unlock(&minor_lock);
-	return retval < 0 ? retval : 0;
-}
-
-static void macvtap_free_minor(struct macvlan_dev *vlan)
-{
-	mutex_lock(&minor_lock);
-	if (vlan->minor) {
-		idr_remove(&minor_idr, vlan->minor);
-		vlan->minor = 0;
-	}
-	mutex_unlock(&minor_lock);
-}
-
-static struct net_device *dev_get_by_macvtap_minor(int minor)
-{
-	struct net_device *dev = NULL;
-	struct macvlan_dev *vlan;
-
-	mutex_lock(&minor_lock);
-	vlan = idr_find(&minor_idr, minor);
-	if (vlan) {
-		dev = vlan->dev;
-		dev_hold(dev);
-	}
-	mutex_unlock(&minor_lock);
-	return dev;
-}
-
-static int macvtap_newlink(struct net *src_net,
-			   struct net_device *dev,
-			   struct nlattr *tb[],
-			   struct nlattr *data[])
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	int err;
-
-	INIT_LIST_HEAD(&vlan->queue_list);
-
-	/* Since macvlan supports all offloads by default, make
-	 * tap support all offloads also.
-	 */
-	vlan->tap_features = TUN_OFFLOADS;
-
-	err = netdev_rx_handler_register(dev, macvtap_handle_frame, vlan);
-	if (err)
-		return err;
-
-	/* Don't put anything that may fail after macvlan_common_newlink
-	 * because we can't undo what it does.
-	 */
-	err = macvlan_common_newlink(src_net, dev, tb, data);
-	if (err) {
-		netdev_rx_handler_unregister(dev);
-		return err;
-	}
-
-	return 0;
-}
-
-static void macvtap_dellink(struct net_device *dev,
-			    struct list_head *head)
-{
-	netdev_rx_handler_unregister(dev);
-	macvtap_del_queues(dev);
-	macvlan_dellink(dev, head);
-}
-
-static void macvtap_setup(struct net_device *dev)
-{
-	macvlan_common_setup(dev);
-	dev->tx_queue_len = TUN_READQ_SIZE;
-}
-
-static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
-	.kind		= "macvtap",
-	.setup		= macvtap_setup,
-	.newlink	= macvtap_newlink,
-	.dellink	= macvtap_dellink,
-};
-
-
-static void macvtap_sock_write_space(struct sock *sk)
-{
-	wait_queue_head_t *wqueue;
-
-	if (!sock_writeable(sk) ||
-	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
-		return;
-
-	wqueue = sk_sleep(sk);
-	if (wqueue && waitqueue_active(wqueue))
-		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
-}
-
-static void macvtap_sock_destruct(struct sock *sk)
-{
-	struct macvtap_queue *q = container_of(sk, struct macvtap_queue, sk);
-
-	skb_array_cleanup(&q->skb_array);
-}
-
-static int macvtap_open(struct inode *inode, struct file *file)
-{
-	struct net *net = current->nsproxy->net_ns;
-	struct net_device *dev;
-	struct macvtap_queue *q;
-	int err = -ENODEV;
-
-	rtnl_lock();
-	dev = dev_get_by_macvtap_minor(iminor(inode));
-	if (!dev)
-		goto err;
-
-	err = -ENOMEM;
-	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					     &macvtap_proto, 0);
-	if (!q)
-		goto err;
-
-	RCU_INIT_POINTER(q->sock.wq, &q->wq);
-	init_waitqueue_head(&q->wq.wait);
-	q->sock.type = SOCK_RAW;
-	q->sock.state = SS_CONNECTED;
-	q->sock.file = file;
-	q->sock.ops = &macvtap_socket_ops;
-	sock_init_data(&q->sock, &q->sk);
-	q->sk.sk_write_space = macvtap_sock_write_space;
-	q->sk.sk_destruct = macvtap_sock_destruct;
-	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
-	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
-
-	/*
-	 * so far only KVM virtio_net uses macvtap, enable zero copy between
-	 * guest kernel and host kernel when lower device supports zerocopy
-	 *
-	 * The macvlan supports zerocopy iff the lower device supports zero
-	 * copy so we don't have to look at the lower device directly.
-	 */
-	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
-		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
-
-	err = -ENOMEM;
-	if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL))
-		goto err_array;
-
-	err = macvtap_set_queue(dev, file, q);
-	if (err)
-		goto err_queue;
-
-	dev_put(dev);
-
-	rtnl_unlock();
-	return err;
-
-err_queue:
-	skb_array_cleanup(&q->skb_array);
-err_array:
-	sock_put(&q->sk);
-err:
-	if (dev)
-		dev_put(dev);
-
-	rtnl_unlock();
-	return err;
-}
-
-static int macvtap_release(struct inode *inode, struct file *file)
-{
-	struct macvtap_queue *q = file->private_data;
-	macvtap_put_queue(q);
-	return 0;
-}
-
-static unsigned int macvtap_poll(struct file *file, poll_table * wait)
-{
-	struct macvtap_queue *q = file->private_data;
-	unsigned int mask = POLLERR;
-
-	if (!q)
-		goto out;
-
-	mask = 0;
-	poll_wait(file, &q->wq.wait, wait);
-
-	if (!skb_array_empty(&q->skb_array))
-		mask |= POLLIN | POLLRDNORM;
-
-	if (sock_writeable(&q->sk) ||
-	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
-	     sock_writeable(&q->sk)))
-		mask |= POLLOUT | POLLWRNORM;
-
-out:
-	return mask;
-}
-
-static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
-						size_t len, size_t linear,
-						int noblock, int *err)
-{
-	struct sk_buff *skb;
-
-	/* Under a page?  Don't bother with paged skb. */
-	if (prepad + len < PAGE_SIZE || !linear)
-		linear = len;
-
-	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
-				   err, 0);
-	if (!skb)
-		return NULL;
-
-	skb_reserve(skb, prepad);
-	skb_put(skb, linear);
-	skb->data_len = len - linear;
-	skb->len += len - linear;
-
-	return skb;
-}
-
-/* Neighbour code has some assumptions on HH_DATA_MOD alignment */
-#define MACVTAP_RESERVE HH_DATA_OFF(ETH_HLEN)
-
-/* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
-				struct iov_iter *from, int noblock)
-{
-	int good_linear = SKB_MAX_HEAD(MACVTAP_RESERVE);
-	struct sk_buff *skb;
-	struct macvlan_dev *vlan;
-	unsigned long total_len = iov_iter_count(from);
-	unsigned long len = total_len;
-	int err;
-	struct virtio_net_hdr vnet_hdr = { 0 };
-	int vnet_hdr_len = 0;
-	int copylen = 0;
-	int depth;
-	bool zerocopy = false;
-	size_t linear;
-
-	if (q->flags & IFF_VNET_HDR) {
-		vnet_hdr_len = q->vnet_hdr_sz;
-
-		err = -EINVAL;
-		if (len < vnet_hdr_len)
-			goto err;
-		len -= vnet_hdr_len;
-
-		err = -EFAULT;
-		if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
-			goto err;
-		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
-		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
-		     macvtap16_to_cpu(q, vnet_hdr.csum_start) +
-		     macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
-			     macvtap16_to_cpu(q, vnet_hdr.hdr_len))
-			vnet_hdr.hdr_len = cpu_to_macvtap16(q,
-				 macvtap16_to_cpu(q, vnet_hdr.csum_start) +
-				 macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
-		err = -EINVAL;
-		if (macvtap16_to_cpu(q, vnet_hdr.hdr_len) > len)
-			goto err;
-	}
-
-	err = -EINVAL;
-	if (unlikely(len < ETH_HLEN))
-		goto err;
-
-	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
-		struct iov_iter i;
-
-		copylen = vnet_hdr.hdr_len ?
-			macvtap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
-		if (copylen > good_linear)
-			copylen = good_linear;
-		else if (copylen < ETH_HLEN)
-			copylen = ETH_HLEN;
-		linear = copylen;
-		i = *from;
-		iov_iter_advance(&i, copylen);
-		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
-			zerocopy = true;
-	}
-
-	if (!zerocopy) {
-		copylen = len;
-		linear = macvtap16_to_cpu(q, vnet_hdr.hdr_len);
-		if (linear > good_linear)
-			linear = good_linear;
-		else if (linear < ETH_HLEN)
-			linear = ETH_HLEN;
-	}
-
-	skb = macvtap_alloc_skb(&q->sk, MACVTAP_RESERVE, copylen,
-				linear, noblock, &err);
-	if (!skb)
-		goto err;
-
-	if (zerocopy)
-		err = zerocopy_sg_from_iter(skb, from);
-	else
-		err = skb_copy_datagram_from_iter(skb, 0, from, len);
-
-	if (err)
-		goto err_kfree;
-
-	skb_set_network_header(skb, ETH_HLEN);
-	skb_reset_mac_header(skb);
-	skb->protocol = eth_hdr(skb)->h_proto;
-
-	if (vnet_hdr_len) {
-		err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
-					    macvtap_is_little_endian(q));
-		if (err)
-			goto err_kfree;
-	}
-
-	skb_probe_transport_header(skb, ETH_HLEN);
-
-	/* Move network header to the right position for VLAN tagged packets */
-	if ((skb->protocol == htons(ETH_P_8021Q) ||
-	     skb->protocol == htons(ETH_P_8021AD)) &&
-	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
-		skb_set_network_header(skb, depth);
-
-	rcu_read_lock();
-	vlan = rcu_dereference(q->vlan);
-	/* copy skb_ubuf_info for callback when skb has no error */
-	if (zerocopy) {
-		skb_shinfo(skb)->destructor_arg = m->msg_control;
-		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
-		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-	} else if (m && m->msg_control) {
-		struct ubuf_info *uarg = m->msg_control;
-		uarg->callback(uarg, false);
-	}
-
-	if (vlan) {
-		skb->dev = vlan->dev;
-		dev_queue_xmit(skb);
-	} else {
-		kfree_skb(skb);
-	}
-	rcu_read_unlock();
-
-	return total_len;
-
-err_kfree:
-	kfree_skb(skb);
-
-err:
-	rcu_read_lock();
-	vlan = rcu_dereference(q->vlan);
-	if (vlan)
-		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
-	rcu_read_unlock();
-
-	return err;
-}
-
-static ssize_t macvtap_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct macvtap_queue *q = file->private_data;
-
-	return macvtap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
-}
-
-/* Put packet to the user space buffer */
-static ssize_t macvtap_put_user(struct macvtap_queue *q,
-				const struct sk_buff *skb,
-				struct iov_iter *iter)
-{
-	int ret;
-	int vnet_hdr_len = 0;
-	int vlan_offset = 0;
-	int total;
-
-	if (q->flags & IFF_VNET_HDR) {
-		struct virtio_net_hdr vnet_hdr;
-		vnet_hdr_len = q->vnet_hdr_sz;
-		if (iov_iter_count(iter) < vnet_hdr_len)
-			return -EINVAL;
-
-		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
-					    macvtap_is_little_endian(q)))
-			BUG();
-
-		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
-		    sizeof(vnet_hdr))
-			return -EFAULT;
-
-		iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
-	}
-	total = vnet_hdr_len;
-	total += skb->len;
-
-	if (skb_vlan_tag_present(skb)) {
-		struct {
-			__be16 h_vlan_proto;
-			__be16 h_vlan_TCI;
-		} veth;
-		veth.h_vlan_proto = skb->vlan_proto;
-		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
-
-		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
-		total += VLAN_HLEN;
-
-		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
-		if (ret || !iov_iter_count(iter))
-			goto done;
-
-		ret = copy_to_iter(&veth, sizeof(veth), iter);
-		if (ret != sizeof(veth) || !iov_iter_count(iter))
-			goto done;
-	}
-
-	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
-				     skb->len - vlan_offset);
-
-done:
-	return ret ? ret : total;
-}
-
-static ssize_t macvtap_do_read(struct macvtap_queue *q,
-			       struct iov_iter *to,
-			       int noblock)
-{
-	DEFINE_WAIT(wait);
-	struct sk_buff *skb;
-	ssize_t ret = 0;
-
-	if (!iov_iter_count(to))
-		return 0;
-
-	while (1) {
-		if (!noblock)
-			prepare_to_wait(sk_sleep(&q->sk), &wait,
-					TASK_INTERRUPTIBLE);
-
-		/* Read frames from the queue */
-		skb = skb_array_consume(&q->skb_array);
-		if (skb)
-			break;
-		if (noblock) {
-			ret = -EAGAIN;
-			break;
-		}
-		if (signal_pending(current)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
-		/* Nothing to read, let's sleep */
-		schedule();
-	}
-	if (!noblock)
-		finish_wait(sk_sleep(&q->sk), &wait);
-
-	if (skb) {
-		ret = macvtap_put_user(q, skb, to);
-		if (unlikely(ret < 0))
-			kfree_skb(skb);
-		else
-			consume_skb(skb);
-	}
-	return ret;
-}
-
-static ssize_t macvtap_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
-	struct file *file = iocb->ki_filp;
-	struct macvtap_queue *q = file->private_data;
-	ssize_t len = iov_iter_count(to), ret;
-
-	ret = macvtap_do_read(q, to, file->f_flags & O_NONBLOCK);
-	ret = min_t(ssize_t, ret, len);
-	if (ret > 0)
-		iocb->ki_pos = ret;
-	return ret;
-}
-
-static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
-{
-	struct macvlan_dev *vlan;
-
-	ASSERT_RTNL();
-	vlan = rtnl_dereference(q->vlan);
-	if (vlan)
-		dev_hold(vlan->dev);
-
-	return vlan;
-}
-
-static void macvtap_put_vlan(struct macvlan_dev *vlan)
-{
-	dev_put(vlan->dev);
-}
-
-static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
-{
-	struct macvtap_queue *q = file->private_data;
-	struct macvlan_dev *vlan;
-	int ret;
-
-	vlan = macvtap_get_vlan(q);
-	if (!vlan)
-		return -EINVAL;
-
-	if (flags & IFF_ATTACH_QUEUE)
-		ret = macvtap_enable_queue(vlan->dev, file, q);
-	else if (flags & IFF_DETACH_QUEUE)
-		ret = macvtap_disable_queue(q);
-	else
-		ret = -EINVAL;
-
-	macvtap_put_vlan(vlan);
-	return ret;
-}
-
-static int set_offload(struct macvtap_queue *q, unsigned long arg)
-{
-	struct macvlan_dev *vlan;
-	netdev_features_t features;
-	netdev_features_t feature_mask = 0;
-
-	vlan = rtnl_dereference(q->vlan);
-	if (!vlan)
-		return -ENOLINK;
-
-	features = vlan->dev->features;
-
-	if (arg & TUN_F_CSUM) {
-		feature_mask = NETIF_F_HW_CSUM;
-
-		if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) {
-			if (arg & TUN_F_TSO_ECN)
-				feature_mask |= NETIF_F_TSO_ECN;
-			if (arg & TUN_F_TSO4)
-				feature_mask |= NETIF_F_TSO;
-			if (arg & TUN_F_TSO6)
-				feature_mask |= NETIF_F_TSO6;
-		}
-
-		if (arg & TUN_F_UFO)
-			feature_mask |= NETIF_F_UFO;
-	}
-
-	/* tun/tap driver inverts the usage for TSO offloads, where
-	 * setting the TSO bit means that the userspace wants to
-	 * accept TSO frames and turning it off means that user space
-	 * does not support TSO.
-	 * For macvtap, we have to invert it to mean the same thing.
-	 * When user space turns off TSO, we turn off GSO/LRO so that
-	 * user-space will not receive TSO frames.
-	 */
-	if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO))
-		features |= RX_OFFLOADS;
-	else
-		features &= ~RX_OFFLOADS;
-
-	/* tap_features are the same as features on tun/tap and
-	 * reflect user expectations.
-	 */
-	vlan->tap_features = feature_mask;
-	vlan->set_features = features;
-	netdev_update_features(vlan->dev);
-
-	return 0;
-}
-
-/*
- * provide compatibility with generic tun/tap interface
- */
-static long macvtap_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	struct macvtap_queue *q = file->private_data;
-	struct macvlan_dev *vlan;
-	void __user *argp = (void __user *)arg;
-	struct ifreq __user *ifr = argp;
-	unsigned int __user *up = argp;
-	unsigned short u;
-	int __user *sp = argp;
-	struct sockaddr sa;
-	int s;
-	int ret;
-
-	switch (cmd) {
-	case TUNSETIFF:
-		/* ignore the name, just look at flags */
-		if (get_user(u, &ifr->ifr_flags))
-			return -EFAULT;
-
-		ret = 0;
-		if ((u & ~MACVTAP_FEATURES) != (IFF_NO_PI | IFF_TAP))
-			ret = -EINVAL;
-		else
-			q->flags = (q->flags & ~MACVTAP_FEATURES) | u;
-
-		return ret;
-
-	case TUNGETIFF:
-		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
-		if (!vlan) {
-			rtnl_unlock();
-			return -ENOLINK;
-		}
-
-		ret = 0;
-		u = q->flags;
-		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
-		    put_user(u, &ifr->ifr_flags))
-			ret = -EFAULT;
-		macvtap_put_vlan(vlan);
-		rtnl_unlock();
-		return ret;
-
-	case TUNSETQUEUE:
-		if (get_user(u, &ifr->ifr_flags))
-			return -EFAULT;
-		rtnl_lock();
-		ret = macvtap_ioctl_set_queue(file, u);
-		rtnl_unlock();
-		return ret;
-
-	case TUNGETFEATURES:
-		if (put_user(IFF_TAP | IFF_NO_PI | MACVTAP_FEATURES, up))
-			return -EFAULT;
-		return 0;
-
-	case TUNSETSNDBUF:
-		if (get_user(s, sp))
-			return -EFAULT;
-
-		q->sk.sk_sndbuf = s;
-		return 0;
-
-	case TUNGETVNETHDRSZ:
-		s = q->vnet_hdr_sz;
-		if (put_user(s, sp))
-			return -EFAULT;
-		return 0;
-
-	case TUNSETVNETHDRSZ:
-		if (get_user(s, sp))
-			return -EFAULT;
-		if (s < (int)sizeof(struct virtio_net_hdr))
-			return -EINVAL;
-
-		q->vnet_hdr_sz = s;
-		return 0;
-
-	case TUNGETVNETLE:
-		s = !!(q->flags & MACVTAP_VNET_LE);
-		if (put_user(s, sp))
-			return -EFAULT;
-		return 0;
-
-	case TUNSETVNETLE:
-		if (get_user(s, sp))
-			return -EFAULT;
-		if (s)
-			q->flags |= MACVTAP_VNET_LE;
-		else
-			q->flags &= ~MACVTAP_VNET_LE;
-		return 0;
-
-	case TUNGETVNETBE:
-		return macvtap_get_vnet_be(q, sp);
-
-	case TUNSETVNETBE:
-		return macvtap_set_vnet_be(q, sp);
-
-	case TUNSETOFFLOAD:
-		/* let the user check for future flags */
-		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-			    TUN_F_TSO_ECN | TUN_F_UFO))
-			return -EINVAL;
-
-		rtnl_lock();
-		ret = set_offload(q, arg);
-		rtnl_unlock();
-		return ret;
-
-	case SIOCGIFHWADDR:
-		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
-		if (!vlan) {
-			rtnl_unlock();
-			return -ENOLINK;
-		}
-		ret = 0;
-		u = vlan->dev->type;
-		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
-		    copy_to_user(&ifr->ifr_hwaddr.sa_data, vlan->dev->dev_addr, ETH_ALEN) ||
-		    put_user(u, &ifr->ifr_hwaddr.sa_family))
-			ret = -EFAULT;
-		macvtap_put_vlan(vlan);
-		rtnl_unlock();
-		return ret;
-
-	case SIOCSIFHWADDR:
-		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
-			return -EFAULT;
-		rtnl_lock();
-		vlan = macvtap_get_vlan(q);
-		if (!vlan) {
-			rtnl_unlock();
-			return -ENOLINK;
-		}
-		ret = dev_set_mac_address(vlan->dev, &sa);
-		macvtap_put_vlan(vlan);
-		rtnl_unlock();
-		return ret;
-
-	default:
-		return -EINVAL;
-	}
-}
-
-#ifdef CONFIG_COMPAT
-static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
-				 unsigned long arg)
-{
-	return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
-static const struct file_operations macvtap_fops = {
-	.owner		= THIS_MODULE,
-	.open		= macvtap_open,
-	.release	= macvtap_release,
-	.read_iter	= macvtap_read_iter,
-	.write_iter	= macvtap_write_iter,
-	.poll		= macvtap_poll,
-	.llseek		= no_llseek,
-	.unlocked_ioctl	= macvtap_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= macvtap_compat_ioctl,
-#endif
-};
-
-static int macvtap_sendmsg(struct socket *sock, struct msghdr *m,
-			   size_t total_len)
-{
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
-}
-
-static int macvtap_recvmsg(struct socket *sock, struct msghdr *m,
-			   size_t total_len, int flags)
-{
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	int ret;
-	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
-		return -EINVAL;
-	ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
-	if (ret > total_len) {
-		m->msg_flags |= MSG_TRUNC;
-		ret = flags & MSG_TRUNC ? ret : total_len;
-	}
-	return ret;
-}
-
-static int macvtap_peek_len(struct socket *sock)
-{
-	struct macvtap_queue *q = container_of(sock, struct macvtap_queue,
-					       sock);
-	return skb_array_peek_len(&q->skb_array);
-}
-
-/* Ops structure to mimic raw sockets with tun */
-static const struct proto_ops macvtap_socket_ops = {
-	.sendmsg = macvtap_sendmsg,
-	.recvmsg = macvtap_recvmsg,
-	.peek_len = macvtap_peek_len,
-};
-
-/* Get an underlying socket object from tun file.  Returns error unless file is
- * attached to a device.  The returned object works like a packet socket, it
- * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
- * holding a reference to the file for as long as the socket is in use. */
-struct socket *macvtap_get_socket(struct file *file)
-{
-	struct macvtap_queue *q;
-	if (file->f_op != &macvtap_fops)
-		return ERR_PTR(-EINVAL);
-	q = file->private_data;
-	if (!q)
-		return ERR_PTR(-EBADFD);
-	return &q->sock;
-}
-EXPORT_SYMBOL_GPL(macvtap_get_socket);
-
-static int macvtap_queue_resize(struct macvlan_dev *vlan)
-{
-	struct net_device *dev = vlan->dev;
-	struct macvtap_queue *q;
-	struct skb_array **arrays;
-	int n = vlan->numqueues;
-	int ret, i = 0;
-
-	arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
-	if (!arrays)
-		return -ENOMEM;
-
-	list_for_each_entry(q, &vlan->queue_list, next)
-		arrays[i++] = &q->skb_array;
-
-	ret = skb_array_resize_multiple(arrays, n,
-					dev->tx_queue_len, GFP_KERNEL);
-
-	kfree(arrays);
-	return ret;
-}
-
-static int macvtap_device_event(struct notifier_block *unused,
-				unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct macvlan_dev *vlan;
-	struct device *classdev;
-	dev_t devt;
-	int err;
-	char tap_name[IFNAMSIZ];
-
-	if (dev->rtnl_link_ops != &macvtap_link_ops)
-		return NOTIFY_DONE;
-
-	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
-	vlan = netdev_priv(dev);
-
-	switch (event) {
-	case NETDEV_REGISTER:
-		/* Create the device node here after the network device has
-		 * been registered but before register_netdevice has
-		 * finished running.
-		 */
-		err = macvtap_get_minor(vlan);
-		if (err)
-			return notifier_from_errno(err);
-
-		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
-		classdev = device_create(&macvtap_class, &dev->dev, devt,
-					 dev, tap_name);
-		if (IS_ERR(classdev)) {
-			macvtap_free_minor(vlan);
-			return notifier_from_errno(PTR_ERR(classdev));
-		}
-		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
-					tap_name);
-		if (err)
-			return notifier_from_errno(err);
-		break;
-	case NETDEV_UNREGISTER:
-		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
-		if (vlan->minor == 0)
-			break;
-		sysfs_remove_link(&dev->dev.kobj, tap_name);
-		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
-		device_destroy(&macvtap_class, devt);
-		macvtap_free_minor(vlan);
-		break;
-	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		if (macvtap_queue_resize(vlan))
-			return NOTIFY_BAD;
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block macvtap_notifier_block __read_mostly = {
-	.notifier_call	= macvtap_device_event,
-};
-
-static int macvtap_init(void)
-{
-	int err;
-
-	err = alloc_chrdev_region(&macvtap_major, 0,
-				MACVTAP_NUM_DEVS, "macvtap");
-	if (err)
-		goto out1;
-
-	cdev_init(&macvtap_cdev, &macvtap_fops);
-	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
-	if (err)
-		goto out2;
-
-	err = class_register(&macvtap_class);
-	if (err)
-		goto out3;
-
-	err = register_netdevice_notifier(&macvtap_notifier_block);
-	if (err)
-		goto out4;
-
-	err = macvlan_link_register(&macvtap_link_ops);
-	if (err)
-		goto out5;
-
-	return 0;
-
-out5:
-	unregister_netdevice_notifier(&macvtap_notifier_block);
-out4:
-	class_unregister(&macvtap_class);
-out3:
-	cdev_del(&macvtap_cdev);
-out2:
-	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
-out1:
-	return err;
-}
-module_init(macvtap_init);
-
-static void macvtap_exit(void)
-{
-	rtnl_link_unregister(&macvtap_link_ops);
-	unregister_netdevice_notifier(&macvtap_notifier_block);
-	class_unregister(&macvtap_class);
-	cdev_del(&macvtap_cdev);
-	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
-	idr_destroy(&minor_idr);
-}
-module_exit(macvtap_exit);
-
-MODULE_ALIAS_RTNL_LINK("macvtap");
-MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/net/macvtap_main.c b/drivers/net/macvtap_main.c
new file mode 100644
index 0000000..96ffa60
--- /dev/null
+++ b/drivers/net/macvtap_main.c
@@ -0,0 +1,218 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/if_macvtap.h>
+#include <linux/if_vlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/fs.h>
+#include <linux/uio.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+#include <linux/virtio_net.h>
+#include <linux/skb_array.h>
+
+/*
+ * Variables for dealing with macvtaps device numbers.
+ */
+static dev_t macvtap_major;
+#define MACVTAP_NUM_DEVS (1U << MINORBITS)
+
+static const void *macvtap_net_namespace(struct device *d)
+{
+	struct net_device *dev = to_net_dev(d->parent);
+	return dev_net(dev);
+}
+
+static struct class macvtap_class = {
+	.name = "macvtap",
+	.owner = THIS_MODULE,
+	.ns_type = &net_ns_type_operations,
+	.namespace = macvtap_net_namespace,
+};
+static struct cdev macvtap_cdev;
+
+#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
+		      NETIF_F_TSO6 | NETIF_F_UFO)
+
+static int macvtap_newlink(struct net *src_net,
+			   struct net_device *dev,
+			   struct nlattr *tb[],
+			   struct nlattr *data[])
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	int err;
+
+	INIT_LIST_HEAD(&vlan->queue_list);
+
+	/* Since macvlan supports all offloads by default, make
+	 * tap support all offloads also.
+	 */
+	vlan->tap_features = TUN_OFFLOADS;
+
+	err = netdev_rx_handler_register(dev, macvtap_handle_frame, vlan);
+	if (err)
+		return err;
+
+	/* Don't put anything that may fail after macvlan_common_newlink
+	 * because we can't undo what it does.
+	 */
+	err = macvlan_common_newlink(src_net, dev, tb, data);
+	if (err) {
+		netdev_rx_handler_unregister(dev);
+		return err;
+	}
+
+	return 0;
+}
+
+static void macvtap_dellink(struct net_device *dev,
+			    struct list_head *head)
+{
+	netdev_rx_handler_unregister(dev);
+	macvtap_del_queues(dev);
+	macvlan_dellink(dev, head);
+}
+
+static void macvtap_setup(struct net_device *dev)
+{
+	macvlan_common_setup(dev);
+	dev->tx_queue_len = TUN_READQ_SIZE;
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+	.kind		= "macvtap",
+	.setup		= macvtap_setup,
+	.newlink	= macvtap_newlink,
+	.dellink	= macvtap_dellink,
+};
+
+static int macvtap_device_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct macvlan_dev *vlan;
+	struct device *classdev;
+	dev_t devt;
+	int err;
+	char tap_name[IFNAMSIZ];
+
+	if (dev->rtnl_link_ops != &macvtap_link_ops)
+		return NOTIFY_DONE;
+
+	snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
+	vlan = netdev_priv(dev);
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		/* Create the device node here after the network device has
+		 * been registered but before register_netdevice has
+		 * finished running.
+		 */
+		err = macvtap_get_minor(vlan);
+		if (err)
+			return notifier_from_errno(err);
+
+		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
+		classdev = device_create(&macvtap_class, &dev->dev, devt,
+					 dev, tap_name);
+		if (IS_ERR(classdev)) {
+			macvtap_free_minor(vlan);
+			return notifier_from_errno(PTR_ERR(classdev));
+		}
+		err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
+					tap_name);
+		if (err)
+			return notifier_from_errno(err);
+		break;
+	case NETDEV_UNREGISTER:
+		/* vlan->minor == 0 if NETDEV_REGISTER above failed */
+		if (vlan->minor == 0)
+			break;
+		sysfs_remove_link(&dev->dev.kobj, tap_name);
+		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
+		device_destroy(&macvtap_class, devt);
+		macvtap_free_minor(vlan);
+		break;
+	case NETDEV_CHANGE_TX_QUEUE_LEN:
+		if (macvtap_queue_resize(vlan))
+			return NOTIFY_BAD;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block macvtap_notifier_block __read_mostly = {
+	.notifier_call	= macvtap_device_event,
+};
+
+extern struct file_operations macvtap_fops;
+static int macvtap_init(void)
+{
+	int err;
+
+	err = alloc_chrdev_region(&macvtap_major, 0,
+				MACVTAP_NUM_DEVS, "macvtap");
+	if (err)
+		goto out1;
+
+	cdev_init(&macvtap_cdev, &macvtap_fops);
+	err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
+	if (err)
+		goto out2;
+
+	err = class_register(&macvtap_class);
+	if (err)
+		goto out3;
+
+	err = register_netdevice_notifier(&macvtap_notifier_block);
+	if (err)
+		goto out4;
+
+	err = macvlan_link_register(&macvtap_link_ops);
+	if (err)
+		goto out5;
+
+	return 0;
+
+out5:
+	unregister_netdevice_notifier(&macvtap_notifier_block);
+out4:
+	class_unregister(&macvtap_class);
+out3:
+	cdev_del(&macvtap_cdev);
+out2:
+	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+out1:
+	return err;
+}
+module_init(macvtap_init);
+
+extern struct idr minor_idr;
+static void macvtap_exit(void)
+{
+	rtnl_link_unregister(&macvtap_link_ops);
+	unregister_netdevice_notifier(&macvtap_notifier_block);
+	class_unregister(&macvtap_class);
+	cdev_del(&macvtap_cdev);
+	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+	idr_destroy(&minor_idr);
+}
+module_exit(macvtap_exit);
+
+MODULE_ALIAS_RTNL_LINK("macvtap");
+MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
new file mode 100644
index 0000000..8f12a39
--- /dev/null
+++ b/drivers/net/tap.c
@@ -0,0 +1,1186 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/if_vlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/fs.h>
+#include <linux/uio.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+#include <linux/virtio_net.h>
+#include <linux/skb_array.h>
+
+/*
+ * A macvtap queue is the central object of this driver, it connects
+ * an open character device to a macvlan interface. There can be
+ * multiple queues on one interface, which map back to queues
+ * implemented in hardware on the underlying device.
+ *
+ * macvtap_proto is used to allocate queues through the sock allocation
+ * mechanism.
+ *
+ */
+struct macvtap_queue {
+	struct sock sk;
+	struct socket sock;
+	struct socket_wq wq;
+	int vnet_hdr_sz;
+	struct macvlan_dev __rcu *vlan;
+	struct file *file;
+	unsigned int flags;
+	u16 queue_index;
+	bool enabled;
+	struct list_head next;
+	struct skb_array skb_array;
+};
+
+#define MACVTAP_FEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE)
+
+#define MACVTAP_VNET_LE 0x80000000
+#define MACVTAP_VNET_BE 0x40000000
+
+#ifdef CONFIG_TUN_VNET_CROSS_LE
+static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
+{
+	return q->flags & MACVTAP_VNET_BE ? false :
+		virtio_legacy_is_little_endian();
+}
+
+static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *sp)
+{
+	int s = !!(q->flags & MACVTAP_VNET_BE);
+
+	if (put_user(s, sp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *sp)
+{
+	int s;
+
+	if (get_user(s, sp))
+		return -EFAULT;
+
+	if (s)
+		q->flags |= MACVTAP_VNET_BE;
+	else
+		q->flags &= ~MACVTAP_VNET_BE;
+
+	return 0;
+}
+#else
+static inline bool macvtap_legacy_is_little_endian(struct macvtap_queue *q)
+{
+	return virtio_legacy_is_little_endian();
+}
+
+static long macvtap_get_vnet_be(struct macvtap_queue *q, int __user *argp)
+{
+	return -EINVAL;
+}
+
+static long macvtap_set_vnet_be(struct macvtap_queue *q, int __user *argp)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_TUN_VNET_CROSS_LE */
+
+static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
+{
+	return q->flags & MACVTAP_VNET_LE ||
+		macvtap_legacy_is_little_endian(q);
+}
+
+static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
+{
+	return __virtio16_to_cpu(macvtap_is_little_endian(q), val);
+}
+
+static inline __virtio16 cpu_to_macvtap16(struct macvtap_queue *q, u16 val)
+{
+	return __cpu_to_virtio16(macvtap_is_little_endian(q), val);
+}
+
+static struct proto macvtap_proto = {
+	.name = "macvtap",
+	.owner = THIS_MODULE,
+	.obj_size = sizeof (struct macvtap_queue),
+};
+
+#define MACVTAP_NUM_DEVS (1U << MINORBITS)
+static DEFINE_MUTEX(minor_lock);
+DEFINE_IDR(minor_idr);
+
+#define GOODCOPY_LEN 128
+
+static const struct proto_ops macvtap_socket_ops;
+
+#define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO)
+#define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST)
+
+static struct macvlan_dev *macvtap_get_vlan_rcu(const struct net_device *dev)
+{
+	return rcu_dereference(dev->rx_handler_data);
+}
+
+/*
+ * RCU usage:
+ * The macvtap_queue and the macvlan_dev are loosely coupled, the
+ * pointers from one to the other can only be read while rcu_read_lock
+ * or rtnl is held.
+ *
+ * Both the file and the macvlan_dev hold a reference on the macvtap_queue
+ * through sock_hold(&q->sk). When the macvlan_dev goes away first,
+ * q->vlan becomes inaccessible. When the files gets closed,
+ * macvtap_get_queue() fails.
+ *
+ * There may still be references to the struct sock inside of the
+ * queue from outbound SKBs, but these never reference back to the
+ * file or the dev. The data structure is freed through __sk_free
+ * when both our references and any pending SKBs are gone.
+ */
+
+static int macvtap_enable_queue(struct net_device *dev, struct file *file,
+				struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	int err = -EINVAL;
+
+	ASSERT_RTNL();
+
+	if (q->enabled)
+		goto out;
+
+	err = 0;
+	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
+	q->queue_index = vlan->numvtaps;
+	q->enabled = true;
+
+	vlan->numvtaps++;
+out:
+	return err;
+}
+
+/* Requires RTNL */
+static int macvtap_set_queue(struct net_device *dev, struct file *file,
+			     struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	if (vlan->numqueues == MAX_MACVTAP_QUEUES)
+		return -EBUSY;
+
+	rcu_assign_pointer(q->vlan, vlan);
+	rcu_assign_pointer(vlan->taps[vlan->numvtaps], q);
+	sock_hold(&q->sk);
+
+	q->file = file;
+	q->queue_index = vlan->numvtaps;
+	q->enabled = true;
+	file->private_data = q;
+	list_add_tail(&q->next, &vlan->queue_list);
+
+	vlan->numvtaps++;
+	vlan->numqueues++;
+
+	return 0;
+}
+
+static int macvtap_disable_queue(struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan;
+	struct macvtap_queue *nq;
+
+	ASSERT_RTNL();
+	if (!q->enabled)
+		return -EINVAL;
+
+	vlan = rtnl_dereference(q->vlan);
+
+	if (vlan) {
+		int index = q->queue_index;
+		BUG_ON(index >= vlan->numvtaps);
+		nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]);
+		nq->queue_index = index;
+
+		rcu_assign_pointer(vlan->taps[index], nq);
+		RCU_INIT_POINTER(vlan->taps[vlan->numvtaps - 1], NULL);
+		q->enabled = false;
+
+		vlan->numvtaps--;
+	}
+
+	return 0;
+}
+
+/*
+ * The file owning the queue got closed, give up both
+ * the reference that the files holds as well as the
+ * one from the macvlan_dev if that still exists.
+ *
+ * Using the spinlock makes sure that we don't get
+ * to the queue again after destroying it.
+ */
+static void macvtap_put_queue(struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan;
+
+	rtnl_lock();
+	vlan = rtnl_dereference(q->vlan);
+
+	if (vlan) {
+		if (q->enabled)
+			BUG_ON(macvtap_disable_queue(q));
+
+		vlan->numqueues--;
+		RCU_INIT_POINTER(q->vlan, NULL);
+		sock_put(&q->sk);
+		list_del_init(&q->next);
+	}
+
+	rtnl_unlock();
+
+	synchronize_rcu();
+	sock_put(&q->sk);
+}
+
+/*
+ * Select a queue based on the rxq of the device on which this packet
+ * arrived. If the incoming device is not mq, calculate a flow hash
+ * to select a queue. If all fails, find the first available queue.
+ * Cache vlan->numvtaps since it can become zero during the execution
+ * of this function.
+ */
+static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
+					       struct sk_buff *skb)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvtap_queue *tap = NULL;
+	/* Access to taps array is protected by rcu, but access to numvtaps
+	 * isn't. Below we use it to lookup a queue, but treat it as a hint
+	 * and validate that the result isn't NULL - in case we are
+	 * racing against queue removal.
+	 */
+	int numvtaps = ACCESS_ONCE(vlan->numvtaps);
+	__u32 rxq;
+
+	if (!numvtaps)
+		goto out;
+
+	if (numvtaps == 1)
+		goto single;
+
+	/* Check if we can use flow to select a queue */
+	rxq = skb_get_hash(skb);
+	if (rxq) {
+		tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
+		goto out;
+	}
+
+	if (likely(skb_rx_queue_recorded(skb))) {
+		rxq = skb_get_rx_queue(skb);
+
+		while (unlikely(rxq >= numvtaps))
+			rxq -= numvtaps;
+
+		tap = rcu_dereference(vlan->taps[rxq]);
+		goto out;
+	}
+
+single:
+	tap = rcu_dereference(vlan->taps[0]);
+out:
+	return tap;
+}
+
+/*
+ * The net_device is going away, give up the reference
+ * that it holds on all queues and safely set the pointer
+ * from the queues to NULL.
+ */
+void macvtap_del_queues(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvtap_queue *q, *tmp;
+
+	ASSERT_RTNL();
+	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
+		list_del_init(&q->next);
+		RCU_INIT_POINTER(q->vlan, NULL);
+		if (q->enabled)
+			vlan->numvtaps--;
+		vlan->numqueues--;
+		sock_put(&q->sk);
+	}
+	BUG_ON(vlan->numvtaps);
+	BUG_ON(vlan->numqueues);
+	/* guarantee that any future macvtap_set_queue will fail */
+	vlan->numvtaps = MAX_MACVTAP_QUEUES;
+}
+
+rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct net_device *dev = skb->dev;
+	struct macvlan_dev *vlan;
+	struct macvtap_queue *q;
+	netdev_features_t features = TAP_FEATURES;
+
+	vlan = macvtap_get_vlan_rcu(dev);
+	if (!vlan)
+		return RX_HANDLER_PASS;
+
+	q = macvtap_get_queue(dev, skb);
+	if (!q)
+		return RX_HANDLER_PASS;
+
+	if (__skb_array_full(&q->skb_array))
+		goto drop;
+
+	skb_push(skb, ETH_HLEN);
+
+	/* Apply the forward feature mask so that we perform segmentation
+	 * according to users wishes.  This only works if VNET_HDR is
+	 * enabled.
+	 */
+	if (q->flags & IFF_VNET_HDR)
+		features |= vlan->tap_features;
+	if (netif_needs_gso(skb, features)) {
+		struct sk_buff *segs = __skb_gso_segment(skb, features, false);
+
+		if (IS_ERR(segs))
+			goto drop;
+
+		if (!segs) {
+			if (skb_array_produce(&q->skb_array, skb))
+				goto drop;
+			goto wake_up;
+		}
+
+		consume_skb(skb);
+		while (segs) {
+			struct sk_buff *nskb = segs->next;
+
+			segs->next = NULL;
+			if (skb_array_produce(&q->skb_array, segs)) {
+				kfree_skb(segs);
+				kfree_skb_list(nskb);
+				break;
+			}
+			segs = nskb;
+		}
+	} else {
+		/* If we receive a partial checksum and the tap side
+		 * doesn't support checksum offload, compute the checksum.
+		 * Note: it doesn't matter which checksum feature to
+		 *	  check, we either support them all or none.
+		 */
+		if (skb->ip_summed == CHECKSUM_PARTIAL &&
+		    !(features & NETIF_F_CSUM_MASK) &&
+		    skb_checksum_help(skb))
+			goto drop;
+		if (skb_array_produce(&q->skb_array, skb))
+			goto drop;
+	}
+
+wake_up:
+	wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND);
+	return RX_HANDLER_CONSUMED;
+
+drop:
+	/* Count errors/drops only here, thus don't care about args. */
+	macvlan_count_rx(vlan, 0, 0, 0);
+	kfree_skb(skb);
+	return RX_HANDLER_CONSUMED;
+}
+
+int macvtap_get_minor(struct macvlan_dev *vlan)
+{
+	int retval = -ENOMEM;
+
+	mutex_lock(&minor_lock);
+	retval = idr_alloc(&minor_idr, vlan, 1, MACVTAP_NUM_DEVS, GFP_KERNEL);
+	if (retval >= 0) {
+		vlan->minor = retval;
+	} else if (retval == -ENOSPC) {
+		netdev_err(vlan->dev, "Too many macvtap devices\n");
+		retval = -EINVAL;
+	}
+	mutex_unlock(&minor_lock);
+	return retval < 0 ? retval : 0;
+}
+
+void macvtap_free_minor(struct macvlan_dev *vlan)
+{
+	mutex_lock(&minor_lock);
+	if (vlan->minor) {
+		idr_remove(&minor_idr, vlan->minor);
+		vlan->minor = 0;
+	}
+	mutex_unlock(&minor_lock);
+}
+
+static struct net_device *dev_get_by_macvtap_minor(int minor)
+{
+	struct net_device *dev = NULL;
+	struct macvlan_dev *vlan;
+
+	mutex_lock(&minor_lock);
+	vlan = idr_find(&minor_idr, minor);
+	if (vlan) {
+		dev = vlan->dev;
+		dev_hold(dev);
+	}
+	mutex_unlock(&minor_lock);
+	return dev;
+}
+
+static void macvtap_sock_write_space(struct sock *sk)
+{
+	wait_queue_head_t *wqueue;
+
+	if (!sock_writeable(sk) ||
+	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
+		return;
+
+	wqueue = sk_sleep(sk);
+	if (wqueue && waitqueue_active(wqueue))
+		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
+}
+
+static void macvtap_sock_destruct(struct sock *sk)
+{
+	struct macvtap_queue *q = container_of(sk, struct macvtap_queue, sk);
+
+	skb_array_cleanup(&q->skb_array);
+}
+
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct net_device *dev;
+	struct macvtap_queue *q;
+	int err = -ENODEV;
+
+	rtnl_lock();
+	dev = dev_get_by_macvtap_minor(iminor(inode));
+	if (!dev)
+		goto err;
+
+	err = -ENOMEM;
+	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+					     &macvtap_proto, 0);
+	if (!q)
+		goto err;
+
+	RCU_INIT_POINTER(q->sock.wq, &q->wq);
+	init_waitqueue_head(&q->wq.wait);
+	q->sock.type = SOCK_RAW;
+	q->sock.state = SS_CONNECTED;
+	q->sock.file = file;
+	q->sock.ops = &macvtap_socket_ops;
+	sock_init_data(&q->sock, &q->sk);
+	q->sk.sk_write_space = macvtap_sock_write_space;
+	q->sk.sk_destruct = macvtap_sock_destruct;
+	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
+	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+
+	/*
+	 * so far only KVM virtio_net uses macvtap, enable zero copy between
+	 * guest kernel and host kernel when lower device supports zerocopy
+	 *
+	 * The macvlan supports zerocopy iff the lower device supports zero
+	 * copy so we don't have to look at the lower device directly.
+	 */
+	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
+		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
+
+	err = -ENOMEM;
+	if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL))
+		goto err_array;
+
+	err = macvtap_set_queue(dev, file, q);
+	if (err)
+		goto err_queue;
+
+	dev_put(dev);
+
+	rtnl_unlock();
+	return err;
+
+err_queue:
+	skb_array_cleanup(&q->skb_array);
+err_array:
+	sock_put(&q->sk);
+err:
+	if (dev)
+		dev_put(dev);
+
+	rtnl_unlock();
+	return err;
+}
+
+static int macvtap_release(struct inode *inode, struct file *file)
+{
+	struct macvtap_queue *q = file->private_data;
+	macvtap_put_queue(q);
+	return 0;
+}
+
+static unsigned int macvtap_poll(struct file *file, poll_table * wait)
+{
+	struct macvtap_queue *q = file->private_data;
+	unsigned int mask = POLLERR;
+
+	if (!q)
+		goto out;
+
+	mask = 0;
+	poll_wait(file, &q->wq.wait, wait);
+
+	if (!skb_array_empty(&q->skb_array))
+		mask |= POLLIN | POLLRDNORM;
+
+	if (sock_writeable(&q->sk) ||
+	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
+	     sock_writeable(&q->sk)))
+		mask |= POLLOUT | POLLWRNORM;
+
+out:
+	return mask;
+}
+
+static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
+						size_t len, size_t linear,
+						int noblock, int *err)
+{
+	struct sk_buff *skb;
+
+	/* Under a page?  Don't bother with paged skb. */
+	if (prepad + len < PAGE_SIZE || !linear)
+		linear = len;
+
+	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+				   err, 0);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, prepad);
+	skb_put(skb, linear);
+	skb->data_len = len - linear;
+	skb->len += len - linear;
+
+	return skb;
+}
+
+/* Neighbour code has some assumptions on HH_DATA_MOD alignment */
+#define MACVTAP_RESERVE HH_DATA_OFF(ETH_HLEN)
+
+/* Get packet from user space buffer */
+static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
+				struct iov_iter *from, int noblock)
+{
+	int good_linear = SKB_MAX_HEAD(MACVTAP_RESERVE);
+	struct sk_buff *skb;
+	struct macvlan_dev *vlan;
+	unsigned long total_len = iov_iter_count(from);
+	unsigned long len = total_len;
+	int err;
+	struct virtio_net_hdr vnet_hdr = { 0 };
+	int vnet_hdr_len = 0;
+	int copylen = 0;
+	int depth;
+	bool zerocopy = false;
+	size_t linear;
+
+	if (q->flags & IFF_VNET_HDR) {
+		vnet_hdr_len = q->vnet_hdr_sz;
+
+		err = -EINVAL;
+		if (len < vnet_hdr_len)
+			goto err;
+		len -= vnet_hdr_len;
+
+		err = -EFAULT;
+		if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
+			goto err;
+		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
+		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		     macvtap16_to_cpu(q, vnet_hdr.csum_start) +
+		     macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
+			     macvtap16_to_cpu(q, vnet_hdr.hdr_len))
+			vnet_hdr.hdr_len = cpu_to_macvtap16(q,
+				 macvtap16_to_cpu(q, vnet_hdr.csum_start) +
+				 macvtap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
+		err = -EINVAL;
+		if (macvtap16_to_cpu(q, vnet_hdr.hdr_len) > len)
+			goto err;
+	}
+
+	err = -EINVAL;
+	if (unlikely(len < ETH_HLEN))
+		goto err;
+
+	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
+		struct iov_iter i;
+
+		copylen = vnet_hdr.hdr_len ?
+			macvtap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
+		if (copylen > good_linear)
+			copylen = good_linear;
+		else if (copylen < ETH_HLEN)
+			copylen = ETH_HLEN;
+		linear = copylen;
+		i = *from;
+		iov_iter_advance(&i, copylen);
+		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
+			zerocopy = true;
+	}
+
+	if (!zerocopy) {
+		copylen = len;
+		linear = macvtap16_to_cpu(q, vnet_hdr.hdr_len);
+		if (linear > good_linear)
+			linear = good_linear;
+		else if (linear < ETH_HLEN)
+			linear = ETH_HLEN;
+	}
+
+	skb = macvtap_alloc_skb(&q->sk, MACVTAP_RESERVE, copylen,
+				linear, noblock, &err);
+	if (!skb)
+		goto err;
+
+	if (zerocopy)
+		err = zerocopy_sg_from_iter(skb, from);
+	else
+		err = skb_copy_datagram_from_iter(skb, 0, from, len);
+
+	if (err)
+		goto err_kfree;
+
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	if (vnet_hdr_len) {
+		err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
+					    macvtap_is_little_endian(q));
+		if (err)
+			goto err_kfree;
+	}
+
+	skb_probe_transport_header(skb, ETH_HLEN);
+
+	/* Move network header to the right position for VLAN tagged packets */
+	if ((skb->protocol == htons(ETH_P_8021Q) ||
+	     skb->protocol == htons(ETH_P_8021AD)) &&
+	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
+		skb_set_network_header(skb, depth);
+
+	rcu_read_lock();
+	vlan = rcu_dereference(q->vlan);
+	/* copy skb_ubuf_info for callback when skb has no error */
+	if (zerocopy) {
+		skb_shinfo(skb)->destructor_arg = m->msg_control;
+		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+	} else if (m && m->msg_control) {
+		struct ubuf_info *uarg = m->msg_control;
+		uarg->callback(uarg, false);
+	}
+
+	if (vlan) {
+		skb->dev = vlan->dev;
+		dev_queue_xmit(skb);
+	} else {
+		kfree_skb(skb);
+	}
+	rcu_read_unlock();
+
+	return total_len;
+
+err_kfree:
+	kfree_skb(skb);
+
+err:
+	rcu_read_lock();
+	vlan = rcu_dereference(q->vlan);
+	if (vlan)
+		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+	rcu_read_unlock();
+
+	return err;
+}
+
+static ssize_t macvtap_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct macvtap_queue *q = file->private_data;
+
+	return macvtap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK);
+}
+
+/* Put packet to the user space buffer */
+static ssize_t macvtap_put_user(struct macvtap_queue *q,
+				const struct sk_buff *skb,
+				struct iov_iter *iter)
+{
+	int ret;
+	int vnet_hdr_len = 0;
+	int vlan_offset = 0;
+	int total;
+
+	if (q->flags & IFF_VNET_HDR) {
+		struct virtio_net_hdr vnet_hdr;
+		vnet_hdr_len = q->vnet_hdr_sz;
+		if (iov_iter_count(iter) < vnet_hdr_len)
+			return -EINVAL;
+
+		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
+					    macvtap_is_little_endian(q)))
+			BUG();
+
+		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
+		    sizeof(vnet_hdr))
+			return -EFAULT;
+
+		iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
+	}
+	total = vnet_hdr_len;
+	total += skb->len;
+
+	if (skb_vlan_tag_present(skb)) {
+		struct {
+			__be16 h_vlan_proto;
+			__be16 h_vlan_TCI;
+		} veth;
+		veth.h_vlan_proto = skb->vlan_proto;
+		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+
+		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
+		total += VLAN_HLEN;
+
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
+			goto done;
+
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret != sizeof(veth) || !iov_iter_count(iter))
+			goto done;
+	}
+
+	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
+				     skb->len - vlan_offset);
+
+done:
+	return ret ? ret : total;
+}
+
+static ssize_t macvtap_do_read(struct macvtap_queue *q,
+			       struct iov_iter *to,
+			       int noblock)
+{
+	DEFINE_WAIT(wait);
+	struct sk_buff *skb;
+	ssize_t ret = 0;
+
+	if (!iov_iter_count(to))
+		return 0;
+
+	while (1) {
+		if (!noblock)
+			prepare_to_wait(sk_sleep(&q->sk), &wait,
+					TASK_INTERRUPTIBLE);
+
+		/* Read frames from the queue */
+		skb = skb_array_consume(&q->skb_array);
+		if (skb)
+			break;
+		if (noblock) {
+			ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		/* Nothing to read, let's sleep */
+		schedule();
+	}
+	if (!noblock)
+		finish_wait(sk_sleep(&q->sk), &wait);
+
+	if (skb) {
+		ret = macvtap_put_user(q, skb, to);
+		if (unlikely(ret < 0))
+			kfree_skb(skb);
+		else
+			consume_skb(skb);
+	}
+	return ret;
+}
+
+static ssize_t macvtap_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct macvtap_queue *q = file->private_data;
+	ssize_t len = iov_iter_count(to), ret;
+
+	ret = macvtap_do_read(q, to, file->f_flags & O_NONBLOCK);
+	ret = min_t(ssize_t, ret, len);
+	if (ret > 0)
+		iocb->ki_pos = ret;
+	return ret;
+}
+
+static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q)
+{
+	struct macvlan_dev *vlan;
+
+	ASSERT_RTNL();
+	vlan = rtnl_dereference(q->vlan);
+	if (vlan)
+		dev_hold(vlan->dev);
+
+	return vlan;
+}
+
+static void macvtap_put_vlan(struct macvlan_dev *vlan)
+{
+	dev_put(vlan->dev);
+}
+
+static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags)
+{
+	struct macvtap_queue *q = file->private_data;
+	struct macvlan_dev *vlan;
+	int ret;
+
+	vlan = macvtap_get_vlan(q);
+	if (!vlan)
+		return -EINVAL;
+
+	if (flags & IFF_ATTACH_QUEUE)
+		ret = macvtap_enable_queue(vlan->dev, file, q);
+	else if (flags & IFF_DETACH_QUEUE)
+		ret = macvtap_disable_queue(q);
+	else
+		ret = -EINVAL;
+
+	macvtap_put_vlan(vlan);
+	return ret;
+}
+
+static int set_offload(struct macvtap_queue *q, unsigned long arg)
+{
+	struct macvlan_dev *vlan;
+	netdev_features_t features;
+	netdev_features_t feature_mask = 0;
+
+	vlan = rtnl_dereference(q->vlan);
+	if (!vlan)
+		return -ENOLINK;
+
+	features = vlan->dev->features;
+
+	if (arg & TUN_F_CSUM) {
+		feature_mask = NETIF_F_HW_CSUM;
+
+		if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) {
+			if (arg & TUN_F_TSO_ECN)
+				feature_mask |= NETIF_F_TSO_ECN;
+			if (arg & TUN_F_TSO4)
+				feature_mask |= NETIF_F_TSO;
+			if (arg & TUN_F_TSO6)
+				feature_mask |= NETIF_F_TSO6;
+		}
+
+		if (arg & TUN_F_UFO)
+			feature_mask |= NETIF_F_UFO;
+	}
+
+	/* tun/tap driver inverts the usage for TSO offloads, where
+	 * setting the TSO bit means that the userspace wants to
+	 * accept TSO frames and turning it off means that user space
+	 * does not support TSO.
+	 * For macvtap, we have to invert it to mean the same thing.
+	 * When user space turns off TSO, we turn off GSO/LRO so that
+	 * user-space will not receive TSO frames.
+	 */
+	if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO))
+		features |= RX_OFFLOADS;
+	else
+		features &= ~RX_OFFLOADS;
+
+	/* tap_features are the same as features on tun/tap and
+	 * reflect user expectations.
+	 */
+	vlan->tap_features = feature_mask;
+	vlan->set_features = features;
+	netdev_update_features(vlan->dev);
+
+	return 0;
+}
+
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct macvtap_queue *q = file->private_data;
+	struct macvlan_dev *vlan;
+	void __user *argp = (void __user *)arg;
+	struct ifreq __user *ifr = argp;
+	unsigned int __user *up = argp;
+	unsigned short u;
+	int __user *sp = argp;
+	struct sockaddr sa;
+	int s;
+	int ret;
+
+	switch (cmd) {
+	case TUNSETIFF:
+		/* ignore the name, just look at flags */
+		if (get_user(u, &ifr->ifr_flags))
+			return -EFAULT;
+
+		ret = 0;
+		if ((u & ~MACVTAP_FEATURES) != (IFF_NO_PI | IFF_TAP))
+			ret = -EINVAL;
+		else
+			q->flags = (q->flags & ~MACVTAP_FEATURES) | u;
+
+		return ret;
+
+	case TUNGETIFF:
+		rtnl_lock();
+		vlan = macvtap_get_vlan(q);
+		if (!vlan) {
+			rtnl_unlock();
+			return -ENOLINK;
+		}
+
+		ret = 0;
+		u = q->flags;
+		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
+		    put_user(u, &ifr->ifr_flags))
+			ret = -EFAULT;
+		macvtap_put_vlan(vlan);
+		rtnl_unlock();
+		return ret;
+
+	case TUNSETQUEUE:
+		if (get_user(u, &ifr->ifr_flags))
+			return -EFAULT;
+		rtnl_lock();
+		ret = macvtap_ioctl_set_queue(file, u);
+		rtnl_unlock();
+		return ret;
+
+	case TUNGETFEATURES:
+		if (put_user(IFF_TAP | IFF_NO_PI | MACVTAP_FEATURES, up))
+			return -EFAULT;
+		return 0;
+
+	case TUNSETSNDBUF:
+		if (get_user(s, sp))
+			return -EFAULT;
+
+		q->sk.sk_sndbuf = s;
+		return 0;
+
+	case TUNGETVNETHDRSZ:
+		s = q->vnet_hdr_sz;
+		if (put_user(s, sp))
+			return -EFAULT;
+		return 0;
+
+	case TUNSETVNETHDRSZ:
+		if (get_user(s, sp))
+			return -EFAULT;
+		if (s < (int)sizeof(struct virtio_net_hdr))
+			return -EINVAL;
+
+		q->vnet_hdr_sz = s;
+		return 0;
+
+	case TUNGETVNETLE:
+		s = !!(q->flags & MACVTAP_VNET_LE);
+		if (put_user(s, sp))
+			return -EFAULT;
+		return 0;
+
+	case TUNSETVNETLE:
+		if (get_user(s, sp))
+			return -EFAULT;
+		if (s)
+			q->flags |= MACVTAP_VNET_LE;
+		else
+			q->flags &= ~MACVTAP_VNET_LE;
+		return 0;
+
+	case TUNGETVNETBE:
+		return macvtap_get_vnet_be(q, sp);
+
+	case TUNSETVNETBE:
+		return macvtap_set_vnet_be(q, sp);
+
+	case TUNSETOFFLOAD:
+		/* let the user check for future flags */
+		if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+			    TUN_F_TSO_ECN | TUN_F_UFO))
+			return -EINVAL;
+
+		rtnl_lock();
+		ret = set_offload(q, arg);
+		rtnl_unlock();
+		return ret;
+
+	case SIOCGIFHWADDR:
+		rtnl_lock();
+		vlan = macvtap_get_vlan(q);
+		if (!vlan) {
+			rtnl_unlock();
+			return -ENOLINK;
+		}
+		ret = 0;
+		u = vlan->dev->type;
+		if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
+		    copy_to_user(&ifr->ifr_hwaddr.sa_data, vlan->dev->dev_addr, ETH_ALEN) ||
+		    put_user(u, &ifr->ifr_hwaddr.sa_family))
+			ret = -EFAULT;
+		macvtap_put_vlan(vlan);
+		rtnl_unlock();
+		return ret;
+
+	case SIOCSIFHWADDR:
+		if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa)))
+			return -EFAULT;
+		rtnl_lock();
+		vlan = macvtap_get_vlan(q);
+		if (!vlan) {
+			rtnl_unlock();
+			return -ENOLINK;
+		}
+		ret = dev_set_mac_address(vlan->dev, &sa);
+		macvtap_put_vlan(vlan);
+		rtnl_unlock();
+		return ret;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long arg)
+{
+	return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+const struct file_operations macvtap_fops = {
+	.owner		= THIS_MODULE,
+	.open		= macvtap_open,
+	.release	= macvtap_release,
+	.read_iter	= macvtap_read_iter,
+	.write_iter	= macvtap_write_iter,
+	.poll		= macvtap_poll,
+	.llseek		= no_llseek,
+	.unlocked_ioctl	= macvtap_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= macvtap_compat_ioctl,
+#endif
+};
+
+static int macvtap_sendmsg(struct socket *sock, struct msghdr *m,
+			   size_t total_len)
+{
+	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
+	return macvtap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
+}
+
+static int macvtap_recvmsg(struct socket *sock, struct msghdr *m,
+			   size_t total_len, int flags)
+{
+	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
+	int ret;
+	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
+		return -EINVAL;
+	ret = macvtap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT);
+	if (ret > total_len) {
+		m->msg_flags |= MSG_TRUNC;
+		ret = flags & MSG_TRUNC ? ret : total_len;
+	}
+	return ret;
+}
+
+static int macvtap_peek_len(struct socket *sock)
+{
+	struct macvtap_queue *q = container_of(sock, struct macvtap_queue,
+					       sock);
+	return skb_array_peek_len(&q->skb_array);
+}
+
+/* Ops structure to mimic raw sockets with tun */
+static const struct proto_ops macvtap_socket_ops = {
+	.sendmsg = macvtap_sendmsg,
+	.recvmsg = macvtap_recvmsg,
+	.peek_len = macvtap_peek_len,
+};
+
+/* Get an underlying socket object from tun file.  Returns error unless file is
+ * attached to a device.  The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *macvtap_get_socket(struct file *file)
+{
+	struct macvtap_queue *q;
+	if (file->f_op != &macvtap_fops)
+		return ERR_PTR(-EINVAL);
+	q = file->private_data;
+	if (!q)
+		return ERR_PTR(-EBADFD);
+	return &q->sock;
+}
+EXPORT_SYMBOL_GPL(macvtap_get_socket);
+
+int macvtap_queue_resize(struct macvlan_dev *vlan)
+{
+	struct net_device *dev = vlan->dev;
+	struct macvtap_queue *q;
+	struct skb_array **arrays;
+	int n = vlan->numqueues;
+	int ret, i = 0;
+
+	arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+	if (!arrays)
+		return -ENOMEM;
+
+	list_for_each_entry(q, &vlan->queue_list, next)
+		arrays[i++] = &q->skb_array;
+
+	ret = skb_array_resize_multiple(arrays, n,
+					dev->tx_queue_len, GFP_KERNEL);
+
+	kfree(arrays);
+	return ret;
+}
diff --git a/include/linux/if_macvtap.h b/include/linux/if_macvtap.h
new file mode 100644
index 0000000..c9bf84b
--- /dev/null
+++ b/include/linux/if_macvtap.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_IF_MACVTAP_H_
+#define _LINUX_IF_MACVTAP_H_
+
+rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb);
+void macvtap_del_queues(struct net_device *dev);
+int macvtap_get_minor(struct macvlan_dev *vlan);
+void macvtap_free_minor(struct macvlan_dev *vlan);
+int macvtap_queue_resize(struct macvlan_dev *vlan);
+
+#endif /*_LINUX_IF_MACVTAP_H_*/
-- 
2.7.4

^ permalink raw reply related

* [PATCHv1 0/7] Refactor macvtap to re-use tap functionality by other virtual intefaces
From: Sainath Grandhi @ 2017-01-06 22:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, mahesh, linux-kernel, Sainath Grandhi

Tap character devices can be implemented on other virtual interfaces like
ipvlan, similar to macvtap. Source code for tap functionality in macvtap 
can be re-used for this purpose.

This patch series splits macvtap source into two modules, macvtap and tap.
This patch series also includes a patch for implementing tap character 
device driver based on the IP-VLAN network interface, called ipvtap.

Sainath Grandhi (7):
  TAP: Refactoring macvtap.c
  TAP: Renaming tap related APIs, data structures,  macros
  TAP: Tap character device creation/destroy API
  TAP: Abstract type of virtual interface from tap implementation
  TAP: Extending tap device create/destroy APIs
  TAP: tap as an independent module
  IPVTAP: IP-VLAN based tap driver

 drivers/net/Kconfig              |   26 +
 drivers/net/Makefile             |    2 +
 drivers/net/ipvlan/Makefile      |    1 +
 drivers/net/ipvlan/ipvlan.h      |    7 +
 drivers/net/ipvlan/ipvlan_core.c |    5 +-
 drivers/net/ipvlan/ipvlan_main.c |   37 +-
 drivers/net/ipvlan/ipvtap.c      |  238 +++++++
 drivers/net/macvlan.c            |    2 +-
 drivers/net/macvtap.c            | 1227 ++----------------------------------
 drivers/net/tap.c                | 1276 ++++++++++++++++++++++++++++++++++++++
 drivers/vhost/net.c              |    3 +-
 include/linux/if_macvlan.h       |    4 +-
 include/linux/if_tap.h           |   63 ++
 13 files changed, 1691 insertions(+), 1200 deletions(-)
 create mode 100644 drivers/net/ipvlan/ipvtap.c
 create mode 100644 drivers/net/tap.c
 create mode 100644 include/linux/if_tap.h

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH net-next] net: dsa: move HWMON support to its own file
From: Florian Fainelli @ 2017-01-06 22:24 UTC (permalink / raw)
  To: Vivien Didelot, netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn
In-Reply-To: <20170106214200.15633-1-vivien.didelot@savoirfairelinux.com>

On 01/06/2017 01:42 PM, Vivien Didelot wrote:
> Isolate the HWMON support in DSA in its own file. Currently only the
> legacy DSA code is concerned.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH v2 00/12] net: ethernet: aquantia: Add AQtion 2.5/5 GB NIC driver
From: David VomLehn @ 2017-01-06 22:23 UTC (permalink / raw)
  To: David Miller, Alexander.Loktionov
  Cc: netdev, Simon.Edelhaus, Dmitrii.Tarakanov, Pavel.Belous
In-Reply-To: <20170106.160206.1149383357421277805.davem@davemloft.net>

On 01/06/2017 01:02 PM, David Miller wrote:
> From: Alexander Loktionov <Alexander.Loktionov@aquantia.com>
> Date: Fri,  6 Jan 2017 00:06:01 -0800
>
>> This series introduced the AQtion NIC driver for the aQuantia
>> AQC107/AQC108 network devices.
>>
>> v1: Initial version
>> v2: o Make necessary drivers/net/ethernet changes to integrate software
>>      o Drop intermediate atlantic directory
>>      o Remove Makefile things only appropriate to out of tree module builidng
> Every patch series must be fully bisectable, this means that at each step
> of the series, the kernel tree must fully build and work properly.
>
> You break that already at the first patch, which makes the Kconfig options
> available, which if enabled will cause a build failure.
>
> make[4]: *** No rule to make target 'drivers/net/ethernet/aquantia/aq_main.o', needed by 'drivers/net/ethernet/aquantia/atlantic.o'.  Stop.
> make[4]: *** Waiting for unfinished jobs....
> scripts/Makefile.build:551: recipe for target 'drivers/net/ethernet/aquantia' failed
> make[3]: *** [drivers/net/ethernet/aquantia] Error 2
> make[3]: *** Waiting for unfinished jobs....
> scripts/Makefile.build:551: recipe for target 'drivers/net/ethernet' failed
> make[2]: *** [drivers/net/ethernet] Error 2
> scripts/Makefile.build:551: recipe for target 'drivers/net' failed
> make[1]: *** [drivers/net] Error 2
> make[1]: *** Waiting for unfinished jobs....
> Makefile:988: recipe for target 'drivers' failed
> make: *** [drivers] Error 2
>
> The way to do this, is to add the pieces of source code infrastrucutre,
> one piece at a time.  And then at the very very end, enable the code
> into the build.

Thanks, will fix shortly.

-- 
David VL

^ permalink raw reply

* Re: [PATCH net-next 1/2] net: dsa: make "label" property optional for dsa2
From: Andrew Lunn @ 2017-01-06 22:20 UTC (permalink / raw)
  To: Vivien Didelot
  Cc: netdev, linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Uwe Kleine-König
In-Reply-To: <20170106220043.21280-2-vivien.didelot@savoirfairelinux.com>

> If one wants to rename an interface, udev rules can be used as usual.

Hi Vivien

Do you have some examples?

A quick look at udevadm info suggests we can use

ATTR{phys_port_id} and ATTR{phys_switch_id}

Humm, it would be nice to know why the second switch has a
phys_switch_id of 01000000.

How is systemd naming them without udev rules?

    Andrew

^ permalink raw reply

* Re: [net-next PATCH] net: reduce cycles spend on ICMP replies that gets rate limited
From: Eric Dumazet @ 2017-01-06 22:08 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev
In-Reply-To: <1483731621.9712.31.camel@edumazet-glaptop3.roam.corp.google.com>

On Fri, 2017-01-06 at 11:40 -0800, Eric Dumazet wrote:
> On Fri, 2017-01-06 at 18:39 +0100, Jesper Dangaard Brouer wrote:
> 
> 
> > @@ -648,13 +668,17 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
> >  		}
> >  	}
> >  
> > -	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
> > -	if (!icmp_param)
> > -		return;
> > -
> >  	sk = icmp_xmit_lock(net);
> >  	if (!sk)
> > -		goto out_free;
> > +		goto out;
> > +
> > +	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
> > +	if (!icmpv4_global_allow(net, type, code))
> > +		goto out_unlock;
> > +
> > +	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
> > +	if (!icmp_param)
> > +		goto out_unlock;
> 

You could call icmp_xmit_lock() _after_ checking global limit perhaps. 

That would remove one atomic op.

if (!icmpv4_global_allow(net, type, code))
    goto out;

sk = icmp_xmit_lock(net);
if (!sk)
    goto out;

^ permalink raw reply

* [PATCH net-next 0/2] net: dsa: make "label" property optional
From: Vivien Didelot @ 2017-01-06 22:00 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Uwe Kleine-König, Vivien Didelot

Patch 1/2 makes the "label" property in new DSA bindings optional. This
doesn't change the current behavior with existing DTS files.

As Linux considers the Ethernet switch ports as normal NICs by default,
not providing a "label" property for user ports results in using the
standard "ethX" network device name. Giving a "label" overwrites this.

Patch 2/2 removes the labels for the ZII Rev B board as an example.

Vivien Didelot (2):
  net: dsa: make "label" property optional for dsa2
  arm: dts: vf610-zii-dev-rev-b: remove ports label

 Documentation/devicetree/bindings/net/dsa/dsa.txt | 20 ++++++++-----------
 arch/arm/boot/dts/vf610-zii-dev-rev-b.dts         | 16 ---------------
 net/dsa/dsa2.c                                    | 24 ++++-------------------
 3 files changed, 12 insertions(+), 48 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH net-next 2/2] arm: dts: vf610-zii-dev-rev-b: remove ports label
From: Vivien Didelot @ 2017-01-06 22:00 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Uwe Kleine-König, Vivien Didelot
In-Reply-To: <20170106220043.21280-1-vivien.didelot@savoirfairelinux.com>

Now that the "label" property is optional for Ethernet switch ports,
remove them in the ZII Dev Rev B board DTS.

On a Rev B board, once eth1 is up, this DTS now exposes to userspace:

    # ip link | grep ': ' | cut -d: -f2
     lo
     eth0
     eth1
     eth2@eth1
     eth3@eth1
     eth4@eth1
     eth5@eth1
     eth6@eth1
     eth7@eth1
     eth8@eth1
     eth9@eth1
     eth10@eth1
     eth11@eth1
     eth12@eth1

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 arch/arm/boot/dts/vf610-zii-dev-rev-b.dts | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts b/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
index 7ea617e47fe4..f9c8810aed7c 100644
--- a/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
+++ b/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts
@@ -104,25 +104,21 @@
 					#size-cells = <0>;
 					port@0 {
 						reg = <0>;
-						label = "lan0";
 						phy-handle = <&switch0phy0>;
 					};
 
 					port@1 {
 						reg = <1>;
-						label = "lan1";
 						phy-handle = <&switch0phy1>;
 					};
 
 					port@2 {
 						reg = <2>;
-						label = "lan2";
 						phy-handle = <&switch0phy2>;
 					};
 
 					switch0port5: port@5 {
 						reg = <5>;
-						label = "dsa";
 						phy-mode = "rgmii-txid";
 						link = <&switch1port6
 							&switch2port9>;
@@ -134,7 +130,6 @@
 
 					port@6 {
 						reg = <6>;
-						label = "cpu";
 						ethernet = <&fec1>;
 						fixed-link {
 							speed = <100>;
@@ -186,25 +181,21 @@
 					#size-cells = <0>;
 					port@0 {
 						reg = <0>;
-						label = "lan3";
 						phy-handle = <&switch1phy0>;
 					};
 
 					port@1 {
 						reg = <1>;
-						label = "lan4";
 						phy-handle = <&switch1phy1>;
 					};
 
 					port@2 {
 						reg = <2>;
-						label = "lan5";
 						phy-handle = <&switch1phy2>;
 					};
 
 					switch1port5: port@5 {
 						reg = <5>;
-						label = "dsa";
 						link = <&switch2port9>;
 						phy-mode = "rgmii-txid";
 						fixed-link {
@@ -215,7 +206,6 @@
 
 					switch1port6: port@6 {
 						reg = <6>;
-						label = "dsa";
 						phy-mode = "rgmii-txid";
 						link = <&switch0port5>;
 						fixed-link {
@@ -263,22 +253,18 @@
 					#size-cells = <0>;
 					port@0 {
 						reg = <0>;
-						label = "lan6";
 					};
 
 					port@1 {
 						reg = <1>;
-						label = "lan7";
 					};
 
 					port@2 {
 						reg = <2>;
-						label = "lan8";
 					};
 
 					port@3 {
 						reg = <3>;
-						label = "optical3";
 						fixed-link {
 							speed = <1000>;
 							full-duplex;
@@ -289,7 +275,6 @@
 
 					port@4 {
 						reg = <4>;
-						label = "optical4";
 						fixed-link {
 							speed = <1000>;
 							full-duplex;
@@ -300,7 +285,6 @@
 
 					switch2port9: port@9 {
 						reg = <9>;
-						label = "dsa";
 						phy-mode = "rgmii-txid";
 						link = <&switch1port5
 							&switch0port5>;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/2] net: dsa: make "label" property optional for dsa2
From: Vivien Didelot @ 2017-01-06 22:00 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Uwe Kleine-König, Vivien Didelot
In-Reply-To: <20170106220043.21280-1-vivien.didelot@savoirfairelinux.com>

In the new DTS bindings for DSA (dsa2), the "ethernet" and "link"
phandles are respectively mandatory and exclusive to CPU port and DSA
link device tree nodes.

Simplify dsa2.c a bit by checking the presence of such phandle instead
of checking the redundant "label" property.

Then the Linux philosophy for Ethernet switch ports is to expose them to
userspace as standard NICs by default. Thus use the standard enumerated
"eth%d" device name if no "label" property is provided for a user port.
This allows to save DTS files from subjective net device names.

If one wants to rename an interface, udev rules can be used as usual.
The sysfs phys_port_id and phys_switch_id also provide physical data.

Of course the current behavior is unchanged, and the optional "label"
property for user ports has precedence over the enumerated name.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 Documentation/devicetree/bindings/net/dsa/dsa.txt | 20 ++++++++-----------
 net/dsa/dsa2.c                                    | 24 ++++-------------------
 2 files changed, 12 insertions(+), 32 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt
index a4a570fb2494..cfe8f64eca4f 100644
--- a/Documentation/devicetree/bindings/net/dsa/dsa.txt
+++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt
@@ -34,13 +34,9 @@ Required properties:
 
 Each port children node must have the following mandatory properties:
 - reg			: Describes the port address in the switch
-- label			: Describes the label associated with this port, which
-                          will become the netdev name. Special labels are
-			  "cpu" to indicate a CPU port and "dsa" to
-			  indicate an uplink/downlink port between switches in
-			  the cluster.
 
-A port labelled "dsa" has the following mandatory property:
+An uplink/downlink port between switches in the cluster has the following
+mandatory property:
 
 - link			: Should be a list of phandles to other switch's DSA
 			  port. This port is used as the outgoing port
@@ -48,12 +44,17 @@ A port labelled "dsa" has the following mandatory property:
 			  information must be given, not just the one hop
 			  routes to neighbouring switches.
 
-A port labelled "cpu" has the following mandatory property:
+A CPU port has the following mandatory property:
 
 - ethernet		: Should be a phandle to a valid Ethernet device node.
                           This host device is what the switch port is
 			  connected to.
 
+A user port has the following optional property:
+
+- label			: Describes the label associated with this port, which
+                          will become the netdev name.
+
 Port child nodes may also contain the following optional standardised
 properties, described in binding documents:
 
@@ -107,7 +108,6 @@ linked into one DSA cluster.
 
 			switch0port5: port@5 {
 				reg = <5>;
-				label = "dsa";
 				phy-mode = "rgmii-txid";
 				link = <&switch1port6
 					&switch2port9>;
@@ -119,7 +119,6 @@ linked into one DSA cluster.
 
 			port@6 {
 				reg = <6>;
-				label = "cpu";
 				ethernet = <&fec1>;
 				fixed-link {
 					speed = <100>;
@@ -165,7 +164,6 @@ linked into one DSA cluster.
 
 			switch1port5: port@5 {
 				reg = <5>;
-				label = "dsa";
 				link = <&switch2port9>;
 				phy-mode = "rgmii-txid";
 				fixed-link {
@@ -176,7 +174,6 @@ linked into one DSA cluster.
 
 			switch1port6: port@6 {
 				reg = <6>;
-				label = "dsa";
 				phy-mode = "rgmii-txid";
 				link = <&switch0port5>;
 				fixed-link {
@@ -255,7 +252,6 @@ linked into one DSA cluster.
 
 			switch2port9: port@9 {
 				reg = <9>;
-				label = "dsa";
 				phy-mode = "rgmii-txid";
 				link = <&switch1port5
 					&switch0port5>;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index bad119cee2a3..9526bdf2a34a 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -81,30 +81,12 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
 
 static bool dsa_port_is_dsa(struct device_node *port)
 {
-	const char *name;
-
-	name = of_get_property(port, "label", NULL);
-	if (!name)
-		return false;
-
-	if (!strcmp(name, "dsa"))
-		return true;
-
-	return false;
+	return !!of_parse_phandle(port, "link", 0);
 }
 
 static bool dsa_port_is_cpu(struct device_node *port)
 {
-	const char *name;
-
-	name = of_get_property(port, "label", NULL);
-	if (!name)
-		return false;
-
-	if (!strcmp(name, "cpu"))
-		return true;
-
-	return false;
+	return !!of_parse_phandle(port, "ethernet", 0);
 }
 
 static bool dsa_ds_find_port(struct dsa_switch *ds,
@@ -268,6 +250,8 @@ static int dsa_user_port_apply(struct device_node *port, u32 index,
 	int err;
 
 	name = of_get_property(port, "label", NULL);
+	if (!name)
+		name = "eth%d";
 
 	err = dsa_slave_create(ds, ds->dev, index, name);
 	if (err) {
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2] PCI: lock each enable/disable num_vfs operation in sysfs
From: Emil Tantilov @ 2017-01-06 21:59 UTC (permalink / raw)
  To: linux-pci, intel-wired-lan; +Cc: alexander.h.duyck, netdev, linux-kernel

Enabling/disabling SRIOV via sysfs by echo-ing multiple values
simultaneously:

echo 63 > /sys/class/net/ethX/device/sriov_numvfs&
echo 63 > /sys/class/net/ethX/device/sriov_numvfs

sleep 5

echo 0 > /sys/class/net/ethX/device/sriov_numvfs&
echo 0 > /sys/class/net/ethX/device/sriov_numvfs

Results in the following bug:

kernel BUG at drivers/pci/iov.c:495!
invalid opcode: 0000 [#1] SMP
CPU: 1 PID: 8050 Comm: bash Tainted: G   W   4.9.0-rc7-net-next #2092
RIP: 0010:[<ffffffff813b1647>]
	  [<ffffffff813b1647>] pci_iov_release+0x57/0x60

Call Trace:
 [<ffffffff81391726>] pci_release_dev+0x26/0x70
 [<ffffffff8155be6e>] device_release+0x3e/0xb0
 [<ffffffff81365ee7>] kobject_cleanup+0x67/0x180
 [<ffffffff81365d9d>] kobject_put+0x2d/0x60
 [<ffffffff8155bc27>] put_device+0x17/0x20
 [<ffffffff8139c08a>] pci_dev_put+0x1a/0x20
 [<ffffffff8139cb6b>] pci_get_dev_by_id+0x5b/0x90
 [<ffffffff8139cca5>] pci_get_subsys+0x35/0x40
 [<ffffffff8139ccc8>] pci_get_device+0x18/0x20
 [<ffffffff8139ccfb>] pci_get_domain_bus_and_slot+0x2b/0x60
 [<ffffffff813b09e7>] pci_iov_remove_virtfn+0x57/0x180
 [<ffffffff813b0b95>] pci_disable_sriov+0x65/0x140
 [<ffffffffa00a1af7>] ixgbe_disable_sriov+0xc7/0x1d0 [ixgbe]
 [<ffffffffa00a1e9d>] ixgbe_pci_sriov_configure+0x3d/0x170 [ixgbe]
 [<ffffffff8139d28c>] sriov_numvfs_store+0xdc/0x130
...
RIP  [<ffffffff813b1647>] pci_iov_release+0x57/0x60

Use the existing mutex lock to protect each enable/disable operation.

-v2: move the existing lock from protecting the config of the IOV bus
to protecting the writes to sriov_numvfs in sysfs without maintaining
a "locked" version of pci_iov_add/remove_virtfn().
As suggested by Gavin Shan <gwshan@linux.vnet.ibm.com>

CC: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
---
 drivers/pci/iov.c       |    7 -------
 drivers/pci/pci-sysfs.c |   23 ++++++++++++++++-------
 drivers/pci/pci.h       |    2 +-
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 4722782..2479ae8 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -124,7 +124,6 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
 	struct pci_sriov *iov = dev->sriov;
 	struct pci_bus *bus;
 
-	mutex_lock(&iov->dev->sriov->lock);
 	bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
 	if (!bus)
 		goto failed;
@@ -162,7 +161,6 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
 		__pci_reset_function(virtfn);
 
 	pci_device_add(virtfn, virtfn->bus);
-	mutex_unlock(&iov->dev->sriov->lock);
 
 	pci_bus_add_device(virtfn);
 	sprintf(buf, "virtfn%u", id);
@@ -181,12 +179,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
 	sysfs_remove_link(&dev->dev.kobj, buf);
 failed1:
 	pci_dev_put(dev);
-	mutex_lock(&iov->dev->sriov->lock);
 	pci_stop_and_remove_bus_device(virtfn);
 failed0:
 	virtfn_remove_bus(dev->bus, bus);
 failed:
-	mutex_unlock(&iov->dev->sriov->lock);
 
 	return rc;
 }
@@ -195,7 +191,6 @@ void pci_iov_remove_virtfn(struct pci_dev *dev, int id, int reset)
 {
 	char buf[VIRTFN_ID_LEN];
 	struct pci_dev *virtfn;
-	struct pci_sriov *iov = dev->sriov;
 
 	virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
 					     pci_iov_virtfn_bus(dev, id),
@@ -218,10 +213,8 @@ void pci_iov_remove_virtfn(struct pci_dev *dev, int id, int reset)
 	if (virtfn->dev.kobj.sd)
 		sysfs_remove_link(&virtfn->dev.kobj, "physfn");
 
-	mutex_lock(&iov->dev->sriov->lock);
 	pci_stop_and_remove_bus_device(virtfn);
 	virtfn_remove_bus(dev->bus, virtfn->bus);
-	mutex_unlock(&iov->dev->sriov->lock);
 
 	/* balance pci_get_domain_bus_and_slot() */
 	pci_dev_put(virtfn);
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 0666287..25d010d 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -472,6 +472,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
 				  const char *buf, size_t count)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_sriov *iov = pdev->sriov;
 	int ret;
 	u16 num_vfs;
 
@@ -482,38 +483,46 @@ static ssize_t sriov_numvfs_store(struct device *dev,
 	if (num_vfs > pci_sriov_get_totalvfs(pdev))
 		return -ERANGE;
 
+	mutex_lock(&iov->dev->sriov->lock);
+
 	if (num_vfs == pdev->sriov->num_VFs)
-		return count;		/* no change */
+		goto exit;
 
 	/* is PF driver loaded w/callback */
 	if (!pdev->driver || !pdev->driver->sriov_configure) {
 		dev_info(&pdev->dev, "Driver doesn't support SRIOV configuration via sysfs\n");
-		return -ENOSYS;
+		ret = -ENOENT;
+		goto exit;
 	}
 
 	if (num_vfs == 0) {
 		/* disable VFs */
 		ret = pdev->driver->sriov_configure(pdev, 0);
-		if (ret < 0)
-			return ret;
-		return count;
+		goto exit;
 	}
 
 	/* enable VFs */
 	if (pdev->sriov->num_VFs) {
 		dev_warn(&pdev->dev, "%d VFs already enabled. Disable before enabling %d VFs\n",
 			 pdev->sriov->num_VFs, num_vfs);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto exit;
 	}
 
 	ret = pdev->driver->sriov_configure(pdev, num_vfs);
 	if (ret < 0)
-		return ret;
+		goto exit;
 
 	if (ret != num_vfs)
 		dev_warn(&pdev->dev, "%d VFs requested; only %d enabled\n",
 			 num_vfs, ret);
 
+exit:
+	mutex_unlock(&iov->dev->sriov->lock);
+
+	if (ret < 0)
+		return ret;
+
 	return count;
 }
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cb17db2..8dd38e6 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -270,7 +270,7 @@ struct pci_sriov {
 	u16 driver_max_VFs;	/* max num VFs driver supports */
 	struct pci_dev *dev;	/* lowest numbered PF */
 	struct pci_dev *self;	/* this PF */
-	struct mutex lock;	/* lock for VF bus */
+	struct mutex lock;	/* lock for setting sriov_numvfs in sysfs */
 	resource_size_t barsz[PCI_SRIOV_NUM_BARS];	/* VF BAR size */
 };
 

^ permalink raw reply related

* Re: [PATCH net-next] net: dsa: move HWMON support to its own file
From: Andrew Lunn @ 2017-01-06 21:58 UTC (permalink / raw)
  To: Vivien Didelot
  Cc: netdev, linux-kernel, kernel, David S. Miller, Florian Fainelli
In-Reply-To: <20170106214200.15633-1-vivien.didelot@savoirfairelinux.com>

On Fri, Jan 06, 2017 at 04:42:00PM -0500, Vivien Didelot wrote:
> Isolate the HWMON support in DSA in its own file. Currently only the
> legacy DSA code is concerned.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Seems like a good step towards removing it completely and letting the
PHY do it all.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH 2/3] xen: modify xenstore watch event interface
From: Boris Ostrovsky @ 2017-01-06 21:57 UTC (permalink / raw)
  To: Juergen Gross, linux-kernel, xen-devel
  Cc: netdev, roger.pau, wei.liu2, paul.durrant
In-Reply-To: <20170106150544.10836-3-jgross@suse.com>

On 01/06/2017 10:05 AM, Juergen Gross wrote:
> Today a Xenstore watch event is delivered via a callback function
> declared as:
>
> void (*callback)(struct xenbus_watch *,
>                  const char **vec, unsigned int len);
>
> As all watch events only ever come with two parameters (path and token)
> changing the prototype to:
>
> void (*callback)(struct xenbus_watch *,
>                  const char *path, const char *token);
>
> is the natural thing to do.
>
> Apply this change and adapt all users.
>
> Cc: konrad.wilk@oracle.com
> Cc: roger.pau@citrix.com
> Cc: wei.liu2@citrix.com
> Cc: paul.durrant@citrix.com
> Cc: netdev@vger.kernel.org
>
> Signed-off-by: Juergen Gross <jgross@suse.com>


>  
> @@ -903,24 +902,24 @@ static int process_msg(void)
>  	body[msg->hdr.len] = '\0';
>  
>  	if (msg->hdr.type == XS_WATCH_EVENT) {
> -		msg->u.watch.vec = split(body, msg->hdr.len,
> -					 &msg->u.watch.vec_size);
> -		if (IS_ERR(msg->u.watch.vec)) {
> -			err = PTR_ERR(msg->u.watch.vec);
> +		if (count_strings(body, msg->hdr.len) != 2) {
> +			err = -EINVAL;

xenbus_write_watch() returns -EILSEQ when this type of error is
encountered so perhaps for we should return the same error here.

Either way

Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply

* Re: [PATCH 2/2] ARM: dts: dra72-evm-revc: enable irqs for dp83867 eth phys
From: Tony Lindgren @ 2017-01-06 21:54 UTC (permalink / raw)
  To: Grygorii Strashko; +Cc: Mugunthan V N, linux-omap, Sekhar Nori, netdev
In-Reply-To: <20170106205543.4748-1-grygorii.strashko@ti.com>

* Grygorii Strashko <grygorii.strashko@ti.com> [170106 12:56]:
> TI DRA72-EVM Rev C has two DP83867 ethernet phys which support IRQ
> generation in case of phy/link status changes. The INT/PWDN lines from both
> DP83867 phys are wired to DRA7 gpio6.16, so reflect the same in DT.

Hmm not seeing the patch 1/2 here.. Can this one be queued separately?
Is it for v4.11 or a fix?

Regards,

Tony

> Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
> ---
>  arch/arm/boot/dts/dra72-evm-revc.dts | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/boot/dts/dra72-evm-revc.dts b/arch/arm/boot/dts/dra72-evm-revc.dts
> index c3d939c..3ecac56 100644
> --- a/arch/arm/boot/dts/dra72-evm-revc.dts
> +++ b/arch/arm/boot/dts/dra72-evm-revc.dts
> @@ -68,6 +68,8 @@
>  		ti,tx-internal-delay = <DP83867_RGMIIDCTL_250_PS>;
>  		ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_8_B_NIB>;
>  		ti,min-output-impedance;
> +		interrupt-parent = <&gpio6>;
> +		interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
>  	};
>  
>  	dp83867_1: ethernet-phy@3 {
> @@ -75,6 +77,8 @@
>  		ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_25_NS>;
>  		ti,tx-internal-delay = <DP83867_RGMIIDCTL_250_PS>;
>  		ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_8_B_NIB>;
> -		ti,min-output-imepdance;
> +		ti,min-output-impedance;
> +		interrupt-parent = <&gpio6>;
> +		interrupts = <16 IRQ_TYPE_EDGE_FALLING>;
>  	};
>  };
> -- 
> 2.10.1.dirty
> 

^ permalink raw reply

* [PATCH] [v2] net: qcom/emac: add ethtool support
From: Timur Tabi @ 2017-01-06 21:43 UTC (permalink / raw)
  To: David Miller, Florian Fainelli, netdev, Alok Chauhan

Add support for some ethtool methods: get/set link settings, get/set
message level, get statistics, get link status, get ring params, get
pause params, and restart autonegotiation.

The code to collect the hardware statistics is moved into its own
function so that it can be used by "get statistics" method.

Signed-off-by: Timur Tabi <timur@codeaurora.org>
---

Notes:
    I don't trust my implementation of emac_get_pauseparam.  I feel like
    I'm missing something.
    
    v2: added emac_get_pauseparam and emac_get_ringparam

 drivers/net/ethernet/qualcomm/emac/Makefile       |   2 +-
 drivers/net/ethernet/qualcomm/emac/emac-ethtool.c | 185 ++++++++++++++++++++++
 drivers/net/ethernet/qualcomm/emac/emac.c         |  51 +++---
 drivers/net/ethernet/qualcomm/emac/emac.h         |   3 +
 4 files changed, 220 insertions(+), 21 deletions(-)
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-ethtool.c

diff --git a/drivers/net/ethernet/qualcomm/emac/Makefile b/drivers/net/ethernet/qualcomm/emac/Makefile
index 7a66879..fc57ced 100644
--- a/drivers/net/ethernet/qualcomm/emac/Makefile
+++ b/drivers/net/ethernet/qualcomm/emac/Makefile
@@ -4,6 +4,6 @@
 
 obj-$(CONFIG_QCOM_EMAC) += qcom-emac.o
 
-qcom-emac-objs := emac.o emac-mac.o emac-phy.o emac-sgmii.o \
+qcom-emac-objs := emac.o emac-mac.o emac-phy.o emac-sgmii.o emac-ethtool.o \
 		  emac-sgmii-fsm9900.o emac-sgmii-qdf2432.o \
 		  emac-sgmii-qdf2400.o
diff --git a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c
new file mode 100644
index 0000000..cfc57d2
--- /dev/null
+++ b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+
+#include "emac.h"
+
+static const char * const emac_ethtool_stat_strings[] = {
+	"rx_ok",
+	"rx_bcast",
+	"rx_mcast",
+	"rx_pause",
+	"rx_ctrl",
+	"rx_fcs_err",
+	"rx_len_err",
+	"rx_byte_cnt",
+	"rx_runt",
+	"rx_frag",
+	"rx_sz_64",
+	"rx_sz_65_127",
+	"rx_sz_128_255",
+	"rx_sz_256_511",
+	"rx_sz_512_1023",
+	"rx_sz_1024_1518",
+	"rx_sz_1519_max",
+	"rx_sz_ov",
+	"rx_rxf_ov",
+	"rx_align_err",
+	"rx_bcast_byte_cnt",
+	"rx_mcast_byte_cnt",
+	"rx_err_addr",
+	"rx_crc_align",
+	"rx_jabbers",
+	"tx_ok",
+	"tx_bcast",
+	"tx_mcast",
+	"tx_pause",
+	"tx_exc_defer",
+	"tx_ctrl",
+	"tx_defer",
+	"tx_byte_cnt",
+	"tx_sz_64",
+	"tx_sz_65_127",
+	"tx_sz_128_255",
+	"tx_sz_256_511",
+	"tx_sz_512_1023",
+	"tx_sz_1024_1518",
+	"tx_sz_1519_max",
+	"tx_1_col",
+	"tx_2_col",
+	"tx_late_col",
+	"tx_abort_col",
+	"tx_underrun",
+	"tx_rd_eop",
+	"tx_len_err",
+	"tx_trunc",
+	"tx_bcast_byte",
+	"tx_mcast_byte",
+	"tx_col",
+};
+
+#define EMAC_STATS_LEN	ARRAY_SIZE(emac_ethtool_stat_strings)
+
+static u32 emac_get_msglevel(struct net_device *netdev)
+{
+	struct emac_adapter *adpt = netdev_priv(netdev);
+
+	return adpt->msg_enable;
+}
+
+static void emac_set_msglevel(struct net_device *netdev, u32 data)
+{
+	struct emac_adapter *adpt = netdev_priv(netdev);
+
+	adpt->msg_enable = data;
+}
+
+static int emac_get_sset_count(struct net_device *netdev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return EMAC_STATS_LEN;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void emac_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+{
+	unsigned int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < EMAC_STATS_LEN; i++) {
+			strlcpy(data, emac_ethtool_stat_strings[i],
+				ETH_GSTRING_LEN);
+			data += ETH_GSTRING_LEN;
+		}
+		break;
+	}
+}
+
+static void emac_get_ethtool_stats(struct net_device *netdev,
+				   struct ethtool_stats *stats,
+				   u64 *data)
+{
+	struct emac_adapter *adpt = netdev_priv(netdev);
+
+	spin_lock(&adpt->stats.lock);
+
+	emac_update_hw_stats(adpt);
+	memcpy(data, &adpt->stats, EMAC_STATS_LEN * sizeof(u64));
+
+	spin_unlock(&adpt->stats.lock);
+}
+
+static int emac_nway_reset(struct net_device *netdev)
+{
+	struct phy_device *phydev = netdev->phydev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return genphy_restart_aneg(phydev);
+}
+
+static void emac_get_ringparam(struct net_device *netdev,
+			       struct ethtool_ringparam *ring)
+{
+	struct emac_adapter *adpt = netdev_priv(netdev);
+
+	ring->rx_max_pending = EMAC_MAX_RX_DESCS;
+	ring->tx_max_pending = EMAC_MAX_TX_DESCS;
+	ring->rx_pending = adpt->rx_desc_cnt;
+	ring->tx_pending = adpt->tx_desc_cnt;
+}
+
+static void emac_get_pauseparam(struct net_device *netdev,
+				struct ethtool_pauseparam *pause)
+{
+	struct phy_device *phydev = netdev->phydev;
+
+	if (phydev) {
+		if (phydev->autoneg)
+			pause->autoneg = 1;
+		if (phydev->pause)
+			pause->rx_pause = 1;
+		if (phydev->pause != phydev->asym_pause)
+			pause->tx_pause = 1;
+	}
+}
+
+static const struct ethtool_ops emac_ethtool_ops = {
+	.get_link_ksettings = phy_ethtool_get_link_ksettings,
+	.set_link_ksettings = phy_ethtool_set_link_ksettings,
+
+	.get_msglevel    = emac_get_msglevel,
+	.set_msglevel    = emac_set_msglevel,
+
+	.get_sset_count  = emac_get_sset_count,
+	.get_strings = emac_get_strings,
+	.get_ethtool_stats = emac_get_ethtool_stats,
+
+	.get_ringparam = emac_get_ringparam,
+	.get_pauseparam = emac_get_pauseparam,
+
+	.nway_reset = emac_nway_reset,
+
+	.get_link = ethtool_op_get_link,
+};
+
+void emac_set_ethtool_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &emac_ethtool_ops;
+}
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index 422289c..1ab4478 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -311,45 +311,55 @@ static int emac_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 	return phy_mii_ioctl(netdev->phydev, ifr, cmd);
 }
 
-/* Provide network statistics info for the interface */
-static struct rtnl_link_stats64 *emac_get_stats64(struct net_device *netdev,
-						  struct rtnl_link_stats64 *net_stats)
+/**
+ * emac_update_hw_stats - read the EMAC stat registers
+ *
+ * Reads the stats registers and write the values to adpt->stats.
+ *
+ * adpt->stats.lock must be held while calling this function.
+ */
+void emac_update_hw_stats(struct emac_adapter *adpt)
 {
-	struct emac_adapter *adpt = netdev_priv(netdev);
-	unsigned int addr = REG_MAC_RX_STATUS_BIN;
 	struct emac_stats *stats = &adpt->stats;
 	u64 *stats_itr = &adpt->stats.rx_ok;
-	u32 val;
-
-	spin_lock(&stats->lock);
+	void __iomem *base = adpt->base;
+	unsigned int addr;
 
+	addr = REG_MAC_RX_STATUS_BIN;
 	while (addr <= REG_MAC_RX_STATUS_END) {
-		val = readl_relaxed(adpt->base + addr);
-		*stats_itr += val;
+		*stats_itr += readl_relaxed(base + addr);
 		stats_itr++;
 		addr += sizeof(u32);
 	}
 
 	/* additional rx status */
-	val = readl_relaxed(adpt->base + EMAC_RXMAC_STATC_REG23);
-	adpt->stats.rx_crc_align += val;
-	val = readl_relaxed(adpt->base + EMAC_RXMAC_STATC_REG24);
-	adpt->stats.rx_jabbers += val;
+	stats->rx_crc_align += readl_relaxed(base + EMAC_RXMAC_STATC_REG23);
+	stats->rx_jabbers += readl_relaxed(base + EMAC_RXMAC_STATC_REG24);
 
 	/* update tx status */
 	addr = REG_MAC_TX_STATUS_BIN;
-	stats_itr = &adpt->stats.tx_ok;
+	stats_itr = &stats->tx_ok;
 
 	while (addr <= REG_MAC_TX_STATUS_END) {
-		val = readl_relaxed(adpt->base + addr);
-		*stats_itr += val;
-		++stats_itr;
+		*stats_itr += readl_relaxed(base + addr);
+		stats_itr++;
 		addr += sizeof(u32);
 	}
 
 	/* additional tx status */
-	val = readl_relaxed(adpt->base + EMAC_TXMAC_STATC_REG25);
-	adpt->stats.tx_col += val;
+	stats->tx_col += readl_relaxed(base + EMAC_TXMAC_STATC_REG25);
+}
+
+/* Provide network statistics info for the interface */
+static struct rtnl_link_stats64 *
+emac_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *net_stats)
+{
+	struct emac_adapter *adpt = netdev_priv(netdev);
+	struct emac_stats *stats = &adpt->stats;
+
+	spin_lock(&stats->lock);
+
+	emac_update_hw_stats(adpt);
 
 	/* return parsed statistics */
 	net_stats->rx_packets = stats->rx_ok;
@@ -620,6 +630,7 @@ static int emac_probe(struct platform_device *pdev)
 
 	dev_set_drvdata(&pdev->dev, netdev);
 	SET_NETDEV_DEV(netdev, &pdev->dev);
+	emac_set_ethtool_ops(netdev);
 
 	adpt = netdev_priv(netdev);
 	adpt->netdev = netdev;
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.h b/drivers/net/ethernet/qualcomm/emac/emac.h
index 0c76e6c..4b8483c 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.h
+++ b/drivers/net/ethernet/qualcomm/emac/emac.h
@@ -332,4 +332,7 @@ struct emac_adapter {
 void emac_reg_update32(void __iomem *addr, u32 mask, u32 val);
 irqreturn_t emac_isr(int irq, void *data);
 
+void emac_set_ethtool_ops(struct net_device *netdev);
+void emac_update_hw_stats(struct emac_adapter *adpt);
+
 #endif /* _EMAC_H_ */
-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply related

* [PATCH net-next] net: dsa: move HWMON support to its own file
From: Vivien Didelot @ 2017-01-06 21:42 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot

Isolate the HWMON support in DSA in its own file. Currently only the
legacy DSA code is concerned.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 net/dsa/Makefile   |   1 +
 net/dsa/dsa.c      | 131 +----------------------------------------------
 net/dsa/dsa_priv.h |   9 ++++
 net/dsa/hwmon.c    | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+), 129 deletions(-)
 create mode 100644 net/dsa/hwmon.c

diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index a3380ed0e0be..560b6747c276 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,7 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
 dsa_core-y += dsa.o slave.o dsa2.o
+dsa_core-$(CONFIG_NET_DSA_HWMON) += hwmon.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 3f85be0aae34..cda787ebad15 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -9,9 +9,7 @@
  * (at your option) any later version.
  */
 
-#include <linux/ctype.h>
 #include <linux/device.h>
-#include <linux/hwmon.h>
 #include <linux/list.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -108,105 +106,6 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
 	return ret;
 }
 
-/* hwmon support ************************************************************/
-
-#ifdef CONFIG_NET_DSA_HWMON
-
-static ssize_t temp1_input_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = ds->ops->get_temp(ds, &temp);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", temp * 1000);
-}
-static DEVICE_ATTR_RO(temp1_input);
-
-static ssize_t temp1_max_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = ds->ops->get_temp_limit(ds, &temp);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", temp * 1000);
-}
-
-static ssize_t temp1_max_store(struct device *dev,
-			       struct device_attribute *attr, const char *buf,
-			       size_t count)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	int temp, ret;
-
-	ret = kstrtoint(buf, 0, &temp);
-	if (ret < 0)
-		return ret;
-
-	ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
-	if (ret < 0)
-		return ret;
-
-	return count;
-}
-static DEVICE_ATTR_RW(temp1_max);
-
-static ssize_t temp1_max_alarm_show(struct device *dev,
-				    struct device_attribute *attr, char *buf)
-{
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	bool alarm;
-	int ret;
-
-	ret = ds->ops->get_temp_alarm(ds, &alarm);
-	if (ret < 0)
-		return ret;
-
-	return sprintf(buf, "%d\n", alarm);
-}
-static DEVICE_ATTR_RO(temp1_max_alarm);
-
-static struct attribute *dsa_hwmon_attrs[] = {
-	&dev_attr_temp1_input.attr,	/* 0 */
-	&dev_attr_temp1_max.attr,	/* 1 */
-	&dev_attr_temp1_max_alarm.attr,	/* 2 */
-	NULL
-};
-
-static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
-				       struct attribute *attr, int index)
-{
-	struct device *dev = container_of(kobj, struct device, kobj);
-	struct dsa_switch *ds = dev_get_drvdata(dev);
-	struct dsa_switch_ops *ops = ds->ops;
-	umode_t mode = attr->mode;
-
-	if (index == 1) {
-		if (!ops->get_temp_limit)
-			mode = 0;
-		else if (!ops->set_temp_limit)
-			mode &= ~S_IWUSR;
-	} else if (index == 2 && !ops->get_temp_alarm) {
-		mode = 0;
-	}
-	return mode;
-}
-
-static const struct attribute_group dsa_hwmon_group = {
-	.attrs = dsa_hwmon_attrs,
-	.is_visible = dsa_hwmon_attrs_visible,
-};
-__ATTRIBUTE_GROUPS(dsa_hwmon);
-
-#endif /* CONFIG_NET_DSA_HWMON */
-
 /* basic switch operations **************************************************/
 int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
 		      struct device_node *port_dn, int port)
@@ -415,30 +314,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
 	if (ret)
 		return ret;
 
-#ifdef CONFIG_NET_DSA_HWMON
-	/* If the switch provides a temperature sensor,
-	 * register with hardware monitoring subsystem.
-	 * Treat registration error as non-fatal and ignore it.
-	 */
-	if (ops->get_temp) {
-		const char *netname = netdev_name(dst->master_netdev);
-		char hname[IFNAMSIZ + 1];
-		int i, j;
-
-		/* Create valid hwmon 'name' attribute */
-		for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
-			if (isalnum(netname[i]))
-				hname[j++] = netname[i];
-		}
-		hname[j] = '\0';
-		scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d",
-			  hname, index);
-		ds->hwmon_dev = hwmon_device_register_with_groups(NULL,
-					ds->hwmon_name, ds, dsa_hwmon_groups);
-		if (IS_ERR(ds->hwmon_dev))
-			ds->hwmon_dev = NULL;
-	}
-#endif /* CONFIG_NET_DSA_HWMON */
+	dsa_hwmon_register(ds);
 
 	return 0;
 }
@@ -498,10 +374,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
 {
 	int port;
 
-#ifdef CONFIG_NET_DSA_HWMON
-	if (ds->hwmon_dev)
-		hwmon_device_unregister(ds->hwmon_dev);
-#endif
+	dsa_hwmon_unregister(ds);
 
 	/* Destroy network devices for physical switch ports. */
 	for (port = 0; port < DSA_MAX_PORTS; port++) {
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 63ae1484abae..7e3385ec73f4 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -56,6 +56,15 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
 int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds);
 void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds);
 
+/* hwmon.c */
+#ifdef CONFIG_NET_DSA_HWMON
+void dsa_hwmon_register(struct dsa_switch *ds);
+void dsa_hwmon_unregister(struct dsa_switch *ds);
+#else
+static inline void dsa_hwmon_register(struct dsa_switch *ds) { }
+static inline void dsa_hwmon_unregister(struct dsa_switch *ds) { }
+#endif
+
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
diff --git a/net/dsa/hwmon.c b/net/dsa/hwmon.c
new file mode 100644
index 000000000000..3a9cdf0b22b8
--- /dev/null
+++ b/net/dsa/hwmon.c
@@ -0,0 +1,147 @@
+/*
+ * net/dsa/hwmon.c - HWMON subsystem support
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/ctype.h>
+#include <linux/hwmon.h>
+#include <net/dsa.h>
+
+#include "dsa_priv.h"
+
+static ssize_t temp1_input_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	int temp, ret;
+
+	ret = ds->ops->get_temp(ds, &temp);
+	if (ret < 0)
+		return ret;
+
+	return sprintf(buf, "%d\n", temp * 1000);
+}
+static DEVICE_ATTR_RO(temp1_input);
+
+static ssize_t temp1_max_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	int temp, ret;
+
+	ret = ds->ops->get_temp_limit(ds, &temp);
+	if (ret < 0)
+		return ret;
+
+	return sprintf(buf, "%d\n", temp * 1000);
+}
+
+static ssize_t temp1_max_store(struct device *dev,
+			       struct device_attribute *attr, const char *buf,
+			       size_t count)
+{
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	int temp, ret;
+
+	ret = kstrtoint(buf, 0, &temp);
+	if (ret < 0)
+		return ret;
+
+	ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+static DEVICE_ATTR_RW(temp1_max);
+
+static ssize_t temp1_max_alarm_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	bool alarm;
+	int ret;
+
+	ret = ds->ops->get_temp_alarm(ds, &alarm);
+	if (ret < 0)
+		return ret;
+
+	return sprintf(buf, "%d\n", alarm);
+}
+static DEVICE_ATTR_RO(temp1_max_alarm);
+
+static struct attribute *dsa_hwmon_attrs[] = {
+	&dev_attr_temp1_input.attr,	/* 0 */
+	&dev_attr_temp1_max.attr,	/* 1 */
+	&dev_attr_temp1_max_alarm.attr,	/* 2 */
+	NULL
+};
+
+static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
+				       struct attribute *attr, int index)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct dsa_switch *ds = dev_get_drvdata(dev);
+	struct dsa_switch_ops *ops = ds->ops;
+	umode_t mode = attr->mode;
+
+	if (index == 1) {
+		if (!ops->get_temp_limit)
+			mode = 0;
+		else if (!ops->set_temp_limit)
+			mode &= ~S_IWUSR;
+	} else if (index == 2 && !ops->get_temp_alarm) {
+		mode = 0;
+	}
+	return mode;
+}
+
+static const struct attribute_group dsa_hwmon_group = {
+	.attrs = dsa_hwmon_attrs,
+	.is_visible = dsa_hwmon_attrs_visible,
+};
+__ATTRIBUTE_GROUPS(dsa_hwmon);
+
+void dsa_hwmon_register(struct dsa_switch *ds)
+{
+	const char *netname = netdev_name(ds->dst->master_netdev);
+	char hname[IFNAMSIZ + 1];
+	int i, j;
+
+	/* If the switch provides temperature accessors, register with hardware
+	 * monitoring subsystem. Treat registration error as non-fatal.
+	 */
+	if (!ds->ops->get_temp)
+		return;
+
+	/* Create valid hwmon 'name' attribute */
+	for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
+		if (isalnum(netname[i]))
+			hname[j++] = netname[i];
+	}
+	hname[j] = '\0';
+	scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d", hname,
+		  ds->index);
+	ds->hwmon_dev = hwmon_device_register_with_groups(NULL, ds->hwmon_name,
+							  ds, dsa_hwmon_groups);
+	if (IS_ERR(ds->hwmon_dev)) {
+		pr_warn("DSA: failed to register HWMON subsystem for switch %d\n",
+			ds->index);
+		ds->hwmon_dev = NULL;
+	} else {
+		pr_info("DSA: registered HWMON subsystem for switch %d\n",
+			ds->index);
+	}
+}
+
+void dsa_hwmon_unregister(struct dsa_switch *ds)
+{
+	if (ds->hwmon_dev) {
+		hwmon_device_unregister(ds->hwmon_dev);
+		ds->hwmon_dev = NULL;
+	}
+}
-- 
2.11.0

^ permalink raw reply related

* Re: [next PATCH 00/11] ixgbe: Add support for writable pages and build_skb
From: David Miller @ 2017-01-06 21:41 UTC (permalink / raw)
  To: alexander.duyck; +Cc: intel-wired-lan, jeffrey.t.kirsher, netdev
In-Reply-To: <20170106155448.1501.31298.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.duyck@gmail.com>
Date: Fri, 06 Jan 2017 08:06:16 -0800

> The testing matrix for all of these patches is going to be pretty
> extensive.  Basically we want to test these patches on as many platforms
> and architectures as possible with as many features being toggled as
> possible including RSC, FCoE, SR-IOV, and Jumbo Frames all while receiving
> traffic.

Overall looks very nice to me.

I am assuming that I will get a formal submission once the necessary
amount of testing is performed.

^ permalink raw reply

* Re: pull-request: mac80211 2017-01-06
From: David Miller @ 2017-01-06 21:27 UTC (permalink / raw)
  To: johannes; +Cc: netdev, linux-wireless
In-Reply-To: <20170106123721.10970-1-johannes@sipsolutions.net>

From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri,  6 Jan 2017 13:37:20 +0100

> Here's another fix for something I noticed while reviewing the code in
> a new suggested patch that added another netlink socket destroy path.
> 
> Since the new patch would otherwise cause conflicts, it might be good
> to pull net or Linus's next RC containing it into net-next, if you can.
> 
> Please pull and let me know if there's any problem.

Pulled, I'll try to get this moving into net-next over the weekend.

Remind me about this early next week if that ends up slipping through
the cracks.

Thanks.

^ permalink raw reply

* [PATCH net] tg3: Fix race condition in tg3_get_stats64().
From: Michael Chan @ 2017-01-06 21:18 UTC (permalink / raw)
  To: davem; +Cc: netdev, wangyufen

The driver's ndo_get_stats64() method is not always called under RTNL.
So it can race with driver close or ethtool reconfigurations.  Fix the
race condition by taking tp->lock spinlock in tg3_free_consistent()
when freeing the tp->hw_stats memory block.  tg3_get_stats64() is
already taking tp->lock.

Reported-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/tg3.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 185e9e0..ae42de4 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -8720,11 +8720,14 @@ static void tg3_free_consistent(struct tg3 *tp)
 	tg3_mem_rx_release(tp);
 	tg3_mem_tx_release(tp);

+	/* Protect tg3_get_stats64() from reading freed tp->hw_stats. */
+	tg3_full_lock(tp, 0);
 	if (tp->hw_stats) {
 		dma_free_coherent(&tp->pdev->dev, sizeof(struct tg3_hw_stats),
 				  tp->hw_stats, tp->stats_mapping);
 		tp->hw_stats = NULL;
 	}
+	tg3_full_unlock(tp);
 }

 /*
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next] cxgb4: Add port description for new cards.
From: David Miller @ 2017-01-06 21:24 UTC (permalink / raw)
  To: ganeshgr; +Cc: netdev, nirranjan, hariprasad
In-Reply-To: <1483701730-11926-1-git-send-email-ganeshgr@chelsio.com>

From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Fri,  6 Jan 2017 16:52:10 +0530

> Add port description for 25G and 100G cards, and also
> change few port descriptions in compliance with the new
> naming convention.
> 
> Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] cxgb4/cxgb4vf: Display 25G and 100G link speed
From: David Miller @ 2017-01-06 21:23 UTC (permalink / raw)
  To: ganeshgr; +Cc: netdev, nirranjan, hariprasad
In-Reply-To: <1483701706-11882-1-git-send-email-ganeshgr@chelsio.com>

From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Fri,  6 Jan 2017 16:51:46 +0530

> Add support to report 25G and 100G links, which was missed
> as part of commit "eb97ad99f9ed".
> 
> Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>

Applied.

^ permalink raw reply

* Re: [PATCH v2 00/12] net: ethernet: aquantia: Add AQtion 2.5/5 GB NIC driver
From: David Miller @ 2017-01-06 21:02 UTC (permalink / raw)
  To: Alexander.Loktionov
  Cc: netdev, vomlehn, Simon.Edelhaus, Dmitrii.Tarakanov, Pavel.Belous
In-Reply-To: <cover.1483689029.git.vomlehn@texas.net>

From: Alexander Loktionov <Alexander.Loktionov@aquantia.com>
Date: Fri,  6 Jan 2017 00:06:01 -0800

> This series introduced the AQtion NIC driver for the aQuantia
> AQC107/AQC108 network devices.
> 
> v1: Initial version
> v2: o Make necessary drivers/net/ethernet changes to integrate software
>     o Drop intermediate atlantic directory
>     o Remove Makefile things only appropriate to out of tree module builidng

Every patch series must be fully bisectable, this means that at each step
of the series, the kernel tree must fully build and work properly.

You break that already at the first patch, which makes the Kconfig options
available, which if enabled will cause a build failure.

make[4]: *** No rule to make target 'drivers/net/ethernet/aquantia/aq_main.o', needed by 'drivers/net/ethernet/aquantia/atlantic.o'.  Stop.
make[4]: *** Waiting for unfinished jobs....
scripts/Makefile.build:551: recipe for target 'drivers/net/ethernet/aquantia' failed
make[3]: *** [drivers/net/ethernet/aquantia] Error 2
make[3]: *** Waiting for unfinished jobs....
scripts/Makefile.build:551: recipe for target 'drivers/net/ethernet' failed
make[2]: *** [drivers/net/ethernet] Error 2
scripts/Makefile.build:551: recipe for target 'drivers/net' failed
make[1]: *** [drivers/net] Error 2
make[1]: *** Waiting for unfinished jobs....
Makefile:988: recipe for target 'drivers' failed
make: *** [drivers] Error 2

The way to do this, is to add the pieces of source code infrastrucutre,
one piece at a time.  And then at the very very end, enable the code
into the build.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox