Netdev List
 help / color / mirror / Atom feed
* [PATCH net] macsonic: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-03  4:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-m68k, netdev, linux-kernel

Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m68k@lists.linux-m68k.org
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
---
 drivers/net/ethernet/natsemi/macsonic.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/natsemi/macsonic.c b/drivers/net/ethernet/natsemi/macsonic.c
index 0937fc2a928e..37b1ffa8bb61 100644
--- a/drivers/net/ethernet/natsemi/macsonic.c
+++ b/drivers/net/ethernet/natsemi/macsonic.c
@@ -523,6 +523,10 @@ static int mac_sonic_platform_probe(struct platform_device *pdev)
 	struct sonic_local *lp;
 	int err;
 
+	err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+	if (err)
+		return err;
+
 	dev = alloc_etherdev(sizeof(struct sonic_local));
 	if (!dev)
 		return -ENOMEM;
-- 
2.16.1

^ permalink raw reply related

* [PATCH net] macsonic: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-03  4:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-m68k, netdev, linux-kernel

Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m68k@lists.linux-m68k.org
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
---
 drivers/net/ethernet/natsemi/macsonic.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/natsemi/macsonic.c b/drivers/net/ethernet/natsemi/macsonic.c
index 0937fc2a928e..37b1ffa8bb61 100644
--- a/drivers/net/ethernet/natsemi/macsonic.c
+++ b/drivers/net/ethernet/natsemi/macsonic.c
@@ -523,6 +523,10 @@ static int mac_sonic_platform_probe(struct platform_device *pdev)
 	struct sonic_local *lp;
 	int err;
 
+	err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+	if (err)
+		return err;
+
 	dev = alloc_etherdev(sizeof(struct sonic_local));
 	if (!dev)
 		return -ENOMEM;
-- 
2.16.1

^ permalink raw reply related

* [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-03  4:23 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-m68k, netdev, linux-kernel

Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m68k@lists.linux-m68k.org
Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
---
 drivers/net/ethernet/apple/macmace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/apple/macmace.c b/drivers/net/ethernet/apple/macmace.c
index 137cbb470af2..98292c49ecf0 100644
--- a/drivers/net/ethernet/apple/macmace.c
+++ b/drivers/net/ethernet/apple/macmace.c
@@ -203,6 +203,10 @@ static int mace_probe(struct platform_device *pdev)
 	unsigned char checksum = 0;
 	int err;
 
+	err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+	if (err)
+		return err;
+
 	dev = alloc_etherdev(PRIV_BYTES);
 	if (!dev)
 		return -ENOMEM;
-- 
2.16.1

^ permalink raw reply related

* [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-03  4:23 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-m68k, netdev, linux-kernel

Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m68k@lists.linux-m68k.org
Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
---
 drivers/net/ethernet/apple/macmace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/apple/macmace.c b/drivers/net/ethernet/apple/macmace.c
index 137cbb470af2..98292c49ecf0 100644
--- a/drivers/net/ethernet/apple/macmace.c
+++ b/drivers/net/ethernet/apple/macmace.c
@@ -203,6 +203,10 @@ static int mace_probe(struct platform_device *pdev)
 	unsigned char checksum = 0;
 	int err;
 
+	err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+	if (err)
+		return err;
+
 	dev = alloc_etherdev(PRIV_BYTES);
 	if (!dev)
 		return -ENOMEM;
-- 
2.16.1

^ permalink raw reply related

* [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-03  4:23 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-m68k, netdev, linux-kernel

Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m68k@lists.linux-m68k.org
Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
---
 drivers/net/ethernet/apple/macmace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/apple/macmace.c b/drivers/net/ethernet/apple/macmace.c
index 137cbb470af2..98292c49ecf0 100644
--- a/drivers/net/ethernet/apple/macmace.c
+++ b/drivers/net/ethernet/apple/macmace.c
@@ -203,6 +203,10 @@ static int mace_probe(struct platform_device *pdev)
 	unsigned char checksum = 0;
 	int err;
 
+	err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+	if (err)
+		return err;
+
 	dev = alloc_etherdev(PRIV_BYTES);
 	if (!dev)
 		return -ENOMEM;
-- 
2.16.1

^ permalink raw reply related

* [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-03  4:23 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-m68k, netdev, linux-kernel

Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m68k@lists.linux-m68k.org
Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
---
 drivers/net/ethernet/apple/macmace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/apple/macmace.c b/drivers/net/ethernet/apple/macmace.c
index 137cbb470af2..98292c49ecf0 100644
--- a/drivers/net/ethernet/apple/macmace.c
+++ b/drivers/net/ethernet/apple/macmace.c
@@ -203,6 +203,10 @@ static int mace_probe(struct platform_device *pdev)
 	unsigned char checksum = 0;
 	int err;
 
+	err = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+	if (err)
+		return err;
+
 	dev = alloc_etherdev(PRIV_BYTES);
 	if (!dev)
 		return -ENOMEM;
-- 
2.16.1

^ permalink raw reply related

* [PATCH 2/2] drivers core: multi-threading device shutdown
From: Pavel Tatashin @ 2018-05-03  3:59 UTC (permalink / raw)
  To: pasha.tatashin, steven.sistare, daniel.m.jordan, linux-kernel,
	jeffrey.t.kirsher, intel-wired-lan, netdev, gregkh
In-Reply-To: <20180503035931.22439-1-pasha.tatashin@oracle.com>

When system is rebooted, halted or kexeced device_shutdown() is
called.

This function shuts down every single device by calling either:
	dev->bus->shutdown(dev)
	dev->driver->shutdown(dev)

Even on a machine just with a moderate amount of devices, device_shutdown()
may take multiple seconds to complete. Because many devices require a
specific delays to perform this operation.

Here is sample analysis of time it takes to call device_shutdown() on
two socket Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz machine.

device_shutdown		2.95s
 mlx4_shutdown		1.14s
 megasas_shutdown	0.24s
 ixgbe_shutdown		0.37s x 4 (four ixgbe devices on my machine).
 the rest		0.09s

In mlx4 we spent the most time, but that is because there is a 1 second
sleep:
mlx4_shutdown
 mlx4_unload_one
  mlx4_free_ownership
   msleep(1000)

With megasas we spend quoter of second, but sometimes longer (up-to 0.5s)
in this path:

    megasas_shutdown
      megasas_flush_cache
        megasas_issue_blocked_cmd
          wait_event_timeout

Finally, with ixgbe_shutdown() it takes 0.37 for each device, but that time
is spread all over the place, with bigger offenders:

    ixgbe_shutdown
      __ixgbe_shutdown
        ixgbe_close_suspend
          ixgbe_down
            ixgbe_init_hw_generic
              ixgbe_reset_hw_X540
                msleep(100);                        0.104483472
                ixgbe_get_san_mac_addr_generic      0.048414851
                ixgbe_get_wwn_prefix_generic        0.048409893
              ixgbe_start_hw_X540
                ixgbe_start_hw_generic
                  ixgbe_clear_hw_cntrs_generic      0.048581502
                  ixgbe_setup_fc_generic            0.024225800

    All the ixgbe_*generic functions end-up calling:
    ixgbe_read_eerd_X540()
      ixgbe_acquire_swfw_sync_X540
        usleep_range(5000, 6000);
      ixgbe_release_swfw_sync_X540
        usleep_range(5000, 6000);

While these are short sleeps, they end-up calling them over 24 times!
24 * 0.0055s = 0.132s. Adding-up to 0.528s for four devices.

While we should keep optimizing the individual device drivers, in some
cases this is simply a hardware property that forces a specific delay, and
we must wait.

So, the solution for this problem is to shutdown devices in parallel.
However, we must shutdown children before shutting down parents, so parent
device must wait for its children to finish.

With this patch, on the same machine devices_shutdown() takes 1.142s, and
without mlx4 one second delay only 0.38s

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
---
 drivers/base/core.c | 238 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 189 insertions(+), 49 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index b610816eb887..f370369a303b 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -25,6 +25,7 @@
 #include <linux/netdevice.h>
 #include <linux/sched/signal.h>
 #include <linux/sysfs.h>
+#include <linux/kthread.h>
 
 #include "base.h"
 #include "power/power.h"
@@ -2102,6 +2103,59 @@ const char *device_get_devnode(struct device *dev,
 	return *tmp = s;
 }
 
+/**
+ * device_children_count - device children count
+ * @parent: parent struct device.
+ *
+ * Returns number of children for this device or 0 if nonde.
+ */
+static int device_children_count(struct device *parent)
+{
+	struct klist_iter i;
+	int children = 0;
+
+	if (!parent->p)
+		return 0;
+
+	klist_iter_init(&parent->p->klist_children, &i);
+	while (next_device(&i))
+		children++;
+	klist_iter_exit(&i);
+
+	return children;
+}
+
+/**
+ * device_get_child_by_index - Return child using the provide index.
+ * @parent: parent struct device.
+ * @index:  Index of the child, where 0 is the first child in the children list,
+ * and so on.
+ *
+ * Returns child or NULL if child with this index is not present.
+ */
+static struct device *
+device_get_child_by_index(struct device *parent, int index)
+{
+	struct klist_iter i;
+	struct device *dev = NULL, *d;
+	int child_index = 0;
+
+	if (!parent->p || index < 0)
+		return NULL;
+
+	klist_iter_init(&parent->p->klist_children, &i);
+	while ((d = next_device(&i)) != NULL) {
+		if (child_index == index) {
+			dev = d;
+			break;
+		}
+		child_index++;
+	}
+	klist_iter_exit(&i);
+
+	return dev;
+}
+
 /**
  * device_for_each_child - device child iterator.
  * @parent: parent struct device.
@@ -2765,71 +2819,157 @@ int device_move(struct device *dev, struct device *new_parent,
 }
 EXPORT_SYMBOL_GPL(device_move);
 
+/*
+ * device_shutdown_one - call ->shutdown() for the device passed as
+ * argument.
+ */
+static void device_shutdown_one(struct device *dev)
+{
+	/* Don't allow any more runtime suspends */
+	pm_runtime_get_noresume(dev);
+	pm_runtime_barrier(dev);
+
+	if (dev->class && dev->class->shutdown_pre) {
+		if (initcall_debug)
+			dev_info(dev, "shutdown_pre\n");
+		dev->class->shutdown_pre(dev);
+	}
+	if (dev->bus && dev->bus->shutdown) {
+		if (initcall_debug)
+			dev_info(dev, "shutdown\n");
+		dev->bus->shutdown(dev);
+	} else if (dev->driver && dev->driver->shutdown) {
+		if (initcall_debug)
+			dev_info(dev, "shutdown\n");
+		dev->driver->shutdown(dev);
+	}
+
+	/* Release device lock, and decrement the reference counter */
+	device_unlock(dev);
+	put_device(dev);
+}
+
+static DECLARE_COMPLETION(device_root_tasks_complete);
+static void device_shutdown_tree(struct device *dev);
+static atomic_t device_root_tasks;
+
+/*
+ * Passed as an argument to to device_shutdown_task().
+ * child_next_index	the next available child index.
+ * tasks_running	number of tasks still running. Each tasks decrements it
+ *			when job is finished and the last tasks signals that the
+ *			job is complete.
+ * complete		Used to signal job competition.
+ * parent		Parent device.
+ */
+struct device_shutdown_task_data {
+	atomic_t		child_next_index;
+	atomic_t		tasks_running;
+	struct completion	complete;
+	struct device		*parent;
+};
+
+static int device_shutdown_task(void *data)
+{
+	struct device_shutdown_task_data *tdata =
+		(struct device_shutdown_task_data *)data;
+	int child_idx = atomic_inc_return(&tdata->child_next_index) - 1;
+	struct device *dev = device_get_child_by_index(tdata->parent,
+						       child_idx);
+
+	if (dev)
+		device_shutdown_tree(dev);
+	if (atomic_dec_return(&tdata->tasks_running) == 0)
+		complete(&tdata->complete);
+	return 0;
+}
+
+/*
+ * Shutdown device tree with root started in dev. If dev has no children
+ * simply shutdown only this device. If dev has children recursively shutdown
+ * children first, and only then the parent. For performance reasons children
+ * are shutdown in parallel using kernel threads.
+ */
+static void device_shutdown_tree(struct device *dev)
+{
+	int children_count = device_children_count(dev);
+
+	if (children_count) {
+		struct device_shutdown_task_data tdata;
+		int i;
+
+		init_completion(&tdata.complete);
+		atomic_set(&tdata.child_next_index, 0);
+		atomic_set(&tdata.tasks_running, children_count);
+		tdata.parent = dev;
+
+		for (i = 0; i < children_count; i++) {
+			kthread_run(device_shutdown_task,
+				    &tdata, "device_shutdown.%s",
+				    dev_name(dev));
+		}
+		wait_for_completion(&tdata.complete);
+	}
+	device_shutdown_one(dev);
+}
+
+/*
+ * On shutdown each root device (the one that does not have a parent) goes
+ * through this function.
+ */
+static int
+device_shutdown_root_task(void *data)
+{
+	struct device *dev = (struct device *)data;
+
+	device_shutdown_tree(dev);
+	if (atomic_dec_return(&device_root_tasks) == 0)
+		complete(&device_root_tasks_complete);
+	return 0;
+}
+
 /**
  * device_shutdown - call ->shutdown() on each device to shutdown.
  */
 void device_shutdown(void)
 {
-	struct device *dev, *parent;
+	struct list_head *pos, *next;
+	int root_devices = 0;
+	struct device *dev;
 
 	spin_lock(&devices_kset->list_lock);
 	/*
-	 * Walk the devices list backward, shutting down each in turn.
-	 * Beware that device unplug events may also start pulling
-	 * devices offline, even as the system is shutting down.
+	 * Prepare devices for shutdown: lock, and increment references in every
+	 * devices. Remove child devices from the list, and count number of root
+	 * devices.
 	 */
-	while (!list_empty(&devices_kset->list)) {
-		dev = list_entry(devices_kset->list.prev, struct device,
-				kobj.entry);
+	list_for_each_safe(pos, next, &devices_kset->list) {
+		dev = list_entry(pos, struct device, kobj.entry);
 
-		/*
-		 * hold reference count of device's parent to
-		 * prevent it from being freed because parent's
-		 * lock is to be held
-		 */
-		parent = get_device(dev->parent);
 		get_device(dev);
-		/*
-		 * Make sure the device is off the kset list, in the
-		 * event that dev->*->shutdown() doesn't remove it.
-		 */
-		list_del_init(&dev->kobj.entry);
-		spin_unlock(&devices_kset->list_lock);
-
-		/* hold lock to avoid race with probe/release */
-		if (parent)
-			device_lock(parent);
 		device_lock(dev);
 
-		/* Don't allow any more runtime suspends */
-		pm_runtime_get_noresume(dev);
-		pm_runtime_barrier(dev);
-
-		if (dev->class && dev->class->shutdown_pre) {
-			if (initcall_debug)
-				dev_info(dev, "shutdown_pre\n");
-			dev->class->shutdown_pre(dev);
-		}
-		if (dev->bus && dev->bus->shutdown) {
-			if (initcall_debug)
-				dev_info(dev, "shutdown\n");
-			dev->bus->shutdown(dev);
-		} else if (dev->driver && dev->driver->shutdown) {
-			if (initcall_debug)
-				dev_info(dev, "shutdown\n");
-			dev->driver->shutdown(dev);
-		}
-
-		device_unlock(dev);
-		if (parent)
-			device_unlock(parent);
-
-		put_device(dev);
-		put_device(parent);
-
+		if (!dev->parent)
+			root_devices++;
+		else
+			list_del_init(&dev->kobj.entry);
+	}
+	atomic_set(&device_root_tasks, root_devices);
+	/*
+	 * Shutdown the root devices in parallel. The children are going to be
+	 * shutdown first.
+	 */
+	list_for_each_safe(pos, next, &devices_kset->list) {
+		dev = list_entry(pos, struct device, kobj.entry);
+		list_del_init(&dev->kobj.entry);
+		spin_unlock(&devices_kset->list_lock);
+		kthread_run(device_shutdown_root_task,
+			    dev, "device_root_shutdown.%s",
+			    dev_name(dev));
 		spin_lock(&devices_kset->list_lock);
 	}
 	spin_unlock(&devices_kset->list_lock);
+	wait_for_completion(&device_root_tasks_complete);
 }
 
 /*
-- 
2.17.0

^ permalink raw reply related

* [PATCH 1/2] ixgbe: release lock for the duration of ixgbe_suspend_close()
From: Pavel Tatashin @ 2018-05-03  3:59 UTC (permalink / raw)
  To: pasha.tatashin, steven.sistare, daniel.m.jordan, linux-kernel,
	jeffrey.t.kirsher, intel-wired-lan, netdev, gregkh
In-Reply-To: <20180503035931.22439-1-pasha.tatashin@oracle.com>

Currently, during device_shutdown() ixgbe holds rtnl_lock for the duration
of lengthy ixgbe_close_suspend(). On machines with multiple ixgbe cards
this lock prevents scaling if device_shutdown() function is multi-threaded.

It is not necessary to hold this lock during ixgbe_close_suspend()
as it is not held when ixgbe_close() is called also during shutdown but for
kexec case.

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index afadba99f7b8..e7875b58854b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6748,8 +6748,15 @@ static int __ixgbe_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	rtnl_lock();
 	netif_device_detach(netdev);
 
-	if (netif_running(netdev))
+	if (netif_running(netdev)) {
+		/* Suspend takes a long time, device_shutdown may be
+		 * parallelized this function, so drop lock for the
+		 * duration of this call.
+		 */
+		rtnl_unlock();
 		ixgbe_close_suspend(adapter);
+		rtnl_lock();
+	}
 
 	ixgbe_clear_interrupt_scheme(adapter);
 	rtnl_unlock();
-- 
2.17.0

^ permalink raw reply related

* [PATCH 0/2] multi-threading device shutdown
From: Pavel Tatashin @ 2018-05-03  3:59 UTC (permalink / raw)
  To: pasha.tatashin, steven.sistare, daniel.m.jordan, linux-kernel,
	jeffrey.t.kirsher, intel-wired-lan, netdev, gregkh

Do a faster shutdown by calling dev->*->shutdown(dev) in parallel.
device_shutdown() calls these functions for every single device but
only using one thread.

Since, nothing else is running on the machine by the device_shutdown()
s called, there is no reason not to utilize all the available CPU
resources.

Pavel Tatashin (2):
  ixgbe: release lock for the duration of ixgbe_suspend_close()
  drivers core: multi-threading device shutdown

 drivers/base/core.c                           | 238 ++++++++++++++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   9 +-
 2 files changed, 197 insertions(+), 50 deletions(-)

-- 
2.17.0

^ permalink raw reply

* [bpf-next v1 9/9] samples/bpf: Add example of ipv4 and ipv6 forwarding in XDP
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

Simple example of fast-path forwarding. It has a serious flaw
in not verifying the egress device index supports XDP forwarding.
If the egress device does not packets are dropped.

Take this only as a simple example of fast-path forwarding.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 113 +++++++++++++++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 ++++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 4 files changed, 256 insertions(+)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 5e31770ac087..393dac1c43f4 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -45,6 +45,7 @@ hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
+hostprogs-y += xdp_fwd
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -98,6 +99,7 @@ xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
+xdp_fwd-objs := bpf_load.o $(LIBBPF) xdp_fwd_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -151,6 +153,7 @@ always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
+always += xdp_fwd_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -197,6 +200,7 @@ HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
 HOSTLOADLIBES_xdp_adjust_tail += -lelf
+HOSTLOADLIBES_xdp_fwd += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000000..7eeaa32538b1
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include "bpf_helpers.h"
+
+#define IPV6_FLOWINFO_MASK              cpu_to_be32(0x0FFFFFFF)
+
+struct bpf_map_def SEC("maps") tx_port = {
+	.type = BPF_MAP_TYPE_DEVMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 64,
+};
+
+static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	struct bpf_fib_lookup fib_params;
+	struct ethhdr *eth = data;
+	int out_index;
+	u16 h_proto;
+	u64 nh_off;
+
+	nh_off = sizeof(*eth);
+	if (data + nh_off > data_end)
+		return XDP_DROP;
+
+	__builtin_memset(&fib_params, 0, sizeof(fib_params));
+
+	h_proto = eth->h_proto;
+	if (h_proto == htons(ETH_P_IP)) {
+		struct iphdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET;
+		fib_params.tos		= iph->tos;
+		fib_params.l4_protocol	= iph->protocol;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->tot_len);
+		fib_params.ipv4_src	= iph->saddr;
+		fib_params.ipv4_dst	= iph->daddr;
+	} else if (h_proto == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET6;
+		fib_params.flowlabel	= *(__be32 *)iph & IPV6_FLOWINFO_MASK;
+		fib_params.l4_protocol	= iph->nexthdr;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->payload_len);
+		fib_params.ipv6_src	= iph->saddr;
+		fib_params.ipv6_dst	= iph->daddr;
+	} else {
+		return XDP_PASS;
+	}
+
+	fib_params.ifindex = ctx->ingress_ifindex;
+
+	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+
+	/* verify egress index has xdp support
+	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
+	 *       cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+	 * NOTE: without verification that egress index supports XDP
+	 *       forwarding packets are dropped.
+	 */
+	if (out_index > 0) {
+		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+		return bpf_redirect_map(&tx_port, out_index, 0);
+	}
+
+	return XDP_PASS;
+}
+
+SEC("xdp_fwd")
+int xdp_fwd_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, 0);
+}
+
+SEC("xdp_fwd_direct")
+int xdp_fwd_direct_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000000..9c6606f57126
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/limits.h>
+#include <net/if.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "libbpf.h"
+
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+static void usage(const char *prog)
+{
+	fprintf(stderr,
+		"usage: %s [OPTS] interface-list\n"
+		"\nOPTS:\n"
+		"    -d    detach program\n"
+		"    -D    direct table lookups (skip fib rules)\n",
+		prog);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[PATH_MAX];
+	int opt, i, idx, err;
+	int prog_id = 0;
+	int attach = 1;
+	int ret = 0;
+
+	while ((opt = getopt(argc, argv, ":dD")) != -1) {
+		switch (opt) {
+		case 'd':
+			attach = 0;
+			break;
+		case 'D':
+			prog_id = 1;
+			break;
+		default:
+			usage(basename(argv[0]));
+			return 1;
+		}
+	}
+
+	if (optind == argc) {
+		usage(basename(argv[0]));
+		return 1;
+	}
+
+	if (attach) {
+		snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+		if (access(filename, O_RDONLY) < 0) {
+			printf("error accessing file %s: %s\n",
+				filename, strerror(errno));
+			return 1;
+		}
+
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+
+		if (!prog_fd[prog_id]) {
+			printf("load_bpf_file: %s\n", strerror(errno));
+			return 1;
+		}
+	}
+	if (attach) {
+		for (i = 1; i < 64; ++i)
+			bpf_map_update_elem(map_fd[0], &i, &i, 0);
+	}
+
+	for (i = optind; i < argc; ++i) {
+		idx = if_nametoindex(argv[i]);
+		if (!idx)
+			idx = strtoul(argv[i], NULL, 0);
+
+		if (!idx) {
+			fprintf(stderr, "Invalid arg\n");
+			return 1;
+		}
+		if (!attach) {
+			err = do_detach(idx, argv[i]);
+			if (err)
+				ret = err;
+		} else {
+			err = do_attach(idx, prog_fd[prog_id], argv[i]);
+			if (err)
+				ret = err;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 265f8e0e8ada..2375d06c706b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -103,6 +103,9 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 	(void *) BPF_FUNC_skb_get_xfrm_state;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 	(void *) BPF_FUNC_get_stack;
+static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
+			     int plen, __u32 flags) =
+	(void *) BPF_FUNC_fib_lookup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 8/9] bpf: Provide helper to do lookups in kernel FIB table
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet continues up the stack.

If it is to be forwarded, the forwarding can be done directly if the
neighbor is already known. If the neighbor does not exist, the first
few packets go up the stack for neighbor resolution. Once resolved, the
xdp program provides the fast path.

On successful lookup the nexthop dmac, current device smac and egress
device index are returned.

The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
are implemented in this patch. The API includes layer 4 parameters if
the XDP program chooses to do deep packet inspection to allow compare
against ACLs implemented as FIB rules.

Header rewrite is left to the XDP program.

The lookup takes 2 flags:
- BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
  straight to the table associated with the device (expert setting for
  those looking to maximize throughput)

- BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
  Default is an ingress lookup.

Initial performance numbers collected by Jesper, forwarded packets/sec:

       Full stack    XDP FIB lookup    XDP Direct lookup
IPv4   1,947,969       7,074,156          7,415,333
IPv6   1,728,000       6,165,504          7,262,720

These number are single CPU core forwarding on a Broadwell
E5-1650 v4 @ 3.60GHz.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/bpf.h |  83 ++++++++++++++-
 net/core/filter.c        | 263 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 345 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8daef7326bb7..360a1168c353 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -10,6 +10,8 @@
 
 #include <linux/types.h>
 #include <linux/bpf_common.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
 
 /* Extended instruction set based on top of classic BPF */
 
@@ -1801,6 +1803,33 @@ union bpf_attr {
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
  * 		a negative error in case of failure.
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packets is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst,
+ *		ipv6_dst or mpls_out based on family, smac is set to mac
+ *		address of egress device, dmac is set to nexthop mac address,
+ *		rt_metric is set to metric from route.
+ *
+ *		*plen* argument is the size of the passed in struct.
+ *		*flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *
+ *		**BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
+ *		full lookup using FIB rules
+ *		**BPF_FIB_LOOKUP_OUTPUT** mmeans do lookup from an egress
+ *		perspective (default is ingress)
+ *
+ *		*ctx* is either **struct xdp_md** for XDP programs or
+ *		**struct sk_buff** tc cls_act programs.
+ *
+ *	Return
+ *		Egress device index on success, 0 if packet needs to continue
+ *		up the stack for further processing or a negative error in case
+ *		of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1870,7 +1899,8 @@ union bpf_attr {
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
-	FN(get_stack),
+	FN(get_stack),			\
+	FN(fib_lookup),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2278,4 +2308,55 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		struct in6_addr	ipv6_src;
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		struct in6_addr	ipv6_dst;
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[ETH_ALEN];
+	__u8	dmac[ETH_ALEN];
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index d3781daa26ab..c34ba2675a98 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,6 +59,10 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3788,6 +3792,261 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+				  const struct neighbour *neigh,
+				  const struct net_device *dev)
+{
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+	params->h_vlan_TCI = 0;
+	params->h_vlan_proto = 0;
+
+	return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in_device *in_dev;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib_result res;
+	struct fib_nh *nh;
+	struct flowi4 fl4;
+	int err;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	/* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl4.flowi4_iif = 1;
+		fl4.flowi4_oif = params->ifindex;
+	} else {
+		fl4.flowi4_iif = params->ifindex;
+		fl4.flowi4_oif = 0;
+	}
+	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+
+	fl4.flowi4_proto = params->l4_protocol;
+	fl4.daddr = params->ipv4_dst;
+	fl4.saddr = params->ipv4_src;
+	fl4.fl4_sport = params->sport;
+	fl4.fl4_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib_table *tb;
+
+		tb = fib_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+	} else {
+		fl4.flowi4_mark = 0;
+		fl4.flowi4_secid = 0;
+		fl4.flowi4_tun_key.tun_id = 0;
+		fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+	}
+
+	if (err || res.type != RTN_UNICAST)
+		return 0;
+
+	if (res.fi->fib_nhs > 1)
+		fib_select_path(net, &res, &fl4, NULL);
+
+	nh = &res.fi->fib_nh[res.nh_sel];
+
+	/* do not handle lwt encaps right now */
+	if (nh->nh_lwtstate)
+		return 0;
+
+	dev = nh->nh_dev;
+	if (unlikely(!dev))
+		return 0;
+
+	if (nh->nh_gw)
+		params->ipv4_dst = nh->nh_gw;
+
+	params->rt_metric = res.fi->fib_priority;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so
+	 * rcu_read_lock_bh is not needed here
+	 */
+	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib6_info *f6i;
+	struct flowi6 fl6;
+	int strict = 0;
+	int oif;
+
+	/* link local addresses are never forwarded */
+	if (rt6_need_strict(&params->ipv6_dst) ||
+	    rt6_need_strict(&params->ipv6_src))
+		return 0;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl6.flowi6_iif = 1;
+		oif = fl6.flowi6_oif = params->ifindex;
+	} else {
+		oif = fl6.flowi6_iif = params->ifindex;
+		fl6.flowi6_oif = 0;
+		strict = RT6_LOOKUP_F_HAS_SADDR;
+	}
+	fl6.flowlabel = params->flowlabel;
+	fl6.flowi6_scope = 0;
+	fl6.flowi6_flags = 0;
+	fl6.mp_hash = 0;
+
+	fl6.flowi6_proto = params->l4_protocol;
+	fl6.daddr = params->ipv6_dst;
+	fl6.saddr = params->ipv6_src;
+	fl6.fl6_sport = params->sport;
+	fl6.fl6_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib6_table *tb;
+
+		tb = ipv6_stub->fib6_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+	} else {
+		fl6.flowi6_mark = 0;
+		fl6.flowi6_secid = 0;
+		fl6.flowi6_tun_key.tun_id = 0;
+		fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+	}
+
+	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+		return 0;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+	    f6i->fib6_type != RTN_UNICAST))
+		return 0;
+
+	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+						       fl6.flowi6_oif, NULL,
+						       strict);
+
+	if (f6i->fib6_nh.nh_lwtstate)
+		return 0;
+
+	if (f6i->fib6_flags & RTF_GATEWAY)
+		params->ipv6_dst = f6i->fib6_nh.nh_gw;
+
+	dev = f6i->fib6_nh.nh_dev;
+	params->rt_metric = f6i->fib6_metric;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
+	 * because we need to get nd_tbl via the stub
+	 */
+	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				      ndisc_hashfn, &params->ipv6_dst, dev);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+	.func		= bpf_xdp_fib_lookup,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+	.func		= bpf_skb_fib_lookup,
+	.gpl_only	= true,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3933,6 +4192,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -3958,6 +4219,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_xdp_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 7/9] net/ipv6: Add fib lookup stubs for use in bpf helper
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table,
a stub to do a lookup in a specific table, fib6_table_lookup, and
a stub for a full route lookup.

The stubs are needed for core bpf code to handle the case when the
IPv6 module is not builtin.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/addrconf.h   | 14 ++++++++++++++
 net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/af_inet6.c      |  6 +++++-
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8312cc25a3af..ff766ab207e0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,6 +223,20 @@ struct ipv6_stub {
 				 const struct in6_addr *addr);
 	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..2fe754fd4f5e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
 	return -EAFNOSUPPORT;
 }
 
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6, int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			 int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+				   struct flowi6 *fl6, int oif,
+				   const struct sk_buff *skb, int strict)
+{
+	return f6i;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
-	.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
+	.fib6_get_table    = eafnosupport_fib6_get_table,
+	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
+	.fib6_lookup       = eafnosupport_fib6_lookup,
+	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..c0e8255d50bb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -887,7 +887,11 @@ static struct pernet_operations inet6_net_ops = {
 static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_join = ipv6_sock_mc_join,
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
-	.ipv6_dst_lookup = ip6_dst_lookup,
+	.ipv6_dst_lookup   = ip6_dst_lookup,
+	.fib6_get_table	   = fib6_get_table,
+	.fib6_table_lookup = fib6_table_lookup,
+	.fib6_lookup       = fib6_lookup,
+	.fib6_multipath_select = fib6_multipath_select,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 6/9] net/ipv6: Update fib6 tracepoint to take fib6_info
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

Similar to IPv4, IPv6 should use the FIB lookup result in the
tracepoint.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/trace/events/fib6.h | 14 +++++++-------
 net/ipv6/route.c            | 14 ++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 7e8d48a81b91..1b8d951e3c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct rt6_info *rt,
+	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, rt, table, flp),
+	TP_ARGS(net, f6i, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
-		if (rt->rt6i_idev) {
-			__assign_str(name, rt->rt6i_idev->dev->name);
+		if (f6i->fib6_nh.nh_dev) {
+			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
 			__assign_str(name, "");
 		}
-		if (rt == net->ipv6.ip6_null_entry) {
+		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
 
-		} else if (rt) {
+		} else if (f6i) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = rt->rt6i_gateway;
+			*in6 = f6i->fib6_nh.nh_gw;
 		}
 	),
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d0ace0c5c3e9..cf8de6899581 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1078,6 +1078,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 			goto restart;
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	/* Search through exception table */
 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
 	if (rt) {
@@ -1096,8 +1098,6 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-
 	return rt;
 }
 
@@ -1827,6 +1827,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	return f6i;
 }
 
@@ -1853,7 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	}
 
@@ -1864,7 +1865,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_use_noref(&rt->dst, jiffies);
 
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
@@ -1890,9 +1890,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_hold(&uncached_rt->dst);
 		}
 
-		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
-
 	} else {
 		/* Get a percpu copy */
 
@@ -1906,7 +1904,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 		local_bh_enable();
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
 		return pcpu_rt;
 	}
 }
@@ -2486,7 +2484,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, ret, table, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 	return ret;
 };
 
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 5/9] net/ipv6: Add fib6_lookup
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled)
to 60% faster than any of the dst based lookup methods (without custom
rules) and 25% faster with custom rules (e.g., l3mdev rule).

Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.

Caller must hold rcu lock as no reference is taken on the returned
fib entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  6 ++++
 net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_fib.c    |  7 +++++
 3 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 4f7b8f59ea6d..d920dd00139b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags);
+
 /* called with rcu lock held; caller needs to select path */
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d040c4bff3a0..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
 	return fib_rules_seq_read(net, AF_INET6);
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	struct fib6_info *f6i;
+	int err;
+
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = fib6_table_lookup,
+			.lookup_data = &oif,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				       flowi6_to_flowi(fl6), flags, &arg);
+		if (err)
+			return ERR_PTR(err);
+
+		f6i = arg.result ? : net->ipv6.fib6_null_entry;
+	} else {
+		f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+					oif, fl6, flags);
+		if (!f6i || f6i == net->ipv6.fib6_null_entry)
+			f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+						oif, fl6, flags);
+	}
+
+	return f6i;
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
@@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
 	return 0;
 }
 
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
-			    int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+				int flags, struct fib_lookup_arg *arg)
+{
+	struct flowi6 *flp6 = &flp->u.ip6;
+	struct net *net = rule->fr_net;
+	struct fib6_table *table;
+	struct fib6_info *f6i;
+	int err = -EAGAIN, *oif;
+	u32 tb_id;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	tb_id = fib_rule_get_table(rule, arg);
+	table = fib6_get_table(net, tb_id);
+	if (!table)
+		return -EAGAIN;
+
+	oif = (int *)arg->lookup_data;
+	f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+	if (f6i != net->ipv6.fib6_null_entry) {
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      fib6_info_nh_dev(f6i));
+
+		if (likely(!err))
+			arg->result = f6i;
+	}
+
+	return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
 {
 	struct flowi6 *flp6 = &flp->u.ip6;
 	struct rt6_info *rt = NULL;
@@ -182,6 +255,15 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 	return err;
 }
 
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	if (arg->lookup_ptr == fib6_table_lookup)
+		return fib6_rule_action_alt(rule, flp, flags, arg);
+
+	return __fib6_rule_action(rule, flp, flags, arg);
+}
+
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4cfffa0f676e..0b94c0a631cb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &rt->dst;
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
 static void __net_init fib6_tables_init(struct net *net)
 {
 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 4/9] net/ipv6: Refactor fib6_rule_action
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

Move source address lookup from fib6_rule_action to a helper. It will be
used in a later patch by a second variant for fib6_rule_action.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/fib6_rules.c | 52 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 6547fc6491a6..d040c4bff3a0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -96,6 +96,31 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &net->ipv6.ip6_null_entry->dst;
 }
 
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+			   struct flowi6 *flp6, const struct net_device *dev)
+{
+	struct fib6_rule *r = (struct fib6_rule *)rule;
+
+	/* If we need to find a source address for this traffic,
+	 * we check the result if it meets requirement of the rule.
+	 */
+	if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+	    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+		struct in6_addr saddr;
+
+		if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+				       rt6_flags2srcprefs(flags), &saddr))
+			return -EAGAIN;
+
+		if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+			return -EAGAIN;
+
+		flp6->saddr = saddr;
+	}
+
+	return 0;
+}
+
 static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 			    int flags, struct fib_lookup_arg *arg)
 {
@@ -134,27 +159,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 
 	rt = lookup(net, table, flp6, arg->lookup_data, flags);
 	if (rt != net->ipv6.ip6_null_entry) {
-		struct fib6_rule *r = (struct fib6_rule *)rule;
-
-		/*
-		 * If we need to find a source address for this traffic,
-		 * we check the result if it meets requirement of the rule.
-		 */
-		if ((rule->flags & FIB_RULE_FIND_SADDR) &&
-		    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
-			struct in6_addr saddr;
-
-			if (ipv6_dev_get_saddr(net,
-					       ip6_dst_idev(&rt->dst)->dev,
-					       &flp6->daddr,
-					       rt6_flags2srcprefs(flags),
-					       &saddr))
-				goto again;
-			if (!ipv6_prefix_equal(&saddr, &r->src.addr,
-					       r->src.plen))
-				goto again;
-			flp6->saddr = saddr;
-		}
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      ip6_dst_idev(&rt->dst)->dev);
+
+		if (err == -EAGAIN)
+			goto again;
+
 		err = rt->dst.error;
 		if (err != -EAGAIN)
 			goto out;
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 3/9] net/ipv6: Extract table lookup from ip6_pol_route
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.

ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  4 ++++
 net/ipv6/route.c      | 39 +++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 80d76d8dc683..4f7b8f59ea6d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict);
+
 struct fib6_info *fib6_multipath_select(const struct net *net,
 					struct fib6_info *match,
 					struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 58af969f3a2c..d0ace0c5c3e9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
 	rcu_read_unlock_bh();
 }
 
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
-			       int oif, struct flowi6 *fl6,
-			       const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct fib6_info *f6i;
-	struct rt6_info *rt;
-	int strict = 0;
-
-	strict |= flags & RT6_LOOKUP_F_IFACE;
-	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	if (net->ipv6.devconf_all->forwarding == 0)
-		strict |= RT6_LOOKUP_F_REACHABLE;
-
-	rcu_read_lock();
 
 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
@@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->fib6_nsiblings)
-		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1838,6 +1827,28 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		}
 	}
 
+	return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6,
+			       const struct sk_buff *skb, int flags)
+{
+	struct fib6_info *f6i;
+	struct rt6_info *rt;
+	int strict = 0;
+
+	strict |= flags & RT6_LOOKUP_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
+
+	rcu_read_lock();
+
+	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+	if (f6i->fib6_nsiblings)
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
 	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 2/9] net/ipv6: Rename rt6_multipath_select
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

Rename rt6_multipath_select to fib6_multipath_select and export it.
A later patch wants access to it similar to IPv4's fib_select_path.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/route.c      | 17 +++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5a16630179cb..80d76d8dc683 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb, int strict);
+
 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d903db30dfff..58af969f3a2c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct fib6_info *rt6_multipath_select(const struct net *net,
-					      struct fib6_info *match,
-					     struct flowi6 *fl6, int oif,
-					     const struct sk_buff *skb,
-					     int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb,
+					int strict)
 {
 	struct fib6_info *sibling, *next_sibling;
 
@@ -1068,8 +1068,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
-			f6i = rt6_multipath_select(net, f6i, fl6,
-						   fl6->flowi6_oif, skb, flags);
+			f6i = fib6_multipath_select(net, f6i, fl6,
+						    fl6->flowi6_oif, skb,
+						    flags);
 	}
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1824,7 +1825,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
 	if (f6i->fib6_nsiblings)
-		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180503035319.18290-1-dsahern@gmail.com>

Rename fib6_lookup to fib6_node_lookup to better reflect what it
returns. The fib6_lookup name will be used in a later patch for
an IPv6 equivalent to IPv4's fib_lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  6 +++---
 net/ipv6/ip6_fib.c    | 14 ++++++++------
 net/ipv6/route.c      |  8 ++++----
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1af450d4e923..5a16630179cb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
-struct fib6_node *fib6_lookup(struct fib6_node *root,
-			      const struct in6_addr *daddr,
-			      const struct in6_addr *saddr);
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr);
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *daddr, int dst_len,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6421c893466e..4cfffa0f676e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1354,8 +1354,8 @@ struct lookup_args {
 	const struct in6_addr	*addr;		/* search key			*/
 };
 
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
-				       struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+					    struct lookup_args *args)
 {
 	struct fib6_node *fn;
 	__be32 dir;
@@ -1400,7 +1400,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 #ifdef CONFIG_IPV6_SUBTREES
 				if (subtree) {
 					struct fib6_node *sfn;
-					sfn = fib6_lookup_1(subtree, args + 1);
+					sfn = fib6_node_lookup_1(subtree,
+								 args + 1);
 					if (!sfn)
 						goto backtrack;
 					fn = sfn;
@@ -1422,8 +1423,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 
 /* called with rcu_read_lock() held
  */
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
-			      const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr)
 {
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
@@ -1442,7 +1444,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 		}
 	};
 
-	fn = fib6_lookup_1(root, daddr ? args : args + 1);
+	fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
 	if (!fn || fn->fn_flags & RTN_TL_ROOT)
 		fn = root;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7ee0a34fba46..d903db30dfff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1006,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 		pn = rcu_dereference(fn->parent);
 		sn = FIB6_SUBTREE(pn);
 		if (sn && sn != fn)
-			fn = fib6_lookup(sn, NULL, saddr);
+			fn = fib6_node_lookup(sn, NULL, saddr);
 		else
 			fn = pn;
 		if (fn->fn_flags & RTN_RTINFO)
@@ -1059,7 +1059,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		flags &= ~RT6_LOOKUP_F_IFACE;
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	f6i = rcu_dereference(fn->leaf);
 	if (!f6i) {
@@ -1815,7 +1815,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	rcu_read_lock();
 
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
 
 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
@@ -2420,7 +2420,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	 */
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
-- 
2.11.0

^ permalink raw reply related

* [bpf-next v1 0/9] bpf: Add helper to do FIB lookups
From: David Ahern @ 2018-05-03  3:53 UTC (permalink / raw)
  To: netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet is expected to continue up the stack
for full processing.

The response from a FIB and neighbor lookup is either the egress index
with the bpf_fib_lookup struct filled in with dmac and gateway or
0 meaning the packet should continue up the stack. In time we can
visit this to return the FIB lookup result errno if it is one of the
special RTN_'s such as RTN_BLACKHOLE (-EINVAL) so that the XDP
programs can do an early drop if desired.

Patches 1-6 do some more refactoring to IPv6 with the end goal of
extracting a FIB lookup function that aligns with fib_lookup for IPv4,
basically returning a fib6_info without creating a dst based entry.

Patch 7 adds lookup functions to the ipv6 stub. These are needed since
bpf is built into the kernel and ipv6 may not be built or loaded.

Patch 8 adds the bpf helper and 9 adds a sample program.

v1
- updated commit messages and cover letter
- added comment to sample program noting lack of verification on
  egress device supporting XDP

RFC v2
- fixed use of foward helper from cls_act as noted by Daniel
- in patch 1 rename fib6_lookup_1 as well for consistency


David Ahern (9):
  net/ipv6: Rename fib6_lookup to fib6_node_lookup
  net/ipv6: Rename rt6_multipath_select
  net/ipv6: Extract table lookup from ip6_pol_route
  net/ipv6: Refactor fib6_rule_action
  net/ipv6: Add fib6_lookup
  net/ipv6: Update fib6 tracepoint to take fib6_info
  net/ipv6: Add fib lookup stubs for use in bpf helper
  bpf: Provide helper to do lookups in kernel FIB table
  samples/bpf: Add example of ipv4 and ipv6 forwarding in XDP

 include/net/addrconf.h                    |  14 ++
 include/net/ip6_fib.h                     |  21 ++-
 include/trace/events/fib6.h               |  14 +-
 include/uapi/linux/bpf.h                  |  83 +++++++++-
 net/core/filter.c                         | 263 ++++++++++++++++++++++++++++++
 net/ipv6/addrconf_core.c                  |  33 +++-
 net/ipv6/af_inet6.c                       |   6 +-
 net/ipv6/fib6_rules.c                     | 138 +++++++++++++---
 net/ipv6/ip6_fib.c                        |  21 ++-
 net/ipv6/route.c                          |  76 +++++----
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 113 +++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 +++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 14 files changed, 850 insertions(+), 75 deletions(-)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

-- 
2.11.0

^ permalink raw reply

* [PATCH net] tcp: restore autocorking
From: Eric Dumazet @ 2018-05-03  3:25 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Michael Wenig, Eric Dumazet

When adding rb-tree for TCP retransmit queue, we inadvertently broke
TCP autocorking.

tcp_should_autocork() should really check if the rtx queue is not empty.

Tested:

Before the fix :
$ nstat -n;./netperf -H 10.246.7.152 -Cc -- -m 500;nstat | grep AutoCork
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

540000 262144    500    10.00      2682.85   2.47     1.59     3.618   2.329
TcpExtTCPAutoCorking            33                 0.0

// Same test, but forcing TCP_NODELAY
$ nstat -n;./netperf -H 10.246.7.152 -Cc -- -D -m 500;nstat | grep AutoCork
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET : nodelay
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

540000 262144    500    10.00      1408.75   2.44     2.96     6.802   8.259
TcpExtTCPAutoCorking            1                  0.0

After the fix :
$ nstat -n;./netperf -H 10.246.7.152 -Cc -- -m 500;nstat | grep AutoCork
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

540000 262144    500    10.00      5472.46   2.45     1.43     1.761   1.027
TcpExtTCPAutoCorking            361293             0.0

// With TCP_NODELAY option
$ nstat -n;./netperf -H 10.246.7.152 -Cc -- -D -m 500;nstat | grep AutoCork
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.246.7.152 () port 0 AF_INET : nodelay
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

540000 262144    500    10.00      5454.96   2.46     1.63     1.775   1.174
TcpExtTCPAutoCorking            315448             0.0

Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Michael Wenig <mwenig@vmware.com>
Tested-by: Michael Wenig <mwenig@vmware.com>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 44be7f43455e4aefde8db61e2d941a69abcc642a..c9d00ef54deca15d5760bcbe154001a96fa1e2a7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -697,7 +697,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
 {
 	return skb->len < size_goal &&
 	       sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
-	       skb != tcp_write_queue_head(sk) &&
+	       !tcp_rtx_queue_empty(sk) &&
 	       refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
 }
 
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* Re: [PATCH net] ipv4: fix fnhe usage by non-cached routes
From: David Miller @ 2018-05-03  2:55 UTC (permalink / raw)
  To: ja; +Cc: netdev, kafai, kernel-team, dsahern, lucien.xin
In-Reply-To: <20180502064119.4552-1-ja@ssi.bg>

From: Julian Anastasov <ja@ssi.bg>
Date: Wed,  2 May 2018 09:41:19 +0300

> Allow some non-cached routes to use non-expired fnhe:
> 
> 1. ip_del_fnhe: moved above and now called by find_exception.
> The 4.5+ commit deed49df7390 expires fnhe only when caching
> routes. Change that to:
> 
> 1.1. use fnhe for non-cached local output routes, with the help
> from (2)
> 
> 1.2. allow __mkroute_input to detect expired fnhe (outdated
> fnhe_gw, for example) when do_cache is false, eg. when itag!=0
> for unicast destinations.
> 
> 2. __mkroute_output: keep fi to allow local routes with orig_oif != 0
> to use fnhe info even when the new route will not be cached into fnhe.
> After commit 839da4d98960 ("net: ipv4: set orig_oif based on fib
> result for local traffic") it means all local routes will be affected
> because they are not cached. This change is used to solve a PMTU
> problem with IPVS (and probably Netfilter DNAT) setups that redirect
> local clients from target local IP (local route to Virtual IP)
> to new remote IP target, eg. IPVS TUN real server. Loopback has
> 64K MTU and we need to create fnhe on the local route that will
> keep the reduced PMTU for the Virtual IP. Without this change
> fnhe_pmtu is updated from ICMP but never exposed to non-cached
> local routes. This includes routes with flowi4_oif!=0 for 4.6+ and
> with flowi4_oif=any for 4.14+).
> 
> 3. update_or_create_fnhe: make sure fnhe_expires is not 0 for
> new entries
> 
> Fixes: 839da4d98960 ("net: ipv4: set orig_oif based on fib result for local traffic")
> Fixes: d6d5e999e5df ("route: do not cache fib route info on local routes with oif")
> Fixes: deed49df7390 ("route: check and remove route cache when we get route")
> Cc: David Ahern <dsahern@gmail.com>
> Cc: Xin Long <lucien.xin@gmail.com>
> Signed-off-by: Julian Anastasov <ja@ssi.bg>

Applied and queued up for -stable, thanks Julian.

^ permalink raw reply

* Re: pull-request: bpf 2018-05-03
From: David Miller @ 2018-05-03  2:47 UTC (permalink / raw)
  To: daniel; +Cc: ast, netdev
In-Reply-To: <20180503003712.749-1-daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu,  3 May 2018 02:37:12 +0200

> The following pull-request contains BPF updates for your *net* tree.
> 
> The main changes are:
 ...
> Please consider pulling these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git

Pulled, thanks Daniel.

^ permalink raw reply

* Re: [RFC v3 4/5] virtio_ring: add event idx support in packed ring
From: Tiwei Bie @ 2018-05-03  2:09 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180503044218-mutt-send-email-mst@kernel.org>

On Thu, May 03, 2018 at 04:44:39AM +0300, Michael S. Tsirkin wrote:
> On Thu, May 03, 2018 at 09:11:16AM +0800, Tiwei Bie wrote:
> > On Wed, May 02, 2018 at 06:42:57PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, May 02, 2018 at 11:12:55PM +0800, Tiwei Bie wrote:
> > > > On Wed, May 02, 2018 at 04:51:01PM +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, May 02, 2018 at 03:28:19PM +0800, Tiwei Bie wrote:
> > > > > > On Wed, May 02, 2018 at 10:51:06AM +0800, Jason Wang wrote:
> > > > > > > On 2018年04月25日 13:15, Tiwei Bie wrote:
> > > > > > > > This commit introduces the event idx support in packed
> > > > > > > > ring. This feature is temporarily disabled, because the
> > > > > > > > implementation in this patch may not work as expected,
> > > > > > > > and some further discussions on the implementation are
> > > > > > > > needed, e.g. do we have to check the wrap counter when
> > > > > > > > checking whether a kick is needed?
> > > > > > > > 
> > > > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > > > ---
> > > > > > > >   drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++++++----
> > > > > > > >   1 file changed, 49 insertions(+), 4 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > index 0181e93897be..b1039c2985b9 100644
> > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > @@ -986,7 +986,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > > >   static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > >   {
> > > > > > > >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > -	u16 flags;
> > > > > > > > +	u16 new, old, off_wrap, flags;
> > > > > > > >   	bool needs_kick;
> > > > > > > >   	u32 snapshot;
> > > > > > > > @@ -995,7 +995,12 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > >   	 * suppressions. */
> > > > > > > >   	virtio_mb(vq->weak_barriers);
> > > > > > > > +	old = vq->next_avail_idx - vq->num_added;
> > > > > > > > +	new = vq->next_avail_idx;
> > > > > > > > +	vq->num_added = 0;
> > > > > > > > +
> > > > > > > >   	snapshot = *(u32 *)vq->vring_packed.device;
> > > > > > > > +	off_wrap = virtio16_to_cpu(_vq->vdev, snapshot & 0xffff);
> > > > > > > >   	flags = cpu_to_virtio16(_vq->vdev, snapshot >> 16) & 0x3;
> > > > > > > >   #ifdef DEBUG
> > > > > > > > @@ -1006,7 +1011,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > > >   	vq->last_add_time_valid = false;
> > > > > > > >   #endif
> > > > > > > > -	needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > > > +	if (flags == VRING_EVENT_F_DESC)
> > > > > > > > +		needs_kick = vring_need_event(off_wrap & ~(1<<15), new, old);
> > > > > > > 
> > > > > > > I wonder whether or not the math is correct. Both new and event are in the
> > > > > > > unit of descriptor ring size, but old looks not.
> > > > > > 
> > > > > > What vring_need_event() cares is the distance between
> > > > > > `new` and `old`, i.e. vq->num_added. So I think there
> > > > > > is nothing wrong with `old`. But the calculation of the
> > > > > > distance between `new` and `event_idx` isn't right when
> > > > > > `new` wraps. How do you think about the below code:
> > > > > > 
> > > > > > 	wrap_counter = off_wrap >> 15;
> > > > > > 	event_idx = off_wrap & ~(1<<15);
> > > > > > 	if (wrap_counter != vq->wrap_counter)
> > > > > > 		event_idx -= vq->vring_packed.num;
> > > > > > 	
> > > > > > 	needs_kick = vring_need_event(event_idx, new, old);
> > > > > 
> > > > > I suspect this hack won't work for non power of 2 ring.
> > > > 
> > > > Above code doesn't require the ring size to be a power of 2.
> > > > 
> > > > For (__u16)(new_idx - old), what we want to get is vq->num_added.
> > > > 
> > > > old = vq->next_avail_idx - vq->num_added;
> > > > new = vq->next_avail_idx;
> > > > 
> > > > When vq->next_avail_idx >= vq->num_added, it's obvious that,
> > > > (__u16)(new_idx - old) is vq->num_added.
> > > > 
> > > > And when vq->next_avail_idx < vq->num_added, new will be smaller
> > > > than old (old will be a big unsigned number), but (__u16)(new_idx
> > > > - old) is still vq->num_added.
> > > > 
> > > > For (__u16)(new_idx - event_idx - 1), when new wraps and event_idx
> > > > doesn't wrap, the most straightforward way to calculate it is:
> > > > (new + vq->vring_packed.num) - event_idx - 1.
> > > 
> > > So how about we use the straightforward way then?
> > 
> > You mean we do new += vq->vring_packed.num instead
> > of event_idx -= vq->vring_packed.num before calling
> > vring_need_event()?
> > 
> > The problem is that, the second param (new_idx) of
> > vring_need_event() will be used for:
> > 
> > (__u16)(new_idx - event_idx - 1)
> > (__u16)(new_idx - old)
> > 
> > So if we change new, we will need to change old too.
> 
> I think that since we have a branch there anyway,
> we are better off just special-casing if (wrap_counter != vq->wrap_counter).
> Treat is differenty and avoid casts.
> 
> > And that would be an ugly hack..
> > 
> > Best regards,
> > Tiwei Bie
> 
> I consider casts and huge numbers with two's complement
> games even uglier.

The dependency on two's complement game is introduced
since the split ring.

In packed ring, old is calculated via:

old = vq->next_avail_idx - vq->num_added;

In split ring, old is calculated via:

old = vq->avail_idx_shadow - vq->num_added;

In both cases, when vq->num_added is bigger, old will
be a big number.

Best regards,
Tiwei Bie

> 
> > > 
> > > > But we can also calculate it in this way:
> > > > 
> > > > event_idx -= vq->vring_packed.num;
> > > > (event_idx will be a big unsigned number)
> > > > 
> > > > Then (__u16)(new_idx - event_idx - 1) will be the value we want.
> > > > 
> > > > Best regards,
> > > > Tiwei Bie
> > > 
> > > 
> > > > > 
> > > > > 
> > > > > > Best regards,
> > > > > > Tiwei Bie
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > Thanks
> > > > > > > 
> > > > > > > > +	else
> > > > > > > > +		needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > > >   	END_USE(vq);
> > > > > > > >   	return needs_kick;
> > > > > > > >   }
> > > > > > > > @@ -1116,6 +1124,15 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > > >   	if (vq->last_used_idx >= vq->vring_packed.num)
> > > > > > > >   		vq->last_used_idx -= vq->vring_packed.num;
> > > > > > > > +	/* If we expect an interrupt for the next entry, tell host
> > > > > > > > +	 * by writing event index and flush out the write before
> > > > > > > > +	 * the read in the next get_buf call. */
> > > > > > > > +	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> > > > > > > > +		virtio_store_mb(vq->weak_barriers,
> > > > > > > > +				&vq->vring_packed.driver->off_wrap,
> > > > > > > > +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> > > > > > > > +						(vq->wrap_counter << 15)));
> > > > > > > > +
> > > > > > > >   #ifdef DEBUG
> > > > > > > >   	vq->last_add_time_valid = false;
> > > > > > > >   #endif
> > > > > > > > @@ -1143,10 +1160,17 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > > > > >   	/* We optimistically turn back on interrupts, then check if there was
> > > > > > > >   	 * more to do. */
> > > > > > > > +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > > > +	 * either clear the flags bit or point the event index at the next
> > > > > > > > +	 * entry. Always update the event index to keep code simple. */
> > > > > > > > +
> > > > > > > > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > > > +			vq->last_used_idx | (vq->wrap_counter << 15));
> > > > > > > >   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > > >   		virtio_wmb(vq->weak_barriers);
> > > > > > > > -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > > > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > > > +						     VRING_EVENT_F_ENABLE;
> > > > > > > >   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > > >   							vq->event_flags_shadow);
> > > > > > > >   	}
> > > > > > > > @@ -1172,15 +1196,34 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> > > > > > > >   static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> > > > > > > >   {
> > > > > > > >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > +	u16 bufs, used_idx, wrap_counter;
> > > > > > > >   	START_USE(vq);
> > > > > > > >   	/* We optimistically turn back on interrupts, then check if there was
> > > > > > > >   	 * more to do. */
> > > > > > > > +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > > > +	 * either clear the flags bit or point the event index at the next
> > > > > > > > +	 * entry. Always update the event index to keep code simple. */
> > > > > > > > +
> > > > > > > > +	/* TODO: tune this threshold */
> > > > > > > > +	bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> > > > > > > > +
> > > > > > > > +	used_idx = vq->last_used_idx + bufs;
> > > > > > > > +	wrap_counter = vq->wrap_counter;
> > > > > > > > +
> > > > > > > > +	if (used_idx >= vq->vring_packed.num) {
> > > > > > > > +		used_idx -= vq->vring_packed.num;
> > > > > > > > +		wrap_counter ^= 1;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > > > +			used_idx | (wrap_counter << 15));
> > > > > > > >   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > > >   		virtio_wmb(vq->weak_barriers);
> > > > > > > > -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > > > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > > > +						     VRING_EVENT_F_ENABLE;
> > > > > > > >   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > > >   							vq->event_flags_shadow);
> > > > > > > >   	}
> > > > > > > > @@ -1822,8 +1865,10 @@ void vring_transport_features(struct virtio_device *vdev)
> > > > > > > >   		switch (i) {
> > > > > > > >   		case VIRTIO_RING_F_INDIRECT_DESC:
> > > > > > > >   			break;
> > > > > > > > +#if 0
> > > > > > > >   		case VIRTIO_RING_F_EVENT_IDX:
> > > > > > > >   			break;
> > > > > > > > +#endif
> > > > > > > >   		case VIRTIO_F_VERSION_1:
> > > > > > > >   			break;
> > > > > > > >   		case VIRTIO_F_IOMMU_PLATFORM:
> > > > > > > 
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH] sctp: fix a potential missing-check bug
From: Marcelo Ricardo Leitner @ 2018-05-03  1:48 UTC (permalink / raw)
  To: Wenwen Wang
  Cc: Kangjie Lu, Vlad Yasevich, Neil Horman, David S. Miller,
	open list:SCTP PROTOCOL, open list:NETWORKING [GENERAL],
	open list
In-Reply-To: <CAAa=b7f+dfDUZR7tHvmSQgTcNDmGjvCn9ZK9eevKGF+bNd2Aqg@mail.gmail.com>

On Wed, May 02, 2018 at 08:27:05PM -0500, Wenwen Wang wrote:
> On Wed, May 2, 2018 at 8:24 PM, Marcelo Ricardo Leitner
> <marcelo.leitner@gmail.com> wrote:
> > On Wed, May 02, 2018 at 08:15:45PM -0500, Wenwen Wang wrote:
> >> In sctp_setsockopt_maxseg(), the integer 'val' is compared against min_len
> >> and max_len to check whether it is in the appropriate range. If it is not,
> >> an error code -EINVAL will be returned. This is enforced by a security
> >> check. But, this check is only executed when 'val' is not 0. In fact, if
> >> 'val' is 0, it will be assigned with a new value (if the return value of
> >> the function sctp_id2assoc() is not 0) in the following execution. However,
> >> this new value of 'val' is not checked before it is used to assigned to
> >> asoc->user_frag. That means it is possible that the new value of 'val'
> >> could be out of the expected range. This can cause security issues
> >> such as buffer overflows, e.g., the new value of 'val' is used as an index
> >> to access a buffer.
> >>
> >> This patch inserts a check for the new value of 'val' to see if it is in
> >> the expected range. If it is not, an error code -EINVAL will be returned.
> >>
> >> Signed-off-by: Wenwen Wang <wang6495@umn.edu>
> >> ---
> >>  net/sctp/socket.c | 22 +++++++++++-----------
> >>  1 file changed, 11 insertions(+), 11 deletions(-)
> >
> > ?
> > This patch is the same as previous one. git send-email <old file>
> > maybe?
> >
> >   Marcelo
>
> Thanks for your suggestion, Marcelo. I can send the old file. But, I
> have added a line of comment in this patch.

I meant if you had sent the old patch again by accident, because you
said you worked on an old version of the tree, but then posted a patch
that also doesn't use the new MTU function I mentioned.

  Marcelo

^ permalink raw reply

* Re: [RFC v3 4/5] virtio_ring: add event idx support in packed ring
From: Michael S. Tsirkin @ 2018-05-03  1:44 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: Jason Wang, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <20180503011116.qvoyblcpklinrk26@debian>

On Thu, May 03, 2018 at 09:11:16AM +0800, Tiwei Bie wrote:
> On Wed, May 02, 2018 at 06:42:57PM +0300, Michael S. Tsirkin wrote:
> > On Wed, May 02, 2018 at 11:12:55PM +0800, Tiwei Bie wrote:
> > > On Wed, May 02, 2018 at 04:51:01PM +0300, Michael S. Tsirkin wrote:
> > > > On Wed, May 02, 2018 at 03:28:19PM +0800, Tiwei Bie wrote:
> > > > > On Wed, May 02, 2018 at 10:51:06AM +0800, Jason Wang wrote:
> > > > > > On 2018年04月25日 13:15, Tiwei Bie wrote:
> > > > > > > This commit introduces the event idx support in packed
> > > > > > > ring. This feature is temporarily disabled, because the
> > > > > > > implementation in this patch may not work as expected,
> > > > > > > and some further discussions on the implementation are
> > > > > > > needed, e.g. do we have to check the wrap counter when
> > > > > > > checking whether a kick is needed?
> > > > > > > 
> > > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > > ---
> > > > > > >   drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++++++----
> > > > > > >   1 file changed, 49 insertions(+), 4 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > index 0181e93897be..b1039c2985b9 100644
> > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > @@ -986,7 +986,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > > > >   static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > >   {
> > > > > > >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > -	u16 flags;
> > > > > > > +	u16 new, old, off_wrap, flags;
> > > > > > >   	bool needs_kick;
> > > > > > >   	u32 snapshot;
> > > > > > > @@ -995,7 +995,12 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > >   	 * suppressions. */
> > > > > > >   	virtio_mb(vq->weak_barriers);
> > > > > > > +	old = vq->next_avail_idx - vq->num_added;
> > > > > > > +	new = vq->next_avail_idx;
> > > > > > > +	vq->num_added = 0;
> > > > > > > +
> > > > > > >   	snapshot = *(u32 *)vq->vring_packed.device;
> > > > > > > +	off_wrap = virtio16_to_cpu(_vq->vdev, snapshot & 0xffff);
> > > > > > >   	flags = cpu_to_virtio16(_vq->vdev, snapshot >> 16) & 0x3;
> > > > > > >   #ifdef DEBUG
> > > > > > > @@ -1006,7 +1011,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > > > >   	vq->last_add_time_valid = false;
> > > > > > >   #endif
> > > > > > > -	needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > > +	if (flags == VRING_EVENT_F_DESC)
> > > > > > > +		needs_kick = vring_need_event(off_wrap & ~(1<<15), new, old);
> > > > > > 
> > > > > > I wonder whether or not the math is correct. Both new and event are in the
> > > > > > unit of descriptor ring size, but old looks not.
> > > > > 
> > > > > What vring_need_event() cares is the distance between
> > > > > `new` and `old`, i.e. vq->num_added. So I think there
> > > > > is nothing wrong with `old`. But the calculation of the
> > > > > distance between `new` and `event_idx` isn't right when
> > > > > `new` wraps. How do you think about the below code:
> > > > > 
> > > > > 	wrap_counter = off_wrap >> 15;
> > > > > 	event_idx = off_wrap & ~(1<<15);
> > > > > 	if (wrap_counter != vq->wrap_counter)
> > > > > 		event_idx -= vq->vring_packed.num;
> > > > > 	
> > > > > 	needs_kick = vring_need_event(event_idx, new, old);
> > > > 
> > > > I suspect this hack won't work for non power of 2 ring.
> > > 
> > > Above code doesn't require the ring size to be a power of 2.
> > > 
> > > For (__u16)(new_idx - old), what we want to get is vq->num_added.
> > > 
> > > old = vq->next_avail_idx - vq->num_added;
> > > new = vq->next_avail_idx;
> > > 
> > > When vq->next_avail_idx >= vq->num_added, it's obvious that,
> > > (__u16)(new_idx - old) is vq->num_added.
> > > 
> > > And when vq->next_avail_idx < vq->num_added, new will be smaller
> > > than old (old will be a big unsigned number), but (__u16)(new_idx
> > > - old) is still vq->num_added.
> > > 
> > > For (__u16)(new_idx - event_idx - 1), when new wraps and event_idx
> > > doesn't wrap, the most straightforward way to calculate it is:
> > > (new + vq->vring_packed.num) - event_idx - 1.
> > 
> > So how about we use the straightforward way then?
> 
> You mean we do new += vq->vring_packed.num instead
> of event_idx -= vq->vring_packed.num before calling
> vring_need_event()?
> 
> The problem is that, the second param (new_idx) of
> vring_need_event() will be used for:
> 
> (__u16)(new_idx - event_idx - 1)
> (__u16)(new_idx - old)
> 
> So if we change new, we will need to change old too.

I think that since we have a branch there anyway,
we are better off just special-casing if (wrap_counter != vq->wrap_counter).
Treat is differenty and avoid casts.

> And that would be an ugly hack..
> 
> Best regards,
> Tiwei Bie

I consider casts and huge numbers with two's complement
games even uglier.

> > 
> > > But we can also calculate it in this way:
> > > 
> > > event_idx -= vq->vring_packed.num;
> > > (event_idx will be a big unsigned number)
> > > 
> > > Then (__u16)(new_idx - event_idx - 1) will be the value we want.
> > > 
> > > Best regards,
> > > Tiwei Bie
> > 
> > 
> > > > 
> > > > 
> > > > > Best regards,
> > > > > Tiwei Bie
> > > > > 
> > > > > 
> > > > > > 
> > > > > > Thanks
> > > > > > 
> > > > > > > +	else
> > > > > > > +		needs_kick = (flags != VRING_EVENT_F_DISABLE);
> > > > > > >   	END_USE(vq);
> > > > > > >   	return needs_kick;
> > > > > > >   }
> > > > > > > @@ -1116,6 +1124,15 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> > > > > > >   	if (vq->last_used_idx >= vq->vring_packed.num)
> > > > > > >   		vq->last_used_idx -= vq->vring_packed.num;
> > > > > > > +	/* If we expect an interrupt for the next entry, tell host
> > > > > > > +	 * by writing event index and flush out the write before
> > > > > > > +	 * the read in the next get_buf call. */
> > > > > > > +	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> > > > > > > +		virtio_store_mb(vq->weak_barriers,
> > > > > > > +				&vq->vring_packed.driver->off_wrap,
> > > > > > > +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> > > > > > > +						(vq->wrap_counter << 15)));
> > > > > > > +
> > > > > > >   #ifdef DEBUG
> > > > > > >   	vq->last_add_time_valid = false;
> > > > > > >   #endif
> > > > > > > @@ -1143,10 +1160,17 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > > > >   	/* We optimistically turn back on interrupts, then check if there was
> > > > > > >   	 * more to do. */
> > > > > > > +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > > +	 * either clear the flags bit or point the event index at the next
> > > > > > > +	 * entry. Always update the event index to keep code simple. */
> > > > > > > +
> > > > > > > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > > +			vq->last_used_idx | (vq->wrap_counter << 15));
> > > > > > >   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > >   		virtio_wmb(vq->weak_barriers);
> > > > > > > -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > > +						     VRING_EVENT_F_ENABLE;
> > > > > > >   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > >   							vq->event_flags_shadow);
> > > > > > >   	}
> > > > > > > @@ -1172,15 +1196,34 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> > > > > > >   static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> > > > > > >   {
> > > > > > >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > +	u16 bufs, used_idx, wrap_counter;
> > > > > > >   	START_USE(vq);
> > > > > > >   	/* We optimistically turn back on interrupts, then check if there was
> > > > > > >   	 * more to do. */
> > > > > > > +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > > > > > > +	 * either clear the flags bit or point the event index at the next
> > > > > > > +	 * entry. Always update the event index to keep code simple. */
> > > > > > > +
> > > > > > > +	/* TODO: tune this threshold */
> > > > > > > +	bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> > > > > > > +
> > > > > > > +	used_idx = vq->last_used_idx + bufs;
> > > > > > > +	wrap_counter = vq->wrap_counter;
> > > > > > > +
> > > > > > > +	if (used_idx >= vq->vring_packed.num) {
> > > > > > > +		used_idx -= vq->vring_packed.num;
> > > > > > > +		wrap_counter ^= 1;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > > > > > > +			used_idx | (wrap_counter << 15));
> > > > > > >   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > >   		virtio_wmb(vq->weak_barriers);
> > > > > > > -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > > > > > > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > > > > > > +						     VRING_EVENT_F_ENABLE;
> > > > > > >   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > >   							vq->event_flags_shadow);
> > > > > > >   	}
> > > > > > > @@ -1822,8 +1865,10 @@ void vring_transport_features(struct virtio_device *vdev)
> > > > > > >   		switch (i) {
> > > > > > >   		case VIRTIO_RING_F_INDIRECT_DESC:
> > > > > > >   			break;
> > > > > > > +#if 0
> > > > > > >   		case VIRTIO_RING_F_EVENT_IDX:
> > > > > > >   			break;
> > > > > > > +#endif
> > > > > > >   		case VIRTIO_F_VERSION_1:
> > > > > > >   			break;
> > > > > > >   		case VIRTIO_F_IOMMU_PLATFORM:
> > > > > > 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox