Netdev List
 help / color / mirror / Atom feed
* [PATCH v2] Optimize lookup of /0 xfrm policies
From: Yannick Brosseau @ 2018-08-31 22:18 UTC (permalink / raw)
  To: steffen.klassert, herbert, davem, netdev
  Cc: linux-kernel, kernel-team, Yannick Brosseau

Currently, all the xfrm policies that are not /32 end up in
the inexact policies linked list which take a long time to lookup.

We can optimize the case where we have a /0 prefix in the policy, which
means we can match any address to that part.
We do this by putting those policies in the direct hash table after
zeroing the address part.
At lookup time, we do an additional lookup with the packet address
and either the destination or source address zeroed out.
We still call xfrm_policy_match to validate that the packet match the
selector.

In our tests, with this optimization we reduce softirq cpu utilisation
from about 40% to 7% with 3k policies.

Signed-off-by: Yannick Brosseau <scientist@fb.com>
---
 net/xfrm/xfrm_hash.h   | 10 +++++
 net/xfrm/xfrm_policy.c | 87 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
index 61be810389d8..40997fb5336d 100644
--- a/net/xfrm/xfrm_hash.h
+++ b/net/xfrm/xfrm_hash.h
@@ -145,6 +145,16 @@ static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
 	const xfrm_address_t *saddr = &sel->saddr;
 	unsigned int h = 0;
 
+	/* A selector with a prefixlen of zero can basically be ignored in
+	 * the matching. To speed up the lookup, let's hash it without those
+	 * component. In the lookup, we'll do an additional check for a zero
+	 * daddr and a zero saddr.
+	 */
+	if (sel->prefixlen_d == 0)
+		dbits = 0;
+	if (sel->prefixlen_s == 0)
+		sbits = 0;
+
 	switch (family) {
 	case AF_INET:
 		if (sel->prefixlen_d < dbits ||
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 3110c3fbee20..a1ca78900ffc 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1096,8 +1096,10 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	int err;
 	struct xfrm_policy *pol, *ret;
 	const xfrm_address_t *daddr, *saddr;
+	static const xfrm_address_t zero_addr = {0};
+
 	struct hlist_head *chain;
-	unsigned int sequence;
+	unsigned int sequence, first_sequence;
 	u32 priority;
 
 	daddr = xfrm_flowi_daddr(fl, family);
@@ -1112,6 +1114,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		chain = policy_hash_direct(net, daddr, saddr, family, dir);
 	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
 
+	first_sequence = sequence;
 	priority = ~0U;
 	ret = NULL;
 	hlist_for_each_entry_rcu(pol, chain, bydst) {
@@ -1129,6 +1132,86 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 			break;
 		}
 	}
+
+	/* Do an additional lookup for saddr == 0, since we stored source
+	 * selector with a prefix len of 0 that way in the bydst hash
+	 */
+	do {
+		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+		chain = policy_hash_direct(net, daddr, &zero_addr, family, dir);
+	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
+		if ((pol->priority >= priority) && ret)
+			break;
+
+		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+		if (err) {
+			if (err == -ESRCH)
+				continue;
+			else {
+				ret = ERR_PTR(err);
+				goto fail;
+			}
+		} else {
+			ret = pol;
+			priority = ret->priority;
+			break;
+		}
+	}
+
+	/* Do an additional lookup for daddr == 0, since we stored dest
+	 * selector with a prefix len of 0 that way in the bydst hash
+	 */
+	do {
+		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+		chain = policy_hash_direct(net, &zero_addr, saddr, family, dir);
+	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
+		if ((pol->priority >= priority) && ret)
+			break;
+
+		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+		if (err) {
+			if (err == -ESRCH)
+				continue;
+			else {
+				ret = ERR_PTR(err);
+				goto fail;
+			}
+		} else {
+			ret = pol;
+			priority = ret->priority;
+			break;
+		}
+	}
+
+	/* Do an additional lookup for both saddr and daddr == 0 */
+	do {
+		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+		chain = policy_hash_direct(net, &zero_addr, &zero_addr, family, dir);
+	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+
+	hlist_for_each_entry_rcu(pol, chain, bydst) {
+		if ((pol->priority >= priority) && ret)
+			break;
+
+		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+		if (err) {
+			if (err == -ESRCH)
+				continue;
+			else {
+				ret = ERR_PTR(err);
+				goto fail;
+			}
+		} else {
+			ret = pol;
+			priority = ret->priority;
+			break;
+		}
+	}
+
 	chain = &net->xfrm.policy_inexact[dir];
 	hlist_for_each_entry_rcu(pol, chain, bydst) {
 		if ((pol->priority >= priority) && ret)
@@ -1148,7 +1231,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		}
 	}
 
-	if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+	if (read_seqcount_retry(&xfrm_policy_hash_generation, first_sequence))
 		goto retry;
 
 	if (ret && !xfrm_pol_hold_rcu(ret))
-- 
2.18.0

^ permalink raw reply related

* [PATCH v6 1/4] gpiolib: Pass bitmaps, not integer arrays, to get/set array
From: Janusz Krzysztofik @ 2018-08-31 22:56 UTC (permalink / raw)
  To: Linus Walleij
  Cc: Jonathan Corbet, Miguel Ojeda Sandonis, Peter Korsgaard,
	Peter Rosin, Ulf Hansson, Andrew Lunn, Florian Fainelli,
	David S. Miller, Dominik Brodowski, Greg Kroah-Hartman,
	Kishon Vijay Abraham I, Lars-Peter Clausen, Michael Hennerich,
	Jonathan Cameron, Hartmut Knaack, Peter Meerwald-Stadler,
	Jiri Slaby, Willy Tarreau, Geert Uytterhoeven
In-Reply-To: <20180831225616.29221-1-jmkrzyszt@gmail.com>

Most users of get/set array functions iterate consecutive bits of data,
usually a single integer, while processing array of results obtained
from, or building an array of values to be passed to those functions.
Save time wasted on those iterations by changing the functions' API to
accept bitmaps.

All current users are updated as well.

More benefits from the change are expected as soon as planned support
for accepting/passing those bitmaps directly from/to respective GPIO
chip callbacks if applicable is implemented.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
Cc: Peter Korsgaard <peter.korsgaard@barco.com>
Cc: Peter Rosin <peda@axentia.se>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Kishon Vijay Abraham I <kishon@ti.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: Michael Hennerich <Michael.Hennerich@analog.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Hartmut Knaack <knaack.h@gmx.de>
Cc: Peter Meerwald-Stadler <pmeerw@pmeerw.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jslaby@suse.com>
Signed-off-by: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/driver-api/gpio/consumer.rst  | 22 ++++----
 drivers/auxdisplay/hd44780.c                | 62 ++++++++-------------
 drivers/bus/ts-nbus.c                       | 21 ++-----
 drivers/gpio/gpio-max3191x.c                | 17 +++---
 drivers/gpio/gpiolib.c                      | 86 +++++++++++++++--------------
 drivers/gpio/gpiolib.h                      |  4 +-
 drivers/i2c/muxes/i2c-mux-gpio.c            | 14 ++---
 drivers/mmc/core/pwrseq_simple.c            | 13 ++---
 drivers/mux/gpio.c                          | 15 ++---
 drivers/net/phy/mdio-mux-gpio.c             | 11 ++--
 drivers/pcmcia/soc_common.c                 | 11 ++--
 drivers/phy/motorola/phy-mapphone-mdm6600.c | 17 +++---
 drivers/staging/iio/adc/ad7606.c            |  9 +--
 drivers/tty/serial/serial_mctrl_gpio.c      |  7 ++-
 include/linux/gpio/consumer.h               | 18 +++---
 15 files changed, 145 insertions(+), 182 deletions(-)

diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst
index aa03f389d41d..ed68042ddccf 100644
--- a/Documentation/driver-api/gpio/consumer.rst
+++ b/Documentation/driver-api/gpio/consumer.rst
@@ -323,29 +323,29 @@ The following functions get or set the values of an array of GPIOs::
 
 	int gpiod_get_array_value(unsigned int array_size,
 				  struct gpio_desc **desc_array,
-				  int *value_array);
+				  unsigned long *value_bitmap);
 	int gpiod_get_raw_array_value(unsigned int array_size,
 				      struct gpio_desc **desc_array,
-				      int *value_array);
+				      unsigned long *value_bitmap);
 	int gpiod_get_array_value_cansleep(unsigned int array_size,
 					   struct gpio_desc **desc_array,
-					   int *value_array);
+					   unsigned long *value_bitmap);
 	int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
 					   struct gpio_desc **desc_array,
-					   int *value_array);
+					   unsigned long *value_bitmap);
 
 	void gpiod_set_array_value(unsigned int array_size,
 				   struct gpio_desc **desc_array,
-				   int *value_array)
+				   unsigned long *value_bitmap)
 	void gpiod_set_raw_array_value(unsigned int array_size,
 				       struct gpio_desc **desc_array,
-				       int *value_array)
+				       unsigned long *value_bitmap)
 	void gpiod_set_array_value_cansleep(unsigned int array_size,
 					    struct gpio_desc **desc_array,
-					    int *value_array)
+					    unsigned long *value_bitmap)
 	void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 						struct gpio_desc **desc_array,
-						int *value_array)
+						unsigned long *value_bitmap)
 
 The array can be an arbitrary set of GPIOs. The functions will try to access
 GPIOs belonging to the same bank or chip simultaneously if supported by the
@@ -356,8 +356,8 @@ accessed sequentially.
 The functions take three arguments:
 	* array_size	- the number of array elements
 	* desc_array	- an array of GPIO descriptors
-	* value_array	- an array to store the GPIOs' values (get) or
-			  an array of values to assign to the GPIOs (set)
+	* value_bitmap	- a bitmap to store the GPIOs' values (get) or
+			  a bitmap of values to assign to the GPIOs (set)
 
 The descriptor array can be obtained using the gpiod_get_array() function
 or one of its variants. If the group of descriptors returned by that function
@@ -366,7 +366,7 @@ the struct gpio_descs returned by gpiod_get_array()::
 
 	struct gpio_descs *my_gpio_descs = gpiod_get_array(...);
 	gpiod_set_array_value(my_gpio_descs->ndescs, my_gpio_descs->desc,
-			      my_gpio_values);
+			      my_gpio_value_bitmap);
 
 It is also possible to access a completely arbitrary array of descriptors. The
 descriptors may be obtained using any combination of gpiod_get() and
diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c
index f1a42f0f1ded..7ee4f9a65bfc 100644
--- a/drivers/auxdisplay/hd44780.c
+++ b/drivers/auxdisplay/hd44780.c
@@ -62,20 +62,15 @@ static void hd44780_strobe_gpio(struct hd44780 *hd)
 /* write to an LCD panel register in 8 bit GPIO mode */
 static void hd44780_write_gpio8(struct hd44780 *hd, u8 val, unsigned int rs)
 {
-	int values[10];	/* for DATA[0-7], RS, RW */
-	unsigned int i, n;
-
-	for (i = 0; i < 8; i++)
-		values[PIN_DATA0 + i] = !!(val & BIT(i));
-	values[PIN_CTRL_RS] = rs;
-	n = 9;
-	if (hd->pins[PIN_CTRL_RW]) {
-		values[PIN_CTRL_RW] = 0;
-		n++;
-	}
+	DECLARE_BITMAP(value_bitmap, 10);	/* for DATA[0-7], RS, RW */
+	unsigned int n;
+
+	*value_bitmap = val;
+	__assign_bit(8, value_bitmap, rs);
+	n = hd->pins[PIN_CTRL_RW] ? 10 : 9;
 
 	/* Present the data to the port */
-	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA0], values);
+	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA0], value_bitmap);
 
 	hd44780_strobe_gpio(hd);
 }
@@ -83,32 +78,25 @@ static void hd44780_write_gpio8(struct hd44780 *hd, u8 val, unsigned int rs)
 /* write to an LCD panel register in 4 bit GPIO mode */
 static void hd44780_write_gpio4(struct hd44780 *hd, u8 val, unsigned int rs)
 {
-	int values[10];	/* for DATA[0-7], RS, RW, but DATA[0-3] is unused */
-	unsigned int i, n;
+	DECLARE_BITMAP(value_bitmap, 6);	/* for DATA[4-7], RS, RW */
+	unsigned int n;
 
 	/* High nibble + RS, RW */
-	for (i = 4; i < 8; i++)
-		values[PIN_DATA0 + i] = !!(val & BIT(i));
-	values[PIN_CTRL_RS] = rs;
-	n = 5;
-	if (hd->pins[PIN_CTRL_RW]) {
-		values[PIN_CTRL_RW] = 0;
-		n++;
-	}
+	*value_bitmap = val >> 4;
+	__assign_bit(4, value_bitmap, rs);
+	n = hd->pins[PIN_CTRL_RW] ? 6 : 5;
 
 	/* Present the data to the port */
-	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4],
-				       &values[PIN_DATA4]);
+	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], value_bitmap);
 
 	hd44780_strobe_gpio(hd);
 
 	/* Low nibble */
-	for (i = 0; i < 4; i++)
-		values[PIN_DATA4 + i] = !!(val & BIT(i));
+	*value_bitmap &= ~0x0f;
+	*value_bitmap |= val & 0x0f;
 
 	/* Present the data to the port */
-	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4],
-				       &values[PIN_DATA4]);
+	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], value_bitmap);
 
 	hd44780_strobe_gpio(hd);
 }
@@ -155,23 +143,17 @@ static void hd44780_write_cmd_gpio4(struct charlcd *lcd, int cmd)
 /* Send 4-bits of a command to the LCD panel in raw 4 bit GPIO mode */
 static void hd44780_write_cmd_raw_gpio4(struct charlcd *lcd, int cmd)
 {
-	int values[10];	/* for DATA[0-7], RS, RW, but DATA[0-3] is unused */
+	/* for DATA[0-7], RS, RW, but DATA[0-3] is unused */
+	DECLARE_BITMAP(value_bitmap, 6);
 	struct hd44780 *hd = lcd->drvdata;
-	unsigned int i, n;
+	unsigned int n;
 
 	/* Command nibble + RS, RW */
-	for (i = 0; i < 4; i++)
-		values[PIN_DATA4 + i] = !!(cmd & BIT(i));
-	values[PIN_CTRL_RS] = 0;
-	n = 5;
-	if (hd->pins[PIN_CTRL_RW]) {
-		values[PIN_CTRL_RW] = 0;
-		n++;
-	}
+	*value_bitmap = cmd & 0x0f;
+	n = hd->pins[PIN_CTRL_RW] ? 6 : 5;
 
 	/* Present the data to the port */
-	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4],
-				       &values[PIN_DATA4]);
+	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], value_bitmap);
 
 	hd44780_strobe_gpio(hd);
 }
diff --git a/drivers/bus/ts-nbus.c b/drivers/bus/ts-nbus.c
index 073fd9011154..2ac5239df865 100644
--- a/drivers/bus/ts-nbus.c
+++ b/drivers/bus/ts-nbus.c
@@ -110,13 +110,11 @@ static void ts_nbus_set_direction(struct ts_nbus *ts_nbus, int direction)
  */
 static void ts_nbus_reset_bus(struct ts_nbus *ts_nbus)
 {
-	int i;
-	int values[8];
-
-	for (i = 0; i < 8; i++)
-		values[i] = 0;
+	DECLARE_BITMAP(value_bitmap, 8);
+	
+	*value_bitmap = 0;
 
-	gpiod_set_array_value_cansleep(8, ts_nbus->data->desc, values);
+	gpiod_set_array_value_cansleep(8, ts_nbus->data->desc, value_bitmap);
 	gpiod_set_value_cansleep(ts_nbus->csn, 0);
 	gpiod_set_value_cansleep(ts_nbus->strobe, 0);
 	gpiod_set_value_cansleep(ts_nbus->ale, 0);
@@ -157,16 +155,9 @@ static int ts_nbus_read_byte(struct ts_nbus *ts_nbus, u8 *val)
 static void ts_nbus_write_byte(struct ts_nbus *ts_nbus, u8 byte)
 {
 	struct gpio_descs *gpios = ts_nbus->data;
-	int i;
-	int values[8];
-
-	for (i = 0; i < 8; i++)
-		if (byte & BIT(i))
-			values[i] = 1;
-		else
-			values[i] = 0;
+	DECLARE_BITMAP(value_bitmap, 8) = { byte, };
 
-	gpiod_set_array_value_cansleep(8, gpios->desc, values);
+	gpiod_set_array_value_cansleep(8, gpios->desc, value_bitmap);
 }
 
 /*
diff --git a/drivers/gpio/gpio-max3191x.c b/drivers/gpio/gpio-max3191x.c
index b5b9cb1fda50..c4ec1c82af27 100644
--- a/drivers/gpio/gpio-max3191x.c
+++ b/drivers/gpio/gpio-max3191x.c
@@ -315,17 +315,20 @@ static void gpiod_set_array_single_value_cansleep(unsigned int ndescs,
 						  struct gpio_desc **desc,
 						  int value)
 {
-	int i, *values;
+	unsigned long *value_bitmap;
 
-	values = kmalloc_array(ndescs, sizeof(*values), GFP_KERNEL);
-	if (!values)
+	value_bitmap = kmalloc_array(BITS_TO_LONGS(ndescs),
+				     sizeof(*value_bitmap), GFP_KERNEL);
+	if (!value_bitmap)
 		return;
 
-	for (i = 0; i < ndescs; i++)
-		values[i] = value;
+	if (value)
+		bitmap_fill(value_bitmap, ndescs);
+	else
+		bitmap_zero(value_bitmap, ndescs);
 
-	gpiod_set_array_value_cansleep(ndescs, desc, values);
-	kfree(values);
+	gpiod_set_array_value_cansleep(ndescs, desc, value_bitmap);
+	kfree(value_bitmap);
 }
 
 static struct gpio_descs *devm_gpiod_get_array_optional_count(
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index e8f8a1999393..f0e9ffa8cab6 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -427,7 +427,7 @@ static long linehandle_ioctl(struct file *filep, unsigned int cmd,
 	struct linehandle_state *lh = filep->private_data;
 	void __user *ip = (void __user *)arg;
 	struct gpiohandle_data ghd;
-	int vals[GPIOHANDLES_MAX];
+	unsigned long value_bitmap[BITS_TO_LONGS(GPIOHANDLES_MAX)];
 	int i;
 
 	if (cmd == GPIOHANDLE_GET_LINE_VALUES_IOCTL) {
@@ -436,13 +436,13 @@ static long linehandle_ioctl(struct file *filep, unsigned int cmd,
 							true,
 							lh->numdescs,
 							lh->descs,
-							vals);
+							value_bitmap);
 		if (ret)
 			return ret;
 
 		memset(&ghd, 0, sizeof(ghd));
 		for (i = 0; i < lh->numdescs; i++)
-			ghd.values[i] = vals[i];
+			ghd.values[i] = test_bit(i, value_bitmap);
 
 		if (copy_to_user(ip, &ghd, sizeof(ghd)))
 			return -EFAULT;
@@ -461,14 +461,14 @@ static long linehandle_ioctl(struct file *filep, unsigned int cmd,
 
 		/* Clamp all values to [0,1] */
 		for (i = 0; i < lh->numdescs; i++)
-			vals[i] = !!ghd.values[i];
+			__assign_bit(i, value_bitmap, !!ghd.values[i]);
 
 		/* Reuse the array setting function */
 		return gpiod_set_array_value_complex(false,
 					      true,
 					      lh->numdescs,
 					      lh->descs,
-					      vals);
+					      value_bitmap);
 	}
 	return -EINVAL;
 }
@@ -2784,7 +2784,7 @@ static int gpio_chip_get_multiple(struct gpio_chip *chip,
 int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 				  unsigned int array_size,
 				  struct gpio_desc **desc_array,
-				  int *value_array)
+				  unsigned long *value_bitmap)
 {
 	int i = 0;
 
@@ -2835,7 +2835,7 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 
 			if (!raw && test_bit(FLAG_ACTIVE_LOW, &desc->flags))
 				value = !value;
-			value_array[j] = value;
+			__assign_bit(j, value_bitmap, value);
 			trace_gpio_value(desc_to_gpio(desc), 1, value);
 		}
 
@@ -2895,9 +2895,9 @@ EXPORT_SYMBOL_GPL(gpiod_get_value);
 
 /**
  * gpiod_get_raw_array_value() - read raw values from an array of GPIOs
- * @array_size: number of elements in the descriptor / value arrays
+ * @array_size: number of elements in the descriptor array / value bitmap
  * @desc_array: array of GPIO descriptors whose values will be read
- * @value_array: array to store the read values
+ * @value_bitmap: bitmap to store the read values
  *
  * Read the raw values of the GPIOs, i.e. the values of the physical lines
  * without regard for their ACTIVE_LOW status.  Return 0 in case of success,
@@ -2907,20 +2907,21 @@ EXPORT_SYMBOL_GPL(gpiod_get_value);
  * and it will complain if the GPIO chip functions potentially sleep.
  */
 int gpiod_get_raw_array_value(unsigned int array_size,
-			      struct gpio_desc **desc_array, int *value_array)
+			      struct gpio_desc **desc_array,
+			      unsigned long *value_bitmap)
 {
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_get_array_value_complex(true, false, array_size,
-					     desc_array, value_array);
+					     desc_array, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value);
 
 /**
  * gpiod_get_array_value() - read values from an array of GPIOs
- * @array_size: number of elements in the descriptor / value arrays
+ * @array_size: number of elements in the descriptor array / value bitmap
  * @desc_array: array of GPIO descriptors whose values will be read
- * @value_array: array to store the read values
+ * @value_bitnap: bitmap to store the read values
  *
  * Read the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status
  * into account.  Return 0 in case of success, else an error code.
@@ -2929,12 +2930,13 @@ EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value);
  * and it will complain if the GPIO chip functions potentially sleep.
  */
 int gpiod_get_array_value(unsigned int array_size,
-			  struct gpio_desc **desc_array, int *value_array)
+			  struct gpio_desc **desc_array,
+			  unsigned long *value_bitmap)
 {
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_get_array_value_complex(false, false, array_size,
-					     desc_array, value_array);
+					     desc_array, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_get_array_value);
 
@@ -3027,7 +3029,7 @@ static void gpio_chip_set_multiple(struct gpio_chip *chip,
 int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 				   unsigned int array_size,
 				   struct gpio_desc **desc_array,
-				   int *value_array)
+				   unsigned long *value_bitmap)
 {
 	int i = 0;
 
@@ -3056,7 +3058,7 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 		do {
 			struct gpio_desc *desc = desc_array[i];
 			int hwgpio = gpio_chip_hwgpio(desc);
-			int value = value_array[i];
+			int value = test_bit(i, value_bitmap);
 
 			if (!raw && test_bit(FLAG_ACTIVE_LOW, &desc->flags))
 				value = !value;
@@ -3152,9 +3154,9 @@ EXPORT_SYMBOL_GPL(gpiod_set_value);
 
 /**
  * gpiod_set_raw_array_value() - assign values to an array of GPIOs
- * @array_size: number of elements in the descriptor / value arrays
+ * @array_size: number of elements in the descriptor array / value bitmap
  * @desc_array: array of GPIO descriptors whose values will be assigned
- * @value_array: array of values to assign
+ * @value_bitmap: bitmap of values to assign
  *
  * Set the raw values of the GPIOs, i.e. the values of the physical lines
  * without regard for their ACTIVE_LOW status.
@@ -3163,20 +3165,21 @@ EXPORT_SYMBOL_GPL(gpiod_set_value);
  * complain if the GPIO chip functions potentially sleep.
  */
 int gpiod_set_raw_array_value(unsigned int array_size,
-			 struct gpio_desc **desc_array, int *value_array)
+			 struct gpio_desc **desc_array,
+			 unsigned long *value_bitmap)
 {
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_set_array_value_complex(true, false, array_size,
-					desc_array, value_array);
+					desc_array, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value);
 
 /**
  * gpiod_set_array_value() - assign values to an array of GPIOs
- * @array_size: number of elements in the descriptor / value arrays
+ * @array_size: number of elements in the descriptor array / value bitmap
  * @desc_array: array of GPIO descriptors whose values will be assigned
- * @value_array: array of values to assign
+ * @value_bitmap: bitmap of values to assign
  *
  * Set the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status
  * into account.
@@ -3185,12 +3188,13 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value);
  * complain if the GPIO chip functions potentially sleep.
  */
 void gpiod_set_array_value(unsigned int array_size,
-			   struct gpio_desc **desc_array, int *value_array)
+			   struct gpio_desc **desc_array,
+			   unsigned long *value_bitmap)
 {
 	if (!desc_array)
 		return;
 	gpiod_set_array_value_complex(false, false, array_size, desc_array,
-				      value_array);
+				      value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_array_value);
 
@@ -3410,9 +3414,9 @@ EXPORT_SYMBOL_GPL(gpiod_get_value_cansleep);
 
 /**
  * gpiod_get_raw_array_value_cansleep() - read raw values from an array of GPIOs
- * @array_size: number of elements in the descriptor / value arrays
+ * @array_size: number of elements in the descriptor array / value bitmap
  * @desc_array: array of GPIO descriptors whose values will be read
- * @value_array: array to store the read values
+ * @value_bitmap: bitmap to store the read values
  *
  * Read the raw values of the GPIOs, i.e. the values of the physical lines
  * without regard for their ACTIVE_LOW status.  Return 0 in case of success,
@@ -3422,21 +3426,21 @@ EXPORT_SYMBOL_GPL(gpiod_get_value_cansleep);
  */
 int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
 				       struct gpio_desc **desc_array,
-				       int *value_array)
+				       unsigned long *value_bitmap)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_get_array_value_complex(true, true, array_size,
-					     desc_array, value_array);
+					     desc_array, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value_cansleep);
 
 /**
  * gpiod_get_array_value_cansleep() - read values from an array of GPIOs
- * @array_size: number of elements in the descriptor / value arrays
+ * @array_size: number of elements in the descriptor array / value bitmap
  * @desc_array: array of GPIO descriptors whose values will be read
- * @value_array: array to store the read values
+ * @value_bitmap: bitmap to store the read values
  *
  * Read the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status
  * into account.  Return 0 in case of success, else an error code.
@@ -3445,13 +3449,13 @@ EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value_cansleep);
  */
 int gpiod_get_array_value_cansleep(unsigned int array_size,
 				   struct gpio_desc **desc_array,
-				   int *value_array)
+				   unsigned long *value_bitmap)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_get_array_value_complex(false, true, array_size,
-					     desc_array, value_array);
+					     desc_array, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_get_array_value_cansleep);
 
@@ -3493,9 +3497,9 @@ EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep);
 
 /**
  * gpiod_set_raw_array_value_cansleep() - assign values to an array of GPIOs
- * @array_size: number of elements in the descriptor / value arrays
+ * @array_size: number of elements in the descriptor array / value bitmap
  * @desc_array: array of GPIO descriptors whose values will be assigned
- * @value_array: array of values to assign
+ * @value_bitmap: bitmap of values to assign
  *
  * Set the raw values of the GPIOs, i.e. the values of the physical lines
  * without regard for their ACTIVE_LOW status.
@@ -3504,13 +3508,13 @@ EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep);
  */
 int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 					struct gpio_desc **desc_array,
-					int *value_array)
+					unsigned long *value_bitmap)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_set_array_value_complex(true, true, array_size, desc_array,
-				      value_array);
+				      value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value_cansleep);
 
@@ -3533,9 +3537,9 @@ void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n)
 
 /**
  * gpiod_set_array_value_cansleep() - assign values to an array of GPIOs
- * @array_size: number of elements in the descriptor / value arrays
+ * @array_size: number of elements in the descriptor array / value bitmap
  * @desc_array: array of GPIO descriptors whose values will be assigned
- * @value_array: array of values to assign
+ * @value_bitmap: bitmap of values to assign
  *
  * Set the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status
  * into account.
@@ -3544,13 +3548,13 @@ void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n)
  */
 void gpiod_set_array_value_cansleep(unsigned int array_size,
 				    struct gpio_desc **desc_array,
-				    int *value_array)
+				    unsigned long *value_bitmap)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return;
 	gpiod_set_array_value_complex(false, true, array_size, desc_array,
-				      value_array);
+				      value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_array_value_cansleep);
 
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index a7e49fef73d4..11e83d2eef89 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -187,11 +187,11 @@ struct gpio_desc *gpiochip_get_desc(struct gpio_chip *chip, u16 hwnum);
 int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 				  unsigned int array_size,
 				  struct gpio_desc **desc_array,
-				  int *value_array);
+				  unsigned long *value_bitmap);
 int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 				   unsigned int array_size,
 				   struct gpio_desc **desc_array,
-				   int *value_array);
+				   unsigned long *value_bitmap);
 
 /* This is just passed between gpiolib and devres */
 struct gpio_desc *gpiod_get_from_of_node(struct device_node *node,
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index 401308e3d036..e28ddc20000d 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -22,18 +22,16 @@ struct gpiomux {
 	struct i2c_mux_gpio_platform_data data;
 	unsigned gpio_base;
 	struct gpio_desc **gpios;
-	int *values;
 };
 
 static void i2c_mux_gpio_set(const struct gpiomux *mux, unsigned val)
 {
-	int i;
-
-	for (i = 0; i < mux->data.n_gpios; i++)
-		mux->values[i] = (val >> i) & 1;
+	DECLARE_BITMAP(value_bitmap, mux->data.n_gpios);
+	
+	*value_bitmap = val;
 
 	gpiod_set_array_value_cansleep(mux->data.n_gpios,
-				       mux->gpios, mux->values);
+				       mux->gpios, value_bitmap);
 }
 
 static int i2c_mux_gpio_select(struct i2c_mux_core *muxc, u32 chan)
@@ -182,15 +180,13 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 		return -EPROBE_DEFER;
 
 	muxc = i2c_mux_alloc(parent, &pdev->dev, mux->data.n_values,
-			     mux->data.n_gpios * sizeof(*mux->gpios) +
-			     mux->data.n_gpios * sizeof(*mux->values), 0,
+			     mux->data.n_gpios * sizeof(*mux->gpios), 0,
 			     i2c_mux_gpio_select, NULL);
 	if (!muxc) {
 		ret = -ENOMEM;
 		goto alloc_failed;
 	}
 	mux->gpios = muxc->priv;
-	mux->values = (int *)(mux->gpios + mux->data.n_gpios);
 	muxc->priv = mux;
 
 	platform_set_drvdata(pdev, muxc);
diff --git a/drivers/mmc/core/pwrseq_simple.c b/drivers/mmc/core/pwrseq_simple.c
index a8b9fee4d62a..fd7791e072b3 100644
--- a/drivers/mmc/core/pwrseq_simple.c
+++ b/drivers/mmc/core/pwrseq_simple.c
@@ -40,18 +40,13 @@ static void mmc_pwrseq_simple_set_gpios_value(struct mmc_pwrseq_simple *pwrseq,
 	struct gpio_descs *reset_gpios = pwrseq->reset_gpios;
 
 	if (!IS_ERR(reset_gpios)) {
-		int i, *values;
 		int nvalues = reset_gpios->ndescs;
+		DECLARE_BITMAP(value_bitmap, nvalues);
 
-		values = kmalloc_array(nvalues, sizeof(int), GFP_KERNEL);
-		if (!values)
-			return;
+		*value_bitmap = value;
 
-		for (i = 0; i < nvalues; i++)
-			values[i] = value;
-
-		gpiod_set_array_value_cansleep(nvalues, reset_gpios->desc, values);
-		kfree(values);
+		gpiod_set_array_value_cansleep(nvalues, reset_gpios->desc,
+					       value_bitmap);
 	}
 }
 
diff --git a/drivers/mux/gpio.c b/drivers/mux/gpio.c
index 6fdd9316db8b..17b7b1c21d90 100644
--- a/drivers/mux/gpio.c
+++ b/drivers/mux/gpio.c
@@ -17,20 +17,17 @@
 
 struct mux_gpio {
 	struct gpio_descs *gpios;
-	int *val;
 };
 
 static int mux_gpio_set(struct mux_control *mux, int state)
 {
 	struct mux_gpio *mux_gpio = mux_chip_priv(mux->chip);
-	int i;
-
-	for (i = 0; i < mux_gpio->gpios->ndescs; i++)
-		mux_gpio->val[i] = (state >> i) & 1;
+	DECLARE_BITMAP(value_bitmap, mux_gpio->gpios->ndescs);
+	
+	*value_bitmap = state;
 
 	gpiod_set_array_value_cansleep(mux_gpio->gpios->ndescs,
-				       mux_gpio->gpios->desc,
-				       mux_gpio->val);
+				       mux_gpio->gpios->desc, value_bitmap);
 
 	return 0;
 }
@@ -58,13 +55,11 @@ static int mux_gpio_probe(struct platform_device *pdev)
 	if (pins < 0)
 		return pins;
 
-	mux_chip = devm_mux_chip_alloc(dev, 1, sizeof(*mux_gpio) +
-				       pins * sizeof(*mux_gpio->val));
+	mux_chip = devm_mux_chip_alloc(dev, 1, sizeof(*mux_gpio));
 	if (IS_ERR(mux_chip))
 		return PTR_ERR(mux_chip);
 
 	mux_gpio = mux_chip_priv(mux_chip);
-	mux_gpio->val = (int *)(mux_gpio + 1);
 	mux_chip->ops = &mux_gpio_ops;
 
 	mux_gpio->gpios = devm_gpiod_get_array(dev, "mux", GPIOD_OUT_LOW);
diff --git a/drivers/net/phy/mdio-mux-gpio.c b/drivers/net/phy/mdio-mux-gpio.c
index bc90764a8b8d..3907e9528949 100644
--- a/drivers/net/phy/mdio-mux-gpio.c
+++ b/drivers/net/phy/mdio-mux-gpio.c
@@ -20,23 +20,21 @@
 struct mdio_mux_gpio_state {
 	struct gpio_descs *gpios;
 	void *mux_handle;
-	int values[];
 };
 
 static int mdio_mux_gpio_switch_fn(int current_child, int desired_child,
 				   void *data)
 {
 	struct mdio_mux_gpio_state *s = data;
-	unsigned int n;
+	DECLARE_BITMAP(value_bitmap, s->gpios->ndescs);
 
 	if (current_child == desired_child)
 		return 0;
 
-	for (n = 0; n < s->gpios->ndescs; n++)
-		s->values[n] = (desired_child >> n) & 1;
+	*value_bitmap = desired_child;
 
 	gpiod_set_array_value_cansleep(s->gpios->ndescs, s->gpios->desc,
-				       s->values);
+				       value_bitmap);
 
 	return 0;
 }
@@ -51,8 +49,7 @@ static int mdio_mux_gpio_probe(struct platform_device *pdev)
 	if (IS_ERR(gpios))
 		return PTR_ERR(gpios);
 
-	s = devm_kzalloc(&pdev->dev, struct_size(s, values, gpios->ndescs),
-			 GFP_KERNEL);
+	s = devm_kzalloc(&pdev->dev, sizeof(*s), GFP_KERNEL);
 	if (!s) {
 		gpiod_put_array(gpios);
 		return -ENOMEM;
diff --git a/drivers/pcmcia/soc_common.c b/drivers/pcmcia/soc_common.c
index c5f2344c189b..44288b1a57ee 100644
--- a/drivers/pcmcia/soc_common.c
+++ b/drivers/pcmcia/soc_common.c
@@ -351,19 +351,22 @@ static int soc_common_pcmcia_config_skt(
 
 	if (ret == 0) {
 		struct gpio_desc *descs[2];
-		int values[2], n = 0;
+		DECLARE_BITMAP(value_bitmap, 2);
+		int n = 0;
 
 		if (skt->gpio_reset) {
 			descs[n] = skt->gpio_reset;
-			values[n++] = !!(state->flags & SS_RESET);
+			__assign_bit(n++, value_bitmap,
+				     !!(state->flags & SS_RESET));
 		}
 		if (skt->gpio_bus_enable) {
 			descs[n] = skt->gpio_bus_enable;
-			values[n++] = !!(state->flags & SS_OUTPUT_ENA);
+			__assign_bit(n++, value_bitmap,
+				     !!(state->flags & SS_OUTPUT_ENA));
 		}
 
 		if (n)
-			gpiod_set_array_value_cansleep(n, descs, values);
+			gpiod_set_array_value_cansleep(n, descs, value_bitmap);
 
 		/*
 		 * This really needs a better solution.  The IRQ
diff --git a/drivers/phy/motorola/phy-mapphone-mdm6600.c b/drivers/phy/motorola/phy-mapphone-mdm6600.c
index 0075fb0bef8c..b348272f7f1a 100644
--- a/drivers/phy/motorola/phy-mapphone-mdm6600.c
+++ b/drivers/phy/motorola/phy-mapphone-mdm6600.c
@@ -157,15 +157,12 @@ static const struct phy_ops gpio_usb_ops = {
  */
 static void phy_mdm6600_cmd(struct phy_mdm6600 *ddata, int val)
 {
-	int values[PHY_MDM6600_NR_CMD_LINES];
-	int i;
+	DECLARE_BITMAP(value_bitmap, PHY_MDM6600_NR_CMD_LINES);
 
-	val &= (1 << PHY_MDM6600_NR_CMD_LINES) - 1;
-	for (i = 0; i < PHY_MDM6600_NR_CMD_LINES; i++)
-		values[i] = (val & BIT(i)) >> i;
+	*value_bitmap = val & ((1 << PHY_MDM6600_NR_CMD_LINES) - 1);
 
 	gpiod_set_array_value_cansleep(PHY_MDM6600_NR_CMD_LINES,
-				       ddata->cmd_gpios->desc, values);
+				       ddata->cmd_gpios->desc, value_bitmap);
 }
 
 /**
@@ -176,7 +173,7 @@ static void phy_mdm6600_status(struct work_struct *work)
 {
 	struct phy_mdm6600 *ddata;
 	struct device *dev;
-	int values[PHY_MDM6600_NR_STATUS_LINES];
+	DECLARE_BITMAP(value_bitmap, PHY_MDM6600_NR_STATUS_LINES);
 	int error, i, val = 0;
 
 	ddata = container_of(work, struct phy_mdm6600, status_work.work);
@@ -184,14 +181,14 @@ static void phy_mdm6600_status(struct work_struct *work)
 
 	error = gpiod_get_array_value_cansleep(PHY_MDM6600_NR_STATUS_LINES,
 					       ddata->status_gpios->desc,
-					       values);
+					       value_bitmap);
 	if (error)
 		return;
 
 	for (i = 0; i < PHY_MDM6600_NR_STATUS_LINES; i++) {
-		val |= values[i] << i;
+		val |= test_bit(i, value_bitmap) << i;
 		dev_dbg(ddata->dev, "XXX %s: i: %i values[i]: %i val: %i\n",
-			__func__, i, values[i], val);
+			__func__, i, test_bit(i, value_bitmap), val);
 	}
 	ddata->status = val;
 
diff --git a/drivers/staging/iio/adc/ad7606.c b/drivers/staging/iio/adc/ad7606.c
index 25b9fcd5e3a4..18b6d10b0176 100644
--- a/drivers/staging/iio/adc/ad7606.c
+++ b/drivers/staging/iio/adc/ad7606.c
@@ -202,7 +202,7 @@ static int ad7606_write_raw(struct iio_dev *indio_dev,
 			    long mask)
 {
 	struct ad7606_state *st = iio_priv(indio_dev);
-	int values[3];
+	DECLARE_BITMAP(value_bitmap, 3);
 	int ret, i;
 
 	switch (mask) {
@@ -227,13 +227,10 @@ static int ad7606_write_raw(struct iio_dev *indio_dev,
 		if (ret < 0)
 			return ret;
 
-		values[0] = (ret >> 0) & 1;
-		values[1] = (ret >> 1) & 1;
-		values[2] = (ret >> 2) & 1;
+		*value_bitmap = ret;
 
 		mutex_lock(&st->lock);
-		gpiod_set_array_value(ARRAY_SIZE(values), st->gpio_os->desc,
-				      values);
+		gpiod_set_array_value(3, st->gpio_os->desc, value_bitmap);
 		st->oversampling = val;
 		mutex_unlock(&st->lock);
 
diff --git a/drivers/tty/serial/serial_mctrl_gpio.c b/drivers/tty/serial/serial_mctrl_gpio.c
index 1c06325beaca..edfe8c688479 100644
--- a/drivers/tty/serial/serial_mctrl_gpio.c
+++ b/drivers/tty/serial/serial_mctrl_gpio.c
@@ -40,7 +40,7 @@ void mctrl_gpio_set(struct mctrl_gpios *gpios, unsigned int mctrl)
 {
 	enum mctrl_gpio_idx i;
 	struct gpio_desc *desc_array[UART_GPIO_MAX];
-	int value_array[UART_GPIO_MAX];
+	DECLARE_BITMAP(value_bitmap, UART_GPIO_MAX);
 	unsigned int count = 0;
 
 	if (gpios == NULL)
@@ -49,10 +49,11 @@ void mctrl_gpio_set(struct mctrl_gpios *gpios, unsigned int mctrl)
 	for (i = 0; i < UART_GPIO_MAX; i++)
 		if (gpios->gpio[i] && mctrl_gpios_desc[i].dir_out) {
 			desc_array[count] = gpios->gpio[i];
-			value_array[count] = !!(mctrl & mctrl_gpios_desc[i].mctrl);
+			__assign_bit(count, value_bitmap,
+				     !!(mctrl & mctrl_gpios_desc[i].mctrl));
 			count++;
 		}
-	gpiod_set_array_value(count, desc_array, value_array);
+	gpiod_set_array_value(count, desc_array, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(mctrl_gpio_set);
 
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 21ddbe440030..1b21dc7b0fad 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -104,36 +104,38 @@ int gpiod_direction_output_raw(struct gpio_desc *desc, int value);
 /* Value get/set from non-sleeping context */
 int gpiod_get_value(const struct gpio_desc *desc);
 int gpiod_get_array_value(unsigned int array_size,
-			  struct gpio_desc **desc_array, int *value_array);
+			  struct gpio_desc **desc_array,
+			  unsigned long *value_bitmap);
 void gpiod_set_value(struct gpio_desc *desc, int value);
 void gpiod_set_array_value(unsigned int array_size,
-			   struct gpio_desc **desc_array, int *value_array);
+			   struct gpio_desc **desc_array,
+			   unsigned long *value_bitmap);
 int gpiod_get_raw_value(const struct gpio_desc *desc);
 int gpiod_get_raw_array_value(unsigned int array_size,
 			      struct gpio_desc **desc_array,
-			      int *value_array);
+			      unsigned long *value_bitmap);
 void gpiod_set_raw_value(struct gpio_desc *desc, int value);
 int gpiod_set_raw_array_value(unsigned int array_size,
 			       struct gpio_desc **desc_array,
-			       int *value_array);
+			       unsigned long *value_bitmap);
 
 /* Value get/set from sleeping context */
 int gpiod_get_value_cansleep(const struct gpio_desc *desc);
 int gpiod_get_array_value_cansleep(unsigned int array_size,
 				   struct gpio_desc **desc_array,
-				   int *value_array);
+				   unsigned long *value_bitmap);
 void gpiod_set_value_cansleep(struct gpio_desc *desc, int value);
 void gpiod_set_array_value_cansleep(unsigned int array_size,
 				    struct gpio_desc **desc_array,
-				    int *value_array);
+				    unsigned long *value_bitmap);
 int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc);
 int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
 				       struct gpio_desc **desc_array,
-				       int *value_array);
+				       unsigned long *value_bitmap);
 void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value);
 int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 					struct gpio_desc **desc_array,
-					int *value_array);
+					unsigned long *value_bitmap);
 
 int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce);
 int gpiod_set_transitory(struct gpio_desc *desc, bool transitory);
-- 
2.16.4

^ permalink raw reply related

* [PATCH v6 3/4] gpiolib: Pass array info to get/set array functions
From: Janusz Krzysztofik @ 2018-08-31 22:56 UTC (permalink / raw)
  To: Linus Walleij
  Cc: Jonathan Corbet, Miguel Ojeda Sandonis, Peter Korsgaard,
	Peter Rosin, Ulf Hansson, Andrew Lunn, Florian Fainelli,
	David S. Miller, Dominik Brodowski, Greg Kroah-Hartman,
	Kishon Vijay Abraham I, Lars-Peter Clausen, Michael Hennerich,
	Jonathan Cameron, Hartmut Knaack, Peter Meerwald-Stadler,
	Jiri Slaby, Willy Tarreau, Geert Uytterhoeven
In-Reply-To: <20180831225616.29221-1-jmkrzyszt@gmail.com>

In order to make use of array info obtained from gpiod_get_array() and
speed up processing of arrays matching single GPIO chip layout, that
information must be passed to get/set array functions.  Extend the
functions' API with that additional parameter and update all users.
Pass NULL if a user bulids an array itself from single GPIOs.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
Cc: Peter Korsgaard <peter.korsgaard@barco.com>
Cc: Peter Rosin <peda@axentia.se>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Kishon Vijay Abraham I <kishon@ti.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: Michael Hennerich <Michael.Hennerich@analog.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Hartmut Knaack <knaack.h@gmx.de>
Cc: Peter Meerwald-Stadler <pmeerw@pmeerw.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jslaby@suse.com>
Signed-off-by: Janusz Krzysztofik <jmkrzyszt@gmail.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/driver-api/gpio/consumer.rst  | 14 ++++++++++--
 drivers/auxdisplay/hd44780.c                | 12 ++++++----
 drivers/bus/ts-nbus.c                       |  6 +++--
 drivers/gpio/gpio-max3191x.c                |  6 +++--
 drivers/gpio/gpiolib.c                      | 34 ++++++++++++++++++++---------
 drivers/gpio/gpiolib.h                      |  2 ++
 drivers/i2c/muxes/i2c-mux-gpio.c            |  2 +-
 drivers/mmc/core/pwrseq_simple.c            |  2 +-
 drivers/mux/gpio.c                          |  3 ++-
 drivers/net/phy/mdio-mux-gpio.c             |  2 +-
 drivers/pcmcia/soc_common.c                 |  3 ++-
 drivers/phy/motorola/phy-mapphone-mdm6600.c |  4 +++-
 drivers/staging/iio/adc/ad7606.c            |  3 ++-
 drivers/tty/serial/serial_mctrl_gpio.c      |  2 +-
 include/linux/gpio/consumer.h               |  8 +++++++
 15 files changed, 75 insertions(+), 28 deletions(-)

diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst
index 7e0298b9a7b9..0afd95a12b10 100644
--- a/Documentation/driver-api/gpio/consumer.rst
+++ b/Documentation/driver-api/gpio/consumer.rst
@@ -325,28 +325,36 @@ The following functions get or set the values of an array of GPIOs::
 
 	int gpiod_get_array_value(unsigned int array_size,
 				  struct gpio_desc **desc_array,
+				  struct gpio_array *array_info,
 				  unsigned long *value_bitmap);
 	int gpiod_get_raw_array_value(unsigned int array_size,
 				      struct gpio_desc **desc_array,
+				      struct gpio_array *array_info,
 				      unsigned long *value_bitmap);
 	int gpiod_get_array_value_cansleep(unsigned int array_size,
 					   struct gpio_desc **desc_array,
+					   struct gpio_array *array_info,
 					   unsigned long *value_bitmap);
 	int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
 					   struct gpio_desc **desc_array,
+					   struct gpio_array *array_info,
 					   unsigned long *value_bitmap);
 
 	void gpiod_set_array_value(unsigned int array_size,
 				   struct gpio_desc **desc_array,
+				   struct gpio_array *array_info,
 				   unsigned long *value_bitmap)
 	void gpiod_set_raw_array_value(unsigned int array_size,
 				       struct gpio_desc **desc_array,
+				       struct gpio_array *array_info,
 				       unsigned long *value_bitmap)
 	void gpiod_set_array_value_cansleep(unsigned int array_size,
 					    struct gpio_desc **desc_array,
+					    struct gpio_array *array_info,
 					    unsigned long *value_bitmap)
 	void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 						struct gpio_desc **desc_array,
+						struct gpio_array *array_info,
 						unsigned long *value_bitmap)
 
 The array can be an arbitrary set of GPIOs. The functions will try to access
@@ -358,6 +366,7 @@ accessed sequentially.
 The functions take three arguments:
 	* array_size	- the number of array elements
 	* desc_array	- an array of GPIO descriptors
+	* array_info	- optional information obtained from gpiod_array_get()
 	* value_bitmap	- a bitmap to store the GPIOs' values (get) or
 			  a bitmap of values to assign to the GPIOs (set)
 
@@ -368,12 +377,13 @@ the struct gpio_descs returned by gpiod_get_array()::
 
 	struct gpio_descs *my_gpio_descs = gpiod_get_array(...);
 	gpiod_set_array_value(my_gpio_descs->ndescs, my_gpio_descs->desc,
-			      my_gpio_value_bitmap);
+			      my_gpio_descs->info, my_gpio_value_bitmap);
 
 It is also possible to access a completely arbitrary array of descriptors. The
 descriptors may be obtained using any combination of gpiod_get() and
 gpiod_get_array(). Afterwards the array of descriptors has to be setup
-manually before it can be passed to one of the above functions.
+manually before it can be passed to one of the above functions.  In that case,
+array_info should be set to NULL.
 
 Note that for optimal performance GPIOs belonging to the same chip should be
 contiguous within the array of descriptors.
diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c
index 7ee4f9a65bfc..4134fd090206 100644
--- a/drivers/auxdisplay/hd44780.c
+++ b/drivers/auxdisplay/hd44780.c
@@ -70,7 +70,8 @@ static void hd44780_write_gpio8(struct hd44780 *hd, u8 val, unsigned int rs)
 	n = hd->pins[PIN_CTRL_RW] ? 10 : 9;
 
 	/* Present the data to the port */
-	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA0], value_bitmap);
+	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA0], NULL,
+				       value_bitmap);
 
 	hd44780_strobe_gpio(hd);
 }
@@ -87,7 +88,8 @@ static void hd44780_write_gpio4(struct hd44780 *hd, u8 val, unsigned int rs)
 	n = hd->pins[PIN_CTRL_RW] ? 6 : 5;
 
 	/* Present the data to the port */
-	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], value_bitmap);
+	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], NULL,
+				       value_bitmap);
 
 	hd44780_strobe_gpio(hd);
 
@@ -96,7 +98,8 @@ static void hd44780_write_gpio4(struct hd44780 *hd, u8 val, unsigned int rs)
 	*value_bitmap |= val & 0x0f;
 
 	/* Present the data to the port */
-	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], value_bitmap);
+	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], NULL,
+				       value_bitmap);
 
 	hd44780_strobe_gpio(hd);
 }
@@ -153,7 +156,8 @@ static void hd44780_write_cmd_raw_gpio4(struct charlcd *lcd, int cmd)
 	n = hd->pins[PIN_CTRL_RW] ? 6 : 5;
 
 	/* Present the data to the port */
-	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], value_bitmap);
+	gpiod_set_array_value_cansleep(n, &hd->pins[PIN_DATA4], NULL,
+				       value_bitmap);
 
 	hd44780_strobe_gpio(hd);
 }
diff --git a/drivers/bus/ts-nbus.c b/drivers/bus/ts-nbus.c
index 2ac5239df865..0a8fa64a4050 100644
--- a/drivers/bus/ts-nbus.c
+++ b/drivers/bus/ts-nbus.c
@@ -114,7 +114,8 @@ static void ts_nbus_reset_bus(struct ts_nbus *ts_nbus)
 	
 	*value_bitmap = 0;
 
-	gpiod_set_array_value_cansleep(8, ts_nbus->data->desc, value_bitmap);
+	gpiod_set_array_value_cansleep(8, ts_nbus->data->desc,
+				       ts_nbus->data->info, value_bitmap);
 	gpiod_set_value_cansleep(ts_nbus->csn, 0);
 	gpiod_set_value_cansleep(ts_nbus->strobe, 0);
 	gpiod_set_value_cansleep(ts_nbus->ale, 0);
@@ -157,7 +158,8 @@ static void ts_nbus_write_byte(struct ts_nbus *ts_nbus, u8 byte)
 	struct gpio_descs *gpios = ts_nbus->data;
 	DECLARE_BITMAP(value_bitmap, 8) = { byte, };
 
-	gpiod_set_array_value_cansleep(8, gpios->desc, value_bitmap);
+	gpiod_set_array_value_cansleep(8, gpios->desc, gpios->info,
+				       value_bitmap);
 }
 
 /*
diff --git a/drivers/gpio/gpio-max3191x.c b/drivers/gpio/gpio-max3191x.c
index c4ec1c82af27..4b43b5dabfd2 100644
--- a/drivers/gpio/gpio-max3191x.c
+++ b/drivers/gpio/gpio-max3191x.c
@@ -313,6 +313,7 @@ static int max3191x_set_config(struct gpio_chip *gpio, unsigned int offset,
 
 static void gpiod_set_array_single_value_cansleep(unsigned int ndescs,
 						  struct gpio_desc **desc,
+						  struct gpio_array *info,
 						  int value)
 {
 	unsigned long *value_bitmap;
@@ -327,7 +328,7 @@ static void gpiod_set_array_single_value_cansleep(unsigned int ndescs,
 	else
 		bitmap_zero(value_bitmap, ndescs);
 
-	gpiod_set_array_value_cansleep(ndescs, desc, value_bitmap);
+	gpiod_set_array_value_cansleep(ndescs, desc, info, value_bitmap);
 	kfree(value_bitmap);
 }
 
@@ -400,7 +401,8 @@ static int max3191x_probe(struct spi_device *spi)
 	if (max3191x->modesel_pins)
 		gpiod_set_array_single_value_cansleep(
 				 max3191x->modesel_pins->ndescs,
-				 max3191x->modesel_pins->desc, max3191x->mode);
+				 max3191x->modesel_pins->desc,
+				 max3191x->modesel_pins->info, max3191x->mode);
 
 	max3191x->ignore_uv = device_property_read_bool(dev,
 						  "maxim,ignore-undervoltage");
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index c1ed1c759345..4d26cdbdb7cf 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -435,7 +435,7 @@ static long linehandle_ioctl(struct file *filep, unsigned int cmd,
 		int ret = gpiod_get_array_value_complex(false,
 							true,
 							lh->numdescs,
-							lh->descs,
+							lh->descs, NULL,
 							value_bitmap);
 		if (ret)
 			return ret;
@@ -467,7 +467,7 @@ static long linehandle_ioctl(struct file *filep, unsigned int cmd,
 		return gpiod_set_array_value_complex(false,
 					      true,
 					      lh->numdescs,
-					      lh->descs,
+					      lh->descs, NULL,
 					      value_bitmap);
 	}
 	return -EINVAL;
@@ -2784,6 +2784,7 @@ static int gpio_chip_get_multiple(struct gpio_chip *chip,
 int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 				  unsigned int array_size,
 				  struct gpio_desc **desc_array,
+				  struct gpio_array *array_info,
 				  unsigned long *value_bitmap)
 {
 	int i = 0;
@@ -2908,12 +2909,14 @@ EXPORT_SYMBOL_GPL(gpiod_get_value);
  */
 int gpiod_get_raw_array_value(unsigned int array_size,
 			      struct gpio_desc **desc_array,
+			      struct gpio_array *array_info,
 			      unsigned long *value_bitmap)
 {
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_get_array_value_complex(true, false, array_size,
-					     desc_array, value_bitmap);
+					     desc_array, array_info,
+					     value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value);
 
@@ -2931,12 +2934,14 @@ EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value);
  */
 int gpiod_get_array_value(unsigned int array_size,
 			  struct gpio_desc **desc_array,
+			  struct gpio_array *array_info,
 			  unsigned long *value_bitmap)
 {
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_get_array_value_complex(false, false, array_size,
-					     desc_array, value_bitmap);
+					     desc_array, array_info,
+					     value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_get_array_value);
 
@@ -3029,6 +3034,7 @@ static void gpio_chip_set_multiple(struct gpio_chip *chip,
 int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 				   unsigned int array_size,
 				   struct gpio_desc **desc_array,
+				   struct gpio_array *array_info,
 				   unsigned long *value_bitmap)
 {
 	int i = 0;
@@ -3166,12 +3172,13 @@ EXPORT_SYMBOL_GPL(gpiod_set_value);
  */
 int gpiod_set_raw_array_value(unsigned int array_size,
 			 struct gpio_desc **desc_array,
+			 struct gpio_array *array_info,
 			 unsigned long *value_bitmap)
 {
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_set_array_value_complex(true, false, array_size,
-					desc_array, value_bitmap);
+					desc_array, array_info, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value);
 
@@ -3189,12 +3196,13 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value);
  */
 void gpiod_set_array_value(unsigned int array_size,
 			   struct gpio_desc **desc_array,
+			   struct gpio_array *array_info,
 			   unsigned long *value_bitmap)
 {
 	if (!desc_array)
 		return;
 	gpiod_set_array_value_complex(false, false, array_size, desc_array,
-				      value_bitmap);
+				      array_info, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_array_value);
 
@@ -3426,13 +3434,15 @@ EXPORT_SYMBOL_GPL(gpiod_get_value_cansleep);
  */
 int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
 				       struct gpio_desc **desc_array,
+				       struct gpio_array *array_info,
 				       unsigned long *value_bitmap)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_get_array_value_complex(true, true, array_size,
-					     desc_array, value_bitmap);
+					     desc_array, array_info,
+					     value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value_cansleep);
 
@@ -3449,13 +3459,15 @@ EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value_cansleep);
  */
 int gpiod_get_array_value_cansleep(unsigned int array_size,
 				   struct gpio_desc **desc_array,
+				   struct gpio_array *array_info,
 				   unsigned long *value_bitmap)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_get_array_value_complex(false, true, array_size,
-					     desc_array, value_bitmap);
+					     desc_array, array_info,
+					     value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_get_array_value_cansleep);
 
@@ -3508,13 +3520,14 @@ EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep);
  */
 int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 					struct gpio_desc **desc_array,
+					struct gpio_array *array_info,
 					unsigned long *value_bitmap)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return -EINVAL;
 	return gpiod_set_array_value_complex(true, true, array_size, desc_array,
-				      value_bitmap);
+				      array_info, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value_cansleep);
 
@@ -3548,13 +3561,14 @@ void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n)
  */
 void gpiod_set_array_value_cansleep(unsigned int array_size,
 				    struct gpio_desc **desc_array,
+				    struct gpio_array *array_info,
 				    unsigned long *value_bitmap)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return;
 	gpiod_set_array_value_complex(false, true, array_size, desc_array,
-				      value_bitmap);
+				      array_info, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(gpiod_set_array_value_cansleep);
 
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index b60905d558b1..b65ca896b24d 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -196,10 +196,12 @@ struct gpio_desc *gpiochip_get_desc(struct gpio_chip *chip, u16 hwnum);
 int gpiod_get_array_value_complex(bool raw, bool can_sleep,
 				  unsigned int array_size,
 				  struct gpio_desc **desc_array,
+				  struct gpio_array *array_info,
 				  unsigned long *value_bitmap);
 int gpiod_set_array_value_complex(bool raw, bool can_sleep,
 				   unsigned int array_size,
 				   struct gpio_desc **desc_array,
+				   struct gpio_array *array_info,
 				   unsigned long *value_bitmap);
 
 /* This is just passed between gpiolib and devres */
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index e28ddc20000d..ac77519d4a82 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -31,7 +31,7 @@ static void i2c_mux_gpio_set(const struct gpiomux *mux, unsigned val)
 	*value_bitmap = val;
 
 	gpiod_set_array_value_cansleep(mux->data.n_gpios,
-				       mux->gpios, value_bitmap);
+				       mux->gpios, NULL, value_bitmap);
 }
 
 static int i2c_mux_gpio_select(struct i2c_mux_core *muxc, u32 chan)
diff --git a/drivers/mmc/core/pwrseq_simple.c b/drivers/mmc/core/pwrseq_simple.c
index fd7791e072b3..a57f8a7d5266 100644
--- a/drivers/mmc/core/pwrseq_simple.c
+++ b/drivers/mmc/core/pwrseq_simple.c
@@ -46,7 +46,7 @@ static void mmc_pwrseq_simple_set_gpios_value(struct mmc_pwrseq_simple *pwrseq,
 		*value_bitmap = value;
 
 		gpiod_set_array_value_cansleep(nvalues, reset_gpios->desc,
-					       value_bitmap);
+					       reset_gpios->info, value_bitmap);
 	}
 }
 
diff --git a/drivers/mux/gpio.c b/drivers/mux/gpio.c
index 17b7b1c21d90..728323207b8c 100644
--- a/drivers/mux/gpio.c
+++ b/drivers/mux/gpio.c
@@ -27,7 +27,8 @@ static int mux_gpio_set(struct mux_control *mux, int state)
 	*value_bitmap = state;
 
 	gpiod_set_array_value_cansleep(mux_gpio->gpios->ndescs,
-				       mux_gpio->gpios->desc, value_bitmap);
+				       mux_gpio->gpios->desc,
+				       mux_gpio->gpios->info, value_bitmap);
 
 	return 0;
 }
diff --git a/drivers/net/phy/mdio-mux-gpio.c b/drivers/net/phy/mdio-mux-gpio.c
index 3907e9528949..42f5c253b62c 100644
--- a/drivers/net/phy/mdio-mux-gpio.c
+++ b/drivers/net/phy/mdio-mux-gpio.c
@@ -34,7 +34,7 @@ static int mdio_mux_gpio_switch_fn(int current_child, int desired_child,
 	*value_bitmap = desired_child;
 
 	gpiod_set_array_value_cansleep(s->gpios->ndescs, s->gpios->desc,
-				       value_bitmap);
+				       s->gpios->info, value_bitmap);
 
 	return 0;
 }
diff --git a/drivers/pcmcia/soc_common.c b/drivers/pcmcia/soc_common.c
index 44288b1a57ee..75c3e0bb2737 100644
--- a/drivers/pcmcia/soc_common.c
+++ b/drivers/pcmcia/soc_common.c
@@ -366,7 +366,8 @@ static int soc_common_pcmcia_config_skt(
 		}
 
 		if (n)
-			gpiod_set_array_value_cansleep(n, descs, value_bitmap);
+			gpiod_set_array_value_cansleep(n, descs, NULL,
+						       value_bitmap);
 
 		/*
 		 * This really needs a better solution.  The IRQ
diff --git a/drivers/phy/motorola/phy-mapphone-mdm6600.c b/drivers/phy/motorola/phy-mapphone-mdm6600.c
index b348272f7f1a..5611a04bc286 100644
--- a/drivers/phy/motorola/phy-mapphone-mdm6600.c
+++ b/drivers/phy/motorola/phy-mapphone-mdm6600.c
@@ -162,7 +162,8 @@ static void phy_mdm6600_cmd(struct phy_mdm6600 *ddata, int val)
 	*value_bitmap = val & ((1 << PHY_MDM6600_NR_CMD_LINES) - 1);
 
 	gpiod_set_array_value_cansleep(PHY_MDM6600_NR_CMD_LINES,
-				       ddata->cmd_gpios->desc, value_bitmap);
+				       ddata->cmd_gpios->desc,
+				       ddata->cmd_gpios->info, value_bitmap);
 }
 
 /**
@@ -181,6 +182,7 @@ static void phy_mdm6600_status(struct work_struct *work)
 
 	error = gpiod_get_array_value_cansleep(PHY_MDM6600_NR_STATUS_LINES,
 					       ddata->status_gpios->desc,
+					       ddata->status_gpios->info,
 					       value_bitmap);
 	if (error)
 		return;
diff --git a/drivers/staging/iio/adc/ad7606.c b/drivers/staging/iio/adc/ad7606.c
index 18b6d10b0176..9115b5ee3601 100644
--- a/drivers/staging/iio/adc/ad7606.c
+++ b/drivers/staging/iio/adc/ad7606.c
@@ -230,7 +230,8 @@ static int ad7606_write_raw(struct iio_dev *indio_dev,
 		*value_bitmap = ret;
 
 		mutex_lock(&st->lock);
-		gpiod_set_array_value(3, st->gpio_os->desc, value_bitmap);
+		gpiod_set_array_value(3, st->gpio_os->desc, st->gpio_os->info,
+				      value_bitmap);
 		st->oversampling = val;
 		mutex_unlock(&st->lock);
 
diff --git a/drivers/tty/serial/serial_mctrl_gpio.c b/drivers/tty/serial/serial_mctrl_gpio.c
index edfe8c688479..66408108a951 100644
--- a/drivers/tty/serial/serial_mctrl_gpio.c
+++ b/drivers/tty/serial/serial_mctrl_gpio.c
@@ -53,7 +53,7 @@ void mctrl_gpio_set(struct mctrl_gpios *gpios, unsigned int mctrl)
 				     !!(mctrl & mctrl_gpios_desc[i].mctrl));
 			count++;
 		}
-	gpiod_set_array_value(count, desc_array, value_bitmap);
+	gpiod_set_array_value(count, desc_array, NULL, value_bitmap);
 }
 EXPORT_SYMBOL_GPL(mctrl_gpio_set);
 
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 8dede3e886af..bf037ebe2ed8 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -114,36 +114,44 @@ int gpiod_direction_output_raw(struct gpio_desc *desc, int value);
 int gpiod_get_value(const struct gpio_desc *desc);
 int gpiod_get_array_value(unsigned int array_size,
 			  struct gpio_desc **desc_array,
+			  struct gpio_array *array_info,
 			  unsigned long *value_bitmap);
 void gpiod_set_value(struct gpio_desc *desc, int value);
 void gpiod_set_array_value(unsigned int array_size,
 			   struct gpio_desc **desc_array,
+			   struct gpio_array *array_info,
 			   unsigned long *value_bitmap);
 int gpiod_get_raw_value(const struct gpio_desc *desc);
 int gpiod_get_raw_array_value(unsigned int array_size,
 			      struct gpio_desc **desc_array,
+			      struct gpio_array *array_info,
 			      unsigned long *value_bitmap);
 void gpiod_set_raw_value(struct gpio_desc *desc, int value);
 int gpiod_set_raw_array_value(unsigned int array_size,
 			       struct gpio_desc **desc_array,
+			       struct gpio_array *array_info,
 			       unsigned long *value_bitmap);
 
 /* Value get/set from sleeping context */
 int gpiod_get_value_cansleep(const struct gpio_desc *desc);
 int gpiod_get_array_value_cansleep(unsigned int array_size,
 				   struct gpio_desc **desc_array,
+				   struct gpio_array *array_info,
 				   unsigned long *value_bitmap);
 void gpiod_set_value_cansleep(struct gpio_desc *desc, int value);
 void gpiod_set_array_value_cansleep(unsigned int array_size,
 				    struct gpio_desc **desc_array,
+				    struct gpio_array *array_info,
 				    unsigned long *value_bitmap);
 int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc);
 int gpiod_get_raw_array_value_cansleep(unsigned int array_size,
 				       struct gpio_desc **desc_array,
+				       struct gpio_array *array_info,
 				       unsigned long *value_bitmap);
 void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value);
 int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 					struct gpio_desc **desc_array,
+					struct gpio_array *array_info,
 					unsigned long *value_bitmap);
 
 int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce);
-- 
2.16.4

^ permalink raw reply related

* [PATCH net-next] net: bgmac: remove set but not used variable 'err'
From: YueHaibing @ 2018-09-01  3:13 UTC (permalink / raw)
  To: David S. Miller, Florian Fainelli, Andrew Lunn, Colin Ian King
  Cc: YueHaibing, netdev, linux-kernel, kernel-janitors

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/net/ethernet/broadcom/bgmac.c: In function 'bgmac_dma_alloc':
drivers/net/ethernet/broadcom/bgmac.c:619:6: warning:
 variable 'err' set but not used [-Wunused-but-set-variable]

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/ethernet/broadcom/bgmac.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c
index 4c94d92..cabc8e4 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -616,7 +616,6 @@ static int bgmac_dma_alloc(struct bgmac *bgmac)
 	static const u16 ring_base[] = { BGMAC_DMA_BASE0, BGMAC_DMA_BASE1,
 					 BGMAC_DMA_BASE2, BGMAC_DMA_BASE3, };
 	int size; /* ring size: different for Tx and Rx */
-	int err;
 	int i;
 
 	BUILD_BUG_ON(BGMAC_MAX_TX_RINGS > ARRAY_SIZE(ring_base));
@@ -666,7 +665,6 @@ static int bgmac_dma_alloc(struct bgmac *bgmac)
 		if (!ring->cpu_base) {
 			dev_err(bgmac->dev, "Allocation of RX ring 0x%X failed\n",
 				ring->mmio_base);
-			err = -ENOMEM;
 			goto err_dma_free;
 		}

^ permalink raw reply related

* Re: [PATCH v2] rsi: remove set but not used variable 'header_size'
From: Siva Rebbagondla @ 2018-09-01  3:41 UTC (permalink / raw)
  To: yuehaibing
  Cc: Kalle Valo, davem, linux-kernel, netdev, Amit karwar,
	pavani.muthyala, keescook, Linux Wireless
In-Reply-To: <20180831111325.16984-1-yuehaibing@huawei.com>

On Fri, Aug 31, 2018 at 4:45 PM YueHaibing <yuehaibing@huawei.com> wrote:
>
> Fixes gcc '-Wunused-but-set-variable' warning:
>
> drivers/net/wireless/rsi/rsi_91x_hal.c: In function 'rsi_send_data_pkt':
> drivers/net/wireless/rsi/rsi_91x_hal.c:288:5: warning:
>  variable 'header_size' set but not used [-Wunused-but-set-variable]
>
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
> ---
> v2: remove unused 'tx_params'
> ---
>  drivers/net/wireless/rsi/rsi_91x_hal.c | 4 ----
>  1 file changed, 4 deletions(-)
>
> diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c
> index 01edf96..182b066 100644
> --- a/drivers/net/wireless/rsi/rsi_91x_hal.c
> +++ b/drivers/net/wireless/rsi/rsi_91x_hal.c
> @@ -282,10 +282,8 @@ int rsi_send_data_pkt(struct rsi_common *common, struct sk_buff *skb)
>         struct rsi_hw *adapter = common->priv;
>         struct ieee80211_vif *vif;
>         struct ieee80211_tx_info *info;
> -       struct skb_info *tx_params;
>         struct ieee80211_bss_conf *bss;
>         int status = -EINVAL;
> -       u8 header_size;
>
>         if (!skb)
>                 return 0;
> @@ -297,8 +295,6 @@ int rsi_send_data_pkt(struct rsi_common *common, struct sk_buff *skb)
>                 goto err;
>         vif = info->control.vif;
>         bss = &vif->bss_conf;
> -       tx_params = (struct skb_info *)info->driver_data;
> -       header_size = tx_params->internal_hdr_size;
Yes, These redundant variables shall be removed.
>
>         if (((vif->type == NL80211_IFTYPE_STATION) ||
>              (vif->type == NL80211_IFTYPE_P2P_CLIENT)) &&
> --
> 2.7.0
>
>
Also, Patch title also can be changed to "[v2] rsi: remove unused variables"

Thanks,
Siva Rebbagondla

^ permalink raw reply

* Re: [PATCH bpf-next] adding selftest for bpf's (set|get)_sockopt for SAVE_SYN
From: Daniel Borkmann @ 2018-08-31 23:41 UTC (permalink / raw)
  To: Nikita V. Shirokov, Alexei Starovoitov; +Cc: netdev
In-Reply-To: <20180831164347.507308-1-tehnerd@fb.com>

On 08/31/2018 06:43 PM, Nikita V. Shirokov wrote:
> adding selftest for feature, introduced in
> commit 9452048c79404 ("bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN options for
> bpf_(set|get)sockopt")
> 
> Signed-off-by: Nikita V. Shirokov <tehnerd@fb.com>

Applied to bpf-next. Nikita, please also add a proper subsystem prefix to
your patches in future, I've fixed them all up manually this time.

^ permalink raw reply

* Re: [PATCH bpf-next v2 0/2] xsk: misc code cleanup
From: Daniel Borkmann @ 2018-08-31 23:43 UTC (permalink / raw)
  To: Magnus Karlsson, bjorn.topel, ast, netdev, jeffrey.t.kirsher
In-Reply-To: <1535715602-29056-1-git-send-email-magnus.karlsson@intel.com>

On 08/31/2018 01:40 PM, Magnus Karlsson wrote:
> This patch set cleans up two code style issues with the xsk zero-copy
> code. The resulting code is smaller and simpler.
> 
> Changes from v1:
> 
> * Fixed bisecatbility problem reported by Daniel Borkmann by squashing
>   the two last patches into one.
> 
> Patch 1: Removes a potential compiler warning reported by the Intel
>          0-DAY kernel test infrastructure.
> Patch 2: Removes the xdp_umem_props structure. At some point, it
>          was used to break a dependency, but the members are these
>          days much better off in the xdp_umem since the dependency
>          does not exist anymore. Also adapts the i40e driver to this
> 	 new interface.
> 
> I based this patch set on bpf-next commit 9c4f39811db8 ("samples/bpf:
> xdpsock, minor fixes")
> 
> Thanks: Magnus

Applied to bpf-next, thanks Magnus!

^ permalink raw reply

* pull-request: bpf-next 2018-09-01
From: Daniel Borkmann @ 2018-09-01  0:05 UTC (permalink / raw)
  To: davem; +Cc: daniel, ast, netdev

Hi David,

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Add AF_XDP zero-copy support for i40e driver (!), from Björn and Magnus.

2) BPF verifier improvements by giving each register its own liveness
   chain which allows to simplify and getting rid of skip_callee() logic,
   from Edward.

3) Add bpf fs pretty print support for percpu arraymap, percpu hashmap
   and percpu lru hashmap. Also add generic percpu formatted print on
   bpftool so the same can be dumped there, from Yonghong.

4) Add bpf_{set,get}sockopt() helper support for TCP_SAVE_SYN and
   TCP_SAVED_SYN options to allow reflection of tos/tclass from received
   SYN packet, from Nikita.

5) Misc improvements to the BPF sockmap test cases in terms of cgroup v2
   interaction and removal of incorrect shutdown() calls, from John.

6) Few cleanups in xdp_umem_assign_dev() and xdpsock samples, from Prashant.

Please consider pulling these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git

Thanks a lot!

----------------------------------------------------------------

The following changes since commit 817e60a7a2bb1f22052f18562990d675cb3a3762:

  Merge branch 'nfp-add-NFP5000-support' (2018-08-28 16:01:48 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git 

for you to fetch changes up to 93ee30f3e8b412c5fc2d2f7d9d002529d9a209ad:

  xsk: i40e: get rid of useless struct xdp_umem_props (2018-09-01 01:38:16 +0200)

----------------------------------------------------------------
Alexei Starovoitov (2):
      Merge branch 'AF_XDP-zerocopy-for-i40e'
      Merge branch 'verifier-liveness-simplification'

Björn Töpel (9):
      xdp: implement convert_to_xdp_frame for MEM_TYPE_ZERO_COPY
      xdp: export xdp_rxq_info_unreg_mem_model
      xsk: expose xdp_umem_get_{data,dma} to drivers
      i40e: added queue pair disable/enable functions
      i40e: refactor Rx path for re-use
      i40e: move common Rx functions to i40e_txrx_common.h
      i40e: add AF_XDP zero-copy Rx support
      samples/bpf: add -c/--copy -z/--zero-copy flags to xdpsock
      xsk: include XDP meta data in AF_XDP frames

Colin Ian King (1):
      xdp: remove redundant variable 'headroom'

Daniel Borkmann (1):
      Merge branch 'bpf-bpffs-bpftool-dump-with-btf'

Edward Cree (2):
      bpf/verifier: per-register parent pointers
      bpf/verifier: display non-spill stack slot types in print_verifier_state

John Fastabend (2):
      bpf: sockmap test remove shutdown() calls
      bpf: use --cgroup in test_suite if supplied

Magnus Karlsson (5):
      net: add napi_if_scheduled_mark_missed
      i40e: move common Tx functions to i40e_txrx_common.h
      i40e: add AF_XDP zero-copy Tx support
      i40e: fix possible compiler warning in xsk TX path
      xsk: i40e: get rid of useless struct xdp_umem_props

Nikita V. Shirokov (3):
      bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN options for bpf_(set|get)sockopt
      bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN sample program
      bpf: add selftest for bpf's (set|get)_sockopt for SAVE_SYN

Prashant Bhole (2):
      xsk: remove unnecessary assignment
      samples/bpf: xdpsock, minor fixes

Yonghong Song (3):
      bpf: add bpffs pretty print for percpu arraymap/hash/lru_hash
      tools/bpf: add bpffs percpu map pretty print tests in test_btf
      tools/bpf: bpftool: add btf percpu map formated dump

YueHaibing (1):
      bpf: remove duplicated include from syscall.c

 drivers/net/ethernet/intel/i40e/Makefile           |   3 +-
 drivers/net/ethernet/intel/i40e/i40e.h             |  19 +
 drivers/net/ethernet/intel/i40e/i40e_main.c        | 307 +++++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        | 182 +++--
 drivers/net/ethernet/intel/i40e/i40e_txrx.h        |  20 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx_common.h |  90 +++
 drivers/net/ethernet/intel/i40e/i40e_xsk.c         | 832 +++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_xsk.h         |  25 +
 include/linux/bpf_verifier.h                       |   8 +-
 include/linux/netdevice.h                          |  26 +
 include/net/xdp.h                                  |   6 +-
 include/net/xdp_sock.h                             |  51 +-
 kernel/bpf/arraymap.c                              |  24 +
 kernel/bpf/hashtab.c                               |  31 +
 kernel/bpf/syscall.c                               |   1 -
 kernel/bpf/verifier.c                              | 216 ++----
 net/core/filter.c                                  |  25 +-
 net/core/xdp.c                                     |  53 +-
 net/xdp/xdp_umem.c                                 |   6 +-
 net/xdp/xdp_umem.h                                 |  10 -
 net/xdp/xdp_umem_props.h                           |  14 -
 net/xdp/xsk.c                                      |  34 +-
 net/xdp/xsk_queue.c                                |   5 +-
 net/xdp/xsk_queue.h                                |  13 +-
 samples/bpf/Makefile                               |   1 +
 samples/bpf/tcp_tos_reflect_kern.c                 |  87 +++
 samples/bpf/xdpsock_kern.c                         |   2 +-
 samples/bpf/xdpsock_user.c                         |  15 +-
 tools/bpf/bpftool/map.c                            |  33 +-
 tools/testing/selftests/bpf/test_btf.c             | 179 ++++-
 tools/testing/selftests/bpf/test_sockmap.c         |  56 +-
 tools/testing/selftests/bpf/test_tcpbpf_kern.c     |  38 +-
 tools/testing/selftests/bpf/test_tcpbpf_user.c     |  31 +-
 33 files changed, 2067 insertions(+), 376 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_xsk.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_xsk.h
 delete mode 100644 net/xdp/xdp_umem_props.h
 create mode 100644 samples/bpf/tcp_tos_reflect_kern.c

^ permalink raw reply

* [PATCH RFC net-next 03/18] net/ipv4: export fib_info_update_nh_saddr
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add scope as input argument versus relying on fib_info reference in
fib_nh and export fib_info_update_nh_saddr.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h     | 5 +++--
 net/ipv4/fib_semantics.c | 9 ++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f1c053cf9489..a4a129344098 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -173,13 +173,14 @@ struct fib_result_nl {
 #define FIB_TABLE_HASHSZ 2
 #endif
 
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh,
+				unsigned char scope);
 
 #define FIB_RES_SADDR(net, res)				\
 	((FIB_RES_NH(res).nh_saddr_genid ==		\
 	  atomic_read(&(net)->ipv4.dev_addr_genid)) ?	\
 	 FIB_RES_NH(res).nh_saddr :			\
-	 fib_info_update_nh_saddr((net), &FIB_RES_NH(res)))
+	 fib_info_update_nh_saddr((net), &FIB_RES_NH(res), (res).fi->fib_scope))
 #define FIB_RES_GW(res)			(FIB_RES_NH(res).nh_gw)
 #define FIB_RES_DEV(res)		(FIB_RES_NH(res).nh_dev)
 #define FIB_RES_OIF(res)		(FIB_RES_NH(res).nh_oif)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 7bead7c03e1b..c034d0adf590 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -984,11 +984,10 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
 	fib_info_hash_free(old_laddrhash, bytes);
 }
 
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh,
+				unsigned char scope)
 {
-	nh->nh_saddr = inet_select_addr(nh->nh_dev,
-					nh->nh_gw,
-					nh->nh_parent->fib_scope);
+	nh->nh_saddr = inet_select_addr(nh->nh_dev, nh->nh_gw, scope);
 	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
 
 	return nh->nh_saddr;
@@ -1238,7 +1237,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	}
 
 	change_nexthops(fi) {
-		fib_info_update_nh_saddr(net, nexthop_nh);
+		fib_info_update_nh_saddr(net, nexthop_nh, fi->fib_scope);
 	} endfor_nexthops(fi)
 
 	fib_rebalance(fi);
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 02/18] net: ipv4: export fib_good_nh and fib_flush
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Export fib_good_nh for use by the nexthop code when selecting a path
within a multipath nexthop.

As nexthops are deleted, fib entries referencing it are marked dead.
Export fib_flush so those entries can be removed in a timely
manner.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h     | 2 ++
 net/ipv4/fib_frontend.c  | 2 +-
 net/ipv4/fib_semantics.c | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 69c91d1934c1..f1c053cf9489 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -399,6 +399,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 		       const struct sk_buff *skb, struct flow_keys *flkeys);
 #endif
+bool fib_good_nh(const struct fib_nh *nh);
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
 		     struct flowi4 *fl4, const struct sk_buff *skb);
@@ -423,6 +424,7 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 #endif
 }
 
+void fib_flush(struct net *net);
 void free_fib_info(struct fib_info *fi);
 
 static inline void fib_info_hold(struct fib_info *fi)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2998b0e47d4b..b0910d8c8bd4 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -192,7 +192,7 @@ int fib_unmerge(struct net *net)
 	return 0;
 }
 
-static void fib_flush(struct net *net)
+void fib_flush(struct net *net)
 {
 	int flushed = 0;
 	unsigned int h;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 93524a746ca8..7bead7c03e1b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1682,7 +1682,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-static bool fib_good_nh(const struct fib_nh *nh)
+bool fib_good_nh(const struct fib_nh *nh)
 {
 	int state = NUD_REACHABLE;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 09/18] net/ipv6: Create init and release helpers for fib6_nh
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Refactor initialization and cleanup of fib6_nh to helpers similar to
what was done for IPv4. Add fib6_nh_init to the ipv6 stubs for use by
core code when ipv6 is built as a module.

The replace helper is small enough, so make an inline rather than
requiring it to go through ipv6 stubs.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/addrconf.h   |   5 +
 include/net/ip6_fib.h    |  11 +++
 net/ipv6/addrconf_core.c |   9 ++
 net/ipv6/af_inet6.c      |   1 +
 net/ipv6/ip6_fib.c       |   5 +-
 net/ipv6/route.c         | 239 +++++++++++++++++++++++++----------------------
 6 files changed, 153 insertions(+), 117 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6def0351bcc3..7748b8300ca0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -2,6 +2,8 @@
 #ifndef _ADDRCONF_H
 #define _ADDRCONF_H
 
+#include <net/ip6_fib.h>
+
 #define MAX_RTR_SOLICITATIONS		-1		/* unlimited */
 #define RTR_SOLICITATION_INTERVAL	(4*HZ)
 #define RTR_SOLICITATION_MAX_INTERVAL	(3600*HZ)	/* 1 hour */
@@ -253,6 +255,9 @@ struct ipv6_stub {
 	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
 				 struct in6_addr *saddr);
 
+	int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh,
+			    struct fib6_config *cfg,
+			    struct netlink_ext_ack *extack);
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3d4930528db0..2a1fae1247a9 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -22,6 +22,7 @@
 #include <net/netlink.h>
 #include <net/inetpeer.h>
 #include <net/fib_notifier.h>
+#include <net/lwtunnel.h>
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 #define FIB6_TABLE_HASHSZ 256
@@ -413,6 +414,16 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct fib6_info *rt, struct nl_info *info);
 
+int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+		 struct fib6_config *cfg, struct netlink_ext_ack *extack);
+static inline void fib6_nh_release(struct fib6_nh *fib6_nh)
+{
+	if (fib6_nh->nh_dev)
+		dev_put(fib6_nh->nh_dev);
+
+	lwtstate_put(fib6_nh->nh_lwtstate);
+}
+
 static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 {
 	return f6i->fib6_nh.nh_dev;
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 5cd0029d930e..f5c712136408 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -168,6 +168,14 @@ eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
 	return 0;
 }
 
+static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+				     struct fib6_config *cfg,
+				     struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+	return -EAFNOSUPPORT;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
 	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
 	.fib6_get_table    = eafnosupport_fib6_get_table,
@@ -175,6 +183,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
 	.fib6_lookup       = eafnosupport_fib6_lookup,
 	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
 	.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
+	.fib6_nh_init	   = eafnosupport_fib6_nh_init,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 673bba31eb18..a5809bf7c229 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -895,6 +895,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.fib6_lookup       = fib6_lookup,
 	.fib6_multipath_select = fib6_multipath_select,
 	.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
+	.fib6_nh_init	   = fib6_nh_init,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index c861a6d4671d..c1c23427a81e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -198,10 +198,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 		}
 	}
 
-	lwtstate_put(f6i->fib6_nh.nh_lwtstate);
-
-	if (f6i->fib6_nh.nh_dev)
-		dev_put(f6i->fib6_nh.nh_dev);
+	fib6_nh_release(&f6i->fib6_nh);
 
 	m = f6i->fib6_metrics;
 	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 07ed7812c6b4..aa44cd5b3217 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2844,9 +2844,11 @@ static int ip6_route_check_nh(struct net *net,
 		}
 	} else {
 		*_dev = dev = grt->dst.dev;
-		*idev = grt->rt6i_idev;
 		dev_hold(dev);
-		in6_dev_hold(grt->rt6i_idev);
+		if (idev) {
+			*idev = grt->rt6i_idev;
+			in6_dev_hold(grt->rt6i_idev);
+		}
 	}
 
 	if (!(grt->rt6i_flags & RTF_GATEWAY))
@@ -2931,16 +2933,128 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
 	return err;
 }
 
+int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+		 struct fib6_config *cfg, struct netlink_ext_ack *extack)
+{
+	struct net_device *dev = NULL;
+	struct inet6_dev *idev = NULL;
+	int addr_type;
+	int err;
+
+	err = -ENODEV;
+	if (cfg->fc_ifindex) {
+		dev = dev_get_by_index(net, cfg->fc_ifindex);
+		if (!dev)
+			goto out;
+		idev = in6_dev_get(dev);
+		if (!idev)
+			goto out;
+	}
+
+	if (cfg->fc_flags & RTNH_F_ONLINK) {
+		if (!dev) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop device required for onlink");
+			goto out;
+		}
+
+		if (!(dev->flags & IFF_UP)) {
+			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+			err = -ENETDOWN;
+			goto out;
+		}
+
+		fib6_nh->nh_flags |= RTNH_F_ONLINK;
+	}
+
+	if (cfg->fc_encap) {
+		struct lwtunnel_state *lwtstate;
+
+		err = lwtunnel_build_state(cfg->fc_encap_type,
+					   cfg->fc_encap, AF_INET6, cfg,
+					   &lwtstate, extack);
+		if (err)
+			goto out;
+
+		fib6_nh->nh_lwtstate = lwtstate_get(lwtstate);
+	}
+
+	fib6_nh->nh_weight = 1;
+
+	/* We cannot add true routes via loopback here,
+	 * they would result in kernel looping; promote them to reject routes
+	 */
+	addr_type = ipv6_addr_type(&cfg->fc_dst);
+	if ((cfg->fc_flags & RTF_REJECT) ||
+	    (dev && (dev->flags & IFF_LOOPBACK) &&
+	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
+	     !(cfg->fc_flags & RTF_LOCAL))) {
+		/* hold loopback dev/idev if we haven't done so. */
+		if (dev != net->loopback_dev) {
+			if (dev) {
+				dev_put(dev);
+				in6_dev_put(idev);
+			}
+			dev = net->loopback_dev;
+			dev_hold(dev);
+			idev = in6_dev_get(dev);
+			if (!idev) {
+				err = -ENODEV;
+				goto out;
+			}
+		}
+		cfg->fc_flags = RTF_REJECT | RTF_NONEXTHOP;
+		err = 0;
+		goto out;
+	}
+
+	if (cfg->fc_flags & RTF_GATEWAY) {
+		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
+		if (err)
+			goto out;
+
+		fib6_nh->nh_gw = cfg->fc_gateway;
+	}
+
+	err = -ENODEV;
+	if (!dev)
+		goto out;
+
+	if (idev->cnf.disable_ipv6) {
+		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
+		err = -EACCES;
+		goto out;
+	}
+
+	if (!(dev->flags & IFF_UP)) {
+		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+		err = -ENETDOWN;
+		goto out;
+	}
+
+	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
+	    !netif_carrier_ok(dev))
+		fib6_nh->nh_flags |= RTNH_F_LINKDOWN;
+
+	fib6_nh->nh_dev = dev;
+
+out:
+	if (idev)
+		in6_dev_put(idev);
+
+	if (err && dev)
+		dev_put(dev);
+
+	return err;
+}
+
 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 					      gfp_t gfp_flags,
 					      struct netlink_ext_ack *extack)
 {
 	struct net *net = cfg->fc_nlinfo.nl_net;
 	struct fib6_info *rt = NULL;
-	struct net_device *dev = NULL;
-	struct inet6_dev *idev = NULL;
 	struct fib6_table *table;
-	int addr_type;
 	int err = -EINVAL;
 
 	/* RTF_PCPU is an internal flag; can not be set by userspace */
@@ -2975,34 +3089,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 #endif
-	if (cfg->fc_ifindex) {
-		err = -ENODEV;
-		dev = dev_get_by_index(net, cfg->fc_ifindex);
-		if (!dev)
-			goto out;
-		idev = in6_dev_get(dev);
-		if (!idev)
-			goto out;
-	}
-
 	if (cfg->fc_metric == 0)
 		cfg->fc_metric = IP6_RT_PRIO_USER;
 
-	if (cfg->fc_flags & RTNH_F_ONLINK) {
-		if (!dev) {
-			NL_SET_ERR_MSG(extack,
-				       "Nexthop device required for onlink");
-			err = -ENODEV;
-			goto out;
-		}
-
-		if (!(dev->flags & IFF_UP)) {
-			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
-			err = -ENETDOWN;
-			goto out;
-		}
-	}
-
 	err = -ENOBUFS;
 	if (cfg->fc_nlinfo.nlh &&
 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
@@ -3040,18 +3129,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		cfg->fc_protocol = RTPROT_BOOT;
 	rt->fib6_protocol = cfg->fc_protocol;
 
-	addr_type = ipv6_addr_type(&cfg->fc_dst);
-
-	if (cfg->fc_encap) {
-		struct lwtunnel_state *lwtstate;
-
-		err = lwtunnel_build_state(cfg->fc_encap_type,
-					   cfg->fc_encap, AF_INET6, cfg,
-					   &lwtstate, extack);
-		if (err)
-			goto out;
-		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
-	}
+	rt->fib6_table = table;
+	rt->fib6_metric = cfg->fc_metric;
+	rt->fib6_type = cfg->fc_type;
 
 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
 	rt->fib6_dst.plen = cfg->fc_dst_len;
@@ -3062,62 +3142,13 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
 	rt->fib6_src.plen = cfg->fc_src_len;
 #endif
-
-	rt->fib6_metric = cfg->fc_metric;
-	rt->fib6_nh.nh_weight = 1;
-
-	rt->fib6_type = cfg->fc_type;
-
-	/* We cannot add true routes via loopback here,
-	   they would result in kernel looping; promote them to reject routes
-	 */
-	if ((cfg->fc_flags & RTF_REJECT) ||
-	    (dev && (dev->flags & IFF_LOOPBACK) &&
-	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
-	     !(cfg->fc_flags & RTF_LOCAL))) {
-		/* hold loopback dev/idev if we haven't done so. */
-		if (dev != net->loopback_dev) {
-			if (dev) {
-				dev_put(dev);
-				in6_dev_put(idev);
-			}
-			dev = net->loopback_dev;
-			dev_hold(dev);
-			idev = in6_dev_get(dev);
-			if (!idev) {
-				err = -ENODEV;
-				goto out;
-			}
-		}
-		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
-		goto install_route;
-	}
-
-	if (cfg->fc_flags & RTF_GATEWAY) {
-		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
-		if (err)
-			goto out;
-
-		rt->fib6_nh.nh_gw = cfg->fc_gateway;
-	}
-
-	err = -ENODEV;
-	if (!dev)
+	err = fib6_nh_init(net, &rt->fib6_nh, cfg, extack);
+	if (err)
 		goto out;
 
-	if (idev->cnf.disable_ipv6) {
-		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
-		err = -EACCES;
-		goto out;
-	}
-
-	if (!(dev->flags & IFF_UP)) {
-		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
-		err = -ENETDOWN;
-		goto out;
-	}
-
 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
+		struct net_device *dev = fib6_info_nh_dev(rt);
+
 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
 			NL_SET_ERR_MSG(extack, "Invalid source address");
 			err = -EINVAL;
@@ -3130,26 +3161,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 
 	rt->fib6_flags = cfg->fc_flags;
 
-install_route:
-	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
-	    !netif_carrier_ok(dev))
-		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
-	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
-	rt->fib6_nh.nh_dev = dev;
-	rt->fib6_table = table;
-
-	cfg->fc_nlinfo.nl_net = dev_net(dev);
-
-	if (idev)
-		in6_dev_put(idev);
-
 	return rt;
 out:
-	if (dev)
-		dev_put(dev);
-	if (idev)
-		in6_dev_put(idev);
-
 	fib6_info_release(rt);
 	return ERR_PTR(err);
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 11/18] net: Initial nexthop code
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Initial import of nexthop code.
- Add new RTM commands for nexthop objects.
- Add new uapi attributes for creating nexthops. Attributes are similar
  to the current nexthop attributes for routes.
- Add basic helpers for ipv4 and ipv6 references to nexthop data

Similar to routes nexthops are configured per namespace, so add
netns_nexthop struct and add it to struct net.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/net_namespace.h    |    2 +
 include/net/netns/nexthop.h    |   18 +
 include/net/nexthop.h          |  121 +++++
 include/uapi/linux/nexthop.h   |   56 +++
 include/uapi/linux/rtnetlink.h |    7 +
 net/ipv4/Makefile              |    2 +-
 net/ipv4/nexthop.c             | 1080 ++++++++++++++++++++++++++++++++++++++++
 security/selinux/nlmsgtab.c    |    5 +-
 8 files changed, 1289 insertions(+), 2 deletions(-)
 create mode 100644 include/net/netns/nexthop.h
 create mode 100644 include/net/nexthop.h
 create mode 100644 include/uapi/linux/nexthop.h
 create mode 100644 net/ipv4/nexthop.c

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 9b5fdc50519a..d3d678814b93 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -19,6 +19,7 @@
 #include <net/netns/packet.h>
 #include <net/netns/ipv4.h>
 #include <net/netns/ipv6.h>
+#include <net/netns/nexthop.h>
 #include <net/netns/ieee802154_6lowpan.h>
 #include <net/netns/sctp.h>
 #include <net/netns/dccp.h>
@@ -105,6 +106,7 @@ struct net {
 	struct netns_mib	mib;
 	struct netns_packet	packet;
 	struct netns_unix	unx;
+	struct netns_nexthop	nexthop;
 	struct netns_ipv4	ipv4;
 #if IS_ENABLED(CONFIG_IPV6)
 	struct netns_ipv6	ipv6;
diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h
new file mode 100644
index 000000000000..91627c35e9d3
--- /dev/null
+++ b/include/net/netns/nexthop.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * nexthops in net namespaces
+ */
+
+#ifndef __NETNS_NEXTHOP_H__
+#define __NETNS_NEXTHOP_H__
+
+#include <linux/rbtree.h>
+
+struct netns_nexthop {
+	struct rb_root		root;         /* tree of nexthops by id */
+	struct hlist_head	*devhash;     /* nexthops by device */
+
+	unsigned int		seq;		/* protected by rtnl_mutex */
+	u32			last_id_allocated;
+};
+#endif
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
new file mode 100644
index 000000000000..1c59d04d1da6
--- /dev/null
+++ b/include/net/nexthop.h
@@ -0,0 +1,121 @@
+/*
+ * Generic nexthop implementation
+ *
+ * Copyright (C) 2017-18 Cumulus Networks
+ * Copyright (c) 2017-18 David Ahern <dsa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_NEXTHOP_H
+#define __LINUX_NEXTHOP_H
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <net/ip_fib.h>
+#include <net/ip6_fib.h>
+#include <net/netlink.h>
+
+#define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK
+
+struct nexthop;
+
+struct nh_info {
+	struct hlist_node	dev_hash;
+	struct net		*net;
+	struct nexthop		*nh_parent;
+
+	u8			family;
+	u8			reject_nh:1,
+				has_gw:1,
+				unused:6;
+
+	union {
+		/* fib_nh used for device only nexthops as well */
+		struct fib_nh	fib_nh;
+		struct fib6_nh	fib6_nh;
+	};
+};
+
+struct nexthop {
+	struct rb_node		rb_node;
+	struct list_head	fi_list;    /* v4 entries using nh */
+	struct list_head	f6i_list;   /* v6 entries using nh */
+
+	u32			id;
+
+	u8			protocol;
+	u8			nh_flags;
+
+	refcount_t		refcnt;
+	struct rcu_head		rcu;
+
+	union {
+		struct nh_info	__rcu *nh_info;
+	};
+};
+
+struct nh_config {
+	u8		nh_family;
+	u8		nh_scope;
+	u8		nh_protocol;
+	u8		nh_blackhole;
+	u32		nh_flags;
+
+	u32		nh_id;
+	u32		tclassid;
+
+	int		nh_ifindex;
+	struct net_device *dev;
+	u32		nh_table;
+	union {
+		__be32		ipv4;
+		struct in6_addr	ipv6;
+	} gw;
+
+	u32		nlflags;
+	struct nl_info	nlinfo;
+};
+
+void nexthop_get(struct nexthop *nh);
+void nexthop_put(struct nexthop *nh);
+
+/* caller is holding rtnl; no reference taken to nexthop */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
+
+static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+{
+	return nh1 == nh2;
+}
+
+static inline int nexthop_num_path(struct nexthop *nh)
+{
+	return 1;
+}
+
+/* called with rcu lock */
+static inline bool nexthop_has_gw(struct nexthop *nh)
+{
+	struct nh_info *nhi;
+
+	nhi = rcu_dereference(nh->nh_info);
+	return !!nhi->has_gw;
+}
+
+/* called with rcu lock */
+static inline bool nexthop_is_blackhole(struct nexthop *nh)
+{
+	struct nh_info *nhi;
+
+	nhi = rcu_dereference(nh->nh_info);
+	return !!nhi->reject_nh;
+}
+#endif
diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
new file mode 100644
index 000000000000..f40ef188b529
--- /dev/null
+++ b/include/uapi/linux/nexthop.h
@@ -0,0 +1,56 @@
+#ifndef _UAPI_LINUX_NEXTHOP_H
+#define _UAPI_LINUX_NEXTHOP_H
+
+#include <linux/types.h>
+
+struct nhmsg {
+	unsigned char	nh_family;
+	unsigned char	nh_scope;     /* one of RT_SCOPE */
+	unsigned char	nh_protocol;  /* Routing protocol that installed nh */
+	unsigned char	resvd;
+	unsigned int	nh_flags;     /* RTNH_F flags */
+};
+
+struct nexthop_grp {
+	__u32	id;
+	__u32	weight;
+};
+
+enum {
+	NEXTHOP_GRP_TYPE_MPATH,  /* default type if not specified */
+	__NEXTHOP_GRP_TYPE_MAX,
+};
+
+#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
+
+
+/* NHA_ID	32-bit id for nexthop. id must be greater than 0.
+ *		id == 0 means assign an unused id.
+ */
+enum {
+	NHA_UNSPEC,
+	NHA_ID,		/* u32 */
+	NHA_GROUP,	/* array of nexthop_grp */
+	NHA_GROUP_TYPE,	/* u16 one of NEXTHOP_GRP_TYPE;
+			 * default is NEXTHOP_GRP_TYPE_MPATH */
+
+	/* if NHA_GROUP attribute is added, no other attributes can be set */
+
+	NHA_BLACKHOLE,	/* flag; nexthop used to blackhole packets */
+	NHA_OIF,	/* u32 */
+	NHA_FLOW,	/* u32 */
+
+	NHA_TABLE_ID,	/* u32 - table id to validate gateway */
+	NHA_GATEWAY,	/* be32 (IPv4) or in6_addr (IPv6) gw address */
+
+	/* Dump control attributes */
+	NHA_GROUPS,	/* flag; only return nexthop groups in dump */
+	NHA_MASTER,	/* u32; only return nexthops with given master dev */
+
+	NHA_SADDR,	/* return only: IPv4 or IPv6 source address */
+
+	__NHA_MAX,
+};
+
+#define NHA_MAX	(__NHA_MAX - 1)
+#endif
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 46399367627f..4a0615797e5e 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -157,6 +157,13 @@ enum {
 	RTM_GETCHAIN,
 #define RTM_GETCHAIN RTM_GETCHAIN
 
+	RTM_NEWNEXTHOP = 104,
+#define RTM_NEWNEXTHOP	RTM_NEWNEXTHOP
+	RTM_DELNEXTHOP,
+#define RTM_DELNEXTHOP	RTM_DELNEXTHOP
+	RTM_GETNEXTHOP,
+#define RTM_GETNEXTHOP	RTM_GETNEXTHOP
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7446b98661d8..2ee5129a070c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
-	     metrics.o netlink.o
+	     metrics.o netlink.o nexthop.o
 
 obj-$(CONFIG_BPFILTER) += bpfilter/
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
new file mode 100644
index 000000000000..24c4aa383c9d
--- /dev/null
+++ b/net/ipv4/nexthop.c
@@ -0,0 +1,1080 @@
+/* Generic nexthop implementation
+ *
+ * Copyright (C) 2017-18 Cumulus Networks
+ * Copyright (c) 2017-18 David Ahern <dsa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/inetdevice.h>
+#include <linux/nexthop.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/nexthop.h>
+#include <net/route.h>
+#include <net/ip6_route.h>
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+			   bool skip_fib, struct nl_info *nlinfo);
+
+#define NH_DEV_HASHBITS  8
+#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
+
+static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
+	[NHA_ID]		= { .type = NLA_U32 },
+	[NHA_OIF]		= { .type = NLA_U32 },
+	[NHA_FLOW]		= { .type = NLA_U32 },
+	[NHA_TABLE_ID]		= { .type = NLA_U32 },
+	[NHA_BLACKHOLE]		= { .type = NLA_FLAG },
+	[NHA_MASTER]		= { .type = NLA_U32 },
+};
+
+static unsigned int nh_dev_hashfn(unsigned int val)
+{
+	unsigned int mask = NH_DEV_HASHSIZE - 1;
+
+	return (val ^
+		(val >> NH_DEV_HASHBITS) ^
+		(val >> (NH_DEV_HASHBITS * 2))) & mask;
+}
+
+static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
+{
+	struct hlist_head *head;
+	struct net_device *dev;
+	unsigned int hash;
+
+	if (nhi->family == AF_INET6)
+		dev = nhi->fib6_nh.nh_dev;
+	else
+		dev = nhi->fib_nh.nh_dev;
+
+	WARN_ON(!dev);
+
+	hash = nh_dev_hashfn(dev->ifindex);
+	head = &net->nexthop.devhash[hash];
+	hlist_add_head(&nhi->dev_hash, head);
+}
+
+static void nexthop_free_rcu(struct rcu_head *head)
+{
+	struct nexthop *nh = container_of(head, struct nexthop, rcu);
+	struct nh_info *nhi;
+
+	nhi = rcu_dereference_raw(nh->nh_info);
+	switch (nhi->family) {
+	case AF_INET:
+	case AF_UNSPEC:
+		fib_nh_release(nhi->net, &nhi->fib_nh);
+		break;
+	case AF_INET6:
+		fib6_nh_release(&nhi->fib6_nh);
+		break;
+	}
+	kfree(nhi);
+
+	kfree(nh);
+}
+
+static struct nexthop *nexthop_alloc(void)
+{
+	return kzalloc(sizeof(struct nexthop), GFP_KERNEL);
+}
+
+static void nh_base_seq_inc(struct net *net)
+{
+	while (++net->nexthop.seq == 0)
+		;
+}
+
+void nexthop_put(struct nexthop *nh)
+{
+	if (refcount_dec_and_test(&nh->refcnt))
+		call_rcu(&nh->rcu, nexthop_free_rcu);
+}
+
+void nexthop_get(struct nexthop *nh)
+{
+	refcount_inc(&nh->refcnt);
+}
+
+/* no reference taken; rcu lock or rtnl must be held */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
+{
+	struct rb_node **pp, *parent = NULL, *next;
+
+	pp = &net->nexthop.root.rb_node;
+	while (1) {
+		struct nexthop *nh;
+
+		next = rtnl_dereference(*pp);
+		if (!next)
+			break;
+		parent = next;
+
+		nh = rb_entry(parent, struct nexthop, rb_node);
+		if (id < nh->id)
+			pp = &next->rb_left;
+		else if (id > nh->id)
+			pp = &next->rb_right;
+		else
+			return nh;
+	}
+	return NULL;
+}
+
+/* find an unused id - used for auto id allocation
+ * called with rtnl lock held
+ */
+static u32 nh_find_unused_id(struct net *net)
+{
+	u32 id_start = net->nexthop.last_id_allocated;
+
+	while (1) {
+		net->nexthop.last_id_allocated++;
+		if (net->nexthop.last_id_allocated == id_start)
+			break;
+
+		if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
+			return net->nexthop.last_id_allocated;
+	}
+	return 0;
+}
+
+static size_t nh_nlmsg_size_ipv6(struct nh_info *nhi)
+{
+	size_t sz = 0;
+
+	sz = nla_total_size(sizeof(nhi->fib6_nh.nh_gw));
+
+	return sz;
+}
+
+static size_t nh_nlmsg_size_ipv4(struct nh_info *nhi)
+{
+	size_t sz;
+
+	sz = nla_total_size(4)     /* NHA_GATEWAY */
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	     + nla_total_size(4)   /* NHA_FLOW */
+#endif
+	     + nla_total_size(4);  /* NHA_SADDR_IPV4 */
+
+	return sz;
+}
+
+static size_t nh_nlmsg_size(struct nexthop *nh)
+{
+	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+	size_t sz = nla_total_size(4);    /* NHA_ID */
+
+	/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+	 * are mutually exclusive
+	 */
+	sz += nla_total_size(4);  /* NHA_OIF */
+
+	if (nhi->family == AF_INET)
+		sz += nh_nlmsg_size_ipv4(nhi);
+
+	else if (nhi->family == AF_INET6)
+		sz += nh_nlmsg_size_ipv6(nhi);
+
+	return sz;
+}
+
+static const struct net_device *nh_info_dev(const struct nh_info *nhi)
+{
+	switch (nhi->family) {
+	case AF_INET:
+	case AF_UNSPEC:  /* dev only re-uses IPv4 struct */
+		return nhi->fib_nh.nh_dev;
+	case AF_INET6:
+		return nhi->fib6_nh.nh_dev;
+	}
+	return NULL;
+}
+
+static bool nh_info_uses_dev(const struct nh_info *nhi,
+			     const struct net_device *dev)
+{
+	const struct net_device *nh_dev;
+
+	nh_dev = nh_info_dev(nhi);
+	if (nh_dev == dev || l3mdev_master_dev_rcu(nh_dev) == dev)
+		return true;
+
+	return false;
+}
+
+bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev)
+{
+	const struct nh_info *nhi;
+	bool dev_match = false;
+
+	nhi = rcu_dereference(nh->nh_info);
+	dev_match = nh_info_uses_dev(nhi, dev);
+
+	return dev_match;
+}
+
+static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
+			int event, u32 portid, u32 seq, unsigned int nlflags)
+{
+	const struct net_device *dev;
+	struct fib6_nh *fib6_nh;
+	struct fib_nh *fib_nh;
+	struct nlmsghdr *nlh;
+	struct nh_info *nhi;
+	struct nhmsg *nhm;
+
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	nhm = nlmsg_data(nlh);
+	nhm->nh_family = AF_UNSPEC;
+	nhm->nh_flags = nh->nh_flags;
+	nhm->nh_protocol = nh->protocol;
+	nhm->nh_scope = 0;
+	nhm->resvd = 0;
+
+	if (nla_put_u32(skb, NHA_ID, nh->id))
+		goto nla_put_failure;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	if (nhi->reject_nh && nla_put_flag(skb, NHA_BLACKHOLE))
+		goto nla_put_failure;
+
+	dev = nh_info_dev(nhi);
+	if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
+		goto nla_put_failure;
+
+	nhm->nh_family = nhi->family;
+	switch (nhi->family) {
+	case AF_INET:
+		fib_nh = &nhi->fib_nh;
+
+		nhm->nh_scope = fib_nh->nh_scope;
+		if (nla_put_u32(skb, NHA_GATEWAY, fib_nh->nh_gw))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, NHA_SADDR, fib_nh->nh_saddr))
+			goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		if (fib_nh->nh_tclassid &&
+		    nla_put_u32(skb, NHA_FLOW, fib_nh->nh_tclassid))
+			goto nla_put_failure;
+#endif
+		break;
+
+	case AF_INET6:
+		fib6_nh = &nhi->fib6_nh;
+		if (nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->nh_gw) < 0)
+			goto nla_put_failure;
+		break;
+	}
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
+{
+	unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
+	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
+	if (!skb)
+		goto errout;
+
+	err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in nh_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
+		    info->nlh, gfp_any());
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
+}
+
+/* called on insert failure too */
+static void __remove_nexthop(struct net *net, struct nexthop *nh,
+			     bool skip_fib, struct nl_info *nlinfo)
+{
+	const struct net_device *dev;
+	struct nh_info *nhi;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	dev = nh_info_dev(nhi);
+	if (dev)
+		hlist_del(&nhi->dev_hash);
+}
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+			   bool skip_fib, struct nl_info *nlinfo)
+{
+	/* remove from the tree */
+	rb_erase(&nh->rb_node, &net->nexthop.root);
+
+	__remove_nexthop(net, nh, skip_fib, nlinfo);
+
+	nh_base_seq_inc(net);
+
+	nexthop_put(nh);
+
+	nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+}
+
+static int replace_nexthop(struct net *net, struct nexthop *old,
+			   struct nexthop *new, struct netlink_ext_ack *extack)
+{
+	struct nh_info *oldi, *newi;
+
+	oldi = rtnl_dereference(old->nh_info);
+	newi = rtnl_dereference(new->nh_info);
+	rcu_assign_pointer(old->nh_info, newi);
+	rcu_assign_pointer(new->nh_info, oldi);
+
+	newi->nh_parent = old;
+	oldi->nh_parent = new;
+
+	old->protocol = new->protocol;
+	old->nh_flags = new->nh_flags;
+
+	rt_cache_flush(net);
+	// TO-DO: ipv6 equiv
+
+	__remove_nexthop(net, new, true, NULL);
+	nexthop_put(new);
+
+	return 0;
+}
+
+/* called with rtnl_lock held */
+static int insert_nexthop(struct net *net, struct nexthop *new_nh,
+			  struct nh_config *cfg, struct netlink_ext_ack *extack)
+{
+	struct rb_node **pp, *parent = NULL, *next;
+	struct rb_root *root = &net->nexthop.root;
+	bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
+	bool create = !!(cfg->nlflags & NLM_F_CREATE);
+	u32 new_id = new_nh->id;
+	int rc = -EEXIST;
+
+	pp = &root->rb_node;
+	while (1) {
+		struct nexthop *nh;
+
+		next = rtnl_dereference(*pp);
+		if (!next)
+			break;
+
+		parent = next;
+
+		nh = rb_entry(parent, struct nexthop, rb_node);
+		if (new_id < nh->id) {
+			pp = &next->rb_left;
+		} else if (new_id > nh->id) {
+			pp = &next->rb_right;
+		} else if (replace) {
+			rc = replace_nexthop(net, nh, new_nh, extack);
+			if (!rc)
+				new_nh = nh; /* send notification with old nh */
+			goto out;
+		} else {
+			/* id already exists and not a replace */
+			goto out;
+		}
+	}
+
+	if (replace && !create) {
+		NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
+		rc = -ENOENT;
+		goto out;
+	}
+
+	rb_link_node_rcu(&new_nh->rb_node, parent, pp);
+	rb_insert_color(&new_nh->rb_node, root);
+	rc = 0;
+out:
+	if (!rc) {
+		nh_base_seq_inc(net);
+		nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
+	}
+
+	return rc;
+}
+
+/* rtnl */
+/* remove all nexthops tied to a device being deleted */
+static void nexthop_flush_dev(struct net_device *dev)
+{
+	unsigned int hash = nh_dev_hashfn(dev->ifindex);
+	struct net *net = dev_net(dev);
+	struct hlist_head *head = &net->nexthop.devhash[hash];
+	struct nl_info nlinfo = {
+		.nl_net = net,
+	};
+	struct hlist_node *n;
+	struct nh_info *nhi;
+
+	hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+		if (nh_info_dev(nhi) != dev)
+			continue;
+
+		remove_nexthop(net, nhi->nh_parent, false, &nlinfo);
+	}
+}
+
+/* rtnl */
+static void flush_all_nexthops(struct net *net)
+{
+	struct rb_root *root = &net->nexthop.root;
+	struct rb_node *node;
+	struct nexthop *nh;
+	struct nl_info nlinfo = {
+		.nl_net = net,
+	};
+
+	while ((node = rb_first(root))) {
+		nh = rb_entry(node, struct nexthop, rb_node);
+		remove_nexthop(net, nh, true, &nlinfo);
+		cond_resched();
+	}
+}
+
+static int nh_check_attr(struct nhmsg *nhm, struct nlattr *tb[],
+			 struct net *net, struct netlink_ext_ack *extack)
+{
+	int err = -EINVAL;
+
+	if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid nexthop flags in ancillary header");
+		goto out;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
+			    struct nlmsghdr *nlh, struct nh_config *cfg,
+			    struct netlink_ext_ack *extack)
+{
+	struct nhmsg *nhm = nlmsg_data(nlh);
+	struct nlattr *tb[NHA_MAX + 1];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX,
+			  rtm_nh_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (nhm->resvd) {
+		NL_SET_ERR_MSG(extack, "Invalid value in reserved field of ancillary header");
+		return -EINVAL;
+	}
+
+	err = nh_check_attr(nhm, tb, net, extack);
+	if (err < 0)
+		return err;
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->nlflags = nlh->nlmsg_flags;
+	cfg->nlinfo.portid = NETLINK_CB(skb).portid;
+	cfg->nlinfo.nlh = nlh;
+	cfg->nlinfo.nl_net = net;
+
+	cfg->nh_family = nhm->nh_family;
+	cfg->nh_protocol = nhm->nh_protocol;
+	cfg->nh_flags = nhm->nh_flags;
+	cfg->nh_scope = nhm->nh_scope;
+
+	if (tb[NHA_ID])
+		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+
+	if (tb[NHA_OIF]) {
+		cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+
+		if (cfg->nh_ifindex)
+			cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+
+		if (!cfg->dev) {
+			NL_SET_ERR_MSG(extack, "Invalid device index");
+			goto out;
+		} else if (!(cfg->dev->flags & IFF_UP)) {
+			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+			err = -ENETDOWN;
+			goto out;
+		} else if (!netif_carrier_ok(cfg->dev)) {
+			NL_SET_ERR_MSG(extack,
+				       "Carrier for nexthop device is down");
+			err = -ENETDOWN;
+			goto out;
+		}
+
+		cfg->nh_table = l3mdev_fib_table(cfg->dev);
+	}
+
+	if (tb[NHA_TABLE_ID])
+		cfg->nh_table = nla_get_u32(tb[NHA_TABLE_ID]);
+
+	err = -EINVAL;
+	if (tb[NHA_FLOW]) {
+#ifndef CONFIG_IP_ROUTE_CLASSID
+		NL_SET_ERR_MSG(extack, "Classid not enabled in kernel");
+		goto out;
+#else
+		cfg->tclassid = nla_get_u32(tb[NHA_FLOW]);
+#endif
+	}
+
+	if (tb[NHA_GATEWAY]) {
+		struct nlattr *gwa = tb[NHA_GATEWAY];
+
+		switch (cfg->nh_family) {
+		case AF_INET:
+			if (nla_len(gwa) != sizeof(u32)) {
+				NL_SET_ERR_MSG(extack, "Invalid gateway");
+				goto out;
+			}
+			cfg->gw.ipv4 = nla_get_be32(gwa);
+			break;
+		case AF_INET6:
+			if (nla_len(gwa) != sizeof(struct in6_addr)) {
+				NL_SET_ERR_MSG(extack, "Invalid gateway");
+				goto out;
+			}
+			cfg->gw.ipv6 = nla_get_in6_addr(gwa);
+			break;
+		default:
+			NL_SET_ERR_MSG(extack,
+				       "Unknown address family for gateway");
+			goto out;
+		}
+	} else {
+		cfg->nh_family = AF_UNSPEC;
+
+		/* device only nexthop (no gateway) */
+		if (cfg->nh_flags & RTNH_F_ONLINK) {
+			NL_SET_ERR_MSG(extack,
+				       "ONLINK flag can not be set for nexthop without a gateway");
+			goto out;
+		}
+		cfg->nh_scope = RT_SCOPE_LINK;
+	}
+
+	if (tb[NHA_BLACKHOLE]) {
+		if (tb[NHA_GATEWAY]) {
+			NL_SET_ERR_MSG(extack,
+				       "Blackhole attribute can not be used with gateway");
+			goto out;
+		}
+
+		cfg->nh_blackhole = 1;
+	}
+
+	if (cfg->tclassid && cfg->nh_family != AF_INET) {
+		NL_SET_ERR_MSG(extack,
+			       "FLOW attribute only relevant for IPv4 nexthops");
+		goto out;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+/* rtnl */
+static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nhmsg *nhm = nlmsg_data(nlh);
+	struct nlattr *tb[NHA_MAX + 1];
+	struct sk_buff *skb = NULL;
+	struct nexthop *nh;
+	int err;
+	u32 id;
+
+	err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX,
+			  rtm_nh_policy, extack);
+	if (err < 0)
+		goto out;
+
+	err = -EINVAL;
+	if (!tb[NHA_ID]) {
+		NL_SET_ERR_MSG(extack, "Nexthop id is missing");
+		goto out;
+	}
+	id = nla_get_u32(tb[NHA_ID]);
+	if (!id) {
+		NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+		goto out;
+	}
+
+	err = -ENOBUFS;
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		goto out;
+
+	err = -ENOENT;
+	nh = nexthop_find_by_id(net, id);
+	if (!nh)
+		goto errout_free;
+
+	err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
+			   nlh->nlmsg_seq, 0);
+	if (err < 0)
+		goto errout_free;
+
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+	return err;
+errout_free:
+	kfree_skb(skb);
+	goto out;
+}
+
+/* rtnl */
+static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[NHA_MAX + 1];
+	struct nl_info nlinfo = {
+		.nlh = nlh,
+		.nl_net = net,
+		.portid = NETLINK_CB(skb).portid,
+	};
+	struct nexthop *nh;
+	struct nhmsg *nhm;
+	int err, i;
+	u32 id;
+
+	err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX,
+			  rtm_nh_policy, extack);
+	if (err < 0)
+		return err;
+
+	nhm = nlmsg_data(nlh);
+
+	/* validate expected attribute and check for unexpected attributes */
+	for (i = 1; i < __NHA_MAX; ++i) {
+		switch (i) {
+		case NHA_ID:
+			if (!tb[NHA_ID]) {
+				NL_SET_ERR_MSG(extack, "Nexthop id is missing");
+				return -EINVAL;
+			}
+			break;
+		default:
+			if (!tb[i])
+				break;
+
+			NL_SET_ERR_MSG_ATTR(extack, tb[i],
+					    "Unexpected attribute in request");
+			return -EINVAL;
+		}
+	}
+
+	id = nla_get_u32(tb[NHA_ID]);
+	nh = nexthop_find_by_id(net, id);
+	if (!nh)
+		return -ENOENT;
+
+	remove_nexthop(net, nh, false, &nlinfo);
+
+	return 0;
+}
+
+static int nh_create_ipv4(struct net *net, struct nexthop *nh,
+			  struct nh_info *nhi, struct nh_config *cfg,
+			  struct netlink_ext_ack *extack)
+{
+	struct fib_nh *fib_nh = &nhi->fib_nh;
+	struct fib_config fib_cfg = {
+		.fc_oif   = cfg->nh_ifindex,
+		.fc_flow  = cfg->tclassid,
+		.fc_gw    = cfg->gw.ipv4,
+		.fc_flags = cfg->nh_flags,
+	};
+	int err;
+
+	err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
+	if (err)
+		goto out;
+
+	err = fib_check_nh(net, fib_nh, cfg->nh_table, cfg->nh_scope, extack);
+	if (!err) {
+		/* v4 code normally allows a nexthop device to have
+		 * carrier down; this code does not allow it
+		 */
+		if (fib_nh->nh_flags & RTNH_F_LINKDOWN) {
+			NL_SET_ERR_MSG(extack,
+				       "Carrier for nexthop device is down");
+			dev_put(fib_nh->nh_dev);
+			err = -ENETDOWN;
+			goto out;
+		}
+
+		nh->nh_flags = fib_nh->nh_flags;
+		fib_info_update_nh_saddr(net, fib_nh, fib_nh->nh_scope);
+	}
+
+	nhi->has_gw = !!fib_nh->nh_gw;
+out:
+	return err;
+}
+
+static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
+			  struct nh_info *nhi, struct nh_config *cfg,
+			  struct netlink_ext_ack *extack)
+{
+	struct fib6_nh *fib6_nh = &nhi->fib6_nh;
+	struct fib6_config fib6_cfg = {
+		.fc_table = cfg->nh_table,
+		.fc_ifindex = cfg->nh_ifindex,
+		.fc_gateway = cfg->gw.ipv6,
+		.fc_flags = cfg->nh_flags,
+	};
+	int err;
+
+	if (!ipv6_addr_any(&cfg->gw.ipv6)) {
+		fib6_cfg.fc_flags |= RTF_GATEWAY;
+		nhi->has_gw = true;
+	}
+
+	err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, extack);
+	if (!err) {
+		if (fib6_cfg.fc_flags & RTF_REJECT) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop can not use RTF_REJECT");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (fib6_nh->nh_flags & RTNH_F_LINKDOWN) {
+			NL_SET_ERR_MSG(extack,
+				       "Carrier for nexthop device is down");
+			err = -ENETDOWN;
+			goto out;
+		}
+
+		nh->nh_flags = fib6_nh->nh_flags;
+	}
+
+out:
+	if (err && fib6_nh->nh_dev)
+		dev_put(fib6_nh->nh_dev);
+	return err;
+}
+
+static int nh_create_unspec(struct net *net, struct nexthop *nh,
+			    struct nh_info *nhi, struct nh_config *cfg,
+			    struct netlink_ext_ack *extack)
+{
+	struct net_device *dev = cfg->dev;
+	int err = 0;
+
+	if (cfg->nh_blackhole) {
+		nhi->reject_nh = 1;
+	} else if (!dev) {
+		NL_SET_ERR_MSG(extack, "No device for nexthop");
+		err = -ENODEV;
+	} else {
+		/* leverage ipv4 infra for non-gw nexthop */
+		err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+	}
+
+	return err;
+}
+
+static void nexthop_init_common(struct nexthop *nh)
+{
+	INIT_LIST_HEAD(&nh->fi_list);
+	INIT_LIST_HEAD(&nh->f6i_list);
+}
+
+static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
+				      struct netlink_ext_ack *extack)
+{
+	struct nh_info *nhi;
+	struct nexthop *nh;
+	int err;
+
+	nh = nexthop_alloc();
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
+	if (!nhi) {
+		kfree(nh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nh->nh_flags = cfg->nh_flags;
+	nexthop_init_common(nh);
+
+	nhi->nh_parent = nh;
+	nhi->family = cfg->nh_family;
+	nhi->net = net;
+	switch (cfg->nh_family) {
+	case AF_INET:
+		err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+		break;
+	case AF_INET6:
+		err = nh_create_ipv6(net, nh, nhi, cfg, extack);
+		break;
+	default:
+		err = nh_create_unspec(net, nh, nhi, cfg, extack);
+	}
+
+	if (err) {
+		kfree(nhi);
+		kfree(nh);
+		return ERR_PTR(err);
+	}
+
+	/* add the entry to the device based hash */
+	if (!nhi->reject_nh)
+		nexthop_devhash_add(net, nhi);
+
+	rcu_assign_pointer(nh->nh_info, nhi);
+
+	return nh;
+}
+
+/* called with rtnl lock held */
+static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
+				   struct netlink_ext_ack *extack)
+{
+	struct nexthop *nh;
+	int err;
+
+	if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
+		NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!cfg->nh_id) {
+		cfg->nh_id = nh_find_unused_id(net);
+		if (!cfg->nh_id) {
+			NL_SET_ERR_MSG(extack, "No unused id.");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	nh = nexthop_create(net, cfg, extack);
+	if (IS_ERR(nh))
+		return nh;
+
+	refcount_set(&nh->refcnt, 1);
+	nh->id = cfg->nh_id;
+	nh->protocol = cfg->nh_protocol;
+
+	err = insert_nexthop(net, nh, cfg, extack);
+	if (err)
+		goto out_err;
+
+	return nh;
+out_err:
+	__remove_nexthop(net, nh, true, NULL);
+	nexthop_put(nh);
+
+	return ERR_PTR(err);
+}
+
+/* rtnl */
+static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nh_config cfg;
+	struct nexthop *nh;
+	int err;
+
+	err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
+	if (!err) {
+		nh = nexthop_add(net, &cfg, extack);
+		if (IS_ERR(nh))
+			err = PTR_ERR(nh);
+	}
+
+	return err;
+}
+
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
+			     int master_idx, u8 family)
+{
+	const struct net_device *dev;
+	const struct nh_info *nhi;
+
+	if (dev_idx || master_idx || family)
+		return true;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	if (family && nhi->family != family)
+		return true;
+
+	dev = nh_info_dev(nhi);
+	if (dev_idx && (!dev || dev->ifindex != dev_idx))
+		return true;
+
+	if (master_idx) {
+		struct net_device *master;
+
+		master = netdev_master_upper_dev_get((struct net_device *)dev);
+		if (!master || master->ifindex != master_idx)
+			return true;
+	}
+	return false;
+}
+
+/* rtnl */
+static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int dev_filter_idx = 0, master_idx = 0;
+	struct net *net = sock_net(skb->sk);
+	struct rb_root *root = &net->nexthop.root;
+	struct nlattr *tb[NHA_MAX + 1];
+	struct rb_node *node;
+	struct nhmsg *nhm;
+	int idx = 0;
+	int s_idx;
+	int err;
+
+	if (nlmsg_parse(cb->nlh, sizeof(*nhm), tb, NHA_MAX,
+			rtm_nh_policy, NULL) >= 0) {
+		if (tb[NHA_OIF])
+			dev_filter_idx = nla_get_u32(tb[NHA_OIF]);
+
+		if (tb[NHA_MASTER])
+			master_idx = nla_get_u32(tb[NHA_MASTER]);
+	}
+
+	nhm = nlmsg_data(cb->nlh);
+
+	s_idx = cb->args[0];
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		struct nexthop *nh;
+
+		if (idx < s_idx)
+			goto cont;
+
+		nh = rb_entry(node, struct nexthop, rb_node);
+		if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
+				     nhm->nh_family))
+			goto cont;
+
+		err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
+				   NETLINK_CB(cb->skb).portid,
+				   cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err < 0) {
+			if (likely(skb->len))
+				goto out;
+
+			goto out_err;
+		}
+cont:
+		idx++;
+	}
+
+out:
+	err = skb->len;
+out_err:
+	cb->args[0] = idx;
+	cb->seq = net->nexthop.seq;
+	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+	return err;
+}
+
+/* rtnl */
+static int nh_netdev_event(struct notifier_block *this,
+			   unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_DOWN:
+	case NETDEV_UNREGISTER:
+		nexthop_flush_dev(dev);
+		break;
+	case NETDEV_CHANGE:
+		if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+			nexthop_flush_dev(dev);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nh_netdev_notifier = {
+	.notifier_call = nh_netdev_event,
+};
+
+static void __net_exit nexthop_net_exit(struct net *net)
+{
+	rtnl_lock();
+	flush_all_nexthops(net);
+	rtnl_unlock();
+	kfree(net->nexthop.devhash);
+}
+
+static int __net_init nexthop_net_init(struct net *net)
+{
+	size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
+
+	net->nexthop.root = RB_ROOT;
+	net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
+	if (!net->nexthop.devhash)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static struct pernet_operations nexthop_net_ops = {
+	.init = nexthop_net_init,
+	.exit = nexthop_net_exit,
+};
+
+static int __init nexthop_init(void)
+{
+	register_pernet_subsys(&nexthop_net_ops);
+
+	register_netdevice_notifier(&nh_netdev_notifier);
+
+	rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
+		      rtm_dump_nexthop, 0);
+
+	rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+	rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
+	rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
+
+	return 0;
+}
+subsys_initcall(nexthop_init);
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 74b951f55608..7a852d5af14e 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -80,6 +80,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_NEWSTATS,		NETLINK_ROUTE_SOCKET__NLMSG_READ },
 	{ RTM_GETSTATS,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 	{ RTM_NEWCACHEREPORT,	NETLINK_ROUTE_SOCKET__NLMSG_READ },
+	{ RTM_NEWNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_DELNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_GETNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 };
 
 static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
@@ -159,7 +162,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 	switch (sclass) {
 	case SECCLASS_NETLINK_ROUTE_SOCKET:
 		/* RTM_MAX always point to RTM_SETxxxx, ie RTM_NEWxxx + 3 */
-		BUILD_BUG_ON(RTM_MAX != (RTM_NEWCHAIN + 3));
+		BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOP + 3));
 		err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
 				 sizeof(nlmsg_route_perms));
 		break;
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 05/18] net/ipv4: Define fib_get_nhs when CONFIG_IP_ROUTE_MULTIPATH is disabled
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Define fib_get_nhs to return EINVAL when CONFIG_IP_ROUTE_MULTIPATH is
not enabled and remove the ifdef check for CONFIG_IP_ROUTE_MULTIPATH.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_semantics.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9f8126debba5..9b2d8ba6bdb3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -603,6 +603,15 @@ static void fib_rebalance(struct fib_info *fi)
 }
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+		       int remaining, struct fib_config *cfg,
+		       struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel");
+
+	return -EINVAL;
+}
+
 #define fib_rebalance(fi) do { } while (0)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -1112,7 +1121,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		goto failure;
 
 	if (cfg->fc_mp) {
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
 		if (err != 0)
 			goto failure;
@@ -1133,11 +1141,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 			goto err_inval;
 		}
 #endif
-#else
-		NL_SET_ERR_MSG(extack,
-			       "Multipath support not enabled in kernel");
-		goto err_inval;
-#endif
 	} else {
 		struct fib_nh *nh = fi->fib_nh;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 06/18] net/ipv4: Create init and release helpers for fib_nh
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Consolidate the fib_nh initialization which is duplicated between
fib_create_info for single path and fib_get_nhs for multipath.

Move the fib_nh cleanup code from free_fib_info_rcu into a new helper,
fib_nh_release. Move classid accounting into fib_nh_release which is
called per fib_nh to make accounting symmetrical with fib_nh_init.

Export both new helpers to allow for use with nexthop objects.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h     |   5 ++
 net/ipv4/fib_semantics.c | 185 +++++++++++++++++++++++++----------------------
 2 files changed, 104 insertions(+), 86 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 19012f3ed501..ce9b92485064 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -400,6 +400,11 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 		       const struct sk_buff *skb, struct flow_keys *flkeys);
 #endif
+
+int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
+		struct fib_config *cfg, int nh_weight,
+		struct netlink_ext_ack *extack);
+void fib_nh_release(struct net *net, struct fib_nh *fib_nh);
 int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
 		 struct netlink_ext_ack *extack);
 bool fib_good_nh(const struct fib_nh *nh);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9b2d8ba6bdb3..0d792666821a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -204,6 +204,21 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
 	free_percpu(rtp);
 }
 
+void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (fib_nh->nh_tclassid)
+		net->ipv4.fib_num_tclassid_users--;
+#endif
+	if (fib_nh->nh_dev)
+		dev_put(fib_nh->nh_dev);
+
+	lwtstate_put(fib_nh->nh_lwtstate);
+	free_nh_exceptions(fib_nh);
+	rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output);
+	rt_fibinfo_free(&fib_nh->nh_rth_input);
+}
+
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
@@ -211,12 +226,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
 	struct dst_metrics *m;
 
 	change_nexthops(fi) {
-		if (nexthop_nh->nh_dev)
-			dev_put(nexthop_nh->nh_dev);
-		lwtstate_put(nexthop_nh->nh_lwtstate);
-		free_nh_exceptions(nexthop_nh);
-		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
-		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
+		fib_nh_release(fi->fib_net, nexthop_nh);
 	} endfor_nexthops(fi);
 
 	m = fi->fib_metrics;
@@ -459,6 +469,52 @@ static int fib_detect_death(struct fib_info *fi, int order,
 	return 1;
 }
 
+int fib_nh_init(struct net *net, struct fib_nh *nh,
+		struct fib_config *cfg, int nh_weight,
+		struct netlink_ext_ack *extack)
+{
+	int err = -ENOMEM;
+
+	nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
+	if (!nh->nh_pcpu_rth_output)
+		goto failure;
+
+	if (cfg->fc_encap) {
+		struct lwtunnel_state *lwtstate;
+
+		err = -EINVAL;
+		if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) {
+			NL_SET_ERR_MSG(extack, "LWT encap type not specified");
+			goto failure;
+		}
+		err = lwtunnel_build_state(cfg->fc_encap_type,
+					   cfg->fc_encap, AF_INET, cfg,
+					   &lwtstate, extack);
+		if (err)
+			goto failure;
+
+		nh->nh_lwtstate = lwtstate_get(lwtstate);
+	}
+
+	nh->nh_oif   = cfg->fc_oif;
+	nh->nh_gw    = cfg->fc_gw;
+	nh->nh_flags = cfg->fc_flags;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	nh->nh_tclassid = cfg->fc_flow;
+	if (nh->nh_tclassid)
+		net->ipv4.fib_num_tclassid_users++;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	nh->nh_weight = nh_weight;
+#endif
+
+	err = 0;
+
+failure:
+	return err;
+}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
@@ -485,11 +541,15 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		       int remaining, struct fib_config *cfg,
 		       struct netlink_ext_ack *extack)
 {
+	struct net *net = fi->fib_net;
+	struct fib_config fib_cfg;
 	int ret;
 
 	change_nexthops(fi) {
 		int attrlen;
 
+		memset(&fib_cfg, 0, sizeof(fib_cfg));
+
 		if (!rtnh_ok(rtnh, remaining)) {
 			NL_SET_ERR_MSG(extack,
 				       "Invalid nexthop configuration - extra data after nexthop");
@@ -502,51 +562,52 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			return -EINVAL;
 		}
 
-		nexthop_nh->nh_flags =
-			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
-		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
-		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
+		fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+		fib_cfg.fc_oif = rtnh->rtnh_ifindex;
 
 		attrlen = rtnh_attrlen(rtnh);
 		if (attrlen > 0) {
 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
 
 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
-			nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
+			if (nla)
+				fib_cfg.fc_gw = nla_get_in_addr(nla);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 			nla = nla_find(attrs, attrlen, RTA_FLOW);
-			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
-			if (nexthop_nh->nh_tclassid)
-				fi->fib_net->ipv4.fib_num_tclassid_users++;
+			if (nla)
+				fib_cfg.fc_flow = nla_get_u32(nla);
 #endif
-			nla = nla_find(attrs, attrlen, RTA_ENCAP);
-			if (nla) {
-				struct lwtunnel_state *lwtstate;
-				struct nlattr *nla_entype;
-
-				nla_entype = nla_find(attrs, attrlen,
-						      RTA_ENCAP_TYPE);
-				if (!nla_entype) {
-					NL_SET_BAD_ATTR(extack, nla);
-					NL_SET_ERR_MSG(extack,
-						       "Encap type is missing");
-					goto err_inval;
-				}
-
-				ret = lwtunnel_build_state(nla_get_u16(
-							   nla_entype),
-							   nla,  AF_INET, cfg,
-							   &lwtstate, extack);
-				if (ret)
-					goto errout;
-				nexthop_nh->nh_lwtstate =
-					lwtstate_get(lwtstate);
-			}
+			fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+			if (nla)
+				fib_cfg.fc_encap_type = nla_get_u16(nla);
 		}
 
+		ret = fib_nh_init(net, nexthop_nh, &fib_cfg,
+				  rtnh->rtnh_hops + 1, extack);
+		if (ret)
+			goto errout;
+
 		rtnh = rtnh_next(rtnh, &remaining);
 	} endfor_nexthops(fi);
 
+	if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop device index does not match RTA_OIF");
+		goto err_inval;
+	}
+	if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop gateway does not match RTA_GATEWAY");
+		goto err_inval;
+	}
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
+		NL_SET_ERR_MSG(extack,
+			       "Nexthop class id does not match RTA_FLOW");
+		goto err_inval;
+	}
+#endif
 	return 0;
 
 err_inval:
@@ -1111,9 +1172,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
 		nexthop_nh->nh_parent = fi;
-		nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
-		if (!nexthop_nh->nh_pcpu_rth_output)
-			goto failure;
 	} endfor_nexthops(fi)
 
 	err = fib_convert_metrics(fi, cfg);
@@ -1124,53 +1182,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
 		if (err != 0)
 			goto failure;
-		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) {
-			NL_SET_ERR_MSG(extack,
-				       "Nexthop device index does not match RTA_OIF");
-			goto err_inval;
-		}
-		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) {
-			NL_SET_ERR_MSG(extack,
-				       "Nexthop gateway does not match RTA_GATEWAY");
-			goto err_inval;
-		}
-#ifdef CONFIG_IP_ROUTE_CLASSID
-		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
-			NL_SET_ERR_MSG(extack,
-				       "Nexthop class id does not match RTA_FLOW");
-			goto err_inval;
-		}
-#endif
 	} else {
-		struct fib_nh *nh = fi->fib_nh;
-
-		if (cfg->fc_encap) {
-			struct lwtunnel_state *lwtstate;
-
-			if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) {
-				NL_SET_ERR_MSG(extack,
-					       "LWT encap type not specified");
-				goto err_inval;
-			}
-			err = lwtunnel_build_state(cfg->fc_encap_type,
-						   cfg->fc_encap, AF_INET, cfg,
-						   &lwtstate, extack);
-			if (err)
-				goto failure;
-
-			nh->nh_lwtstate = lwtstate_get(lwtstate);
-		}
-		nh->nh_oif = cfg->fc_oif;
-		nh->nh_gw = cfg->fc_gw;
-		nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_IP_ROUTE_CLASSID
-		nh->nh_tclassid = cfg->fc_flow;
-		if (nh->nh_tclassid)
-			fi->fib_net->ipv4.fib_num_tclassid_users++;
-#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-		nh->nh_weight = 1;
-#endif
+		err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
 	}
 
 	if (fib_props[cfg->fc_type].error) {
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 04/18] net/ipv4: export fib_check_nh
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Change fib_check_nh to take net, table and scope as input arguments
over struct fib_config and export for use by nexthop code.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h     |  2 ++
 net/ipv4/fib_semantics.c | 18 +++++++++---------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a4a129344098..19012f3ed501 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -400,6 +400,8 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 		       const struct sk_buff *skb, struct flow_keys *flkeys);
 #endif
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+		 struct netlink_ext_ack *extack);
 bool fib_good_nh(const struct fib_nh *nh);
 void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c034d0adf590..9f8126debba5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -777,21 +777,19 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
  *					|
  *					|-> {local prefix} (terminal node)
  */
-static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
-			struct netlink_ext_ack *extack)
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+		 struct netlink_ext_ack *extack)
 {
 	int err = 0;
-	struct net *net;
 	struct net_device *dev;
 
-	net = cfg->fc_nlinfo.nl_net;
 	if (nh->nh_gw) {
 		struct fib_result res;
 
 		if (nh->nh_flags & RTNH_F_ONLINK) {
 			unsigned int addr_type;
 
-			if (cfg->fc_scope >= RT_SCOPE_LINK) {
+			if (scope >= RT_SCOPE_LINK) {
 				NL_SET_ERR_MSG(extack,
 					       "Nexthop has invalid scope");
 				return -EINVAL;
@@ -822,7 +820,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 			struct fib_table *tbl = NULL;
 			struct flowi4 fl4 = {
 				.daddr = nh->nh_gw,
-				.flowi4_scope = cfg->fc_scope + 1,
+				.flowi4_scope = scope + 1,
 				.flowi4_oif = nh->nh_oif,
 				.flowi4_iif = LOOPBACK_IFINDEX,
 			};
@@ -831,8 +829,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
 			if (fl4.flowi4_scope < RT_SCOPE_LINK)
 				fl4.flowi4_scope = RT_SCOPE_LINK;
 
-			if (cfg->fc_table)
-				tbl = fib_get_table(net, cfg->fc_table);
+			if (table)
+				tbl = fib_get_table(net, table);
 
 			if (tbl)
 				err = fib_table_lookup(tbl, &fl4, &res,
@@ -1221,7 +1219,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		int linkdown = 0;
 
 		change_nexthops(fi) {
-			err = fib_check_nh(cfg, nexthop_nh, extack);
+			err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
+					   cfg->fc_table, cfg->fc_scope,
+					   extack);
 			if (err != 0)
 				goto failure;
 			if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 12/18] net/ipv4: Add nexthop helpers for ipv4 integration
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add nexthop reference to fib_info along with a list_head for tracking
the association of nexthop back to the fib_info.

Add helpers to take a fib_info and return a fib_nh, a nexthop device
and nexthop gateway.

Add helper to validate a nexthop works with a fib_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h  |  4 ++++
 include/net/nexthop.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/nexthop.c    | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 0b40c59b8a5f..e39f55f3c3d8 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -103,9 +103,12 @@ struct fib_nh {
  * This structure contains data shared by many of routes.
  */
 
+struct nexthop;
+
 struct fib_info {
 	struct hlist_node	fib_hash;
 	struct hlist_node	fib_lhash;
+	struct list_head	nh_list;
 	struct net		*fib_net;
 	int			fib_treeref;
 	refcount_t		fib_clntref;
@@ -122,6 +125,7 @@ struct fib_info {
 #define fib_window fib_metrics->metrics[RTAX_WINDOW-1]
 #define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
 #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
+	struct nexthop		*nh;
 	int			fib_nhs;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 1c59d04d1da6..c149fe8394ab 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -118,4 +118,50 @@ static inline bool nexthop_is_blackhole(struct nexthop *nh)
 	nhi = rcu_dereference(nh->nh_info);
 	return !!nhi->reject_nh;
 }
+
+static inline struct fib_nh *nexthop_fib_nh(struct nexthop *nh, int nhsel)
+{
+	struct nh_info *nhi;
+
+	nhi = rcu_dereference(nh->nh_info);
+	if (nhi->family == AF_INET ||
+	    nhi->family == AF_UNSPEC)  /* dev only re-uses IPv4 struct */
+		return &nhi->fib_nh;
+
+	return NULL;
+}
+
+static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
+{
+	if (fi->nh)
+		return nexthop_fib_nh(fi->nh, 0);
+
+	WARN_ON(nhsel > fi->fib_nhs);
+	return &fi->fib_nh[nhsel];
+}
+
+/* return fib_nh for fib_info; for historical reasons
+ * returns first nexthop only
+ */
+static inline struct net_device *fib_info_nh_dev(struct fib_info *fi)
+{
+	struct fib_nh *fib_nh = fib_info_nh(fi, 0);
+
+	return fib_nh->nh_dev;
+}
+
+/* return gateway for fib_info; for historical reasons
+ * returns gateway for first nexthop if multipath
+ */
+static inline __be32 fib_info_nh_gw(struct fib_info *fi)
+{
+	struct fib_nh *fib_nh = fib_info_nh(fi, 0);
+
+	return fib_nh ? fib_nh->nh_gw : 0;
+}
+
+int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
+		      struct netlink_ext_ack *extack);
+
+bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev);
 #endif
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 24c4aa383c9d..d1fc3d21af86 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -315,6 +315,21 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
 }
 
+static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+{
+	struct fib_info *fi;
+	bool do_flush;
+
+	do_flush = false;
+	list_for_each_entry(fi, &nh->fi_list, nh_list) {
+		fi->fib_flags |= RTNH_F_DEAD;
+		do_flush = true;
+	}
+
+	if (do_flush)
+		fib_flush(net);
+}
+
 /* called on insert failure too */
 static void __remove_nexthop(struct net *net, struct nexthop *nh,
 			     bool skip_fib, struct nl_info *nlinfo)
@@ -326,6 +341,8 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh,
 	dev = nh_info_dev(nhi);
 	if (dev)
 		hlist_del(&nhi->dev_hash);
+	if (!skip_fib)
+		__remove_nexthop_fib(net, nh);
 }
 
 static void remove_nexthop(struct net *net, struct nexthop *nh,
@@ -461,6 +478,28 @@ static void flush_all_nexthops(struct net *net)
 	}
 }
 
+/* invoked by fib add code to verify nexthop by id is ok with
+ * config for prefix; parts of fib_check_nh not done when nexthop
+ * is created
+ */
+int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
+		      struct netlink_ext_ack *extack)
+{
+	struct nexthop *nh = fi->nh;
+	struct nh_info *nhi;
+
+	nhi = rtnl_dereference(nh->nh_info);
+	if (nhi->family != AF_UNSPEC) {
+		if (nh->nh_flags & RTNH_F_ONLINK &&
+		    cfg->fc_scope >= RT_SCOPE_LINK) {
+			NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int nh_check_attr(struct nhmsg *nhm, struct nlattr *tb[],
 			 struct net *net, struct netlink_ext_ack *extack)
 {
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 18/18] net/ipv4: Optimization for fib_info lookup
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Be optimistic about re-using a fib_info when nexthop id is given and
the route does not use metrics. Avoids a memory allocation which in
most cases is expected to be freed anyways.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_semantics.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 0ddf14512bb3..e4411cd5514b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -316,6 +316,19 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
 }
 
+static inline unsigned int fib_info_hashfn_cfg(const struct fib_config *cfg)
+{
+	unsigned int mask = (fib_info_hash_size - 1);
+	unsigned int val = 0;
+
+	val ^= (cfg->fc_protocol << 8) | cfg->fc_scope;
+	val ^= (__force u32)cfg->fc_prefsrc;
+	val ^= cfg->fc_priority;
+	val ^= fib_devindex_hashfn(cfg->fc_nh_id);
+
+	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+
 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 {
 	unsigned int mask = (fib_info_hash_size - 1);
@@ -334,6 +347,35 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
 }
 
+/* no metrics, only nexthop id */
+static struct fib_info *fib_find_info_nh(struct net *net,
+					 const struct fib_config *cfg)
+{
+	struct hlist_head *head;
+	struct fib_info *fi;
+	unsigned int hash;
+
+	hash = fib_info_hashfn_cfg(cfg);
+	head = &fib_info_hash[hash];
+
+	hlist_for_each_entry(fi, head, fib_hash) {
+		if (!net_eq(fi->fib_net, net))
+			continue;
+		if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
+			continue;
+		if (cfg->fc_protocol == fi->fib_protocol &&
+		    cfg->fc_scope == fi->fib_scope &&
+		    cfg->fc_prefsrc == fi->fib_prefsrc &&
+		    cfg->fc_priority == fi->fib_priority &&
+		    cfg->fc_type == fi->fib_type &&
+		    cfg->fc_table == fi->fib_tb_id &&
+		    !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
+			return fi;
+	}
+
+	return NULL;
+}
+
 static struct fib_info *fib_find_info(const struct fib_info *nfi)
 {
 	struct hlist_head *head;
@@ -1154,6 +1196,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 			goto err_inval;
 		}
 
+		if (!cfg->fc_mx) {
+			fi = fib_find_info_nh(net, cfg);
+			if (fi)
+				return fi;
+		}
+
 		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
 		if (!nh) {
 			NL_SET_ERR_MSG(extack,
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 17/18] net: Add support for nexthop groups
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Allow the creation of nexthop groups which reference other nexthop
objects to create multipath routes.

TO-DO: Add mpath support to IPv6

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/nexthop.h    |  77 +++++--
 net/ipv4/fib_semantics.c |   5 +-
 net/ipv4/nexthop.c       | 511 ++++++++++++++++++++++++++++++++++++++++++-----
 net/ipv4/route.c         |  16 +-
 4 files changed, 540 insertions(+), 69 deletions(-)

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 759bb39e4ea7..654b67192337 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -28,6 +28,23 @@
 
 struct nexthop;
 
+struct nh_grp_entry {
+	struct nexthop	 *nh;
+	u32		 weight;
+	atomic_t	 upper_bound;
+
+	struct list_head nh_list;
+	struct nexthop	 *nh_parent;  /* nexthop of group with this entry */
+};
+
+struct nh_group {
+	u16			num_nh_set;
+	u16			num_nh;
+	u8			mpath:1,
+				unused:7;
+	struct nh_grp_entry	nh_entries[0];
+};
+
 struct nh_info {
 	struct hlist_node	dev_hash;
 	struct net		*net;
@@ -47,6 +64,7 @@ struct nh_info {
 
 struct nexthop {
 	struct rb_node		rb_node;
+	struct list_head	grp_list;  /* nh group entries using this nh */
 	struct list_head	fi_list;    /* v4 entries using nh */
 	struct list_head	f6i_list;   /* v6 entries using nh */
 
@@ -54,12 +72,15 @@ struct nexthop {
 
 	u8			protocol;
 	u8			nh_flags;
+	u8			is_group:1,
+				unused:7;
 
 	refcount_t		refcnt;
 	struct rcu_head		rcu;
 
 	union {
 		struct nh_info	__rcu *nh_info;
+		struct nh_group	__rcu *nh_grp;
 	};
 };
 
@@ -81,6 +102,9 @@ struct nh_config {
 		struct in6_addr	ipv6;
 	} gw;
 
+	struct nlattr	*nh_grp;
+	u16		nh_grp_type;
+
 	u32		nlflags;
 	struct nl_info	nlinfo;
 };
@@ -88,42 +112,61 @@ struct nh_config {
 void nexthop_get(struct nexthop *nh);
 void nexthop_put(struct nexthop *nh);
 
+static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+{
+	return nh1 == nh2;
+}
+
 /* caller is holding rtnl; no reference taken to nexthop */
 struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
 
-static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+/* called with rcu lock */
+static inline bool nexthop_is_multipath(const struct nexthop *nh)
 {
-	return nh1 == nh2;
+	if (nh->is_group) {
+		struct nh_group *nh_grp;
+
+		nh_grp = rcu_dereference(nh->nh_grp);
+		return !!nh_grp->mpath;
+	}
+	return false;
 }
 
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel);
+
+/* called with rcu lock */
 static inline int nexthop_num_path(struct nexthop *nh)
 {
+	if (nexthop_is_multipath(nh)) {
+		struct nh_group *nh_grp;
+
+		nh_grp = rcu_dereference(nh->nh_grp);
+		return nh_grp->num_nh_set;
+	}
+
 	return 1;
 }
 
-/* called with rcu lock */
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash);
+
 static inline bool nexthop_has_gw(struct nexthop *nh)
 {
-	struct nh_info *nhi;
-
-	nhi = rcu_dereference(nh->nh_info);
-	return !!nhi->has_gw;
+	return !!nh->nh_info->has_gw;
 }
 
-/* called with rcu lock */
 static inline bool nexthop_is_blackhole(struct nexthop *nh)
 {
-	struct nh_info *nhi;
-
-	nhi = rcu_dereference(nh->nh_info);
-	return !!nhi->reject_nh;
+	return !nexthop_is_multipath(nh) && !!nh->nh_info->reject_nh;
 }
 
 static inline struct fib_nh *nexthop_fib_nh(struct nexthop *nh, int nhsel)
 {
 	struct nh_info *nhi;
 
-	nhi = rcu_dereference(nh->nh_info);
+	if (nexthop_is_multipath(nh))
+		nh = nexthop_mpath_select(nh, nhsel);
+
+	nhi = nh->nh_info;
 	if (nhi->family == AF_INET ||
 	    nhi->family == AF_UNSPEC)  /* dev only re-uses IPv4 struct */
 		return &nhi->fib_nh;
@@ -164,11 +207,11 @@ static inline __be32 fib_info_nh_gw(struct fib_info *fi)
  */
 static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
 {
-	struct nh_info *nhi;
+	if (nexthop_is_multipath(nh))
+		nh = nexthop_mpath_select(nh, 0);
 
-	nhi = rcu_dereference(nh->nh_info);
-	if (nhi->family == AF_INET6)
-		return &nhi->fib6_nh;
+	if (nh->nh_info->family == AF_INET6)
+		return &nh->nh_info->fib6_nh;
 
 	return NULL;
 }
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c91cdafd40ec..0ddf14512bb3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1821,7 +1821,10 @@ void fib_select_path(struct net *net, struct fib_result *res,
 		goto check_saddr;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi->fib_nhs > 1) {
+	if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+		h = fib_multipath_hash(net, fl4, skb, NULL);
+		nexthop_select_path(net, res, h);
+	} else if (res->fi->fib_nhs > 1) {
 		h = fib_multipath_hash(net, fl4, skb, NULL);
 		fib_select_multipath(res, h);
 	}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 1e77fa94e562..f0b4151c661a 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -35,6 +35,8 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
 	[NHA_TABLE_ID]		= { .type = NLA_U32 },
 	[NHA_BLACKHOLE]		= { .type = NLA_FLAG },
 	[NHA_MASTER]		= { .type = NLA_U32 },
+	[NHA_GROUP_TYPE]	= { .type = NLA_U16 },
+	[NHA_GROUPS]		= { .type = NLA_FLAG },
 };
 
 static unsigned int nh_dev_hashfn(unsigned int val)
@@ -67,19 +69,35 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
 static void nexthop_free_rcu(struct rcu_head *head)
 {
 	struct nexthop *nh = container_of(head, struct nexthop, rcu);
-	struct nh_info *nhi;
 
-	nhi = rcu_dereference_raw(nh->nh_info);
-	switch (nhi->family) {
-	case AF_INET:
-	case AF_UNSPEC:
-		fib_nh_release(nhi->net, &nhi->fib_nh);
-		break;
-	case AF_INET6:
-		fib6_nh_release(&nhi->fib6_nh);
-		break;
+	if (nh->is_group) {
+		struct nh_group *nh_grp;
+		int i;
+
+		nh_grp = rcu_dereference_raw(nh->nh_grp);
+		for (i = 0; i < nh_grp->num_nh; ++i) {
+			if (!nh_grp->nh_entries[i].nh)
+				continue;
+
+			list_del(&nh_grp->nh_entries[i].nh_list);
+			nexthop_put(nh_grp->nh_entries[i].nh);
+		}
+		kfree(nh_grp);
+	} else {
+		struct nh_info *nhi;
+
+		nhi = rcu_dereference_raw(nh->nh_info);
+		switch (nhi->family) {
+		case AF_INET:
+		case AF_UNSPEC:
+			fib_nh_release(nhi->net, &nhi->fib_nh);
+			break;
+		case AF_INET6:
+			fib6_nh_release(&nhi->fib6_nh);
+			break;
+		}
+		kfree(nhi);
 	}
-	kfree(nhi);
 
 	kfree(nh);
 }
@@ -89,6 +107,33 @@ static struct nexthop *nexthop_alloc(void)
 	return kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 }
 
+/* nexthop for group has variable size and may not use the kmem_cache */
+static struct nexthop *nexthop_grp_alloc(u16 num_nh)
+{
+	size_t sz = offsetof(struct nexthop, nh_grp)
+		    + sizeof(struct nh_group)
+		    + sizeof(struct nh_grp_entry) * num_nh;
+	struct nh_group *nh_grp;
+	struct nexthop *nh;
+
+	nh = nexthop_alloc();
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nh_grp = kzalloc(sz, GFP_KERNEL);
+	if (!nh_grp) {
+		kfree(nh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nh->is_group = 1;
+	nh_grp->num_nh = num_nh;
+	nh_grp->num_nh_set = num_nh;
+	rcu_assign_pointer(nh->nh_grp, nh_grp);
+
+	return nh;
+}
+
 static void nh_base_seq_inc(struct net *net)
 {
 	while (++net->nexthop.seq == 0)
@@ -173,23 +218,166 @@ static size_t nh_nlmsg_size_ipv4(struct nh_info *nhi)
 
 static size_t nh_nlmsg_size(struct nexthop *nh)
 {
-	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 	size_t sz = nla_total_size(4);    /* NHA_ID */
 
-	/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
-	 * are mutually exclusive
-	 */
-	sz += nla_total_size(4);  /* NHA_OIF */
+	if (nh->is_group) {
+		struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+		size_t sz2 = sizeof(struct nh_group) * nh_grp->num_nh_set;
 
-	if (nhi->family == AF_INET)
-		sz += nh_nlmsg_size_ipv4(nhi);
+		sz += nla_total_size(sz2)
+		      + nla_total_size(2);  /* NHA_GROUP_TYPE */
+	} else {
+		struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 
-	else if (nhi->family == AF_INET6)
-		sz += nh_nlmsg_size_ipv6(nhi);
+		/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+		 * are mutually exclusive
+		 */
+		sz += nla_total_size(4);  /* NHA_OIF */
+
+		if (nhi->family == AF_INET)
+			sz += nh_nlmsg_size_ipv4(nhi);
+		else if (nhi->family == AF_INET6)
+			sz += nh_nlmsg_size_ipv6(nhi);
+	}
 
 	return sz;
 }
 
+static bool valid_group_nh(struct nexthop *nh, struct netlink_ext_ack *extack)
+{
+	if (nh->is_group) {
+		struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+		/* nested multipath (group within a group) is not
+		 * supported
+		 */
+		if (nh_grp->mpath) {
+			NL_SET_ERR_MSG(extack,
+				       "Multipath group can not be a nexthop within a group");
+			return false;
+		}
+	} else {
+		struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+		if (nhi->reject_nh) {
+			NL_SET_ERR_MSG(extack,
+				       "Blackhole nexthop can not be used in a group");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+			       struct netlink_ext_ack *extack)
+{
+	unsigned int len = nla_len(tb[NHA_GROUP]);
+	struct nexthop_grp *nhg;
+	int i;
+
+	if (len & (sizeof(struct nh_group) - 1)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid length for nexthop group attribute");
+		return -EINVAL;
+	}
+
+	/* convert len to number of nexthop ids */
+	len /= sizeof(*nhg);
+
+	nhg = nla_data(tb[NHA_GROUP]);
+	for (i = 0; i < len; ++i) {
+		struct nexthop *nh;
+
+		nh = nexthop_find_by_id(net, nhg->id);
+		if (!nh) {
+			NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+			return -EINVAL;
+		}
+		if (!valid_group_nh(nh, extack))
+			return -EINVAL;
+
+		nhg += 1;
+	}
+
+	for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+		if (!tb[i])
+			continue;
+
+		NL_SET_ERR_MSG(extack,
+			       "No other attributes can be set in nexthop groups");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nh_grp)
+{
+	size_t len = nh_grp->num_nh_set * sizeof(struct nh_group);
+	struct nexthop_grp *p;
+	struct nlattr *nla;
+	u16 group_type = 0;
+	int i;
+
+	if (nh_grp->mpath)
+		group_type = NEXTHOP_GRP_TYPE_MPATH;
+
+	if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, NHA_GROUP, len);
+	if (!nla)
+		goto nla_put_failure;
+
+	p = nla_data(nla);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		if (!nh_grp->nh_entries[i].nh)
+			continue;
+
+		p->id = nh_grp->nh_entries[i].nh->id;
+		p->weight = nh_grp->nh_entries[i].weight;
+		p += 1;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static void nh_group_rebalance(struct nh_group *nhg)
+{
+	struct nh_grp_entry *nhge;
+	int total = 0;
+	int w = 0;
+	int i;
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		nhge = &nhg->nh_entries[i];
+
+		if (!nhge->nh)
+			continue;
+
+		total += nhge->weight;
+	}
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		int upper_bound;
+
+		nhge = &nhg->nh_entries[i];
+		if (!nhge->nh) {
+			upper_bound = -1;
+		} else {
+			w += nhge->weight;
+			upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+							    total) - 1;
+		}
+
+		atomic_set(&nhge->upper_bound, upper_bound);
+	}
+}
+
 static const struct net_device *nh_info_dev(const struct nh_info *nhi)
 {
 	switch (nhi->family) {
@@ -219,8 +407,25 @@ bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev)
 	const struct nh_info *nhi;
 	bool dev_match = false;
 
-	nhi = rcu_dereference(nh->nh_info);
-	dev_match = nh_info_uses_dev(nhi, dev);
+	if (nh->is_group) {
+		const struct nh_group *nh_grp;
+		int i;
+
+		nh_grp = rcu_dereference(nh->nh_grp);
+		for (i = 0; i < nh_grp->num_nh; ++i) {
+			const struct nh_grp_entry *nhge;
+
+			nhge = &nh_grp->nh_entries[i];
+			nhi = rcu_dereference(nhge->nh->nh_info);
+			dev_match = nh_info_uses_dev(nhi, dev);
+			if (dev_match)
+				break;
+		}
+
+	} else {
+		nhi = rcu_dereference(nh->nh_info);
+		dev_match = nh_info_uses_dev(nhi, dev);
+	}
 
 	return dev_match;
 }
@@ -249,6 +454,14 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 	if (nla_put_u32(skb, NHA_ID, nh->id))
 		goto nla_put_failure;
 
+	if (nh->is_group) {
+		struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+		if (nla_put_nh_group(skb, nh_grp))
+			goto nla_put_failure;
+		goto end;
+	}
+
 	nhi = rtnl_dereference(nh->nh_info);
 	if (nhi->reject_nh && nla_put_flag(skb, NHA_BLACKHOLE))
 		goto nla_put_failure;
@@ -281,6 +494,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 		break;
 	}
 
+end:
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -315,6 +529,50 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
 }
 
+static void remove_nh_grp_entry(struct nh_grp_entry *nhge, bool rebalance)
+{
+	struct nh_group *nh_grp;
+
+	list_del(&nhge->nh_list);
+	nexthop_put(nhge->nh);
+	nhge->nh = NULL;
+
+	nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+	nh_grp->num_nh_set--;
+	if (rebalance)
+		nh_group_rebalance(nh_grp);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+				       bool skip_fib, struct nl_info *nlinfo)
+{
+	struct nh_grp_entry *nhge, *tmp;
+
+	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
+		struct nh_group *nh_grp;
+
+		remove_nh_grp_entry(nhge, true);
+
+		/* if this group has no more entries then remove it */
+		nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+		if (!nh_grp->num_nh_set)
+			remove_nexthop(net, nhge->nh_parent, skip_fib,
+				       nlinfo);
+	}
+}
+
+static void remove_nexthop_group(struct nexthop *nh)
+{
+	struct nh_group *nh_grp;
+	int i;
+
+	nh_grp = rtnl_dereference(nh->nh_grp);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		if (nh_grp->nh_entries[i].nh)
+			remove_nh_grp_entry(&nh_grp->nh_entries[i], false);
+	}
+}
+
 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 {
 	struct fib6_info *f6i, *tmp;
@@ -339,13 +597,19 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 static void __remove_nexthop(struct net *net, struct nexthop *nh,
 			     bool skip_fib, struct nl_info *nlinfo)
 {
-	const struct net_device *dev;
-	struct nh_info *nhi;
+	if (nh->is_group) {
+		remove_nexthop_group(nh);
+	} else {
+		const struct net_device *dev;
+		struct nh_info *nhi;
 
-	nhi = rtnl_dereference(nh->nh_info);
-	dev = nh_info_dev(nhi);
-	if (dev)
-		hlist_del(&nhi->dev_hash);
+		nhi = rtnl_dereference(nh->nh_info);
+		dev = nh_info_dev(nhi);
+		if (dev)
+			hlist_del(&nhi->dev_hash);
+
+		remove_nexthop_from_groups(net, nh, skip_fib, nlinfo);
+	}
 	if (!skip_fib)
 		__remove_nexthop_fib(net, nh);
 }
@@ -362,21 +626,46 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
 
 	nexthop_put(nh);
 
-	nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+	if (nlinfo)
+		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 }
 
 static int replace_nexthop(struct net *net, struct nexthop *old,
 			   struct nexthop *new, struct netlink_ext_ack *extack)
 {
-	struct nh_info *oldi, *newi;
+	if (old->is_group) {
+		struct nh_group *oldg, *newg;
+		int i;
 
-	oldi = rtnl_dereference(old->nh_info);
-	newi = rtnl_dereference(new->nh_info);
-	rcu_assign_pointer(old->nh_info, newi);
-	rcu_assign_pointer(new->nh_info, oldi);
+		if (!new->is_group) {
+			NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+			return -EINVAL;
+		}
+		oldg = rtnl_dereference(old->nh_grp);
+		newg = rtnl_dereference(new->nh_grp);
+		rcu_assign_pointer(old->nh_grp, newg);
+		rcu_assign_pointer(new->nh_grp, oldg);
+
+		/* update parents - used by nexthop code for cleanup */
+		for (i = 0; i < newg->num_nh; ++i)
+			newg->nh_entries[i].nh_parent = old;
+		for (i = 0; i < oldg->num_nh; ++i)
+			oldg->nh_entries[i].nh_parent = new;
+	} else {
+		struct nh_info *oldi, *newi;
 
-	newi->nh_parent = old;
-	oldi->nh_parent = new;
+		if (new->is_group) {
+			NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+			return -EINVAL;
+		}
+		oldi = rtnl_dereference(old->nh_info);
+		newi = rtnl_dereference(new->nh_info);
+		rcu_assign_pointer(old->nh_info, newi);
+		rcu_assign_pointer(new->nh_info, oldi);
+
+		newi->nh_parent = old;
+		oldi->nh_parent = new;
+	}
 
 	old->protocol = new->protocol;
 	old->nh_flags = new->nh_flags;
@@ -491,10 +780,16 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
 		      struct netlink_ext_ack *extack)
 {
 	struct nexthop *nh = fi->nh;
-	struct nh_info *nhi;
 
-	nhi = rtnl_dereference(nh->nh_info);
-	if (nhi->family != AF_UNSPEC) {
+	if (nh->is_group) {
+		if (cfg->fc_scope == RT_SCOPE_HOST) {
+			NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+			return -EINVAL;
+		}
+		return 0;
+	}
+
+	if (nh->nh_info->family != AF_UNSPEC) {
 		if (nh->nh_flags & RTNH_F_ONLINK &&
 		    cfg->fc_scope >= RT_SCOPE_LINK) {
 			NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
@@ -505,6 +800,57 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
 	return 0;
 }
 
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash)
+{
+	struct fib_info *fi = res->fi;
+	struct nexthop *nh = fi->nh;
+	struct nh_group *nh_grp;
+	bool first = false;
+	int i;
+
+	WARN_ON(!nh->is_group);
+
+	nh_grp = rcu_dereference(nh->nh_grp);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nh_grp->nh_entries[i];
+		struct fib_nh *fib_nh;
+
+		if (hash > atomic_read(&nhge->upper_bound))
+			continue;
+
+		fib_nh = &nhge->nh->nh_info->fib_nh;
+
+		/* nexthops always check if it is good and does
+		 * not rely on a sysctl for this behavior
+		 */
+		if (fib_good_nh(fib_nh)) {
+			res->nh = fib_nh;
+			return;
+		}
+		if (!first) {
+			res->nh = fib_nh;
+			first = true;
+		}
+	}
+}
+
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel)
+{
+	struct nh_group *nh_grp;
+	int i, j = 0;
+
+	nh_grp = rcu_dereference(nh->nh_grp);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		if (nh_grp->nh_entries[i].nh) {
+			if (nhsel == j)
+				return nh_grp->nh_entries[i].nh;
+			++j;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_mpath_select);
+
 static int nh_check_attr(struct nhmsg *nhm, struct nlattr *tb[],
 			 struct net *net, struct netlink_ext_ack *extack)
 {
@@ -557,6 +903,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 	if (tb[NHA_ID])
 		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
 
+	if (tb[NHA_GROUP]) {
+		cfg->nh_grp = tb[NHA_GROUP];
+
+		cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+		if (tb[NHA_GROUP_TYPE])
+			cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+		if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid group type");
+			goto out;
+		}
+	}
+
 	if (tb[NHA_OIF]) {
 		cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
 
@@ -644,6 +1003,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 		goto out;
 	}
 
+	if (tb[NHA_GROUP]) {
+		err = nh_check_attr_group(net, tb, extack);
+		if (err)
+			goto out;
+
+		return 0;
+	}
+
 	err = 0;
 out:
 	return err;
@@ -791,7 +1158,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
 	return err;
 }
 
-static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
 			  struct nh_info *nhi, struct nh_config *cfg,
 			  struct netlink_ext_ack *extack)
 {
@@ -856,10 +1223,47 @@ static int nh_create_unspec(struct net *net, struct nexthop *nh,
 
 static void nexthop_init_common(struct nexthop *nh)
 {
+	INIT_LIST_HEAD(&nh->grp_list);
 	INIT_LIST_HEAD(&nh->fi_list);
 	INIT_LIST_HEAD(&nh->f6i_list);
 }
 
+static struct nexthop *nexthop_create_group(struct net *net,
+					    struct nh_config *cfg)
+{
+	struct nlattr *grps_attr = cfg->nh_grp;
+	struct nexthop_grp *entry = nla_data(grps_attr);
+	struct nh_group *nh_grp;
+	struct nexthop *nh;
+	int i;
+
+	nh = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nexthop_init_common(nh);
+
+	nh_grp = rtnl_dereference(nh->nh_grp);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		struct nexthop *nhe;
+
+		nhe = nexthop_find_by_id(net, entry[i].id);
+		nexthop_get(nhe);
+
+		nh_grp->nh_entries[i].nh = nhe;
+		nh_grp->nh_entries[i].weight = entry[i].weight ? : 1;
+		list_add(&nh_grp->nh_entries[i].nh_list, &nhe->grp_list);
+		nh_grp->nh_entries[i].nh_parent = nh;
+	}
+
+	if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+		nh_grp->mpath = 1;
+		nh_group_rebalance(nh_grp);
+	}
+
+	return nh;
+}
+
 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
 				      struct netlink_ext_ack *extack)
 {
@@ -929,7 +1333,11 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 		}
 	}
 
-	nh = nexthop_create(net, cfg, extack);
+	if (cfg->nh_grp)
+		nh = nexthop_create_group(net, cfg);
+	else
+		nh = nexthop_create(net, cfg, extack);
+
 	if (IS_ERR(nh))
 		return nh;
 
@@ -968,19 +1376,25 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return err;
 }
 
-static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int group_filter,
 			     int master_idx, u8 family)
 {
 	const struct net_device *dev;
 	const struct nh_info *nhi;
 
-	if (dev_idx || master_idx || family)
+	if (group_filter && !nh->is_group)
+		return true;
+
+	if ((dev_idx || master_idx || family) && nh->is_group)
 		return true;
 
 	nhi = rtnl_dereference(nh->nh_info);
-	if (family && nhi->family != family)
+	if (family && !nh->is_group && nhi->family != family)
 		return true;
 
+	if (nh->is_group)
+		return false;
+
 	dev = nh_info_dev(nhi);
 	if (dev_idx && (!dev || dev->ifindex != dev_idx))
 		return true;
@@ -998,7 +1412,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
 /* rtnl */
 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int dev_filter_idx = 0, master_idx = 0;
+	int group_filter = 0, dev_filter_idx = 0, master_idx = 0;
 	struct net *net = sock_net(skb->sk);
 	struct rb_root *root = &net->nexthop.root;
 	struct nlattr *tb[NHA_MAX + 1];
@@ -1010,6 +1424,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (nlmsg_parse(cb->nlh, sizeof(*nhm), tb, NHA_MAX,
 			rtm_nh_policy, NULL) >= 0) {
+		if (tb[NHA_GROUPS])
+			group_filter = 1;
+
 		if (tb[NHA_OIF])
 			dev_filter_idx = nla_get_u32(tb[NHA_OIF]);
 
@@ -1027,8 +1444,8 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 			goto cont;
 
 		nh = rb_entry(node, struct nexthop, rb_node);
-		if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
-				     nhm->nh_family))
+		if (nh_dump_filtered(nh, dev_filter_idx, group_filter,
+				     master_idx, nhm->nh_family))
 			goto cont;
 
 		err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1297c7c934a8..4c16715607e0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,6 +112,7 @@
 #include <net/secure_seq.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
+#include <net/nexthop.h>
 
 #include "fib_lookup.h"
 
@@ -1887,10 +1888,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
 			    struct flow_keys *hkeys)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi && res->fi->fib_nhs > 1) {
-		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
-
-		fib_select_multipath(res, h);
+	if (res->fi) {
+		struct net *net = res->fi->fib_net;
+		int h;
+
+		if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+			h = fib_multipath_hash(net, NULL, skb, hkeys);
+			nexthop_select_path(net, res, h);
+		} else if (res->fi->fib_nhs > 1) {
+			h = fib_multipath_hash(net, NULL, skb, hkeys);
+			fib_select_multipath(res, h);
+		}
 	}
 #endif
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 10/18] net/ipv6: Make fib6_nh optional at the end of fib6_info
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Move fib6_nh to the end of fib6_info and make an array of
size 0. Pass a flag to fib6_info_alloc indicating if the
allocation needs to add space for a fib6_nh.

The current code path always has a fib6_nh allocated; with
nexthop objects they will not.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h       |   8 +--
 include/net/ip6_route.h     |  10 ++-
 include/trace/events/fib6.h |  15 ++--
 net/core/filter.c           |   6 +-
 net/ipv6/addrconf.c         |   2 +-
 net/ipv6/ip6_fib.c          |  15 ++--
 net/ipv6/ndisc.c            |  13 ++--
 net/ipv6/route.c            | 165 ++++++++++++++++++++++----------------------
 8 files changed, 124 insertions(+), 110 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2a1fae1247a9..9526eef711d5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -170,8 +170,8 @@ struct fib6_info {
 					dst_host:1,
 					unused:3;
 
-	struct fib6_nh			fib6_nh;
 	struct rcu_head			rcu;
+	struct fib6_nh			fib6_nh[0];
 };
 
 struct rt6_info {
@@ -274,7 +274,7 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 	dst_release(&rt->dst);
 }
 
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh);
 void fib6_info_destroy_rcu(struct rcu_head *head);
 
 static inline void fib6_info_hold(struct fib6_info *f6i)
@@ -426,13 +426,13 @@ static inline void fib6_nh_release(struct fib6_nh *fib6_nh)
 
 static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.nh_dev;
+	return f6i->fib6_nh->nh_dev;
 }
 
 static inline
 struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.nh_lwtstate;
+	return f6i->fib6_nh->nh_lwtstate;
 }
 
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 7b9c82de11cc..b1ca637acb2a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -274,9 +274,13 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
-	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
-	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
-	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
+// TO-DO:
+	//if (a->nh || b->nh)
+	//	return nexthop_cmp(a->nh, b->nh);
+
+	return a->fib6_nh->nh_dev == b->fib6_nh->nh_dev &&
+	       ipv6_addr_equal(&a->fib6_nh->nh_gw, &b->fib6_nh->nh_gw) &&
+	       !lwtunnel_cmp_encap(a->fib6_nh->nh_lwtstate, b->fib6_nh->nh_lwtstate);
 }
 
 static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index b088b54d699c..037df3d2be0b 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,7 +12,7 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
+	TP_PROTO(const struct net *net, struct fib6_info *f6i,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
 	TP_ARGS(net, f6i, table, flp),
@@ -36,6 +36,7 @@ TRACE_EVENT(fib6_table_lookup,
 	),
 
 	TP_fast_assign(
+		struct fib6_nh *fib6_nh = f6i->fib6_nh;
 		struct in6_addr *in6;
 
 		__entry->tb_id = table->tb6_id;
@@ -62,20 +63,20 @@ TRACE_EVENT(fib6_table_lookup,
 			__entry->dport = 0;
 		}
 
-		if (f6i->fib6_nh.nh_dev) {
-			__assign_str(name, f6i->fib6_nh.nh_dev);
+		if (fib6_nh && fib6_nh->nh_dev) {
+			__assign_str(name, fib6_nh->nh_dev);
 		} else {
 			__assign_str(name, "-");
 		}
-		if (f6i == net->ipv6.fib6_null_entry) {
+
+		if (!fib6_nh) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
-
-		} else if (f6i) {
+		} else {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = f6i->fib6_nh.nh_gw;
+			*in6 = fib6_nh->nh_gw;
 		}
 	),
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0ba4c477415d..bc979edf06ca 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4428,13 +4428,13 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
 	}
 
-	if (f6i->fib6_nh.nh_lwtstate)
+	if (f6i->fib6_nh->nh_lwtstate)
 		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
 
 	if (f6i->fib6_flags & RTF_GATEWAY)
-		*dst = f6i->fib6_nh.nh_gw;
+		*dst = f6i->fib6_nh->nh_gw;
 
-	dev = f6i->fib6_nh.nh_dev;
+	dev = f6i->fib6_nh->nh_dev;
 	params->rt_metric = f6i->fib6_metric;
 
 	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d51a8c0b3372..da5102bff2a9 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2366,7 +2366,7 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
+		if (rt->fib6_nh->nh_dev->ifindex != dev->ifindex)
 			continue;
 		if ((rt->fib6_flags & flags) != flags)
 			continue;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index c1c23427a81e..5b0ca5b3710d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -145,11 +145,15 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
 	       addr[fn_bit >> 5];
 }
 
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
 {
 	struct fib6_info *f6i;
+	size_t sz = sizeof(*f6i);
 
-	f6i = kzalloc(sizeof(*f6i), gfp_flags);
+	if (with_fib6_nh)
+		sz += sizeof(struct fib6_nh);
+
+	f6i = kzalloc(sz, gfp_flags);
 	if (!f6i)
 		return NULL;
 
@@ -198,7 +202,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 		}
 	}
 
-	fib6_nh_release(&f6i->fib6_nh);
+	fib6_nh_release(f6i->fib6_nh);
 
 	m = f6i->fib6_metrics;
 	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
@@ -2247,6 +2251,7 @@ void fib6_gc_cleanup(void)
 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
 	struct fib6_info *rt = v;
+	struct fib6_nh *fib6_nh = rt->fib6_nh;
 	struct ipv6_route_iter *iter = seq->private;
 	const struct net_device *dev;
 
@@ -2258,11 +2263,11 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
 	if (rt->fib6_flags & RTF_GATEWAY)
-		seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
+		seq_printf(seq, "%pi6", &fib6_nh->nh_gw);
 	else
 		seq_puts(seq, "00000000000000000000000000000000");
 
-	dev = rt->fib6_nh.nh_dev;
+	dev = fib6_nh->nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
 		   rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
 		   rt->fib6_flags, dev ? dev->name : "");
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0ec273997d1d..4bc47b9db35b 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1156,6 +1156,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	struct neighbour *neigh = NULL;
 	struct inet6_dev *in6_dev;
 	struct fib6_info *rt = NULL;
+	struct fib6_nh *fib6_nh;
 	struct net *net;
 	int lifetime;
 	struct ndisc_options ndopts;
@@ -1276,9 +1277,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
 
 	if (rt) {
-		neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
-					 rt->fib6_nh.nh_dev, NULL,
-					  &ipv6_hdr(skb)->saddr);
+		fib6_nh = rt->fib6_nh;
+		neigh = ip6_neigh_lookup(&fib6_nh->nh_gw, fib6_nh->nh_dev, NULL,
+					 &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
@@ -1306,9 +1307,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			return;
 		}
 
-		neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
-					 rt->fib6_nh.nh_dev, NULL,
-					  &ipv6_hdr(skb)->saddr);
+		fib6_nh = rt->fib6_nh;
+		neigh = ip6_neigh_lookup(&fib6_nh->nh_gw, fib6_nh->nh_dev, NULL,
+					 &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aa44cd5b3217..5792f57fdb91 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -439,14 +439,14 @@ struct fib6_info *fib6_multipath_select(const struct net *net,
 	if (!fl6->mp_hash)
 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
-	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
+	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->nh_upper_bound))
 		return match;
 
 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
 				 fib6_siblings) {
 		int nh_upper_bound;
 
-		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
+		nh_upper_bound = atomic_read(&sibling->fib6_nh->nh_upper_bound);
 		if (fl6->mp_hash > nh_upper_bound)
 			continue;
 		if (rt6_score_route(sibling, oif, strict) < 0)
@@ -471,13 +471,13 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 	struct fib6_info *sprt;
 
 	if (!oif && ipv6_addr_any(saddr) &&
-	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
+	    !(rt->fib6_nh->nh_flags & RTNH_F_DEAD))
 		return rt;
 
 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
-		const struct net_device *dev = sprt->fib6_nh.nh_dev;
+		const struct net_device *dev = sprt->fib6_nh->nh_dev;
 
-		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
+		if (sprt->fib6_nh->nh_flags & RTNH_F_DEAD)
 			continue;
 
 		if (oif) {
@@ -493,7 +493,7 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 	if (oif && flags & RT6_LOOKUP_F_IFACE)
 		return net->ipv6.fib6_null_entry;
 
-	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
+	return rt->fib6_nh->nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -533,8 +533,8 @@ static void rt6_probe(struct fib6_info *rt)
 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
 		return;
 
-	nh_gw = &rt->fib6_nh.nh_gw;
-	dev = rt->fib6_nh.nh_dev;
+	nh_gw = &rt->fib6_nh->nh_gw;
+	dev = rt->fib6_nh->nh_dev;
 	rcu_read_lock_bh();
 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 	if (neigh) {
@@ -580,7 +580,7 @@ static inline void rt6_probe(struct fib6_info *rt)
  */
 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
 {
-	const struct net_device *dev = rt->fib6_nh.nh_dev;
+	const struct net_device *dev = rt->fib6_nh->nh_dev;
 
 	if (!oif || dev->ifindex == oif)
 		return 2;
@@ -597,8 +597,8 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
 		return RT6_NUD_SUCCEED;
 
 	rcu_read_lock_bh();
-	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
-					  &rt->fib6_nh.nh_gw);
+	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh->nh_dev,
+					  &rt->fib6_nh->nh_gw);
 	if (neigh) {
 		read_lock(&neigh->lock);
 		if (neigh->nud_state & NUD_VALID)
@@ -638,6 +638,7 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
 }
 
 /* called with rc_read_lock held */
+// TO-DO: if (!f6i->nh)
 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
 {
 	const struct net_device *dev = fib6_info_nh_dev(f6i);
@@ -659,11 +660,11 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
 	int m;
 	bool match_do_rr = false;
 
-	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh->nh_flags & RTNH_F_DEAD)
 		goto out;
 
 	if (fib6_ignore_linkdown(rt) &&
-	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
+	    rt->fib6_nh->nh_flags & RTNH_F_LINKDOWN &&
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
 
@@ -868,7 +869,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 /* called with rcu_lock held */
 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
 {
-	struct net_device *dev = rt->fib6_nh.nh_dev;
+	struct net_device *dev = rt->fib6_nh->nh_dev;
 
 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
 		/* for copies of local routes, dst->dev needs to be the
@@ -964,8 +965,8 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 		rt->dst.input = ip6_forward;
 	}
 
-	if (ort->fib6_nh.nh_lwtstate) {
-		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
+	if (ort->fib6_nh->nh_lwtstate) {
+		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh->nh_lwtstate);
 		lwtunnel_set_redirect(&rt->dst);
 	}
 
@@ -989,14 +990,14 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 
 	rt->rt6i_dst = ort->fib6_dst;
 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
-	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
+	rt->rt6i_gateway = ort->fib6_nh->nh_gw;
 	rt->rt6i_flags = ort->fib6_flags;
 	rt6_set_from(rt, ort);
 #ifdef CONFIG_IPV6_SUBTREES
 	rt->rt6i_src = ort->fib6_src;
 #endif
 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
-	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
+	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh->nh_lwtstate);
 }
 
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
@@ -1038,7 +1039,7 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
 {
 	unsigned short flags = fib6_info_dst_flags(rt);
-	struct net_device *dev = rt->fib6_nh.nh_dev;
+	struct net_device *dev = rt->fib6_nh->nh_dev;
 	struct rt6_info *nrt;
 
 	if (!fib6_info_hold_safe(rt))
@@ -1409,7 +1410,7 @@ static unsigned int fib6_mtu(const struct fib6_info *rt)
 
 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
 
-	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
+	return mtu - lwtunnel_headroom(rt->fib6_nh->nh_lwtstate, mtu);
 }
 
 static int rt6_insert_exception(struct rt6_info *nrt,
@@ -2453,7 +2454,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
+		if (rt->fib6_nh->nh_flags & RTNH_F_DEAD)
 			continue;
 		if (fib6_check_expired(rt))
 			continue;
@@ -2461,14 +2462,14 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 			break;
 		if (!(rt->fib6_flags & RTF_GATEWAY))
 			continue;
-		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
+		if (fl6->flowi6_oif != rt->fib6_nh->nh_dev->ifindex)
 			continue;
 		/* rt_cache's gateway might be different from its 'parent'
 		 * in the case of an ip redirect.
 		 * So we keep searching in the exception table if the gateway
 		 * is different.
 		 */
-		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
+		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh->nh_gw)) {
 			rt_cache = rt6_find_cached_rt(rt,
 						      &fl6->daddr,
 						      &fl6->saddr);
@@ -3004,8 +3005,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 			}
 		}
 		cfg->fc_flags = RTF_REJECT | RTF_NONEXTHOP;
-		err = 0;
-		goto out;
+		goto set_dev;
 	}
 
 	if (cfg->fc_flags & RTF_GATEWAY) {
@@ -3036,7 +3036,9 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 	    !netif_carrier_ok(dev))
 		fib6_nh->nh_flags |= RTNH_F_LINKDOWN;
 
+set_dev:
 	fib6_nh->nh_dev = dev;
+	err = 0;
 
 out:
 	if (idev)
@@ -3108,7 +3110,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 
 	err = -ENOMEM;
-	rt = fib6_info_alloc(gfp_flags);
+	rt = fib6_info_alloc(gfp_flags, true);
 	if (!rt)
 		goto out;
 
@@ -3142,7 +3144,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
 	rt->fib6_src.plen = cfg->fc_src_len;
 #endif
-	err = fib6_nh_init(net, &rt->fib6_nh, cfg, extack);
+	err = fib6_nh_init(net, rt->fib6_nh, cfg, extack);
 	if (err)
 		goto out;
 
@@ -3318,11 +3320,11 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			}
 			if (cfg->fc_ifindex &&
-			    (!rt->fib6_nh.nh_dev ||
-			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
+			    (!rt->fib6_nh->nh_dev ||
+			     rt->fib6_nh->nh_dev->ifindex != cfg->fc_ifindex))
 				continue;
 			if (cfg->fc_flags & RTF_GATEWAY &&
-			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
+			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh->nh_gw))
 				continue;
 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
 				continue;
@@ -3493,11 +3495,11 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
+		if (rt->fib6_nh->nh_dev->ifindex != ifindex)
 			continue;
 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
 			continue;
-		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
+		if (!ipv6_addr_equal(&rt->fib6_nh->nh_gw, gwaddr))
 			continue;
 		if (!fib6_info_hold_safe(rt))
 			continue;
@@ -3555,9 +3557,9 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
-		if (dev == rt->fib6_nh.nh_dev &&
+		if (dev == rt->fib6_nh->nh_dev &&
 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
-		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
+		    ipv6_addr_equal(&rt->fib6_nh->nh_gw, addr))
 			break;
 	}
 	if (rt && !fib6_info_hold_safe(rt))
@@ -3763,7 +3765,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
 	struct net_device *dev = idev->dev;
 	struct fib6_info *f6i;
 
-	f6i = fib6_info_alloc(gfp_flags);
+	f6i = fib6_info_alloc(gfp_flags, true);
 	if (!f6i)
 		return ERR_PTR(-ENOMEM);
 
@@ -3779,9 +3781,9 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
 		f6i->fib6_flags |= RTF_LOCAL;
 	}
 
-	f6i->fib6_nh.nh_gw = *addr;
+	f6i->fib6_nh->nh_gw = *addr;
 	dev_hold(dev);
-	f6i->fib6_nh.nh_dev = dev;
+	f6i->fib6_nh->nh_dev = dev;
 	f6i->fib6_dst.addr = *addr;
 	f6i->fib6_dst.plen = 128;
 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
@@ -3803,7 +3805,7 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
 
-	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
+	if (((void *)rt->fib6_nh->nh_dev == dev || !dev) &&
 	    rt != net->ipv6.fib6_null_entry &&
 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
@@ -3835,7 +3837,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 	struct in6_addr *gateway = (struct in6_addr *)arg;
 
 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
-	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
+	    ipv6_addr_equal(gateway, &rt->fib6_nh->nh_gw)) {
 		return -1;
 	}
 
@@ -3883,8 +3885,8 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 
 static bool rt6_is_dead(const struct fib6_info *rt)
 {
-	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
-	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
+	if (rt->fib6_nh->nh_flags & RTNH_F_DEAD ||
+	    (rt->fib6_nh->nh_flags & RTNH_F_LINKDOWN &&
 	     fib6_ignore_linkdown(rt)))
 		return true;
 
@@ -3897,11 +3899,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
-		total += rt->fib6_nh.nh_weight;
+		total += rt->fib6_nh->nh_weight;
 
 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
 		if (!rt6_is_dead(iter))
-			total += iter->fib6_nh.nh_weight;
+			total += iter->fib6_nh->nh_weight;
 	}
 
 	return total;
@@ -3912,11 +3914,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
 	int upper_bound = -1;
 
 	if (!rt6_is_dead(rt)) {
-		*weight += rt->fib6_nh.nh_weight;
+		*weight += rt->fib6_nh->nh_weight;
 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
 						    total) - 1;
 	}
-	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
+	atomic_set(&rt->fib6_nh->nh_upper_bound, upper_bound);
 }
 
 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
@@ -3959,8 +3961,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
-		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
+	if (rt != net->ipv6.fib6_null_entry &&
+	    rt->fib6_nh->nh_dev == arg->dev) {
+		rt->fib6_nh->nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
 		rt6_multipath_rebalance(rt);
 	}
@@ -3988,10 +3991,10 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
 {
 	struct fib6_info *iter;
 
-	if (rt->fib6_nh.nh_dev == dev)
+	if (rt->fib6_nh->nh_dev == dev)
 		return true;
 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
-		if (iter->fib6_nh.nh_dev == dev)
+		if (iter->fib6_nh->nh_dev == dev)
 			return true;
 
 	return false;
@@ -4012,12 +4015,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
 	struct fib6_info *iter;
 	unsigned int dead = 0;
 
-	if (rt->fib6_nh.nh_dev == down_dev ||
-	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh->nh_dev == down_dev ||
+	    rt->fib6_nh->nh_flags & RTNH_F_DEAD)
 		dead++;
 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
-		if (iter->fib6_nh.nh_dev == down_dev ||
-		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
+		if (iter->fib6_nh->nh_dev == down_dev ||
+		    iter->fib6_nh->nh_flags & RTNH_F_DEAD)
 			dead++;
 
 	return dead;
@@ -4029,11 +4032,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
 {
 	struct fib6_info *iter;
 
-	if (rt->fib6_nh.nh_dev == dev)
-		rt->fib6_nh.nh_flags |= nh_flags;
+	if (rt->fib6_nh->nh_dev == dev)
+		rt->fib6_nh->nh_flags |= nh_flags;
 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
-		if (iter->fib6_nh.nh_dev == dev)
-			iter->fib6_nh.nh_flags |= nh_flags;
+		if (iter->fib6_nh->nh_dev == dev)
+			iter->fib6_nh->nh_flags |= nh_flags;
 }
 
 /* called with write lock held for table with rt */
@@ -4048,12 +4051,12 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 
 	switch (arg->event) {
 	case NETDEV_UNREGISTER:
-		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
+		return rt->fib6_nh->nh_dev == dev ? -1 : 0;
 	case NETDEV_DOWN:
 		if (rt->should_flush)
 			return -1;
 		if (!rt->fib6_nsiblings)
-			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
+			return rt->fib6_nh->nh_dev == dev ? -1 : 0;
 		if (rt6_multipath_uses_dev(rt, dev)) {
 			unsigned int count;
 
@@ -4069,10 +4072,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 		}
 		return -2;
 	case NETDEV_CHANGE:
-		if (rt->fib6_nh.nh_dev != dev ||
+		if (rt->fib6_nh->nh_dev != dev ||
 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
-		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
+		rt->fib6_nh->nh_flags |= RTNH_F_LINKDOWN;
 		rt6_multipath_rebalance(rt);
 		break;
 	}
@@ -4124,7 +4127,7 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
 	   Since RFC 1981 doesn't include administrative MTU increase
 	   update PMTU increase is a MUST. (i.e. jumbo frame)
 	 */
-	if (rt->fib6_nh.nh_dev == arg->dev &&
+	if (rt->fib6_nh->nh_dev == arg->dev &&
 	    !fib6_metric_locked(rt, RTAX_MTU)) {
 		u32 mtu = rt->fib6_pmtu;
 
@@ -4426,7 +4429,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			goto cleanup;
 		}
 
-		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
+		rt->fib6_nh->nh_weight = rtnh->rtnh_hops + 1;
 
 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
 					    rt, &r_cfg);
@@ -4589,7 +4592,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
 			    + NLA_ALIGN(sizeof(struct rtnexthop))
 			    + nla_total_size(16) /* RTA_GATEWAY */
-			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
+			    + lwtunnel_get_encap_size(rt->fib6_nh->nh_lwtstate);
 
 		nexthop_len *= rt->fib6_nsiblings;
 	}
@@ -4607,17 +4610,17 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
 	       + nla_total_size(1) /* RTA_PREF */
-	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
+	       + lwtunnel_get_encap_size(rt->fib6_nh->nh_lwtstate)
 	       + nexthop_len;
 }
 
 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 			    unsigned int *flags, bool skip_oif)
 {
-	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh->nh_flags & RTNH_F_DEAD)
 		*flags |= RTNH_F_DEAD;
 
-	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
+	if (rt->fib6_nh->nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
 
 		rcu_read_lock();
@@ -4627,21 +4630,21 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 	}
 
 	if (rt->fib6_flags & RTF_GATEWAY) {
-		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
+		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh->nh_gw) < 0)
 			goto nla_put_failure;
 	}
 
-	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
-	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
+	*flags |= (rt->fib6_nh->nh_flags & RTNH_F_ONLINK);
+	if (rt->fib6_nh->nh_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
-	if (!skip_oif && rt->fib6_nh.nh_dev &&
-	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
+	if (!skip_oif && rt->fib6_nh->nh_dev &&
+	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh->nh_dev->ifindex))
 		goto nla_put_failure;
 
-	if (rt->fib6_nh.nh_lwtstate &&
-	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
+	if (rt->fib6_nh->nh_lwtstate &&
+	    lwtunnel_fill_encap(skb, rt->fib6_nh->nh_lwtstate) < 0)
 		goto nla_put_failure;
 
 	return 0;
@@ -4653,7 +4656,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 /* add multipath next hop */
 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
 {
-	const struct net_device *dev = rt->fib6_nh.nh_dev;
+	const struct net_device *dev = rt->fib6_nh->nh_dev;
 	struct rtnexthop *rtnh;
 	unsigned int flags = 0;
 
@@ -4661,7 +4664,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
 	if (!rtnh)
 		goto nla_put_failure;
 
-	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
+	rtnh->rtnh_hops = rt->fib6_nh->nh_weight - 1;
 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
 
 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
@@ -5017,7 +5020,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		return NOTIFY_OK;
 
 	if (event == NETDEV_REGISTER) {
-		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
+		net->ipv6.fib6_null_entry->fib6_nh->nh_dev = dev;
 		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5194,11 +5197,11 @@ static int __net_init ip6_route_net_init(struct net *net)
 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
 		goto out_ip6_dst_ops;
 
-	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
-					    sizeof(*net->ipv6.fib6_null_entry),
-					    GFP_KERNEL);
+	net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
 	if (!net->ipv6.fib6_null_entry)
 		goto out_ip6_dst_entries;
+	memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
+	       sizeof(*net->ipv6.fib6_null_entry));
 
 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
 					   sizeof(*net->ipv6.ip6_null_entry),
@@ -5334,7 +5337,7 @@ void __init ip6_route_init_special_entries(void)
 	/* Registering of the loopback is done before this portion of code,
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
-	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
+	init_net.ipv6.fib6_null_entry->fib6_nh->nh_dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 16/18] net/ipv6: Allow routes to use nexthop objects
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Allow users to specify a nexthop id to use with a route.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_fib.h |  4 +++
 include/net/nexthop.h |  3 ++
 net/ipv4/nexthop.c    |  5 +++
 net/ipv6/addrconf.c   |  3 ++
 net/ipv6/ip6_fib.c    | 17 ++++++++---
 net/ipv6/ndisc.c      |  2 ++
 net/ipv6/route.c      | 85 +++++++++++++++++++++++++++++++++++++++++----------
 7 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1f04a26e4c65..170aadcd83b4 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -52,6 +52,7 @@ struct fib6_config {
 	u16		fc_type;        /* only 8 bits are used */
 	u16		fc_delete_all_nh : 1,
 			__unused : 15;
+	u32		fc_nh_id;
 
 	struct in6_addr	fc_dst;
 	struct in6_addr	fc_src;
@@ -139,6 +140,8 @@ struct fib6_info {
 	struct fib6_info __rcu		*fib6_next;
 	struct fib6_node __rcu		*fib6_node;
 
+	struct list_head		nh_list;
+
 	/* Multipath routes:
 	 * siblings is a list of fib6_info that have the the same metric/weight,
 	 * destination, but not the same gateway. nsiblings is just a cache
@@ -171,6 +174,7 @@ struct fib6_info {
 					unused:3;
 
 	struct rcu_head			rcu;
+	struct nexthop			*nh;
 	struct fib6_nh			fib6_nh[0];
 };
 
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index dae1518af3f3..759bb39e4ea7 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -175,6 +175,9 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
 
 static inline struct fib6_nh *fib6_info_nh(struct fib6_info *f6i)
 {
+	if (f6i->nh)
+		return nexthop_fib6_nh(f6i->nh);
+
 	return f6i->fib6_nh;
 }
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index d1fc3d21af86..1e77fa94e562 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -317,6 +317,7 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 
 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 {
+	struct fib6_info *f6i, *tmp;
 	struct fib_info *fi;
 	bool do_flush;
 
@@ -328,6 +329,10 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 
 	if (do_flush)
 		fib_flush(net);
+
+	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+		ip6_del_rt(net, f6i);
+	}
 }
 
 /* called on insert failure too */
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index da5102bff2a9..8131cdd472cb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2366,6 +2366,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
+		/* prefix routes do not use nexthop objects */
+		if (rt->nh)
+			continue;
 		if (rt->fib6_nh->nh_dev->ifindex != dev->ifindex)
 			continue;
 		if ((rt->fib6_flags & flags) != flags)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5b0ca5b3710d..b6dc644a55cf 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -202,7 +202,10 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 		}
 	}
 
-	fib6_nh_release(f6i->fib6_nh);
+	if (f6i->nh)
+		nexthop_put(f6i->nh);
+	else
+		fib6_nh_release(f6i->fib6_nh);
 
 	m = f6i->fib6_metrics;
 	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
@@ -1302,6 +1305,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	if (!err) {
 		__fib6_update_sernum_upto_root(rt, sernum);
 		fib6_start_gc(info->nl_net, rt);
+		if (rt->nh)
+			list_add(&rt->nh_list, &rt->nh->f6i_list);
 	}
 
 out:
@@ -1776,6 +1781,9 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	fib6_purge_rt(rt, fn, net);
 
+	if (rt->nh)
+		list_del(&rt->nh_list);
+
 	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
 	if (!info->skip_notify)
 		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
@@ -2251,7 +2259,6 @@ void fib6_gc_cleanup(void)
 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
 	struct fib6_info *rt = v;
-	struct fib6_nh *fib6_nh = rt->fib6_nh;
 	struct ipv6_route_iter *iter = seq->private;
 	const struct net_device *dev;
 
@@ -2262,12 +2269,12 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 #else
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
-	if (rt->fib6_flags & RTF_GATEWAY)
-		seq_printf(seq, "%pi6", &fib6_nh->nh_gw);
+	if (!rt->nh && rt->fib6_flags & RTF_GATEWAY)
+		seq_printf(seq, "%pi6", &rt->fib6_nh->nh_gw);
 	else
 		seq_puts(seq, "00000000000000000000000000000000");
 
-	dev = fib6_nh->nh_dev;
+	dev = rt->nh ? NULL : rt->fib6_nh->nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
 		   rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
 		   rt->fib6_flags, dev ? dev->name : "");
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 4bc47b9db35b..1a6b71873dd3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1277,6 +1277,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
 
 	if (rt) {
+		/* routes added from RAs do not use nexthop objects */
 		fib6_nh = rt->fib6_nh;
 		neigh = ip6_neigh_lookup(&fib6_nh->nh_gw, fib6_nh->nh_dev, NULL,
 					 &ipv6_hdr(skb)->saddr);
@@ -1307,6 +1308,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			return;
 		}
 
+		/* routes added from RAs do not use nexthop objects */
 		fib6_nh = rt->fib6_nh;
 		neigh = ip6_neigh_lookup(&fib6_nh->nh_gw, fib6_nh->nh_dev, NULL,
 					 &ipv6_hdr(skb)->saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2c140ce95eb4..217be2c72b69 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -439,6 +439,11 @@ struct fib6_info *fib6_multipath_select(const struct net *net,
 	if (!fl6->mp_hash)
 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
+	if (match->nh) {
+		// TO-DO:
+		return match;
+	}
+
 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->nh_upper_bound))
 		return match;
 
@@ -661,13 +666,15 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
 	int m;
 	bool match_do_rr = false;
 
-	if (rt->fib6_nh->nh_flags & RTNH_F_DEAD)
-		goto out;
+	if (!rt->nh) {
+		if (rt->fib6_nh->nh_flags & RTNH_F_DEAD)
+			goto out;
 
-	if (fib6_ignore_linkdown(rt) &&
-	    rt->fib6_nh->nh_flags & RTNH_F_LINKDOWN &&
-	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
-		goto out;
+		if (fib6_ignore_linkdown(rt) &&
+		    rt->fib6_nh->nh_flags & RTNH_F_LINKDOWN &&
+		    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
+			goto out;
+	}
 
 	if (fib6_check_expired(rt))
 		goto out;
@@ -3064,6 +3071,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 {
 	struct net *net = cfg->fc_nlinfo.nl_net;
 	struct fib6_info *rt = NULL;
+	struct nexthop *nh = NULL;
 	struct fib6_table *table;
 	int err = -EINVAL;
 
@@ -3099,6 +3107,15 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 #endif
+	if (cfg->fc_nh_id) {
+		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+		if (!nh) {
+			NL_SET_ERR_MSG(extack,
+				       "Invalid nexthop id - nexthop does not exist");
+			goto out;
+		}
+	}
+
 	if (cfg->fc_metric == 0)
 		cfg->fc_metric = IP6_RT_PRIO_USER;
 
@@ -3118,7 +3135,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 
 	err = -ENOMEM;
-	rt = fib6_info_alloc(gfp_flags, true);
+	rt = fib6_info_alloc(gfp_flags, !nh);
 	if (!rt)
 		goto out;
 
@@ -3152,9 +3169,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
 	rt->fib6_src.plen = cfg->fc_src_len;
 #endif
-	err = fib6_nh_init(net, rt->fib6_nh, cfg, extack);
-	if (err)
-		goto out;
+	if (nh) {
+		nexthop_get(nh);
+		rt->nh = nh;
+		if (nexthop_has_gw(nh))
+			cfg->fc_flags |= RTF_GATEWAY;
+	} else {
+		err = fib6_nh_init(net, rt->fib6_nh, cfg, extack);
+		if (err)
+			goto out;
+	}
 
 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
 		struct net_device *dev = fib6_info_nh_dev(rt);
@@ -3327,6 +3351,9 @@ static int ip6_route_del(struct fib6_config *cfg,
 				}
 				continue;
 			}
+			if (rt->nh && rt->nh->id == cfg->fc_nh_id)
+				goto del_rt;
+
 			if (cfg->fc_ifindex &&
 			    (!rt->fib6_nh->nh_dev ||
 			     rt->fib6_nh->nh_dev->ifindex != cfg->fc_ifindex))
@@ -3340,6 +3367,7 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			if (!fib6_info_hold_safe(rt))
 				continue;
+del_rt:
 			rcu_read_unlock();
 
 			/* if gateway was specified only delete the one hop */
@@ -3482,6 +3510,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
+/* RA routes do not use nexthop objects */
 static struct fib6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
@@ -3551,6 +3580,7 @@ static struct fib6_info *rt6_add_route_info(struct net *net,
 }
 #endif
 
+/* RA routes do not use nexthop objects */
 struct fib6_info *rt6_get_dflt_router(struct net *net,
 				     const struct in6_addr *addr,
 				     struct net_device *dev)
@@ -3892,6 +3922,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 	return NULL;
 }
 
+/* not called for rt->nh set */
 static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh->nh_flags & RTNH_F_DEAD ||
@@ -3970,7 +4001,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.fib6_null_entry &&
+	if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
 	    rt->fib6_nh->nh_dev == arg->dev) {
 		rt->fib6_nh->nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
@@ -4179,6 +4210,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
 	[RTA_SPORT]		= { .type = NLA_U16 },
 	[RTA_DPORT]		= { .type = NLA_U16 },
+	[RTA_NH_ID]		= { .type = NLA_U32 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4224,6 +4256,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 	cfg->fc_nlinfo.nlh = nlh;
 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
 
+	if (tb[RTA_NH_ID])
+		cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
+
 	if (tb[RTA_GATEWAY]) {
 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
 		cfg->fc_flags |= RTF_GATEWAY;
@@ -4421,6 +4456,13 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
 			if (nla)
 				r_cfg.fc_encap_type = nla_get_u16(nla);
+			nla = nla_find(attrs, attrlen, RTA_NH_ID);
+			if (nla) {
+				err = -EINVAL;
+				NL_SET_ERR_MSG(extack,
+					       "Multipath API can not use nexthop objects.");
+				goto cleanup;
+			}
 		}
 
 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
@@ -4596,6 +4638,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 static size_t rt6_nlmsg_size(struct fib6_info *rt)
 {
 	int nexthop_len = 0;
+	size_t nh_len;
 
 	if (rt->fib6_nsiblings) {
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
@@ -4606,23 +4649,29 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 		nexthop_len *= rt->fib6_nsiblings;
 	}
 
+	if (rt->nh) {
+		nh_len = nla_total_size(4); /* RTA_NH_ID */
+	} else {
+		nh_len = lwtunnel_get_encap_size(rt->fib6_nh->nh_lwtstate)
+			 + nla_total_size(16) /* RTA_GATEWAY */
+			 + nla_total_size(4); /* RTA_OIF */
+	}
+
 	return NLMSG_ALIGN(sizeof(struct rtmsg))
 	       + nla_total_size(16) /* RTA_SRC */
 	       + nla_total_size(16) /* RTA_DST */
-	       + nla_total_size(16) /* RTA_GATEWAY */
 	       + nla_total_size(16) /* RTA_PREFSRC */
 	       + nla_total_size(4) /* RTA_TABLE */
 	       + nla_total_size(4) /* RTA_IIF */
-	       + nla_total_size(4) /* RTA_OIF */
 	       + nla_total_size(4) /* RTA_PRIORITY */
 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
 	       + nla_total_size(1) /* RTA_PREF */
-	       + lwtunnel_get_encap_size(rt->fib6_nh->nh_lwtstate)
-	       + nexthop_len;
+	       + nexthop_len + nh_len;
 }
 
+/* not called for rt->nh set */
 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 			    unsigned int *flags, bool skip_oif)
 {
@@ -4777,10 +4826,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
 		goto nla_put_failure;
 
+	if (rt->nh) {
+		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
+			goto nla_put_failure;
+
 	/* For multipath routes, walk the siblings list and add
 	 * each as a nexthop within RTA_MULTIPATH.
 	 */
-	if (rt->fib6_nsiblings) {
+	} else if (rt->fib6_nsiblings) {
 		struct fib6_info *sibling, *next_sibling;
 		struct nlattr *mp;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 08/18] net/ipv4: Move device validation to helper
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Move the device matching check in __fib_validate_source to a helper.
Code move only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv4/fib_frontend.c | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2f9bf1ec2678..ec6ae186d4b0 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -315,6 +315,32 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 }
 
+static bool fib_info_nh_uses_dev(struct fib_info *fi,
+				 const struct net_device *dev)
+{
+	bool dev_match = false;
+	int ret;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	for (ret = 0; ret < fi->fib_nhs; ret++) {
+		struct fib_nh *nh = &fi->fib_nh[ret];
+
+		if (nh->nh_dev == dev) {
+			dev_match = true;
+			break;
+		} else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+			dev_match = true;
+			break;
+		}
+	}
+#else
+	if (FIB_RES_DEV(res) == dev)
+		dev_match = true;
+#endif
+
+	return dev_match;
+}
+
 /* Given (packet source, input interface) and optional (dst, oif, tos):
  * - (main) check, that source is valid i.e. not broadcast or our local
  *   address.
@@ -361,24 +387,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 		goto e_inval;
 	fib_combine_itag(itag, &res);
-	dev_match = false;
-
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
-		struct fib_nh *nh = &res.fi->fib_nh[ret];
 
-		if (nh->nh_dev == dev) {
-			dev_match = true;
-			break;
-		} else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
-			dev_match = true;
-			break;
-		}
-	}
-#else
-	if (FIB_RES_DEV(res) == dev)
-		dev_match = true;
-#endif
+	dev_match = fib_info_nh_uses_dev(res.fi, dev);
 	if (dev_match) {
 		ret = FIB_RES_NH(res)->nh_scope >= RT_SCOPE_HOST;
 		return ret;
-- 
2.11.0

^ permalink raw reply related

* [PATCH iproute2-next] ip: Add support for nexthop objects
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/nexthop.h   |  56 ++++
 include/uapi/linux/rtnetlink.h |   8 +
 ip/Makefile                    |   3 +-
 ip/ip.c                        |   3 +-
 ip/ip_common.h                 |   7 +-
 ip/ipmonitor.c                 |   6 +
 ip/ipnexthop.c                 | 652 +++++++++++++++++++++++++++++++++++++++++
 ip/iproute.c                   |  19 +-
 8 files changed, 747 insertions(+), 7 deletions(-)
 create mode 100644 include/uapi/linux/nexthop.h
 create mode 100644 ip/ipnexthop.c

diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
new file mode 100644
index 000000000000..335182e8229a
--- /dev/null
+++ b/include/uapi/linux/nexthop.h
@@ -0,0 +1,56 @@
+#ifndef __LINUX_NEXTHOP_H
+#define __LINUX_NEXTHOP_H
+
+#include <linux/types.h>
+
+struct nhmsg {
+	unsigned char	nh_family;
+	unsigned char	nh_scope;     /* one of RT_SCOPE */
+	unsigned char	nh_protocol;  /* Routing protocol that installed nh */
+	unsigned char	resvd;
+	unsigned int	nh_flags;     /* RTNH_F flags */
+};
+
+struct nexthop_grp {
+	__u32	id;
+	__u32	weight;
+};
+
+enum {
+	NEXTHOP_GRP_TYPE_MPATH,  /* default type if not specified */
+	__NEXTHOP_GRP_TYPE_MAX,
+};
+
+#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
+
+
+/* NHA_ID	32-bit id for nexthop. id must be greater than 0.
+ *		id == 0 means assign an unused id.
+ */
+enum {
+	NHA_UNSPEC,
+	NHA_ID,		/* u32 */
+	NHA_GROUP,	/* array of nexthop_grp */
+	NHA_GROUP_TYPE,	/* u16 one of NEXTHOP_GRP_TYPE;
+			 * default is NEXTHOP_GRP_TYPE_MPATH */
+
+	/* if NHA_GROUP attribute is added, no other attributes can be set */
+
+	NHA_BLACKHOLE,	/* flag; nexthop used to blackhole packets */
+	NHA_OIF,	/* u32 */
+	NHA_FLOW,	/* u32 */
+
+	NHA_TABLE_ID,	/* u32 - table id to validate gateway */
+	NHA_GATEWAY,	/* be32 (IPv4) or in6_addr (IPv6) gw address */
+
+	/* Dump control attributes */
+	NHA_GROUPS,	/* flag; only return nexthop groups in dump */
+	NHA_MASTER,	/* u32; only return nexthops with given master dev */
+
+	NHA_SADDR,	/* return only: IPv4 or IPv6 source address */
+
+	__NHA_MAX,
+};
+
+#define NHA_MAX	(__NHA_MAX - 1)
+#endif
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 8c1d600bfa33..158114245b6c 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -157,6 +157,13 @@ enum {
 	RTM_GETCHAIN,
 #define RTM_GETCHAIN RTM_GETCHAIN
 
+	RTM_NEWNEXTHOP = 104,
+#define RTM_NEWNEXTHOP RTM_NEWNEXTHOP
+	RTM_DELNEXTHOP,
+#define RTM_DELNEXTHOP RTM_DELNEXTHOP
+	RTM_GETNEXTHOP,
+#define RTM_GETNEXTHOP RTM_GETNEXTHOP
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
@@ -342,6 +349,7 @@ enum rtattr_type_t {
 	RTA_IP_PROTO,
 	RTA_SPORT,
 	RTA_DPORT,
+	RTA_NH_ID,
 	__RTA_MAX
 };
 
diff --git a/ip/Makefile b/ip/Makefile
index a88f93665ee6..7df818dbe23a 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -10,7 +10,8 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
     iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \
     iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o \
-    ipvrf.o iplink_xstats.o ipseg6.o iplink_netdevsim.o iplink_rmnet.o
+    ipvrf.o iplink_xstats.o ipseg6.o iplink_netdevsim.o iplink_rmnet.o \
+    ipnexthop.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/ip.c b/ip/ip.c
index 58c643df8a36..963ef140c7c4 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -51,7 +51,7 @@ static void usage(void)
 "where  OBJECT := { link | address | addrlabel | route | rule | neigh | ntable |\n"
 "                   tunnel | tuntap | maddress | mroute | mrule | monitor | xfrm |\n"
 "                   netns | l2tp | fou | macsec | tcp_metrics | token | netconf | ila |\n"
-"                   vrf | sr }\n"
+"                   vrf | sr | nexthop }\n"
 "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
 "                    -h[uman-readable] | -iec | -j[son] | -p[retty] |\n"
 "                    -f[amily] { inet | inet6 | ipx | dnet | mpls | bridge | link } |\n"
@@ -101,6 +101,7 @@ static const struct cmd {
 	{ "netconf",	do_ipnetconf },
 	{ "vrf",	do_ipvrf},
 	{ "sr",		do_seg6 },
+	{ "nexthop",	do_ipnh },
 	{ "help",	do_help },
 	{ 0 }
 };
diff --git a/ip/ip_common.h b/ip/ip_common.h
index 200be5e23dd1..2971c1586c4e 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -56,6 +56,8 @@ int print_rule(const struct sockaddr_nl *who,
 int print_netconf(const struct sockaddr_nl *who,
 		  struct rtnl_ctrl_data *ctrl,
 		  struct nlmsghdr *n, void *arg);
+int print_nexthop(const struct sockaddr_nl *who,
+		  struct nlmsghdr *n, void *arg);
 void netns_map_init(void);
 void netns_nsid_socket_init(void);
 int print_nsid(const struct sockaddr_nl *who,
@@ -90,6 +92,7 @@ int do_ipvrf(int argc, char **argv);
 void vrf_reset(void);
 int netns_identify_pid(const char *pidstr, char *name, int len);
 int do_seg6(int argc, char **argv);
+int do_ipnh(int argc, char **argv);
 
 int iplink_get(unsigned int flags, char *name, __u32 filt_mask);
 int iplink_ifla_xstats(int argc, char **argv);
@@ -165,5 +168,7 @@ int name_is_vrf(const char *name);
 #endif
 
 void print_num(FILE *fp, unsigned int width, uint64_t count);
-
+void print_rta_flow(FILE *fp, const struct rtattr *rta);
+void print_rt_flags(FILE *fp, unsigned int flags);
+void print_rta_if(FILE *fp, const struct rtattr *rta, const char *prefix);
 #endif /* _IP_COMMON_H_ */
diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c
index a93b62cd6624..de129626683b 100644
--- a/ip/ipmonitor.c
+++ b/ip/ipmonitor.c
@@ -84,6 +84,12 @@ static int accept_msg(const struct sockaddr_nl *who,
 		}
 	}
 
+	case RTM_NEWNEXTHOP:
+	case RTM_DELNEXTHOP:
+		print_headers(fp, "[NEXTHOP]", ctrl);
+		print_nexthop(who, n, arg);
+		return 0;
+
 	case RTM_NEWLINK:
 	case RTM_DELLINK:
 		ll_remember_index(who, n, NULL);
diff --git a/ip/ipnexthop.c b/ip/ipnexthop.c
new file mode 100644
index 000000000000..9fa4b7292426
--- /dev/null
+++ b/ip/ipnexthop.c
@@ -0,0 +1,652 @@
+/*
+ * ip nexthop
+ *
+ * Copyright (C) 2017 Cumulus Networks
+ * Copyright (c) 2017 David Ahern <dsa@cumulusnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <errno.h>
+#include <linux/nexthop.h>
+#include <libmnl/libmnl.h>
+#include <rt_names.h>
+
+#include "utils.h"
+#include "ip_common.h"
+
+static struct
+{
+	unsigned int flushed;
+	unsigned int groups;
+	char *flushb;
+	int flushp;
+	int flushe;
+} filter;
+
+enum {
+	IPNH_LIST,
+	IPNH_FLUSH,
+};
+
+#define RTM_NHA(h)  ((struct rtattr *)(((char *)(h)) + \
+			NLMSG_ALIGN(sizeof(struct nhmsg))))
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip nexthop { list | flush } SELECTOR\n");
+	fprintf(stderr, "       ip nexthop get [ id ID ]\n");
+	fprintf(stderr, "       ip nexthop { add | del | change | replace } NH\n");
+	fprintf(stderr, "SELECTOR := [ id ID ] [ dev DEV ] [ table TABLE ] [ vrf NAME ]\n");
+	fprintf(stderr, "NH := [ encap ENCAPTYPE ENCAPHDR ] [ via [ FAMILY ] ADDRESS ]\n");
+	fprintf(stderr, "      [ id ID ] [ dev STRING ] [ weight NUMBER ]\n");
+	fprintf(stderr, "      [ table TABLE ] [ vrf VRF ] NHFLAGS\n");
+	fprintf(stderr, "NHFLAGS := [ onlink | pervasive ]\n");
+	fprintf(stderr, "ENCAPTYPE := [ mpls | ip | ip6 ]\n");
+	fprintf(stderr, "ENCAPHDR := [ MPLSLABEL ]\n");
+	exit(-1);
+}
+
+static int delete_nexthop(__u32 id)
+{
+	struct {
+		struct nlmsghdr	n;
+		struct nhmsg	nhm;
+		char		buf[64];
+	} req = {
+		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct nhmsg)),
+		.n.nlmsg_flags = NLM_F_REQUEST,
+		.n.nlmsg_type = RTM_DELNEXTHOP,
+		.nhm.nh_family = AF_UNSPEC,
+	};
+
+	req.n.nlmsg_seq = ++rth.seq;
+
+	addattr32(&req.n, sizeof(req), NHA_ID, id);
+
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
+		return -1;
+
+	filter.flushed++;
+	return 0;
+}
+
+struct nh_entry {
+	__u32 id;
+	unsigned int group;
+	struct nh_entry *next;
+};
+
+struct nh_entry *first, *last;
+
+static int flush_nexthop(const struct sockaddr_nl *who,
+			 struct nlmsghdr *nlh, void *arg)
+{
+	struct nhmsg *nhm = NLMSG_DATA(nlh);
+	struct rtattr *tb[NHA_MAX+1];
+	struct nh_entry *nh;
+	__u32 id = 0;
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_SPACE(sizeof(*nhm));
+	if (len < 0) {
+		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+		return -1;
+	}
+
+	parse_rtattr(tb, NHA_MAX, RTM_NHA(nhm), len);
+	if (tb[NHA_ID])
+		id = rta_getattr_u32(tb[NHA_ID]);
+
+	if (!id)
+		return 0;
+
+	nh = malloc(sizeof(*nh));
+	if (!nh)
+		return -1;
+
+	nh->id = id;
+	nh->group = tb[NHA_GROUP] != NULL;
+	nh->next = NULL;
+	if (!first)
+		first = nh;
+	else
+		last->next = nh;
+
+	last = nh;
+	return 0;
+}
+
+static int ipnh_flush(void *req, __u32 len, unsigned int all)
+{
+	struct nh_entry *nh;
+
+	if (send(rth.fd, req, len, 0) < 0) {
+		perror("Cannot send dump request");
+		return -2;
+	}
+
+	if (rtnl_dump_filter(&rth, flush_nexthop, stdout) < 0) {
+		fprintf(stderr, "Dump terminated\n");
+		return -2;
+	}
+
+	/* if deleting all, then remove groups first */
+	if (all) {
+		nh = first;
+		while (nh) {
+			if (nh->group)
+				delete_nexthop(nh->id);
+			nh = nh->next;
+		}
+	}
+
+	nh = first;
+	while (nh) {
+		if (!all || !nh->group)
+			delete_nexthop(nh->id);
+		nh = nh->next;
+	}
+
+	if (!filter.flushed)
+		printf("Nothing to flush\n");
+	else
+		printf("Flushed %d nexthops\n", filter.flushed);
+
+	return 0;
+}
+
+static char *nh_group_type_to_str(__u16 group_type, char *buf, size_t len)
+{
+	static const char *typestr[NEXTHOP_GRP_TYPE_MAX + 1] = {
+		"multipath",   /* NEXTHOP_GRP_TYPE_MPATH */
+	};
+
+	if (group_type < ARRAY_SIZE(typestr))
+		snprintf(buf, len-1, "%s", typestr[group_type]);
+	else
+		snprintf(buf, len-1, "<%u>", group_type);
+
+	buf[len-1] = '\0';
+
+	return buf;
+}
+
+static void print_nh_group(FILE *fp, const struct rtattr *grps_attr,
+			   const struct rtattr *gtype)
+{
+	struct nexthop_grp *nhg = RTA_DATA(grps_attr);
+	int num = RTA_PAYLOAD(grps_attr) / sizeof(*nhg);
+	__u16 group_type = NEXTHOP_GRP_TYPE_MPATH;
+	int i;
+
+	SPRINT_BUF(b1);
+
+	if (!num || num * sizeof(*nhg) != RTA_PAYLOAD(grps_attr)) {
+		fprintf(fp, "<invalid nexthop group>");
+		return;
+	}
+
+	if (gtype)
+		group_type = rta_getattr_u16(gtype);
+
+	if (is_json_context()) {
+		open_json_array(PRINT_JSON, "group");
+		for (i = 0; i < num; ++i) {
+			open_json_object(NULL);
+			print_uint(PRINT_ANY, "id", "id %u ", nhg[i].id);
+			print_uint(PRINT_ANY, "weight", "weight %u ", nhg[i].weight);
+			close_json_object();
+		}
+		close_json_array(PRINT_JSON, NULL);
+		print_string(PRINT_ANY, "type", "type %s ",
+			     nh_group_type_to_str(group_type, b1, sizeof(b1)));
+	} else {
+		fprintf(fp, "group ");
+		for (i = 0; i < num; ++i) {
+			if (i)
+				fprintf(fp, "/");
+			fprintf(fp, "%u", nhg[i].id);
+			if (num > 1 && nhg[i].weight > 1)
+				fprintf(fp, ",%u", nhg[i].weight);
+		}
+	}
+}
+
+static void print_nh_gateway(FILE *fp, const struct nhmsg *nhm,
+			      const struct rtattr *rta)
+{
+	const char *gateway = format_host_rta(nhm->nh_family, rta);
+
+	if (is_json_context())
+		print_string(PRINT_JSON, "gateway", NULL, gateway);
+	else {
+		fprintf(fp, "via ");
+		print_color_string(PRINT_FP, ifa_family_color(nhm->nh_family),
+				  NULL, "%s ", gateway);
+	}
+}
+
+int print_nexthop(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+	struct nhmsg *nhm = NLMSG_DATA(n);
+	struct rtattr *tb[NHA_MAX+1];
+	FILE *fp = (FILE *)arg;
+	int len;
+
+	SPRINT_BUF(b1);
+
+	if (n->nlmsg_type != RTM_DELNEXTHOP &&
+	    n->nlmsg_type != RTM_NEWNEXTHOP) {
+		fprintf(stderr, "Not a nexthop: %08x %08x %08x\n",
+			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+		return -1;
+	}
+
+	len = n->nlmsg_len - NLMSG_SPACE(sizeof(*nhm));
+	if (len < 0) {
+		close_json_object();
+		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+		return -1;
+	}
+
+	parse_rtattr(tb, NHA_MAX, RTM_NHA(nhm), len);
+
+	open_json_object(NULL);
+
+	if (n->nlmsg_type == RTM_DELROUTE)
+		print_bool(PRINT_ANY, "deleted", "Deleted ", true);
+
+	if (tb[NHA_ID])
+		print_uint(PRINT_ANY, "id", "id %u ",
+			   rta_getattr_u32(tb[NHA_ID]));
+
+	if (tb[NHA_GROUP])
+		print_nh_group(fp, tb[NHA_GROUP], tb[NHA_GROUP_TYPE]);
+
+	if (tb[NHA_GATEWAY])
+		print_nh_gateway(fp, nhm, tb[NHA_GATEWAY]);
+
+	if (tb[NHA_SADDR]) {
+		const char *psrc;
+
+		psrc = rt_addr_n2a_rta(nhm->nh_family, tb[NHA_SADDR]);
+		if (is_json_context())
+			print_string(PRINT_JSON, "src", NULL, psrc);
+		else {
+			fprintf(fp, "src ");
+			print_color_string(PRINT_FP,
+					   ifa_family_color(nhm->nh_family),
+					   NULL, "%s ", psrc);
+		}
+	}
+
+	if (tb[NHA_OIF])
+		print_rta_if(fp, tb[NHA_OIF], "dev");
+
+	if (tb[NHA_BLACKHOLE])
+		print_null(PRINT_ANY, "blackhole", "blackhole", NULL);
+
+	if (nhm->nh_protocol != RTPROT_UNSPEC || show_details > 0) {
+		print_string(PRINT_ANY, "protocol", "proto %s ",
+			     rtnl_rtprot_n2a(nhm->nh_protocol, b1, sizeof(b1)));
+	}
+
+	if (nhm->nh_scope != RT_SCOPE_UNIVERSE || show_details > 0) {
+		print_string(PRINT_ANY, "scope", "scope %s ",
+			     rtnl_rtscope_n2a(nhm->nh_scope, b1, sizeof(b1)));
+	}
+
+	if (tb[NHA_OIF])
+		print_rt_flags(fp, nhm->nh_flags);
+
+	if (tb[NHA_FLOW])
+		print_rta_flow(fp, tb[NHA_FLOW]);
+
+	print_string(PRINT_FP, NULL, "%s", "\n");
+	close_json_object();
+	fflush(fp);
+
+	return 0;
+}
+
+static int add_nh_group_attr(struct nlmsghdr *n, int maxlen, char *argv)
+{
+	struct nexthop_grp *grps;
+	int count = 0, i;
+	char *sep, *wsep;
+
+	if (*argv != '\0')
+		count = 1;
+
+	/* separator is '/' */
+	sep = strchr(argv, '/');
+	while (sep) {
+		count++;
+		sep = strchr(sep + 1, '/');
+	}
+
+	if (count == 0)
+		return -1;
+
+	grps = calloc(count, sizeof(*grps));
+	if (!grps)
+		return -1;
+
+	for (i = 0; i < count; ++i) {
+		sep = strchr(argv, '/');
+		if (sep)
+			*sep = '\0';
+
+		wsep = strchr(argv, ',');
+		if (wsep)
+			*wsep = '\0';
+
+		if (get_unsigned(&grps[i].id, argv, 0))
+			return -1;
+		if (wsep) {
+			wsep++;
+			if (get_unsigned(&grps[i].weight, wsep, 0))
+				return -1;
+		}
+
+		if (!sep)
+			break;
+
+		argv = sep + 1;
+	}
+
+	return addattr_l(n, maxlen, NHA_GROUP, grps, count * sizeof(*grps));
+}
+
+static int ipnh_modify(int cmd, unsigned int flags, int argc, char **argv)
+{
+	struct {
+		struct nlmsghdr	n;
+		struct nhmsg	nhm;
+		char		buf[1024];
+	} req = {
+		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct nhmsg)),
+		.n.nlmsg_flags = NLM_F_REQUEST | flags,
+		.n.nlmsg_type = cmd,
+		.nhm.nh_family = AF_UNSPEC,
+	};
+	__u32 nh_flags = 0;
+	__u16 gtype = NEXTHOP_GRP_TYPE_MAX + 1;
+
+	while (argc > 0) {
+		if (!strcmp(*argv, "id")) {
+			__u32 id;
+
+			NEXT_ARG();
+			if (get_unsigned(&id, *argv, 0))
+				invarg("invalid id value", *argv);
+			addattr32(&req.n, sizeof(req), NHA_ID, id);
+		} else if (!strcmp(*argv, "dev")) {
+			int ifindex;
+
+			NEXT_ARG();
+			ifindex = ll_name_to_index(*argv);
+			if (!ifindex)
+				invarg("Device does not exist\n", *argv);
+			addattr32(&req.n, sizeof(req), NHA_OIF, ifindex);
+		} else if (strcmp(*argv, "via") == 0) {
+			inet_prefix addr;
+			int family;
+
+			NEXT_ARG();
+			family = read_family(*argv);
+			if (family == AF_UNSPEC)
+				family = req.nhm.nh_family;
+			else
+				NEXT_ARG();
+			get_addr(&addr, *argv, family);
+			if (req.nhm.nh_family == AF_UNSPEC)
+				req.nhm.nh_family = addr.family;
+			else if (req.nhm.nh_family != addr.family)
+				invarg("address family mismatch\n", *argv);
+			addattr_l(&req.n, sizeof(req), NHA_GATEWAY,
+				  &addr.data, addr.bytelen);
+		} else if (!strcmp(*argv, "blackhole")) {
+			addattr_l(&req.n, sizeof(req), NHA_BLACKHOLE, NULL, 0);
+		} else if (!strcmp(*argv, "onlink")) {
+			nh_flags |= RTNH_F_ONLINK;
+		} else if (!strcmp(*argv, "realms")) {
+			__u32 realm;
+
+			NEXT_ARG();
+			if (get_rt_realms_or_raw(&realm, *argv))
+				invarg("\"realm\" value is invalid\n", *argv);
+			addattr32(&req.n, sizeof(req), NHA_FLOW, realm);
+		} else if (!strcmp(*argv, "group")) {
+			NEXT_ARG();
+
+			if (add_nh_group_attr(&req.n, sizeof(req), *argv))
+				invarg("\"group\" value is invalid\n", *argv);
+		} else if (!strcmp(*argv, "multipath") ||
+			   !strcmp(*argv, "mpath")) {
+			gtype = NEXTHOP_GRP_TYPE_MPATH;
+		} else if (!strcmp(*argv, "table")) {
+			__u32 tb_id;
+
+			NEXT_ARG();
+			if (get_unsigned(&tb_id, *argv, 0))
+				invarg("invalid table value", *argv);
+			addattr32(&req.n, sizeof(req), NHA_TABLE_ID, tb_id);
+		} else if (strcmp(*argv, "vrf") == 0) {
+			__u32 tb_id;
+
+			NEXT_ARG();
+			tb_id = ipvrf_get_table(*argv);
+			if (tb_id == 0)
+				invarg("Invalid VRF", *argv);
+			addattr32(&req.n, sizeof(req), NHA_TABLE_ID, tb_id);
+		} else if (strcmp(*argv, "help") == 0) {
+			usage();
+		} else {
+			invarg("", *argv);
+		}
+		argc--; argv++;
+	}
+
+	if (gtype <= NEXTHOP_GRP_TYPE_MAX)
+		addattr16(&req.n, sizeof(req), NHA_GROUP_TYPE, gtype);
+
+	req.nhm.nh_flags = nh_flags;
+
+	if (rtnl_talk(&rth, &req.n, NULL) < 0)
+		return -2;
+
+	return 0;
+}
+
+static int ipnh_get_id(__u32 id)
+{
+	struct {
+		struct nlmsghdr	n;
+		struct nhmsg	nhm;
+		char		buf[1024];
+	} req = {
+		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct nhmsg)),
+		.n.nlmsg_flags = NLM_F_REQUEST,
+		.n.nlmsg_type  = RTM_GETNEXTHOP,
+		.nhm.nh_family = preferred_family,
+	};
+	struct nlmsghdr *answer;
+
+	addattr32(&req.n, sizeof(req), NHA_ID, id);
+
+	if (rtnl_talk(&rth, &req.n, &answer) < 0)
+		return -2;
+
+	new_json_obj(json);
+
+	if (print_nexthop(NULL, answer, (void *)stdout) < 0) {
+		free(answer);
+		return -1;
+	}
+
+	delete_json_obj();
+	fflush(stdout);
+
+	free(answer);
+
+	return 0;
+}
+
+static int ipnh_list_flush(int argc, char **argv, int action)
+{
+	struct {
+		struct nlmsghdr	n;
+		struct nhmsg	nhm;
+		char		buf[256];
+	} req = {
+		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct nhmsg)),
+		.n.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.n.nlmsg_type  = RTM_GETNEXTHOP,
+		.n.nlmsg_seq = ++rth.seq,
+		.nhm.nh_family = preferred_family,
+	};
+	unsigned int master = 0;
+	unsigned int all = (argc == 0);
+
+	rth.dump = req.n.nlmsg_seq;
+
+	while (argc > 0) {
+		if (!matches(*argv, "dev")) {
+			unsigned int ifindex;
+
+			NEXT_ARG();
+			ifindex = ll_name_to_index(*argv);
+			if (!ifindex)
+				invarg("Device does not exist\n", *argv);
+			addattr32(&req.n, sizeof(req), NHA_OIF, ifindex);
+		} else if (!matches(*argv, "groups")) {
+			addattr_l(&req.n, sizeof(req), NHA_GROUPS, NULL, 0);
+		} else if (!matches(*argv, "master")) {
+			NEXT_ARG();
+			master = ll_name_to_index(*argv);
+			if (!master)
+				invarg("Device does not exist\n", *argv);
+		} else if (matches(*argv, "vrf") == 0) {
+			NEXT_ARG();
+			master = ll_name_to_index(*argv);
+			if (!master)
+				invarg("VRF does not exist\n", *argv);
+			if (!name_is_vrf(*argv))
+				invarg("Invalid VRF\n", *argv);
+		} else if (!strcmp(*argv, "id")) {
+			__u32 id;
+
+			NEXT_ARG();
+			if (get_unsigned(&id, *argv, 0))
+				invarg("invalid id value", *argv);
+			return ipnh_get_id(id);
+		} else if (matches(*argv, "help") == 0) {
+			usage();
+		} else {
+			invarg("", *argv);
+		}
+		argc--; argv++;
+	}
+
+	if (master)
+		addattr32(&req.n, sizeof(req), NHA_MASTER, master);
+
+	if (action == IPNH_FLUSH)
+		return ipnh_flush(&req, req.n.nlmsg_len, all);
+
+	if (send(rth.fd, &req, req.n.nlmsg_len, 0) < 0) {
+		perror("Cannot send dump request");
+		return -2;
+	}
+
+	new_json_obj(json);
+
+	if (rtnl_dump_filter(&rth, print_nexthop, stdout) < 0) {
+		fprintf(stderr, "Dump terminated\n");
+		return -2;
+	}
+
+	delete_json_obj();
+	fflush(stdout);
+
+	return 0;
+}
+
+static int ipnh_get(int argc, char **argv)
+{
+	__u32 id = 0;
+
+	while (argc > 0) {
+		if (!strcmp(*argv, "id")) {
+			NEXT_ARG();
+			if (get_unsigned(&id, *argv, 0))
+				invarg("invalid id value", *argv);
+		}
+		if (matches(*argv, "help") == 0)
+			usage();
+
+		argc--; argv++;
+	}
+
+	if (!id) {
+		usage();
+		return -1;
+	}
+
+	return ipnh_get_id(id);
+}
+
+int do_ipnh(int argc, char **argv)
+{
+	if (argc < 1)
+		return ipnh_list_flush(0, NULL, IPNH_LIST);
+
+	if (!matches(*argv, "add"))
+		return ipnh_modify(RTM_NEWNEXTHOP, NLM_F_CREATE|NLM_F_EXCL,
+				   argc-1, argv+1);
+	if (!matches(*argv, "change") || !strcmp(*argv, "chg"))
+		return ipnh_modify(RTM_NEWNEXTHOP, NLM_F_REPLACE,
+				   argc-1, argv+1);
+	if (!matches(*argv, "replace"))
+		return ipnh_modify(RTM_NEWNEXTHOP, NLM_F_CREATE|NLM_F_REPLACE,
+				   argc-1, argv+1);
+	if (!matches(*argv, "delete"))
+		return ipnh_modify(RTM_DELNEXTHOP, 0, argc-1, argv+1);
+
+	if (!matches(*argv, "list") ||
+	    !matches(*argv, "show") ||
+	    !matches(*argv, "lst"))
+		return ipnh_list_flush(argc-1, argv+1, IPNH_LIST);
+
+	if (!matches(*argv, "get"))
+		return ipnh_get(argc-1, argv+1);
+
+	if (!matches(*argv, "flush"))
+		return ipnh_list_flush(argc-1, argv+1, IPNH_FLUSH);
+
+	if (!matches(*argv, "help"))
+		usage();
+
+	fprintf(stderr,
+		"Command \"%s\" is unknown, try \"ip nexthop help\".\n", *argv);
+	exit(-1);
+}
diff --git a/ip/iproute.c b/ip/iproute.c
index 30833414a3f7..0af72c2eccca 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -349,7 +349,7 @@ static void print_rtax_features(FILE *fp, unsigned int features)
 			    "features", "0x%x ", of);
 }
 
-static void print_rt_flags(FILE *fp, unsigned int flags)
+void print_rt_flags(FILE *fp, unsigned int flags)
 {
 	open_json_array(PRINT_JSON,
 			is_json_context() ?  "flags" : "");
@@ -394,8 +394,8 @@ static void print_rt_pref(FILE *fp, unsigned int pref)
 	}
 }
 
-static void print_rta_if(FILE *fp, const struct rtattr *rta,
-			const char *prefix)
+void print_rta_if(FILE *fp, const struct rtattr *rta,
+		  const char *prefix)
 {
 	const char *ifname = ll_index_to_name(rta_getattr_u32(rta));
 
@@ -492,7 +492,7 @@ static void print_rta_cacheinfo(FILE *fp, const struct rta_cacheinfo *ci)
 	}
 }
 
-static void print_rta_flow(FILE *fp, const struct rtattr *rta)
+void print_rta_flow(FILE *fp, const struct rtattr *rta)
 {
 	__u32 to = rta_getattr_u32(rta);
 	__u32 from = to >> 16;
@@ -823,6 +823,10 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 			     rtnl_dsfield_n2a(r->rtm_tos, b1, sizeof(b1)));
 	}
 
+	if (tb[RTA_NH_ID])
+		print_uint(PRINT_ANY, "nhid", "nhid %u ",
+			   rta_getattr_u32(tb[RTA_NH_ID]));
+
 	if (tb[RTA_GATEWAY] && filter.rvia.bitlen != host_len)
 		print_rta_gateway(fp, r, tb[RTA_GATEWAY]);
 
@@ -1351,6 +1355,13 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv)
 		} else if (strcmp(*argv, "nexthop") == 0) {
 			nhs_ok = 1;
 			break;
+		} else if (!strcmp(*argv, "nhid")) {
+			__u32 id;
+
+			NEXT_ARG();
+			if (get_u32(&id, *argv, 0))
+				invarg("\"id\" value is invalid\n", *argv);
+			addattr32(&req.n, sizeof(req), RTA_NH_ID, id);
 		} else if (matches(*argv, "protocol") == 0) {
 			__u32 prot;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 13/18] net/ipv4: Convert existing use of fib_info to new helpers
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Remove direct accesses to fi->fib_nh in favor of the helpers added
in the previous patch.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c    |  4 +++-
 drivers/net/ethernet/rocker/rocker_ofdpa.c           | 20 ++++++++++++++------
 include/net/ip_fib.h                                 |  1 -
 net/ipv4/fib_frontend.c                              |  3 ++-
 net/ipv4/fib_rules.c                                 |  3 ++-
 net/ipv4/fib_semantics.c                             | 12 ++++++++----
 net/ipv4/fib_trie.c                                  | 19 +++++++++++--------
 7 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 2ab9cf25a08a..3fcac0b6fa92 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -28,6 +28,7 @@
 #include <net/ipv6.h>
 #include <net/fib_notifier.h>
 #include <net/switchdev.h>
+#include <net/nexthop.h>
 
 #include "spectrum.h"
 #include "core.h"
@@ -4121,12 +4122,13 @@ mlxsw_sp_fib4_entry_type_set(struct mlxsw_sp *mlxsw_sp,
 			     struct mlxsw_sp_fib_entry *fib_entry)
 {
 	union mlxsw_sp_l3addr dip = { .addr4 = htonl(fen_info->dst) };
-	struct net_device *dev = fen_info->fi->fib_dev;
 	struct mlxsw_sp_ipip_entry *ipip_entry;
 	struct fib_info *fi = fen_info->fi;
+	struct net_device *dev;
 
 	switch (fen_info->type) {
 	case RTN_LOCAL:
+		dev = fib_info_nh_dev(fi);
 		ipip_entry = mlxsw_sp_ipip_entry_find_by_decap(mlxsw_sp, dev,
 						 MLXSW_SP_L3_PROTO_IPV4, dip);
 		if (ipip_entry && ipip_entry->ol_dev->flags & IFF_UP) {
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 6473cc68c2d5..c05d35945ea7 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -23,6 +23,7 @@
 #include <net/switchdev.h>
 #include <net/ip_fib.h>
 #include <net/arp.h>
+#include <net/nexthop.h>
 
 #include "rocker.h"
 #include "rocker_tlv.h"
@@ -2286,8 +2287,8 @@ static int ofdpa_port_fib_ipv4(struct ofdpa_port *ofdpa_port,  __be32 dst,
 
 	/* XXX support ECMP */
 
-	nh = fi->fib_nh;
-	nh_on_port = (fi->fib_dev == ofdpa_port->dev);
+	nh = fib_info_nh(fi, 0);
+	nh_on_port = (nh->nh_dev == ofdpa_port->dev);
 	has_gw = !!nh->nh_gw;
 
 	if (has_gw && nh_on_port) {
@@ -2747,11 +2748,13 @@ static int ofdpa_fib4_add(struct rocker *rocker,
 {
 	struct ofdpa *ofdpa = rocker->wpriv;
 	struct ofdpa_port *ofdpa_port;
+	struct net_device *dev;
 	int err;
 
 	if (ofdpa->fib_aborted)
 		return 0;
-	ofdpa_port = ofdpa_port_dev_lower_find(fen_info->fi->fib_dev, rocker);
+	dev = fib_info_nh_dev(fen_info->fi);
+	ofdpa_port = ofdpa_port_dev_lower_find(dev, rocker);
 	if (!ofdpa_port)
 		return 0;
 	err = ofdpa_port_fib_ipv4(ofdpa_port, htonl(fen_info->dst),
@@ -2768,10 +2771,12 @@ static int ofdpa_fib4_del(struct rocker *rocker,
 {
 	struct ofdpa *ofdpa = rocker->wpriv;
 	struct ofdpa_port *ofdpa_port;
+	struct net_device *dev;
 
 	if (ofdpa->fib_aborted)
 		return 0;
-	ofdpa_port = ofdpa_port_dev_lower_find(fen_info->fi->fib_dev, rocker);
+	dev = fib_info_nh_dev(fen_info->fi);
+	ofdpa_port = ofdpa_port_dev_lower_find(dev, rocker);
 	if (!ofdpa_port)
 		return 0;
 	fen_info->fi->fib_nh->nh_flags &= ~RTNH_F_OFFLOAD;
@@ -2794,11 +2799,14 @@ static void ofdpa_fib4_abort(struct rocker *rocker)
 
 	spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
 	hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry) {
+		struct net_device *dev;
+
 		if (flow_entry->key.tbl_id !=
 		    ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING)
 			continue;
-		ofdpa_port = ofdpa_port_dev_lower_find(flow_entry->fi->fib_dev,
-						       rocker);
+
+		dev = fib_info_nh_dev(flow_entry->fi);
+		ofdpa_port = ofdpa_port_dev_lower_find(dev, rocker);
 		if (!ofdpa_port)
 			continue;
 		flow_entry->fi->fib_nh->nh_flags &= ~RTNH_F_OFFLOAD;
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e39f55f3c3d8..c59e0f1ba59b 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -129,7 +129,6 @@ struct fib_info {
 	int			fib_nhs;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
-#define fib_dev		fib_nh[0].nh_dev
 };
 
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ec6ae186d4b0..c483453bf037 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <net/nexthop.h>
 
 #include <net/ip.h>
 #include <net/protocol.h>
@@ -234,7 +235,7 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
 	if (table) {
 		ret = RTN_UNICAST;
 		if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
-			if (!dev || dev == res.fi->fib_dev)
+			if (!dev || dev == fib_info_nh_dev(res.fi))
 				ret = res.type;
 		}
 	}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f8eb78d042a4..6808883af694 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -32,6 +32,7 @@
 #include <net/tcp.h>
 #include <net/ip_fib.h>
 #include <net/fib_rules.h>
+#include <net/nexthop.h>
 
 struct fib4_rule {
 	struct fib_rule		common;
@@ -146,7 +147,7 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
 	struct net_device *dev = NULL;
 
 	if (result->fi)
-		dev = result->fi->fib_dev;
+		dev = fib_info_nh_dev(result->fi);
 
 	/* do not accept result if the route does
 	 * not meet the required prefix length
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 53e38ecfdd58..0cd536ad1761 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -45,6 +45,7 @@
 #include <net/rtnh.h>
 #include <net/lwtunnel.h>
 #include <net/fib_notifier.h>
+#include <net/nexthop.h>
 
 #include "fib_lookup.h"
 
@@ -447,10 +448,11 @@ static int fib_detect_death(struct fib_info *fi, int order,
 			    struct fib_info **last_resort, int *last_idx,
 			    int dflt)
 {
+	struct fib_nh *fnh = fib_info_nh(fi, 0);
 	struct neighbour *n;
 	int state = NUD_NONE;
 
-	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+	n = neigh_lookup(&arp_tbl, &fnh->nh_gw, fnh->nh_dev);
 	if (n) {
 		state = n->nud_state;
 		neigh_release(n);
@@ -713,7 +715,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 	if (cfg->fc_oif || cfg->fc_gw) {
 		if (cfg->fc_encap) {
 			if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
-					    fi->fib_nh, cfg, extack))
+					    fib_info_nh(fi, 0), cfg, extack))
 				return 1;
 		}
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1571,6 +1573,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 
 	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
+		struct fib_nh *fnh;
 
 		if (fa->fa_slen != slen)
 			continue;
@@ -1592,8 +1595,9 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 		if (next_fi->fib_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-		if (!next_fi->fib_nh[0].nh_gw ||
-		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+
+		fnh = fib_info_nh(next_fi, 0);
+		if (!fnh->nh_gw || fnh->nh_scope != RT_SCOPE_LINK)
 			continue;
 
 		fib_alias_accessed(fa);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 51e7b38f3a7b..c6aab049a4ac 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -83,6 +83,7 @@
 #include <net/sock.h>
 #include <net/ip_fib.h>
 #include <net/fib_notifier.h>
+#include <net/nexthop.h>
 #include <trace/events/fib.h>
 #include "fib_lookup.h"
 
@@ -2621,13 +2622,13 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock();
 }
 
-static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
 {
 	unsigned int flags = 0;
 
 	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
 		flags = RTF_REJECT;
-	if (fi && fi->fib_nh->nh_gw)
+	if (fi && fib_info_nh_gw(fi))
 		flags |= RTF_GATEWAY;
 	if (mask == htonl(0xFFFFFFFF))
 		flags |= RTF_HOST;
@@ -2659,7 +2660,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 	prefix = htonl(l->key);
 
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
-		const struct fib_info *fi = fa->fa_info;
+		struct fib_info *fi = fa->fa_info;
 		__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
 		unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
 
@@ -2672,26 +2673,28 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 
 		seq_setwidth(seq, 127);
 
-		if (fi)
+		if (fi) {
+			struct net_device *dev = fib_info_nh_dev(fi);
+
 			seq_printf(seq,
 				   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
 				   "%d\t%08X\t%d\t%u\t%u",
-				   fi->fib_dev ? fi->fib_dev->name : "*",
+				   dev ? dev->name : "*",
 				   prefix,
-				   fi->fib_nh->nh_gw, flags, 0, 0,
+				   fib_info_nh_gw(fi), flags, 0, 0,
 				   fi->fib_priority,
 				   mask,
 				   (fi->fib_advmss ?
 				    fi->fib_advmss + 40 : 0),
 				   fi->fib_window,
 				   fi->fib_rtt >> 3);
-		else
+		} else {
 			seq_printf(seq,
 				   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
 				   "%d\t%08X\t%d\t%u\t%u",
 				   prefix, 0, flags, 0, 0, 0,
 				   mask, 0, 0, 0);
-
+		}
 		seq_pad(seq, '\n');
 	}
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 01/18] net: Rename net/nexthop.h net/rtnh.h
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

The header contains rtnh_ macros so rename the file accordingly.
Allows next patch to use the nexthop.h name.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/{nexthop.h => rtnh.h} | 4 ++--
 net/core/lwtunnel.c               | 2 +-
 net/decnet/dn_fib.c               | 2 +-
 net/ipv4/fib_semantics.c          | 2 +-
 net/ipv4/ipmr.c                   | 2 +-
 net/ipv6/route.c                  | 2 +-
 net/mpls/af_mpls.c                | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)
 rename include/net/{nexthop.h => rtnh.h} (94%)

diff --git a/include/net/nexthop.h b/include/net/rtnh.h
similarity index 94%
rename from include/net/nexthop.h
rename to include/net/rtnh.h
index 902ff382a6dc..aa2cfc508f7c 100644
--- a/include/net/nexthop.h
+++ b/include/net/rtnh.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __NET_NEXTHOP_H
-#define __NET_NEXTHOP_H
+#ifndef __NET_RTNH_H
+#define __NET_RTNH_H
 
 #include <linux/rtnetlink.h>
 #include <net/netlink.h>
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 0b171756453c..80c30cd5744a 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -26,7 +26,7 @@
 #include <net/lwtunnel.h>
 #include <net/rtnetlink.h>
 #include <net/ip6_fib.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
 
 #ifdef CONFIG_MODULES
 
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index f78fe58eafc8..3757a56bbcbd 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -42,7 +42,7 @@
 #include <net/dn_fib.h>
 #include <net/dn_neigh.h>
 #include <net/dn_dev.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
 
 #define RT_MIN_TABLE 1
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f3c89ccf14c5..93524a746ca8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -42,7 +42,7 @@
 #include <net/sock.h>
 #include <net/ip_fib.h>
 #include <net/netlink.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
 #include <net/lwtunnel.h>
 #include <net/fib_notifier.h>
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5660adcf7a04..564d4fd5a92b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -66,7 +66,7 @@
 #include <net/netlink.h>
 #include <net/fib_rules.h>
 #include <linux/netconf.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
 #include <net/switchdev.h>
 
 struct ipmr_rule {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c4ea13e8360b..07ed7812c6b4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -59,7 +59,7 @@
 #include <net/xfrm.h>
 #include <net/netevent.h>
 #include <net/netlink.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
 #include <net/lwtunnel.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..d066e5e9b76c 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -23,7 +23,7 @@
 #include <net/ipv6.h>
 #endif
 #include <net/addrconf.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
 #include "internal.h"
 
 /* max memory we will use for mpls_route */
-- 
2.11.0

^ permalink raw reply related

* [PATCH RFC net-next 00/18] net: Improve route scalability via support for nexthop objects
From: dsahern @ 2018-09-01  0:49 UTC (permalink / raw)
  To: netdev; +Cc: roopa, sharpd, idosch, davem, David Ahern

From: David Ahern <dsahern@gmail.com>

As mentioned at netconf in Seoul, we would like to introduce nexthops as
independent objects from the routes to better align with both routing
daemons and hardware and to improve route insertion times into the kernel.

This series adds nexthop objects with their own lifecycle. The model
retains a lot of the established semantics from routes and re-uses some
of the data structures like fib_nh and fib6_nh to more easily align with
the existing code. One difference with nexthop objects is the behavior
better aligns with the target user - routing daemons and switch ASICs.
Specifically, with the exception of the blackhole nexthop, all nexthops
must reference a netdevice (or have a gateway that resolves to a device)
and the device must be admin up with carrier.

Prefixes are then installed pointing to the nexthop by id:
  { prefix } --> { nexthop }  --> { gateway, device }

The nexthop object contains the gateway and device reference.

Benchmarks
The following data shows the route insert time for 720,022 routes (a full
IPv4 internet feed from August 28th). "current" means the current code
where a route insert specifies the device and gateway inline with the
prefix; the "nexthop" columns mean use of the nexthop objects.

         1-hop          1-hop     |    2-hops       2-hops
        current        nexthop    |   current      nexthop
        --------------------------|-------------------------
real    0m21.872s      0m12.982s  |   0m28.723s    0m12.406s
user    0m2.929s       0m1.816s   |   0m3.966s     0m1.935s
sys     0m13.469s      0m6.010s   |   0m18.992s    0m5.913s

With nexthop objects the time to insert the routes is reduced by more
than 30% with the kernel time cut in half. The current model has a route
insertion rate of about 32,000 prefixes / second and with nexthop objects
that increases to a little over 55,000 prefixes/second.

For routes with multiple nexthops the install time is cut by more than
half with system time reduce by a factor of 3. Further, with nexthop
objects insert times for multipath routes drops down to the same as
single path routes since the multipath spec is given once (ie., with the
current model, the time to insert routes increases with the number of
paths in the route compared to nexthop objects where the number of paths
is handled once and the prefixes referencing it are installed in constant
time.

The difference between real and system times shows there is room for
improvement with the trie implementation. As an example, increasing the
sync_pages from 128 to 1024 delays the call to synchronize_rcu increasing
the insert rate to more than 78,000 prefixes/sec!

Some key features:
1. Allows atomic replace of any nexthop object - a nexthop or a group.
   This allows existing route entries to have their nexthop updated
   without the overhead of removing and re-inserting (or replacing)
   them. Instead, one update of the nexthop object implicitly updates
   all routes referencing it.

   One limitation with the atomic replace is that a nexthop group can
   only be replaced with a new group spec and similarly a nexthop can
   only be replaced by a nexthop spec. Specifically, a nexthop id can
   not move between a single nexthop and a group nexthop.

2. Blackhole nexthop: a nexthop object can be designated a blackhole
   which means any lookups that resolve to it, packets are dropped as
   if the lookup failed with the result RTN_BLACKHOLE. Blackhole nexthops
   can not be used with nexthop groups. Combined with atomic replace
   this allows routes to be installed pointing to a blackhole nexthop
   and then switched to an actual gateway with a single nexthop replace
   command (or vice versa, a gateway nexthop is flipped to a blackhole).

3. Nexthop groups for multipath routes. A nexthop group is a nexthop
   that references other nexthops. A multipath group can not be used
   as a nexthop in another nexthop group (ie., groups can not be nested).

4. Multipath routes for IPv6 with device only nexthops. There is a
   demonstrated need for this feature and the existing route semantics
   do not allow it. This series provides a means for that end - create a
   nexthop that has a device only specification.

5. Admin and carrier up are required. If the device goes down (admin or
   carrier) the nexthop is removed in which case routes referencing the
   nexthop are evicted and any nexthop groups referencing it are adjusted.

6. Follow on patches will allow IPv6 nexthops with IPv4 routes for users
   wanting support of RFC 5549.

7. Future extensions: active / backup nexthop. The nexthop groups are
   structured to allow a new group type to be added. One example is a
   group where a nexthop has a preferred device and gateway, but should
   the device go down or the gateway not resolve, the backup nexthop is
   used.

Additional Benefits
- smaller route notifications - messages contain a single nexthop id versus
  the detailed nexthop specification. This is especially noticeable as the
  number of paths increases. Smaller messages have a reduced load on
  userspace as well.

- smaller memory footprint for IPv6 routes.

Examples
1. Single path
    $ ip nexthop add id 1 via 10.99.1.2 dev veth1
    $ ip route add 10.1.1.0/24 nhid 1

    $ ip next ls
    id 1 via 10.99.1.2 src 10.99.1.1 dev veth1 scope link

    $ ip ro ls
    10.1.1.0/24 nhid 1 scope link
    ...

2. ECMP
    $ ip nexthop add id 2 via 10.99.3.2 dev veth3
    $ ip nexthop add id 1001 group 1/2
      --> creates a nexthop group with 2 component nexthops:
          id 1 and id 2 both the same weight

    $ ip route add 10.1.2.0/24 nhid 1001

    $ ip next ls
    id 1 via 10.99.1.2 src 10.99.1.1 dev veth1 scope link
    id 2 via 10.99.3.2 src 10.99.3.1 dev veth3 scope link
    id 1001 group 1/2

    $ ip ro ls
    10.1.1.0/24 nhid 1 scope link
    10.1.2.0/24 nhid 1001 scope link
    ...

3. Weighted multipath
    $ ip nexthop add id 1002 group 1,10/2,20
      --> creates a nexthop group with 2 component nexthops:
          id 1 with a weight of 10 and id 2 with a weight of 20

    $ ip route add 10.1.3.0/24 nhid 1002

    $  ip next ls
    id 1 via 10.99.1.2 src 10.99.1.1 dev veth1 scope link
    id 2 via 10.99.3.2 src 10.99.3.1 dev veth3 scope link
    id 1001 group 1/2
    id 1002 group 1,10/2,20

    $ ip ro ls
    10.1.1.0/24 nhid 1 scope link
    10.1.2.0/24 nhid 1001 scope link
    10.1.3.0/24 nhid 1002 scope link
    ...


Open Items
There is long to-do list before this is ready (e.g., IPv6 multipath, lwt
encap, and updating mlxsw). The point of this RFC is to get comments on
the API and overall idea. Specifically, any interested parties should
think about the API, the objects, the workflow, how it fits and
possibility for future extensions.

David Ahern (18):
  net: Rename net/nexthop.h net/rtnh.h
  net: ipv4: export fib_good_nh and fib_flush
  net/ipv4: export fib_info_update_nh_saddr
  net/ipv4: export fib_check_nh
  net/ipv4: Define fib_get_nhs when CONFIG_IP_ROUTE_MULTIPATH is
    disabled
  net/ipv4: Create init and release helpers for fib_nh
  net: ipv4: Add fib_nh to fib_result
  net/ipv4: Move device validation to helper
  net/ipv6: Create init and release helpers for fib6_nh
  net/ipv6: Make fib6_nh optional at the end of fib6_info
  net: Initial nexthop code
  net/ipv4: Add nexthop helpers for ipv4 integration
  net/ipv4: Convert existing use of fib_info to new helpers
  net/ipv4: Allow routes to use nexthop objects
  net/ipv6: Use helpers to access fib6_nh data
  net/ipv6: Allow routes to use nexthop objects
  net: Add support for nexthop groups
  net/ipv4: Optimization for fib_info lookup

 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |    4 +-
 drivers/net/ethernet/rocker/rocker_ofdpa.c         |   20 +-
 include/net/addrconf.h                             |    5 +
 include/net/ip6_fib.h                              |   22 +-
 include/net/ip6_route.h                            |   12 +-
 include/net/ip_fib.h                               |   39 +-
 include/net/net_namespace.h                        |    2 +
 include/net/netns/nexthop.h                        |   18 +
 include/net/nexthop.h                              |  253 +++-
 include/net/rtnh.h                                 |   34 +
 include/trace/events/fib6.h                        |   15 +-
 include/uapi/linux/nexthop.h                       |   56 +
 include/uapi/linux/rtnetlink.h                     |    8 +
 net/core/filter.c                                  |   13 +-
 net/core/lwtunnel.c                                |    2 +-
 net/decnet/dn_fib.c                                |    2 +-
 net/ipv4/Makefile                                  |    2 +-
 net/ipv4/fib_frontend.c                            |   60 +-
 net/ipv4/fib_rules.c                               |    3 +-
 net/ipv4/fib_semantics.c                           |  433 ++++--
 net/ipv4/fib_trie.c                                |   54 +-
 net/ipv4/ipmr.c                                    |    2 +-
 net/ipv4/nexthop.c                                 | 1541 ++++++++++++++++++++
 net/ipv4/route.c                                   |   34 +-
 net/ipv6/addrconf.c                                |    5 +-
 net/ipv6/addrconf_core.c                           |    9 +
 net/ipv6/af_inet6.c                                |    1 +
 net/ipv6/ip6_fib.c                                 |   27 +-
 net/ipv6/ndisc.c                                   |   15 +-
 net/ipv6/route.c                                   |  474 +++---
 net/mpls/af_mpls.c                                 |    2 +-
 security/selinux/nlmsgtab.c                        |    5 +-
 32 files changed, 2690 insertions(+), 482 deletions(-)
 create mode 100644 include/net/netns/nexthop.h
 create mode 100644 include/net/rtnh.h
 create mode 100644 include/uapi/linux/nexthop.h
 create mode 100644 net/ipv4/nexthop.c

-- 
2.11.0

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox