* [PATCH v3 15/18] ARM: davinci: dm646x-evm: use device properties for at24 eeprom
From: Bartosz Golaszewski @ 2018-06-28 14:32 UTC (permalink / raw)
To: Sekhar Nori, Kevin Hilman, Russell King, Grygorii Strashko,
David S . Miller, Srinivas Kandagatla, Lukas Wunner, Rob Herring,
Florian Fainelli, Dan Carpenter, Ivan Khoronzhuk, David Lechner,
Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet
Cc: linux-arm-kernel, linux-kernel, linux-omap, netdev,
Bartosz Golaszewski
In-Reply-To: <20180628143244.4561-1-brgl@bgdev.pl>
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
We want to work towards phasing out the at24_platform_data structure.
There are few users and its contents can be represented using generic
device properties. Using device properties only will allow us to
significantly simplify the at24 configuration code.
Remove the at24_platform_data structure and replace it with an array
of property entries. Drop the byte_len/size property, as the model name
already implies the EEPROM's size.
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
arch/arm/mach-davinci/board-dm646x-evm.c | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c
index 5a9de47bc8a2..5049f0c6cd1a 100644
--- a/arch/arm/mach-davinci/board-dm646x-evm.c
+++ b/arch/arm/mach-davinci/board-dm646x-evm.c
@@ -22,7 +22,7 @@
#include <linux/gpio.h>
#include <linux/platform_device.h>
#include <linux/i2c.h>
-#include <linux/platform_data/at24.h>
+#include <linux/property.h>
#include <linux/platform_data/pcf857x.h>
#include <media/i2c/tvp514x.h>
@@ -320,12 +320,9 @@ static struct nvmem_cell_lookup dm646x_evm_mac_address_cell = {
.nvmem_name = "1-00500",
};
-static struct at24_platform_data eeprom_info = {
- .byte_len = (256*1024) / 8,
- .page_size = 64,
- .flags = AT24_FLAG_ADDR16,
- .setup = davinci_get_mac_addr,
- .context = (void *)0x7f00,
+static const struct property_entry eeprom_properties[] = {
+ PROPERTY_ENTRY_U32("pagesize", 64),
+ { }
};
#endif
@@ -396,7 +393,7 @@ static void evm_init_cpld(void)
static struct i2c_board_info __initdata i2c_info[] = {
{
I2C_BOARD_INFO("24c256", 0x50),
- .platform_data = &eeprom_info,
+ .properties = eeprom_properties,
},
{
I2C_BOARD_INFO("pcf8574a", 0x38),
--
2.17.1
^ permalink raw reply related
* [PATCH v3 17/18] ARM: davinci: sffsdr: use device properties for at24 eeprom
From: Bartosz Golaszewski @ 2018-06-28 14:32 UTC (permalink / raw)
To: Sekhar Nori, Kevin Hilman, Russell King, Grygorii Strashko,
David S . Miller, Srinivas Kandagatla, Lukas Wunner, Rob Herring,
Florian Fainelli, Dan Carpenter, Ivan Khoronzhuk, David Lechner,
Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet
Cc: linux-arm-kernel, linux-kernel, linux-omap, netdev,
Bartosz Golaszewski
In-Reply-To: <20180628143244.4561-1-brgl@bgdev.pl>
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
We want to work towards phasing out the at24_platform_data structure.
There are few users and its contents can be represented using generic
device properties. Using device properties only will allow us to
significantly simplify the at24 configuration code.
Remove the at24_platform_data structure and replace it with an array
of property entries. Drop the byte_len/size property, as the model name
already implies the EEPROM's size.
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
arch/arm/mach-davinci/board-sffsdr.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-davinci/board-sffsdr.c b/arch/arm/mach-davinci/board-sffsdr.c
index f6a4d094cbc3..680e5d7628a8 100644
--- a/arch/arm/mach-davinci/board-sffsdr.c
+++ b/arch/arm/mach-davinci/board-sffsdr.c
@@ -26,7 +26,7 @@
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/i2c.h>
-#include <linux/platform_data/at24.h>
+#include <linux/property.h>
#include <linux/mtd/mtd.h>
#include <linux/mtd/rawnand.h>
#include <linux/mtd/partitions.h>
@@ -92,16 +92,15 @@ static struct platform_device davinci_sffsdr_nandflash_device = {
.resource = davinci_sffsdr_nandflash_resource,
};
-static struct at24_platform_data eeprom_info = {
- .byte_len = (64*1024) / 8,
- .page_size = 32,
- .flags = AT24_FLAG_ADDR16,
+static const struct property_entry eeprom_properties[] = {
+ PROPERTY_ENTRY_U32("pagesize", 32),
+ { },
};
static struct i2c_board_info __initdata i2c_info[] = {
{
I2C_BOARD_INFO("24c64", 0x50),
- .platform_data = &eeprom_info,
+ .properties = eeprom_properties,
},
/* Other I2C devices:
* MSP430, addr 0x23 (not used)
--
2.17.1
^ permalink raw reply related
* [PATCH v3 18/18] ARM: davinci: remove dead code related to MAC address reading
From: Bartosz Golaszewski @ 2018-06-28 14:32 UTC (permalink / raw)
To: Sekhar Nori, Kevin Hilman, Russell King, Grygorii Strashko,
David S . Miller, Srinivas Kandagatla, Lukas Wunner, Rob Herring,
Florian Fainelli, Dan Carpenter, Ivan Khoronzhuk, David Lechner,
Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet
Cc: linux-arm-kernel, linux-kernel, linux-omap, netdev,
Bartosz Golaszewski
In-Reply-To: <20180628143244.4561-1-brgl@bgdev.pl>
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
There are no more users of davinci_get_mac_addr(). Remove it. Also
remove the mac_addr field from the emac platform data struct.
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
arch/arm/mach-davinci/common.c | 15 ---------------
include/linux/davinci_emac.h | 2 --
2 files changed, 17 deletions(-)
diff --git a/arch/arm/mach-davinci/common.c b/arch/arm/mach-davinci/common.c
index bcb6a7ba84e9..0c6cc354a4aa 100644
--- a/arch/arm/mach-davinci/common.c
+++ b/arch/arm/mach-davinci/common.c
@@ -28,21 +28,6 @@ EXPORT_SYMBOL(davinci_soc_info);
void __iomem *davinci_intc_base;
int davinci_intc_type;
-void davinci_get_mac_addr(struct nvmem_device *nvmem, void *context)
-{
- char *mac_addr = davinci_soc_info.emac_pdata->mac_addr;
- off_t offset = (off_t)context;
-
- if (!IS_BUILTIN(CONFIG_NVMEM)) {
- pr_warn("Cannot read MAC addr from EEPROM without CONFIG_NVMEM\n");
- return;
- }
-
- /* Read MAC addr from EEPROM */
- if (nvmem_device_read(nvmem, offset, ETH_ALEN, mac_addr) == ETH_ALEN)
- pr_info("Read MAC addr from EEPROM: %pM\n", mac_addr);
-}
-
static int __init davinci_init_id(struct davinci_soc_info *soc_info)
{
int i;
diff --git a/include/linux/davinci_emac.h b/include/linux/davinci_emac.h
index 05b97144d342..19888b27706d 100644
--- a/include/linux/davinci_emac.h
+++ b/include/linux/davinci_emac.h
@@ -19,7 +19,6 @@ struct mdio_platform_data {
};
struct emac_platform_data {
- char mac_addr[ETH_ALEN];
u32 ctrl_reg_offset;
u32 ctrl_mod_reg_offset;
u32 ctrl_ram_offset;
@@ -46,5 +45,4 @@ enum {
EMAC_VERSION_2, /* DM646x */
};
-void davinci_get_mac_addr(struct nvmem_device *nvmem, void *context);
#endif
--
2.17.1
^ permalink raw reply related
* [PATCH v3 03/18] ARM: davinci: dm365-evm: use nvmem lookup for mac address
From: Bartosz Golaszewski @ 2018-06-28 14:32 UTC (permalink / raw)
To: Sekhar Nori, Kevin Hilman, Russell King, Grygorii Strashko,
David S . Miller, Srinivas Kandagatla, Lukas Wunner, Rob Herring,
Florian Fainelli, Dan Carpenter, Ivan Khoronzhuk, David Lechner,
Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet
Cc: linux-arm-kernel, linux-kernel, linux-omap, netdev,
Bartosz Golaszewski
In-Reply-To: <20180628143244.4561-1-brgl@bgdev.pl>
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
We now support nvmem lookups for machine code. Add a lookup for
mac-address.
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
arch/arm/mach-davinci/board-dm365-evm.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/arch/arm/mach-davinci/board-dm365-evm.c b/arch/arm/mach-davinci/board-dm365-evm.c
index 435f7ec7d9af..cb0ac92a278e 100644
--- a/arch/arm/mach-davinci/board-dm365-evm.c
+++ b/arch/arm/mach-davinci/board-dm365-evm.c
@@ -28,6 +28,7 @@
#include <linux/spi/spi.h>
#include <linux/spi/eeprom.h>
#include <linux/v4l2-dv-timings.h>
+#include <linux/nvmem-provider.h>
#include <asm/mach-types.h>
#include <asm/mach/arch.h>
@@ -169,6 +170,15 @@ static struct platform_device davinci_nand_device = {
},
};
+static struct nvmem_cell_lookup dm365evm_mac_address_cell = {
+ .info = {
+ .name = "mac-address",
+ .offset = 0x7f00,
+ .bytes = ETH_ALEN,
+ },
+ .nvmem_name = "1-00500",
+};
+
static struct at24_platform_data eeprom_info = {
.byte_len = (256*1024) / 8,
.page_size = 64,
@@ -769,6 +779,8 @@ static __init void dm365_evm_init(void)
dm365_init_spi0(BIT(0), dm365_evm_spi_info,
ARRAY_SIZE(dm365_evm_spi_info));
+
+ nvmem_add_lookup_table(&dm365evm_mac_address_cell, 1);
}
MACHINE_START(DAVINCI_DM365_EVM, "DaVinci DM365 EVM")
--
2.17.1
^ permalink raw reply related
* [PATCH v3 04/18] ARM: davinci: dm644-evm: use nvmem lookup for mac address
From: Bartosz Golaszewski @ 2018-06-28 14:32 UTC (permalink / raw)
To: Sekhar Nori, Kevin Hilman, Russell King, Grygorii Strashko,
David S . Miller, Srinivas Kandagatla, Lukas Wunner, Rob Herring,
Florian Fainelli, Dan Carpenter, Ivan Khoronzhuk, David Lechner,
Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet
Cc: linux-arm-kernel, linux-kernel, linux-omap, netdev,
Bartosz Golaszewski
In-Reply-To: <20180628143244.4561-1-brgl@bgdev.pl>
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
We now support nvmem lookups for machine code. Add a lookup for
mac-address.
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
arch/arm/mach-davinci/board-dm644x-evm.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/arch/arm/mach-davinci/board-dm644x-evm.c b/arch/arm/mach-davinci/board-dm644x-evm.c
index 48436f74fd71..6d35c6e1b0bd 100644
--- a/arch/arm/mach-davinci/board-dm644x-evm.c
+++ b/arch/arm/mach-davinci/board-dm644x-evm.c
@@ -28,6 +28,7 @@
#include <linux/v4l2-dv-timings.h>
#include <linux/export.h>
#include <linux/leds.h>
+#include <linux/nvmem-provider.h>
#include <media/i2c/tvp514x.h>
@@ -476,6 +477,15 @@ static struct pcf857x_platform_data pcf_data_u35 = {
* - ... newer boards may have more
*/
+static struct nvmem_cell_lookup dm6446evm_mac_address_cell = {
+ .info = {
+ .name = "mac-address",
+ .offset = 0x7f00,
+ .bytes = ETH_ALEN,
+ },
+ .nvmem_name = "1-00500",
+};
+
static struct at24_platform_data eeprom_info = {
.byte_len = (256*1024) / 8,
.page_size = 64,
@@ -828,6 +838,8 @@ static __init void davinci_evm_init(void)
phy_register_fixup_for_uid(LXT971_PHY_ID, LXT971_PHY_MASK,
davinci_phy_fixup);
}
+
+ nvmem_add_lookup_table(&dm6446evm_mac_address_cell, 1);
}
MACHINE_START(DAVINCI_EVM, "DaVinci DM644x EVM")
--
2.17.1
^ permalink raw reply related
* [PATCH v3 05/18] ARM: davinci: dm646x-evm: use nvmem lookup for mac address
From: Bartosz Golaszewski @ 2018-06-28 14:32 UTC (permalink / raw)
To: Sekhar Nori, Kevin Hilman, Russell King, Grygorii Strashko,
David S . Miller, Srinivas Kandagatla, Lukas Wunner, Rob Herring,
Florian Fainelli, Dan Carpenter, Ivan Khoronzhuk, David Lechner,
Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet
Cc: linux-arm-kernel, linux-kernel, linux-omap, netdev,
Bartosz Golaszewski
In-Reply-To: <20180628143244.4561-1-brgl@bgdev.pl>
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
We now support nvmem lookups for machine code. Add a lookup for
mac-address.
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
arch/arm/mach-davinci/board-dm646x-evm.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c
index 584064fdabf5..5a9de47bc8a2 100644
--- a/arch/arm/mach-davinci/board-dm646x-evm.c
+++ b/arch/arm/mach-davinci/board-dm646x-evm.c
@@ -37,6 +37,7 @@
#include <linux/platform_data/i2c-davinci.h>
#include <linux/platform_data/mtd-davinci.h>
#include <linux/platform_data/mtd-davinci-aemif.h>
+#include <linux/nvmem-provider.h>
#include <asm/mach-types.h>
#include <asm/mach/arch.h>
@@ -310,6 +311,15 @@ static struct pcf857x_platform_data pcf_data = {
* - ... newer boards may have more
*/
+static struct nvmem_cell_lookup dm646x_evm_mac_address_cell = {
+ .info = {
+ .name = "mac-address",
+ .offset = 0x7f00,
+ .bytes = ETH_ALEN,
+ },
+ .nvmem_name = "1-00500",
+};
+
static struct at24_platform_data eeprom_info = {
.byte_len = (256*1024) / 8,
.page_size = 64,
@@ -802,6 +812,8 @@ static __init void evm_init(void)
davinci_init_ide();
soc_info->emac_pdata->phy_id = DM646X_EVM_PHY_ID;
+
+ nvmem_add_lookup_table(&dm646x_evm_mac_address_cell, 1);
}
MACHINE_START(DAVINCI_DM6467_EVM, "DaVinci DM646x EVM")
--
2.17.1
^ permalink raw reply related
* [PATCH v3 16/18] ARM: davinci: sffsdr: fix the at24 eeprom device name
From: Bartosz Golaszewski @ 2018-06-28 14:32 UTC (permalink / raw)
To: Sekhar Nori, Kevin Hilman, Russell King, Grygorii Strashko,
David S . Miller, Srinivas Kandagatla, Lukas Wunner, Rob Herring,
Florian Fainelli, Dan Carpenter, Ivan Khoronzhuk, David Lechner,
Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet
Cc: linux-arm-kernel, linux-kernel, linux-omap, netdev,
Bartosz Golaszewski
In-Reply-To: <20180628143244.4561-1-brgl@bgdev.pl>
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
The currently used 24lc64 i2c device name doesn't match against any
of the devices supported by the at24 driver. Change it to the closest
compatible chip.
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
arch/arm/mach-davinci/board-sffsdr.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm/mach-davinci/board-sffsdr.c b/arch/arm/mach-davinci/board-sffsdr.c
index e7c1728b0833..f6a4d094cbc3 100644
--- a/arch/arm/mach-davinci/board-sffsdr.c
+++ b/arch/arm/mach-davinci/board-sffsdr.c
@@ -100,7 +100,7 @@ static struct at24_platform_data eeprom_info = {
static struct i2c_board_info __initdata i2c_info[] = {
{
- I2C_BOARD_INFO("24lc64", 0x50),
+ I2C_BOARD_INFO("24c64", 0x50),
.platform_data = &eeprom_info,
},
/* Other I2C devices:
--
2.17.1
^ permalink raw reply related
* Re: [PATCH v1 net-next 02/14] net: Add a new socket option for a future transmit time.
From: Willem de Bruijn @ 2018-06-28 14:40 UTC (permalink / raw)
To: Jesus Sanchez-Palencia
Cc: Network Development, Thomas Gleixner, jan.altenberg,
Vinicius Gomes, kurt.kanzenbach, Henrik Austad, Richard Cochran,
Levi Pearson, ilias.apalodimas, ivan.khoronzhuk, Miroslav Lichvar,
Willem de Bruijn, Jamal Hadi Salim, Cong Wang,
Jiří Pírko, Richard Cochran
In-Reply-To: <CAF=yD-KY60-Hi2_BSb2jOR7W91wOJQtSQ11WSDEDVpY6wx+aUw@mail.gmail.com>
On Thu, Jun 28, 2018 at 10:26 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Wed, Jun 27, 2018 at 6:08 PM Jesus Sanchez-Palencia
> <jesus.sanchez-palencia@intel.com> wrote:
> >
> > From: Richard Cochran <rcochran@linutronix.de>
> >
> > This patch introduces SO_TXTIME. User space enables this option in
> > order to pass a desired future transmit time in a CMSG when calling
> > sendmsg(2). The argument to this socket option is a 6-bytes long struct
> > defined as:
> >
> > struct sock_txtime {
> > clockid_t clockid;
> > u16 flags;
> > };
>
> clockid_t is __kernel_clockid_t is int is a variable length field.
> Please use fixed length fields.
Sorry, int is fine, of course, and clockid_t is used between userspace and
kernel already.
> Also, as MAX_CLOCKS is 16, only 4 bits are needed. A single u16
> is probably sufficient as cmsg argument. To future proof, a u32 will
> allow for more
> than 4 flags. But in struct sock, 16 bits should be sufficient to
> encode both clock id
> and flags.
^ permalink raw reply
* Re: [PATCH] bpfilter: fix user mode helper cross compilation
From: Alexei Starovoitov @ 2018-06-28 14:43 UTC (permalink / raw)
To: Andrew Morton; +Cc: Matteo Croce, netdev
In-Reply-To: <20180627231709.01709545812b669e29e797f7@linux-foundation.org>
On Wed, Jun 27, 2018 at 11:17:09PM -0700, Andrew Morton wrote:
> On Wed, 20 Jun 2018 16:04:34 +0200 Matteo Croce <mcroce@redhat.com> wrote:
>
> > Use $(OBJDUMP) instead of literal 'objdump' to avoid
> > using host toolchain when cross compiling.
> >
>
> I'm still having issues here, with ld.
>
> x86_64 machine, ARCH=i386:
>
> y:/usr/src/25> make V=1 M=net/bpfilter
> test -e include/generated/autoconf.h -a -e include/config/auto.conf || ( \
> echo >&2; \
> echo >&2 " ERROR: Kernel configuration is invalid."; \
> echo >&2 " include/generated/autoconf.h or include/config/auto.conf are missing.";\
> echo >&2 " Run 'make oldconfig && make prepare' on kernel src to fix it."; \
> echo >&2 ; \
> /bin/false)
> mkdir -p net/bpfilter/.tmp_versions ; rm -f net/bpfilter/.tmp_versions/*
> make -f ./scripts/Makefile.build obj=net/bpfilter
> (cat /dev/null; echo kernel/net/bpfilter/bpfilter.ko;) > net/bpfilter/modules.order
> ld -m elf_i386 -r -o net/bpfilter/bpfilter.o net/bpfilter/bpfilter_kern.o net/bpfilter/bpfilter_umh.o ; scripts/mod/modpost net/bpfilter/bpfilter.o
> ld: i386:x86-64 architecture of input file `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
could you please try with this patch
https://patchwork.ozlabs.org/patch/935246/
that is already in net tree?
^ permalink raw reply
* Re: [PATCH v2] net, mm: account sock objects to kmemcg
From: Kirill Tkhai @ 2018-06-28 14:52 UTC (permalink / raw)
To: Shakeel Butt, Andrew Morton
Cc: Johannes Weiner, Vladimir Davydov, Greg Thelen, Roman Gushchin,
David S . Miller, Eric Dumazet, linux-kernel, netdev, linux-mm
In-Reply-To: <20180627221642.247448-1-shakeelb@google.com>
On 28.06.2018 01:16, Shakeel Butt wrote:
> Currently the kernel accounts the memory for network traffic through
> mem_cgroup_[un]charge_skmem() interface. However the memory accounted
> only includes the truesize of sk_buff which does not include the size of
> sock objects. In our production environment, with opt-out kmem
> accounting, the sock kmem caches (TCP[v6], UDP[v6], RAW[v6], UNIX) are
> among the top most charged kmem caches and consume a significant amount
> of memory which can not be left as system overhead. So, this patch
> converts the kmem caches of all sock objects to SLAB_ACCOUNT.
>
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Suggested-by: Eric Dumazet <edumazet@google.com>
Looks good for me.
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
> Changelog since v1:
> - Instead of specific sock kmem_caches, convert all sock kmem_caches to
> use SLAB_ACCOUNT.
>
> net/core/sock.c | 7 +++++--
> 1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index bcc41829a16d..9e8f65585b81 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -3243,7 +3243,8 @@ static int req_prot_init(const struct proto *prot)
>
> rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
> rsk_prot->obj_size, 0,
> - prot->slab_flags, NULL);
> + SLAB_ACCOUNT | prot->slab_flags,
> + NULL);
>
> if (!rsk_prot->slab) {
> pr_crit("%s: Can't create request sock SLAB cache!\n",
> @@ -3258,7 +3259,8 @@ int proto_register(struct proto *prot, int alloc_slab)
> if (alloc_slab) {
> prot->slab = kmem_cache_create_usercopy(prot->name,
> prot->obj_size, 0,
> - SLAB_HWCACHE_ALIGN | prot->slab_flags,
> + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
> + prot->slab_flags,
> prot->useroffset, prot->usersize,
> NULL);
>
> @@ -3281,6 +3283,7 @@ int proto_register(struct proto *prot, int alloc_slab)
> kmem_cache_create(prot->twsk_prot->twsk_slab_name,
> prot->twsk_prot->twsk_obj_size,
> 0,
> + SLAB_ACCOUNT |
> prot->slab_flags,
> NULL);
> if (prot->twsk_prot->twsk_slab == NULL)
>
^ permalink raw reply
* Re: [PATCH v2] net, mm: account sock objects to kmemcg
From: Eric Dumazet @ 2018-06-28 15:03 UTC (permalink / raw)
To: Shakeel Butt, Andrew Morton
Cc: Johannes Weiner, Vladimir Davydov, Greg Thelen, Roman Gushchin,
David S . Miller, Eric Dumazet, Kirill Tkhai, linux-kernel,
netdev, linux-mm
In-Reply-To: <20180627221642.247448-1-shakeelb@google.com>
On 06/27/2018 03:16 PM, Shakeel Butt wrote:
> Currently the kernel accounts the memory for network traffic through
> mem_cgroup_[un]charge_skmem() interface. However the memory accounted
> only includes the truesize of sk_buff which does not include the size of
> sock objects. In our production environment, with opt-out kmem
> accounting, the sock kmem caches (TCP[v6], UDP[v6], RAW[v6], UNIX) are
> among the top most charged kmem caches and consume a significant amount
> of memory which can not be left as system overhead. So, this patch
> converts the kmem caches of all sock objects to SLAB_ACCOUNT.
>
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> Suggested-by: Eric Dumazet <edumazet@google.com>
> ---
Reviewed-by: Eric Dumazet <edumazet@google.com>
Thanks !
^ permalink raw reply
* Re: [patch net-next v2 0/9] net: sched: introduce chain templates support with offloading to mlxsw
From: David Ahern @ 2018-06-28 15:10 UTC (permalink / raw)
To: Jiri Pirko
Cc: netdev, davem, jhs, xiyou.wangcong, jakub.kicinski, simon.horman,
john.hurley, mlxsw, sridhar.samudrala
In-Reply-To: <20180628142909.GE2177@nanopsycho.orion>
On 6/28/18 8:29 AM, Jiri Pirko wrote:
> Thu, Jun 28, 2018 at 04:18:47PM CEST, dsahern@gmail.com wrote:
>> On 6/28/18 7:08 AM, Jiri Pirko wrote:
>>> Create dummy device with clsact first:
>>> # ip link add type dummy
>>> # tc qdisc add dev dummy0 clsact
>>>
>>> There is no template assigned by default:
>>> # tc chaintemplate show dev dummy0 ingress
>>>
>>> Add a template of type flower allowing to insert rules matching on last
>>> 2 bytes of destination mac address:
>>> # tc chaintemplate add dev dummy0 ingress proto ip flower dst_mac 00:00:00:00:00:00/00:00:00:00:FF:FF
>>>
>>> The template is now showed in the list:
>>> # tc chaintemplate show dev dummy0 ingress
>>> chaintemplate flower chain 0
>>> dst_mac 00:00:00:00:00:00/00:00:00:00:ff:ff
>>> eth_type ipv4
>>>
>>> Add another template, this time for chain number 22:
>>> # tc chaintemplate add dev dummy0 ingress proto ip chain 22 flower dst_ip 0.0.0.0/16
>>> # tc chaintemplate show dev dummy0 ingress
>>> chaintemplate flower chain 0
>>> dst_mac 00:00:00:00:00:00/00:00:00:00:ff:ff
>>> eth_type ipv4
>>> chaintemplate flower chain 22
>>> eth_type ipv4
>>> dst_ip 0.0.0.0/16
>>>
>>> Add a filter that fits the template:
>>> # tc filter add dev dummy0 ingress proto ip flower dst_mac aa:bb:cc:dd:ee:ff/00:00:00:00:00:0F action drop
>>>
>>> Addition of filters that does not fit the template would fail:
>>> # tc filter add dev dummy0 ingress proto ip flower dst_mac aa:11:22:33:44:55/00:00:00:FF:00:00 action drop
>>> Error: Mask does not fit the template.
>>> We have an error talking to the kernel, -1
>>> # tc filter add dev dummy0 ingress proto ip flower dst_ip 10.0.0.1 action drop
>>> Error: Mask does not fit the template.
>>> We have an error talking to the kernel, -1
>>>
>>> Additions of filters to chain 22:
>>> # tc filter add dev dummy0 ingress proto ip chain 22 flower dst_ip 10.0.0.1/8 action drop
>>> # tc filter add dev dummy0 ingress proto ip chain 22 flower dst_ip 10.0.0.1 action drop
>>> Error: Mask does not fit the template.
>>> We have an error talking to the kernel, -1
>>> # tc filter add dev dummy0 ingress proto ip chain 22 flower dst_ip 10.0.0.1/24 action drop
>>> Error: Mask does not fit the template.
>>> We have an error talking to the kernel, -1
>>>
>>> Removal of a template from non-empty chain would fail:
>>> # tc chaintemplate del dev dummy0 ingress
>>> Error: The chain is not empty, unable to delete template.
>>> We have an error talking to the kernel, -1
>>
>> Why this restriction? It's a template, so why can't it be removed
>> regardless of whether there are filters?
>
> That means you could start to insert filters that does not match the
> original template. I wanted to avoid it. The chain is utilized in hw for
> the original template, the filter insertion would have to be sanitized
> in driver. With this restriction, drivers can depend on filters always
> be fitting.
>
Then the hardware driver should have that restriction not the core tc code.
^ permalink raw reply
* Re: [PATCH net] net: fib_rules: add protocol check in rule_find
From: David Ahern @ 2018-06-28 15:26 UTC (permalink / raw)
To: Roopa Prabhu, davem; +Cc: netdev
In-Reply-To: <1530149236-5144-1-git-send-email-roopa@cumulusnetworks.com>
On 6/27/18 7:27 PM, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> After commit f9d4b0c1e969 ("fib_rules: move common handling of newrule
> delrule msgs into fib_nl2rule"), rule_find is strict about checking
> for an existing rule. rule_find must check against all
> user given attributes, else it may match against a subset
> of attributes and return an existing rule.
>
> In the below case, without support for protocol match, rule_find
> will match only against 'table main' and return an existing rule.
>
> $ip -4 rule add table main protocol boot
> RTNETLINK answers: File exists
>
> This patch adds protocol support to rule_find, forcing it to
> check protocol match if given by the user.
>
> Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule")
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> ---
Reviewed-by: David Ahern <dsahern@gmail.com>
^ permalink raw reply
* Re: [patch net-next v2 0/9] net: sched: introduce chain templates support with offloading to mlxsw
From: Jiri Pirko @ 2018-06-28 15:37 UTC (permalink / raw)
To: David Ahern
Cc: netdev, davem, jhs, xiyou.wangcong, jakub.kicinski, simon.horman,
john.hurley, mlxsw, sridhar.samudrala
In-Reply-To: <1dbc630d-2da3-afbe-18ba-e9dfc893bf04@gmail.com>
Thu, Jun 28, 2018 at 05:10:45PM CEST, dsahern@gmail.com wrote:
>On 6/28/18 8:29 AM, Jiri Pirko wrote:
>> Thu, Jun 28, 2018 at 04:18:47PM CEST, dsahern@gmail.com wrote:
>>> On 6/28/18 7:08 AM, Jiri Pirko wrote:
>>>> Create dummy device with clsact first:
>>>> # ip link add type dummy
>>>> # tc qdisc add dev dummy0 clsact
>>>>
>>>> There is no template assigned by default:
>>>> # tc chaintemplate show dev dummy0 ingress
>>>>
>>>> Add a template of type flower allowing to insert rules matching on last
>>>> 2 bytes of destination mac address:
>>>> # tc chaintemplate add dev dummy0 ingress proto ip flower dst_mac 00:00:00:00:00:00/00:00:00:00:FF:FF
>>>>
>>>> The template is now showed in the list:
>>>> # tc chaintemplate show dev dummy0 ingress
>>>> chaintemplate flower chain 0
>>>> dst_mac 00:00:00:00:00:00/00:00:00:00:ff:ff
>>>> eth_type ipv4
>>>>
>>>> Add another template, this time for chain number 22:
>>>> # tc chaintemplate add dev dummy0 ingress proto ip chain 22 flower dst_ip 0.0.0.0/16
>>>> # tc chaintemplate show dev dummy0 ingress
>>>> chaintemplate flower chain 0
>>>> dst_mac 00:00:00:00:00:00/00:00:00:00:ff:ff
>>>> eth_type ipv4
>>>> chaintemplate flower chain 22
>>>> eth_type ipv4
>>>> dst_ip 0.0.0.0/16
>>>>
>>>> Add a filter that fits the template:
>>>> # tc filter add dev dummy0 ingress proto ip flower dst_mac aa:bb:cc:dd:ee:ff/00:00:00:00:00:0F action drop
>>>>
>>>> Addition of filters that does not fit the template would fail:
>>>> # tc filter add dev dummy0 ingress proto ip flower dst_mac aa:11:22:33:44:55/00:00:00:FF:00:00 action drop
>>>> Error: Mask does not fit the template.
>>>> We have an error talking to the kernel, -1
>>>> # tc filter add dev dummy0 ingress proto ip flower dst_ip 10.0.0.1 action drop
>>>> Error: Mask does not fit the template.
>>>> We have an error talking to the kernel, -1
>>>>
>>>> Additions of filters to chain 22:
>>>> # tc filter add dev dummy0 ingress proto ip chain 22 flower dst_ip 10.0.0.1/8 action drop
>>>> # tc filter add dev dummy0 ingress proto ip chain 22 flower dst_ip 10.0.0.1 action drop
>>>> Error: Mask does not fit the template.
>>>> We have an error talking to the kernel, -1
>>>> # tc filter add dev dummy0 ingress proto ip chain 22 flower dst_ip 10.0.0.1/24 action drop
>>>> Error: Mask does not fit the template.
>>>> We have an error talking to the kernel, -1
>>>>
>>>> Removal of a template from non-empty chain would fail:
>>>> # tc chaintemplate del dev dummy0 ingress
>>>> Error: The chain is not empty, unable to delete template.
>>>> We have an error talking to the kernel, -1
>>>
>>> Why this restriction? It's a template, so why can't it be removed
>>> regardless of whether there are filters?
>>
>> That means you could start to insert filters that does not match the
>> original template. I wanted to avoid it. The chain is utilized in hw for
>> the original template, the filter insertion would have to be sanitized
>> in driver. With this restriction, drivers can depend on filters always
>> be fitting.
>>
>
>Then the hardware driver should have that restriction not the core tc code.
But why? The same restriction would be in all drivers. I believe it is
better to have in in tc in single place. Drivers can then depend on it.
Do you have a usecase where you need to remove template for non-empty
chain?
^ permalink raw reply
* Re: [patch net-next v2 0/9] net: sched: introduce chain templates support with offloading to mlxsw
From: David Ahern @ 2018-06-28 15:50 UTC (permalink / raw)
To: Jiri Pirko
Cc: netdev, davem, jhs, xiyou.wangcong, jakub.kicinski, simon.horman,
john.hurley, mlxsw, sridhar.samudrala
In-Reply-To: <20180628153752.GF2177@nanopsycho.orion>
On 6/28/18 9:37 AM, Jiri Pirko wrote:
>>>>
>>>> Why this restriction? It's a template, so why can't it be removed
>>>> regardless of whether there are filters?
>>>
>>> That means you could start to insert filters that does not match the
>>> original template. I wanted to avoid it. The chain is utilized in hw for
>>> the original template, the filter insertion would have to be sanitized
>>> in driver. With this restriction, drivers can depend on filters always
>>> be fitting.
>>>
>>
>> Then the hardware driver should have that restriction not the core tc code.
>
> But why? The same restriction would be in all drivers. I believe it is
> better to have in in tc in single place. Drivers can then depend on it.
> Do you have a usecase where you need to remove template for non-empty
> chain?
>
If the hardware has the limitation then the driver should be rejecting a
change.
^ permalink raw reply
* [PATCH] net: cleanup gfp mask in alloc_skb_with_frags
From: Michal Hocko @ 2018-06-28 15:53 UTC (permalink / raw)
To: davem; +Cc: edumazet, netdev, linux-kernel, Michal Hocko
From: Michal Hocko <mhocko@suse.com>
alloc_skb_with_frags uses __GFP_NORETRY for non-sleeping allocations
which is just a noop and a little bit confusing.
__GFP_NORETRY was added by ed98df3361f0 ("net: use __GFP_NORETRY for
high order allocations") to prevent from the OOM killer. Yet this was
not enough because fb05e7a89f50 ("net: don't wait for order-3 page
allocation") didn't want an excessive reclaim for non-costly orders
so it made it completely NOWAIT while it preserved __GFP_NORETRY in
place which is now redundant.
Drop the pointless __GFP_NORETRY because this function is used as
copy&paste source for other places.
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
net/core/skbuff.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c642304f178c..eba8dae22c25 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5276,8 +5276,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
if (npages >= 1 << order) {
page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
__GFP_COMP |
- __GFP_NOWARN |
- __GFP_NORETRY,
+ __GFP_NOWARN,
order);
if (page)
goto fill_page;
--
2.18.0
^ permalink raw reply related
* Re: [patch net-next v2 0/9] net: sched: introduce chain templates support with offloading to mlxsw
From: Jiri Pirko @ 2018-06-28 16:08 UTC (permalink / raw)
To: David Ahern
Cc: netdev, davem, jhs, xiyou.wangcong, jakub.kicinski, simon.horman,
john.hurley, mlxsw, sridhar.samudrala
In-Reply-To: <c5e3b143-787f-436f-1d64-64cfc7d137a2@gmail.com>
Thu, Jun 28, 2018 at 05:50:08PM CEST, dsahern@gmail.com wrote:
>On 6/28/18 9:37 AM, Jiri Pirko wrote:
>>>>>
>>>>> Why this restriction? It's a template, so why can't it be removed
>>>>> regardless of whether there are filters?
>>>>
>>>> That means you could start to insert filters that does not match the
>>>> original template. I wanted to avoid it. The chain is utilized in hw for
>>>> the original template, the filter insertion would have to be sanitized
>>>> in driver. With this restriction, drivers can depend on filters always
>>>> be fitting.
>>>>
>>>
>>> Then the hardware driver should have that restriction not the core tc code.
>>
>> But why? The same restriction would be in all drivers. I believe it is
>> better to have in in tc in single place. Drivers can then depend on it.
>> Do you have a usecase where you need to remove template for non-empty
>> chain?
>>
>
>If the hardware has the limitation then the driver should be rejecting a
>change.
The behaviour I defend is symmetrical with "template add". There is also
possible to add the template only if the chain is empty.
^ permalink raw reply
* [PATCH bpf-net 00/14] bpf: cgroup local storage
From: Roman Gushchin @ 2018-06-28 16:34 UTC (permalink / raw)
To: netdev
Cc: kernel-team, tj, Roman Gushchin, Alexei Starovoitov,
Daniel Borkmann, Martin KaFai Lau
This patchset implements cgroup local storage for bpf programs.
The main idea is to provide a fast accessible memory for storing
various per-cgroup data, e.g. number of transmitted packets.
Cgroup local storage looks as a special type of map for userspace,
and is accessible using generic bpf maps API for reading and
updating of the data. The (cgroup inode id, attachment type) pair
is used as a map key.
A user can't create new entries or destroy existing entries;
it happens automatically when a user attaches/detaches a bpf program
to a cgroup.
>From a bpf program's point of view, cgroup storage is accessible
without lookup using the special get_local_storage() helper function.
It takes a map fd as an argument. It always returns a valid pointer
to the corresponding memory area.
To implement such a lookup-free access a pointer to the cgroup
storage is saved for an attachment of a bpf program to a cgroup,
if required by the program. Before running the program, it's saved
in a special global per-cpu variable, which is accessible from the
get_local_storage() helper.
This patchset implement only cgroup local storage, however the API
is intentionally made extensible to support other local storage types
further: e.g. thread local storage, socket local storage, etc.
Patch (1) adds an ability to charge bpf maps for consuming memory
dynamically.
Patch (2) introduces cgroup storage maps.
Patch (3) implements a mechanism to pass cgroup storage pointer
to a bpf program.
Patch (4) implements allocation/releasing of cgroup local storage
on attaching/detaching of a bpf program to/from a cgroup.
Patch (5) extends bpf_prog_array to store cgroup storage pointers.
Patch (6) introduces BPF_PTR_TO_MAP_VALUE, required to skip
non-necessary NULL-check in bpf programs.
Patch (7) disables creation of maps of cgroup storage maps.
Patch (8) introduces the get_local_storage() helper.
Patch (9) syncs bpf.h to tools/.
Patch (10) adds cgroup storage maps support to bpftool.
Patch (11) adds support for testing programs which are using
cgroup storage without actually attaching them to cgroups.
Patches (12), (13) and (14) are adding necessary tests.
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Martin KaFai Lau <kafai@fb.com>
Roman Gushchin (14):
bpf: add ability to charge bpf maps memory dynamically
bpf: introduce cgroup storage maps
bpf: pass a pointer to a cgroup storage using pcpu variable
bpf: allocate cgroup storage entries on attaching bpf programs
bpf: extend bpf_prog_array to store pointers to the cgroup storage
bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE
bpf: don't allow create maps of cgroup local storages
bpf: introduce the bpf_get_local_storage() helper function
bpf: sync bpf.h to tools/
bpftool: add support for CGROUP_STORAGE maps
bpf/test_run: support cgroup local storage
selftests/bpf: add verifier cgroup storage tests
selftests/bpf: add a cgroup storage test
samples/bpf: extend test_cgrp2_attach2 test to use cgroup storage
include/linux/bpf-cgroup.h | 53 ++++
include/linux/bpf.h | 25 +-
include/linux/bpf_types.h | 3 +
include/uapi/linux/bpf.h | 19 +-
kernel/bpf/Makefile | 1 +
kernel/bpf/cgroup.c | 54 +++-
kernel/bpf/core.c | 76 ++---
kernel/bpf/helpers.c | 20 ++
kernel/bpf/local_storage.c | 369 ++++++++++++++++++++++
kernel/bpf/map_in_map.c | 3 +-
kernel/bpf/syscall.c | 53 +++-
kernel/bpf/verifier.c | 38 ++-
net/bpf/test_run.c | 13 +-
net/core/filter.c | 23 +-
samples/bpf/test_cgrp2_attach2.c | 27 +-
tools/bpf/bpftool/map.c | 1 +
tools/include/uapi/linux/bpf.h | 9 +-
tools/testing/selftests/bpf/Makefile | 4 +-
tools/testing/selftests/bpf/bpf_helpers.h | 2 +
tools/testing/selftests/bpf/test_cgroup_storage.c | 130 ++++++++
tools/testing/selftests/bpf/test_verifier.c | 123 +++++++-
21 files changed, 965 insertions(+), 81 deletions(-)
create mode 100644 kernel/bpf/local_storage.c
create mode 100644 tools/testing/selftests/bpf/test_cgroup_storage.c
--
2.14.4
^ permalink raw reply
* [PATCH bpf-net 02/14] bpf: introduce cgroup storage maps
From: Roman Gushchin @ 2018-06-28 16:34 UTC (permalink / raw)
To: netdev; +Cc: kernel-team, tj, Roman Gushchin, Alexei Starovoitov,
Daniel Borkmann
In-Reply-To: <20180628163458.27193-1-guro@fb.com>
This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps:
a special type of maps which are implementing the cgroup storage.
>From the userspace point of view it's almost a generic
hash map with the (cgroup inode id, attachment type) pair
used as a key.
The only difference is that some operations are restricted:
1) a user can't create new entries,
2) a user can't remove existing entries.
The lookup from userspace is o(log(n)).
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
include/linux/bpf-cgroup.h | 38 +++++
include/linux/bpf.h | 1 +
include/linux/bpf_types.h | 3 +
include/uapi/linux/bpf.h | 6 +
kernel/bpf/Makefile | 1 +
kernel/bpf/local_storage.c | 367 +++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/verifier.c | 12 ++
7 files changed, 428 insertions(+)
create mode 100644 kernel/bpf/local_storage.c
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 975fb4cf1bb7..b4e2e42c1d2a 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -3,19 +3,39 @@
#define _BPF_CGROUP_H
#include <linux/jump_label.h>
+#include <linux/rbtree.h>
#include <uapi/linux/bpf.h>
struct sock;
struct sockaddr;
struct cgroup;
struct sk_buff;
+struct bpf_map;
+struct bpf_prog;
struct bpf_sock_ops_kern;
+struct bpf_cgroup_storage;
#ifdef CONFIG_CGROUP_BPF
extern struct static_key_false cgroup_bpf_enabled_key;
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+struct bpf_cgroup_storage_map;
+
+struct bpf_storage_buffer {
+ struct rcu_head rcu;
+ char data[0];
+};
+
+struct bpf_cgroup_storage {
+ struct bpf_storage_buffer *buf;
+ struct bpf_cgroup_storage_map *map;
+ struct bpf_cgroup_storage_key key;
+ struct list_head list;
+ struct rb_node node;
+ struct rcu_head rcu;
+};
+
struct bpf_prog_list {
struct list_head node;
struct bpf_prog *prog;
@@ -76,6 +96,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
short access, enum bpf_attach_type type);
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+ struct cgroup *cgroup,
+ enum bpf_attach_type type);
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
+
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \
@@ -194,6 +223,15 @@ struct cgroup_bpf {};
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
+static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
+ struct bpf_map *map) { return 0; }
+static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
+ struct bpf_map *map) {}
+static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
+ struct bpf_prog *prog) { return 0; }
+static inline void bpf_cgroup_storage_free(
+ struct bpf_cgroup_storage *storage) {}
+
#define cgroup_bpf_enabled (0)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e4d684ce3f5e..4b3e42e5b6d0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -281,6 +281,7 @@ struct bpf_prog_aux {
struct bpf_prog *prog;
struct user_struct *user;
u64 load_time; /* ns since boottime */
+ struct bpf_map *cgroup_storage;
char name[BPF_OBJ_NAME_LEN];
#ifdef CONFIG_SECURITY
void *security;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c5700c2d5549..add08be53b6f 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
#ifdef CONFIG_CGROUPS
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
#endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
+#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59b19b6a40d7..7aa135e4c2f3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -75,6 +75,11 @@ struct bpf_lpm_trie_key {
__u8 data[0]; /* Arbitrary size */
};
+struct bpf_cgroup_storage_key {
+ __u64 cgroup_inode_id; /* cgroup inode id */
+ __u32 attach_type; /* program attach type */
+};
+
/* BPF syscall commands, see bpf(2) man-page for details. */
enum bpf_cmd {
BPF_MAP_CREATE,
@@ -120,6 +125,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_CGROUP_STORAGE,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f27f5496d6fe..e8906cbad81f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,6 +3,7 @@ obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
new file mode 100644
index 000000000000..940889eda2c7
--- /dev/null
+++ b/kernel/bpf/local_storage.c
@@ -0,0 +1,367 @@
+//SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf-cgroup.h>
+#include <linux/bpf.h>
+#include <linux/bug.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_CGROUP_BPF
+
+struct bpf_cgroup_storage_map {
+ struct bpf_map map;
+ struct bpf_prog *prog;
+
+ spinlock_t lock;
+ struct rb_root root;
+ struct list_head list;
+};
+
+static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
+{
+ return container_of(map, struct bpf_cgroup_storage_map, map);
+}
+
+static int bpf_cgroup_storage_key_cmp(
+ const struct bpf_cgroup_storage_key *key1,
+ const struct bpf_cgroup_storage_key *key2)
+{
+ if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+ return -1;
+ else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+ return 1;
+ else if (key1->attach_type < key2->attach_type)
+ return -1;
+ else if (key1->attach_type > key2->attach_type)
+ return 1;
+ return 0;
+}
+
+static struct bpf_cgroup_storage *cgroup_storage_lookup(
+ struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
+ bool locked)
+{
+ struct rb_root *root = &map->root;
+ struct rb_node *node;
+
+ /*
+ * This lock protects rbtree and list of storage entries,
+ * which are used from the syscall context only.
+ * So, simple spin_lock()/unlock() is fine here.
+ */
+ if (!locked)
+ spin_lock(&map->lock);
+
+ node = root->rb_node;
+ while (node) {
+ struct bpf_cgroup_storage *storage;
+
+ storage = container_of(node, struct bpf_cgroup_storage, node);
+
+ switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+ case -1:
+ node = node->rb_left;
+ break;
+ case 1:
+ node = node->rb_right;
+ break;
+ default:
+ if (!locked)
+ spin_unlock(&map->lock);
+ return storage;
+ }
+ }
+
+ if (!locked)
+ spin_unlock(&map->lock);
+
+ return NULL;
+}
+
+static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
+ struct bpf_cgroup_storage *storage)
+{
+ struct rb_root *root = &map->root;
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ while (*new) {
+ struct bpf_cgroup_storage *this;
+
+ this = container_of(*new, struct bpf_cgroup_storage, node);
+
+ parent = *new;
+ switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+ case -1:
+ new = &((*new)->rb_left);
+ break;
+ case 1:
+ new = &((*new)->rb_right);
+ break;
+ default:
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&storage->node, parent, new);
+ rb_insert_color(&storage->node, root);
+
+ return 0;
+}
+
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage *storage;
+
+ storage = cgroup_storage_lookup(map, key, false);
+ if (!storage)
+ return NULL;
+
+ return &READ_ONCE(storage->buf)->data[0];
+}
+
+static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
+ void *value, u64 flags)
+{
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage *storage;
+ struct bpf_storage_buffer *new;
+
+ if (flags & BPF_NOEXIST)
+ return -EINVAL;
+
+ storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
+ key, false);
+ if (!storage)
+ return -ENOENT;
+
+ new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+ map->value_size, __GFP_ZERO | GFP_USER,
+ map->numa_node);
+ if (!new)
+ return -ENOMEM;
+
+ memcpy(&new->data[0], value, map->value_size);
+
+ new = xchg(&storage->buf, new);
+ kfree_rcu(new, rcu);
+
+ return 0;
+}
+
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+ void *_next_key)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage_key *next = _next_key;
+ struct bpf_cgroup_storage *storage;
+
+ spin_lock(&map->lock);
+
+ if (list_empty(&map->list))
+ goto enoent;
+
+ if (key) {
+ storage = cgroup_storage_lookup(map, key, true);
+ if (!storage)
+ goto enoent;
+
+ storage = list_next_entry(storage, list);
+ if (!storage)
+ goto enoent;
+ } else {
+ storage = list_first_entry(&map->list,
+ struct bpf_cgroup_storage, list);
+ }
+
+ spin_unlock(&map->lock);
+ next->attach_type = storage->key.attach_type;
+ next->cgroup_inode_id = storage->key.cgroup_inode_id;
+ return 0;
+
+enoent:
+ spin_unlock(&map->lock);
+ return -ENOENT;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_cgroup_storage_map *map;
+
+ if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size > PAGE_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
+ __GFP_ZERO | GFP_USER, numa_node);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map),
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* copy mandatory map attributes */
+ bpf_map_init_from_attr(&map->map, attr);
+
+ spin_lock_init(&map->lock);
+ map->root = RB_ROOT;
+ INIT_LIST_HEAD(&map->list);
+
+ return &map->map;
+}
+
+static void cgroup_storage_map_free(struct bpf_map *_map)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+ WARN_ON(!RB_EMPTY_ROOT(&map->root));
+ WARN_ON(!list_empty(&map->list));
+
+ kfree(map);
+}
+
+static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+const struct bpf_map_ops cgroup_storage_map_ops = {
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = cgroup_storage_get_next_key,
+ .map_lookup_elem = cgroup_storage_lookup_elem,
+ .map_update_elem = cgroup_storage_update_elem,
+ .map_delete_elem = cgroup_storage_delete_elem,
+};
+
+/*
+ * Called by the verifier. bpf_verifier_lock must be locked.
+ */
+int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+ if (map->prog && map->prog != prog)
+ return -EBUSY;
+ if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map)
+ return -EBUSY;
+
+ map->prog = prog;
+ prog->aux->cgroup_storage = _map;
+
+ return 0;
+}
+
+/*
+ * Called by the verifier. bpf_verifier_lock must be locked.
+ */
+void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+
+ if (map->prog == prog) {
+ WARN_ON(prog->aux->cgroup_storage != _map);
+ map->prog = NULL;
+ }
+}
+
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog)
+{
+ struct bpf_cgroup_storage *storage;
+ struct bpf_map *map;
+ u32 pages;
+
+ map = prog->aux->cgroup_storage;
+ if (!map)
+ return NULL;
+
+ pages = round_up(sizeof(struct bpf_cgroup_storage) +
+ sizeof(struct bpf_storage_buffer) +
+ map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+ if (bpf_map_charge_memlock(map, pages))
+ return ERR_PTR(-EPERM);
+
+ storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
+ __GFP_ZERO | GFP_USER, map->numa_node);
+ if (!storage) {
+ bpf_map_uncharge_memlock(map, pages);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
+ map->value_size, __GFP_ZERO | GFP_USER,
+ map->numa_node);
+ if (!storage->buf) {
+ bpf_map_uncharge_memlock(map, pages);
+ kfree(storage);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ storage->map = (struct bpf_cgroup_storage_map *)map;
+
+ return storage;
+}
+
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
+{
+ u32 pages;
+ struct bpf_map *map;
+
+ if (!storage)
+ return;
+
+ map = &storage->map->map;
+ pages = round_up(sizeof(struct bpf_cgroup_storage) +
+ sizeof(struct bpf_storage_buffer) +
+ map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+ bpf_map_uncharge_memlock(map, pages);
+
+ kfree_rcu(storage->buf, rcu);
+ kfree_rcu(storage, rcu);
+}
+
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+ struct cgroup *cgroup,
+ enum bpf_attach_type type)
+{
+ struct bpf_cgroup_storage_map *map;
+
+ if (!storage)
+ return;
+
+ storage->key.attach_type = type;
+ storage->key.cgroup_inode_id = cgroup->kn->id.id;
+
+ map = storage->map;
+
+ spin_lock(&map->lock);
+ WARN_ON(cgroup_storage_insert(map, storage));
+ list_add(&storage->list, &map->list);
+ spin_unlock(&map->lock);
+}
+
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
+{
+ struct bpf_cgroup_storage_map *map;
+ struct rb_root *root;
+
+ if (!storage)
+ return;
+
+ map = storage->map;
+
+ spin_lock(&map->lock);
+ root = &map->root;
+ rb_erase(&storage->node, root);
+
+ list_del(&storage->list);
+ spin_unlock(&map->lock);
+}
+
+#endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9e2bf834f13a..de097a642c3f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5140,6 +5140,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
return -E2BIG;
}
+ if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
+ bpf_cgroup_storage_assign(env->prog, map)) {
+ verbose(env,
+ "only one cgroup storage is allowed\n");
+ fdput(f);
+ return -EBUSY;
+ }
+
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
@@ -5148,6 +5156,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
map = bpf_map_inc(map, false);
if (IS_ERR(map)) {
fdput(f);
+ if (map->map_type ==
+ BPF_MAP_TYPE_CGROUP_STORAGE)
+ bpf_cgroup_storage_release(env->prog,
+ map);
return PTR_ERR(map);
}
env->used_maps[env->used_map_cnt++] = map;
--
2.14.4
^ permalink raw reply related
* [PATCH bpf-net 03/14] bpf: pass a pointer to a cgroup storage using pcpu variable
From: Roman Gushchin @ 2018-06-28 16:34 UTC (permalink / raw)
To: netdev; +Cc: kernel-team, tj, Roman Gushchin, Alexei Starovoitov,
Daniel Borkmann
In-Reply-To: <20180628163458.27193-1-guro@fb.com>
This commit introduces the bpf_cgroup_storage_set() helper,
which will be used to pass a pointer to a cgroup storage
to the bpf helper.
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
include/linux/bpf-cgroup.h | 14 ++++++++++++++
kernel/bpf/local_storage.c | 2 ++
2 files changed, 16 insertions(+)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index b4e2e42c1d2a..128fb0e39b4d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -20,6 +20,8 @@ struct bpf_cgroup_storage;
extern struct static_key_false cgroup_bpf_enabled_key;
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+
struct bpf_cgroup_storage_map;
struct bpf_storage_buffer {
@@ -96,6 +98,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
short access, enum bpf_attach_type type);
+static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage)
+{
+ struct bpf_storage_buffer *buf;
+
+ if (!storage)
+ return;
+
+ buf = rcu_dereference(storage->buf);
+ this_cpu_write(bpf_cgroup_storage, &buf->data[0]);
+}
+
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
@@ -223,6 +236,7 @@ struct cgroup_bpf {};
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
+static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {}
static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
struct bpf_map *map) { return 0; }
static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 940889eda2c7..38810a712971 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,6 +7,8 @@
#include <linux/rbtree.h>
#include <linux/slab.h>
+DEFINE_PER_CPU(void*, bpf_cgroup_storage);
+
#ifdef CONFIG_CGROUP_BPF
struct bpf_cgroup_storage_map {
--
2.14.4
^ permalink raw reply related
* [PATCH bpf-net 01/14] bpf: add ability to charge bpf maps memory dynamically
From: Roman Gushchin @ 2018-06-28 16:34 UTC (permalink / raw)
To: netdev; +Cc: kernel-team, tj, Roman Gushchin, Alexei Starovoitov,
Daniel Borkmann
In-Reply-To: <20180628163458.27193-1-guro@fb.com>
This commits extends existing bpf maps memory charging API
to support dynamic charging/uncharging.
This is required to account memory used by maps,
if all entries are created dynamically after
the map initialization.
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
include/linux/bpf.h | 2 ++
kernel/bpf/syscall.c | 53 +++++++++++++++++++++++++++++++++++++---------------
2 files changed, 40 insertions(+), 15 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7df32a3200f7..e4d684ce3f5e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -434,6 +434,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_precharge_memlock(u32 pages);
+int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
+void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
void *bpf_map_area_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35dc466641f2..e03aeeec01e0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -181,32 +181,55 @@ int bpf_map_precharge_memlock(u32 pages)
return 0;
}
-static int bpf_map_charge_memlock(struct bpf_map *map)
+static int bpf_charge_memlock(struct user_struct *user, u32 pages)
{
- struct user_struct *user = get_current_user();
- unsigned long memlock_limit;
+ unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) {
+ atomic_long_sub(pages, &user->locked_vm);
+ return -EPERM;
+ }
+ return 0;
+}
- atomic_long_add(map->pages, &user->locked_vm);
+static int bpf_map_init_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = get_current_user();
+ int ret;
- if (atomic_long_read(&user->locked_vm) > memlock_limit) {
- atomic_long_sub(map->pages, &user->locked_vm);
+ ret = bpf_charge_memlock(user, map->pages);
+ if (ret) {
free_uid(user);
- return -EPERM;
+ return ret;
}
map->user = user;
- return 0;
+ return ret;
}
-static void bpf_map_uncharge_memlock(struct bpf_map *map)
+static void bpf_map_release_memlock(struct bpf_map *map)
{
struct user_struct *user = map->user;
-
- atomic_long_sub(map->pages, &user->locked_vm);
+ atomic_long_sub(map->pages, &map->user->locked_vm);
free_uid(user);
}
+int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
+{
+ int ret;
+
+ ret = bpf_charge_memlock(map->user, pages);
+ if (ret)
+ return ret;
+ map->pages += pages;
+ return ret;
+}
+
+void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
+{
+ atomic_long_sub(pages, &map->user->locked_vm);
+ map->pages -= pages;
+}
+
static int bpf_map_alloc_id(struct bpf_map *map)
{
int id;
@@ -256,7 +279,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
- bpf_map_uncharge_memlock(map);
+ bpf_map_release_memlock(map);
security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
@@ -492,7 +515,7 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_nouncharge;
- err = bpf_map_charge_memlock(map);
+ err = bpf_map_init_memlock(map);
if (err)
goto free_map_sec;
@@ -515,7 +538,7 @@ static int map_create(union bpf_attr *attr)
return err;
free_map:
- bpf_map_uncharge_memlock(map);
+ bpf_map_release_memlock(map);
free_map_sec:
security_bpf_map_free(map);
free_map_nouncharge:
--
2.14.4
^ permalink raw reply related
* [PATCH bpf-net 04/14] bpf: allocate cgroup storage entries on attaching bpf programs
From: Roman Gushchin @ 2018-06-28 16:34 UTC (permalink / raw)
To: netdev; +Cc: kernel-team, tj, Roman Gushchin, Alexei Starovoitov,
Daniel Borkmann
In-Reply-To: <20180628163458.27193-1-guro@fb.com>
If a bpf program is using cgroup local storage, allocate
a bpf_cgroup_storage structure automatically on attaching the program
to a cgroup and save the pointer into the corresponding bpf_prog_list
entry.
Analogically, release the cgroup local storage on detaching
of the bpf program.
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
include/linux/bpf-cgroup.h | 1 +
kernel/bpf/cgroup.c | 28 ++++++++++++++++++++++++++--
2 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 128fb0e39b4d..25ba744d2364 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -41,6 +41,7 @@ struct bpf_cgroup_storage {
struct bpf_prog_list {
struct list_head node;
struct bpf_prog *prog;
+ struct bpf_cgroup_storage *storage;
};
struct bpf_prog_array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index f7c00bd6f8e4..f0a809868f92 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -34,6 +34,8 @@ void cgroup_bpf_put(struct cgroup *cgrp)
list_for_each_entry_safe(pl, tmp, progs, node) {
list_del(&pl->node);
bpf_prog_put(pl->prog);
+ bpf_cgroup_storage_unlink(pl->storage);
+ bpf_cgroup_storage_free(pl->storage);
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
@@ -189,6 +191,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
{
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
+ struct bpf_cgroup_storage *storage, *old_storage = NULL;
struct cgroup_subsys_state *css;
struct bpf_prog_list *pl;
bool pl_was_allocated;
@@ -211,6 +214,10 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
+ storage = bpf_cgroup_storage_alloc(prog);
+ if (IS_ERR(storage))
+ return -ENOMEM;
+
if (flags & BPF_F_ALLOW_MULTI) {
list_for_each_entry(pl, progs, node)
if (pl->prog == prog)
@@ -218,24 +225,33 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
return -EINVAL;
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
- if (!pl)
+ if (!pl) {
+ bpf_cgroup_storage_free(storage);
return -ENOMEM;
+ }
+
pl_was_allocated = true;
pl->prog = prog;
+ pl->storage = storage;
list_add_tail(&pl->node, progs);
} else {
if (list_empty(progs)) {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
- if (!pl)
+ if (!pl) {
+ bpf_cgroup_storage_free(storage);
return -ENOMEM;
+ }
pl_was_allocated = true;
list_add_tail(&pl->node, progs);
} else {
pl = list_first_entry(progs, typeof(*pl), node);
old_prog = pl->prog;
+ old_storage = pl->storage;
+ bpf_cgroup_storage_unlink(old_storage);
pl_was_allocated = false;
}
pl->prog = prog;
+ pl->storage = storage;
}
cgrp->bpf.flags[type] = flags;
@@ -258,10 +274,13 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
}
static_branch_inc(&cgroup_bpf_enabled_key);
+ if (old_storage)
+ bpf_cgroup_storage_free(old_storage);
if (old_prog) {
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
}
+ bpf_cgroup_storage_link(storage, cgrp, type);
return 0;
cleanup:
@@ -277,6 +296,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
/* and cleanup the prog list */
pl->prog = old_prog;
+ bpf_cgroup_storage_free(pl->storage);
+ pl->storage = old_storage;
+ bpf_cgroup_storage_link(old_storage, cgrp, type);
if (pl_was_allocated) {
list_del(&pl->node);
kfree(pl);
@@ -357,6 +379,8 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
/* now can actually delete it from this cgroup list */
list_del(&pl->node);
+ bpf_cgroup_storage_unlink(pl->storage);
+ bpf_cgroup_storage_free(pl->storage);
kfree(pl);
if (list_empty(progs))
/* last program was detached, reset flags to zero */
--
2.14.4
^ permalink raw reply related
* [PATCH bpf-net 06/14] bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE
From: Roman Gushchin @ 2018-06-28 16:34 UTC (permalink / raw)
To: netdev; +Cc: kernel-team, tj, Roman Gushchin, Alexei Starovoitov,
Daniel Borkmann
In-Reply-To: <20180628163458.27193-1-guro@fb.com>
BPF_MAP_TYPE_CGROUP_STORAGE maps are special in a way
that the access from the bpf program side is lookup-free.
That means the result is guaranteed to be a valid
pointer to the cgroup storage; no NULL-check is required.
This patch introduces BPF_PTR_TO_MAP_VALUE return type,
which is required to cause the verifier accept programs,
which are not checking the map value pointer for being NULL.
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
include/linux/bpf.h | 1 +
kernel/bpf/verifier.c | 8 ++++++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 709354a0608a..aaaa6d7e0dfc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -154,6 +154,7 @@ enum bpf_arg_type {
enum bpf_return_type {
RET_INTEGER, /* function returns integer */
RET_VOID, /* function doesn't return anything */
+ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */
RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index de097a642c3f..cc0c7990f849 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2545,8 +2545,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
mark_reg_unknown(env, regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
+ fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+ if (fn->ret_type == RET_PTR_TO_MAP_VALUE)
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
+ else
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].off = 0;
--
2.14.4
^ permalink raw reply related
* [PATCH bpf-net 05/14] bpf: extend bpf_prog_array to store pointers to the cgroup storage
From: Roman Gushchin @ 2018-06-28 16:34 UTC (permalink / raw)
To: netdev; +Cc: kernel-team, tj, Roman Gushchin, Alexei Starovoitov,
Daniel Borkmann
In-Reply-To: <20180628163458.27193-1-guro@fb.com>
This patch converts bpf_prog_array from an array of prog pointers
to the array of struct bpf_prog_array_item elements.
This allows to save a cgroup storage pointer for each bpf program
efficiently attached to a cgroup.
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
include/linux/bpf.h | 19 +++++++++-----
kernel/bpf/cgroup.c | 24 ++++++++++-------
kernel/bpf/core.c | 76 +++++++++++++++++++++++++++--------------------------
3 files changed, 66 insertions(+), 53 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4b3e42e5b6d0..709354a0608a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -348,9 +348,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
* The 'struct bpf_prog_array *' should only be replaced with xchg()
* since other cpus are walking the array of pointers in parallel.
*/
+struct bpf_prog_array_item {
+ struct bpf_prog *prog;
+ struct bpf_cgroup_storage *cgroup_storage;
+};
+
struct bpf_prog_array {
struct rcu_head rcu;
- struct bpf_prog *progs[0];
+ struct bpf_prog_array_item items[0];
};
struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
@@ -371,7 +376,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \
({ \
- struct bpf_prog **_prog, *__prog; \
+ struct bpf_prog_array_item *_item; \
+ struct bpf_prog *_prog; \
struct bpf_prog_array *_array; \
u32 _ret = 1; \
preempt_disable(); \
@@ -379,10 +385,11 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
_array = rcu_dereference(array); \
if (unlikely(check_non_null && !_array))\
goto _out; \
- _prog = _array->progs; \
- while ((__prog = READ_ONCE(*_prog))) { \
- _ret &= func(__prog, ctx); \
- _prog++; \
+ _item = &_array->items[0]; \
+ while ((_prog = READ_ONCE(_item->prog))) { \
+ bpf_cgroup_storage_set(_item->cgroup_storage); \
+ _ret &= func(_prog, ctx); \
+ _item++; \
} \
_out: \
rcu_read_unlock(); \
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index f0a809868f92..14a1f6c94592 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -117,16 +117,20 @@ static int compute_effective_progs(struct cgroup *cgrp,
cnt = 0;
p = cgrp;
do {
- if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
- list_for_each_entry(pl,
- &p->bpf.progs[type], node) {
- if (!pl->prog)
- continue;
- rcu_dereference_protected(progs, 1)->
- progs[cnt++] = pl->prog;
- }
- p = cgroup_parent(p);
- } while (p);
+ if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ list_for_each_entry(pl, &p->bpf.progs[type], node) {
+ if (!pl->prog)
+ continue;
+
+ rcu_dereference_protected(progs, 1)->
+ items[cnt].prog = pl->prog;
+ rcu_dereference_protected(progs, 1)->
+ items[cnt].cgroup_storage = pl->storage;
+ cnt++;
+ }
+ } while ((p = cgroup_parent(p)));
*array = progs;
return 0;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a9e6c04d0f4a..145f44cb0cad 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1570,7 +1570,8 @@ struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
if (prog_cnt)
return kzalloc(sizeof(struct bpf_prog_array) +
- sizeof(struct bpf_prog *) * (prog_cnt + 1),
+ sizeof(struct bpf_prog_array_item) *
+ (prog_cnt + 1),
flags);
return &empty_prog_array.hdr;
@@ -1584,43 +1585,45 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
kfree_rcu(progs, rcu);
}
-int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
{
- struct bpf_prog **prog;
+ struct bpf_prog_array_item *item;
u32 cnt = 0;
rcu_read_lock();
- prog = rcu_dereference(progs)->progs;
- for (; *prog; prog++)
- if (*prog != &dummy_bpf_prog.prog)
+ item = rcu_dereference(array)->items;
+ for (; item->prog; item++)
+ if (item->prog != &dummy_bpf_prog.prog)
cnt++;
rcu_read_unlock();
return cnt;
}
-static bool bpf_prog_array_copy_core(struct bpf_prog **prog,
+
+static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
u32 *prog_ids,
u32 request_cnt)
{
+ struct bpf_prog_array_item *item;
int i = 0;
- for (; *prog; prog++) {
- if (*prog == &dummy_bpf_prog.prog)
+ item = rcu_dereference(array)->items;
+ for (; item->prog; item++) {
+ if (item->prog == &dummy_bpf_prog.prog)
continue;
- prog_ids[i] = (*prog)->aux->id;
+ prog_ids[i] = item->prog->aux->id;
if (++i == request_cnt) {
- prog++;
+ item++;
break;
}
}
- return !!(*prog);
+ return !!(item->prog);
}
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
__u32 __user *prog_ids, u32 cnt)
{
- struct bpf_prog **prog;
unsigned long err = 0;
bool nospc;
u32 *ids;
@@ -1639,8 +1642,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
if (!ids)
return -ENOMEM;
rcu_read_lock();
- prog = rcu_dereference(progs)->progs;
- nospc = bpf_prog_array_copy_core(prog, ids, cnt);
+ nospc = bpf_prog_array_copy_core(array, ids, cnt);
rcu_read_unlock();
err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
kfree(ids);
@@ -1651,14 +1653,14 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
return 0;
}
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
struct bpf_prog *old_prog)
{
- struct bpf_prog **prog = progs->progs;
+ struct bpf_prog_array_item *item = array->items;
- for (; *prog; prog++)
- if (*prog == old_prog) {
- WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ for (; item->prog; item++)
+ if (item->prog == old_prog) {
+ WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
break;
}
}
@@ -1669,7 +1671,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
struct bpf_prog_array **new_array)
{
int new_prog_cnt, carry_prog_cnt = 0;
- struct bpf_prog **existing_prog;
+ struct bpf_prog_array_item *existing;
struct bpf_prog_array *array;
bool found_exclude = false;
int new_prog_idx = 0;
@@ -1678,15 +1680,15 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
* the new array.
*/
if (old_array) {
- existing_prog = old_array->progs;
- for (; *existing_prog; existing_prog++) {
- if (*existing_prog == exclude_prog) {
+ existing = old_array->items;
+ for (; existing->prog; existing++) {
+ if (existing->prog == exclude_prog) {
found_exclude = true;
continue;
}
- if (*existing_prog != &dummy_bpf_prog.prog)
+ if (existing->prog != &dummy_bpf_prog.prog)
carry_prog_cnt++;
- if (*existing_prog == include_prog)
+ if (existing->prog == include_prog)
return -EEXIST;
}
}
@@ -1712,15 +1714,17 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
/* Fill in the new prog array */
if (carry_prog_cnt) {
- existing_prog = old_array->progs;
- for (; *existing_prog; existing_prog++)
- if (*existing_prog != exclude_prog &&
- *existing_prog != &dummy_bpf_prog.prog)
- array->progs[new_prog_idx++] = *existing_prog;
+ existing = old_array->items;
+ for (; existing->prog; existing++)
+ if (existing->prog != exclude_prog &&
+ existing->prog != &dummy_bpf_prog.prog) {
+ array->items[new_prog_idx++].prog =
+ existing->prog;
+ }
}
if (include_prog)
- array->progs[new_prog_idx++] = include_prog;
- array->progs[new_prog_idx] = NULL;
+ array->items[new_prog_idx++].prog = include_prog;
+ array->items[new_prog_idx].prog = NULL;
*new_array = array;
return 0;
}
@@ -1729,7 +1733,6 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
u32 *prog_ids, u32 request_cnt,
u32 *prog_cnt)
{
- struct bpf_prog **prog;
u32 cnt = 0;
if (array)
@@ -1742,8 +1745,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
return 0;
/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
- prog = rcu_dereference_check(array, 1)->progs;
- return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC
+ return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
: 0;
}
--
2.14.4
^ permalink raw reply related
* [PATCH bpf-net 09/14] bpf: sync bpf.h to tools/
From: Roman Gushchin @ 2018-06-28 16:34 UTC (permalink / raw)
To: netdev; +Cc: kernel-team, tj, Roman Gushchin, Alexei Starovoitov,
Daniel Borkmann
In-Reply-To: <20180628163458.27193-1-guro@fb.com>
Sync cgroup storage related changes:
1) new BPF_MAP_TYPE_CGROUP_STORAGE map type
2) struct bpf_cgroup_sotrage_key definition
3) get_local_storage() helper
Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
tools/include/uapi/linux/bpf.h | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e0b06784f227..06e111181dda 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -75,6 +75,11 @@ struct bpf_lpm_trie_key {
__u8 data[0]; /* Arbitrary size */
};
+struct bpf_cgroup_storage_key {
+ __u64 cgroup_inode_id; /* cgroup inode id */
+ __u32 attach_type; /* program attach type */
+};
+
/* BPF syscall commands, see bpf(2) man-page for details. */
enum bpf_cmd {
BPF_MAP_CREATE,
@@ -120,6 +125,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CPUMAP,
BPF_MAP_TYPE_XSKMAP,
BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_CGROUP_STORAGE,
};
enum bpf_prog_type {
@@ -2157,7 +2163,8 @@ union bpf_attr {
FN(rc_repeat), \
FN(rc_keydown), \
FN(skb_cgroup_id), \
- FN(get_current_cgroup_id),
+ FN(get_current_cgroup_id), \
+ FN(get_local_storage),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
--
2.14.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox