* [PATCH net-next 4/8] bpf samples: Update sock test to allow setting mark and priority
From: David Ahern @ 2017-08-23 0:20 UTC (permalink / raw)
To: netdev, daniel, ast, tj, davem; +Cc: David Ahern
In-Reply-To: <1503447621-27997-1-git-send-email-dsahern@gmail.com>
Update sock test to set mark and priority on socket create.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
samples/bpf/test_cgrp2_sock.c | 129 ++++++++++++++++++++++++++++++++++++-----
samples/bpf/test_cgrp2_sock.sh | 2 +-
2 files changed, 116 insertions(+), 15 deletions(-)
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index c3cfb23e23b5..c2501c9508a7 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -19,55 +19,156 @@
#include <errno.h>
#include <fcntl.h>
#include <net/if.h>
+#include <inttypes.h>
#include <linux/bpf.h>
#include "libbpf.h"
char bpf_log_buf[BPF_LOG_BUF_SIZE];
-static int prog_load(int idx)
+static int prog_load(__u32 idx, __u32 mark, __u32 prio)
{
- struct bpf_insn prog[] = {
+ /* save pointer to context */
+ struct bpf_insn prog_start[] = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ };
+ struct bpf_insn prog_end[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+
+ /* set sk_bound_dev_if on socket */
+ struct bpf_insn prog_dev[] = {
BPF_MOV64_IMM(BPF_REG_3, idx),
BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
- BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
- BPF_EXIT_INSN(),
};
- size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
- return bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt,
+ /* set mark on socket */
+ struct bpf_insn prog_mark[] = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_3, mark),
+ BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)),
+ };
+
+ /* set priority on socket */
+ struct bpf_insn prog_prio[] = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_3, prio),
+ BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)),
+ };
+
+ struct bpf_insn *prog;
+ size_t insns_cnt;
+ void *p;
+ int ret;
+
+ insns_cnt = sizeof(prog_start) + sizeof(prog_end);
+ if (idx)
+ insns_cnt += sizeof(prog_dev);
+
+ if (mark)
+ insns_cnt += sizeof(prog_mark);
+
+ if (prio)
+ insns_cnt += sizeof(prog_prio);
+
+ p = prog = malloc(insns_cnt);
+ if (!prog) {
+ fprintf(stderr, "Failed to allocate memory for instructions\n");
+ return EXIT_FAILURE;
+ }
+
+ memcpy(p, prog_start, sizeof(prog_start));
+ p += sizeof(prog_start);
+
+ if (idx) {
+ memcpy(p, prog_dev, sizeof(prog_dev));
+ p += sizeof(prog_dev);
+ }
+
+ if (mark) {
+ memcpy(p, prog_mark, sizeof(prog_mark));
+ p += sizeof(prog_mark);
+ }
+
+ if (prio) {
+ memcpy(p, prog_prio, sizeof(prog_prio));
+ p += sizeof(prog_prio);
+ }
+
+ memcpy(p, prog_end, sizeof(prog_end));
+ p += sizeof(prog_end);
+
+ insns_cnt /= sizeof(struct bpf_insn);
+
+ ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt,
"GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+ free(prog);
+
+ return ret;
}
static int usage(const char *argv0)
{
- printf("Usage: %s cg-path device-index\n", argv0);
+ printf("Usage: %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
return EXIT_FAILURE;
}
int main(int argc, char **argv)
{
+ __u32 idx = 0, mark = 0, prio = 0;
+ const char *cgrp_path = NULL;
int cg_fd, prog_fd, ret;
- unsigned int idx;
+ int rc;
+
+ while ((rc = getopt(argc, argv, "b:m:p:")) != -1) {
+ switch (rc) {
+ case 'b':
+ idx = if_nametoindex(optarg);
+ if (!idx) {
+ idx = strtoumax(optarg, NULL, 0);
+ if (!idx) {
+ printf("Invalid device name\n");
+ return EXIT_FAILURE;
+ }
+ }
+ break;
+ case 'm':
+ mark = strtoumax(optarg, NULL, 0);
+ break;
+ case 'p':
+ prio = strtoumax(optarg, NULL, 0);
+ break;
+ default:
+ return usage(argv[0]);
+ }
+ }
- if (argc < 2)
+ if (optind == argc)
return usage(argv[0]);
- idx = if_nametoindex(argv[2]);
- if (!idx) {
- printf("Invalid device name\n");
+ cgrp_path = argv[optind];
+ if (!cgrp_path) {
+ fprintf(stderr, "cgroup path not given\n");
+ return EXIT_FAILURE;
+ }
+
+ if (!idx && !mark && !prio) {
+ fprintf(stderr, "One of device, mark or priority must be given\n");
return EXIT_FAILURE;
}
- cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+ cg_fd = open(cgrp_path, O_DIRECTORY | O_RDONLY);
if (cg_fd < 0) {
printf("Failed to open cgroup path: '%s'\n", strerror(errno));
return EXIT_FAILURE;
}
- prog_fd = prog_load(idx);
+ prog_fd = prog_load(idx, mark, prio);
printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
if (prog_fd < 0) {
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
index 925fd467c7cc..1153c33e8964 100755
--- a/samples/bpf/test_cgrp2_sock.sh
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -20,7 +20,7 @@ function attach_bpf {
mkdir -p /tmp/cgroupv2
mount -t cgroup2 none /tmp/cgroupv2
mkdir -p /tmp/cgroupv2/foo
- test_cgrp2_sock /tmp/cgroupv2/foo foo
+ test_cgrp2_sock -b foo /tmp/cgroupv2/foo
echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
}
--
2.1.4
^ permalink raw reply related
* [PATCH net-next 3/8] bpf: Allow cgroup sock filters to use get_current_uid_gid helper
From: David Ahern @ 2017-08-23 0:20 UTC (permalink / raw)
To: netdev, daniel, ast, tj, davem; +Cc: David Ahern
In-Reply-To: <1503447621-27997-1-git-send-email-dsahern@gmail.com>
Allow BPF programs run on sock create to use the get_current_uid_gid
helper.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
net/core/filter.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 7ee75a40ff03..6d3f693021f9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3139,6 +3139,17 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
+sock_filter_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -4227,7 +4238,7 @@ const struct bpf_verifier_ops lwt_xmit_prog_ops = {
};
const struct bpf_verifier_ops cg_sock_prog_ops = {
- .get_func_proto = bpf_base_func_proto,
+ .get_func_proto = sock_filter_func_proto,
.is_valid_access = sock_filter_is_valid_access,
.convert_ctx_access = sock_filter_convert_ctx_access,
};
--
2.1.4
^ permalink raw reply related
* [PATCH net-next 2/3 v8] net: arp: Add support for raw IP device
From: Subash Abhinov Kasiviswanathan @ 2017-08-23 0:20 UTC (permalink / raw)
To: netdev, davem, fengguang.wu, dcbw, jiri, stephen, David.Laight,
marcel, andrew
Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1503447610-11409-1-git-send-email-subashab@codeaurora.org>
Define the raw IP type. This is needed for raw IP net devices
like rmnet.
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
include/uapi/linux/if_arp.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index cf73510..a2a6356 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -59,6 +59,7 @@
#define ARPHRD_LAPB 516 /* LAPB */
#define ARPHRD_DDCMP 517 /* Digital's DDCMP protocol */
#define ARPHRD_RAWHDLC 518 /* Raw HDLC */
+#define ARPHRD_RAWIP 519 /* Raw IP */
#define ARPHRD_TUNNEL 768 /* IPIP tunnel */
#define ARPHRD_TUNNEL6 769 /* IP6IP6 tunnel */
--
1.9.1
^ permalink raw reply related
* [PATCH net-next 2/8] bpf: Add mark and priority to sock options that can be set
From: David Ahern @ 2017-08-23 0:20 UTC (permalink / raw)
To: netdev, daniel, ast, tj, davem; +Cc: David Ahern
In-Reply-To: <1503447621-27997-1-git-send-email-dsahern@gmail.com>
Add socket mark and priority to fields that can be set by
ebpf program when a socket is created.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/uapi/linux/bpf.h | 2 ++
net/core/filter.c | 26 ++++++++++++++++++++++++++
2 files changed, 28 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 843818dff96d..a89e5e6dff7c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -764,6 +764,8 @@ struct bpf_sock {
__u32 family;
__u32 type;
__u32 protocol;
+ __u32 mark;
+ __u32 priority;
};
#define XDP_PACKET_HEADROOM 256
diff --git a/net/core/filter.c b/net/core/filter.c
index fa2115695037..7ee75a40ff03 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3444,6 +3444,10 @@ static bool sock_filter_is_valid_access(int off, int size,
switch (off) {
case offsetof(struct bpf_sock, bound_dev_if):
break;
+ case offsetof(struct bpf_sock, mark):
+ break;
+ case offsetof(struct bpf_sock, priority):
+ break;
default:
return false;
}
@@ -3952,6 +3956,28 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
offsetof(struct sock, sk_bound_dev_if));
break;
+ case offsetof(struct bpf_sock, mark):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_mark));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_mark));
+ break;
+
+ case offsetof(struct bpf_sock, priority):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_priority));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_priority));
+ break;
+
case offsetof(struct bpf_sock, family):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
--
2.1.4
^ permalink raw reply related
* [PATCH net-next 1/3 v8] net: ether: Add support for multiplexing and aggregation type
From: Subash Abhinov Kasiviswanathan @ 2017-08-23 0:20 UTC (permalink / raw)
To: netdev, davem, fengguang.wu, dcbw, jiri, stephen, David.Laight,
marcel, andrew
Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1503447610-11409-1-git-send-email-subashab@codeaurora.org>
Define the multiplexing and aggregation (MAP) ether type 0x00F9. This
is needed for receiving data in the MAP protocol like RMNET. This is
not an officially registered ID.
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
include/uapi/linux/if_ether.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 5bc9bfd..0d73ecc 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -137,6 +137,7 @@
#define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */
#define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */
#define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */
+#define ETH_P_MAP 0x00F9 /* Multiplex & aggregation proto*/
/*
* This is an Ethernet frame header.
--
1.9.1
^ permalink raw reply related
* [PATCH net-next 1/8] bpf: Recursively apply cgroup sock filters
From: David Ahern @ 2017-08-23 0:20 UTC (permalink / raw)
To: netdev, daniel, ast, tj, davem; +Cc: David Ahern
In-Reply-To: <1503447621-27997-1-git-send-email-dsahern@gmail.com>
Recursively apply sock filters attached to a cgroup. For now, start
with the inner cgroup attached to the socket and work back to the
root. If desired the inverse can be done use an attach flag (start
with parent cgroup and go in).
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/linux/bpf-cgroup.h | 5 +++--
kernel/bpf/cgroup.c | 4 +---
kernel/cgroup/cgroup.c | 18 ++++++++++++++++++
3 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d41d40ac3efd..d95e44ccd549 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -40,8 +40,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type);
-int __cgroup_bpf_run_filter_sk(struct sock *sk,
+int __cgroup_bpf_run_filter_sk(struct cgroup *cgrp, struct sock *sk,
enum bpf_attach_type type);
+int cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type);
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
struct bpf_sock_ops_kern *sock_ops,
@@ -74,7 +75,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
({ \
int __ret = 0; \
if (cgroup_bpf_enabled && sk) { \
- __ret = __cgroup_bpf_run_filter_sk(sk, \
+ __ret = cgroup_bpf_run_filter_sk(sk, \
BPF_CGROUP_INET_SOCK_CREATE); \
} \
__ret; \
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..0480610bda83 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -217,14 +217,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
* This function will return %-EPERM if any if an attached program was found
* and if it returned != 1 during execution. In all other cases, 0 is returned.
*/
-int __cgroup_bpf_run_filter_sk(struct sock *sk,
+int __cgroup_bpf_run_filter_sk(struct cgroup *cgrp, struct sock *sk,
enum bpf_attach_type type)
{
- struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
struct bpf_prog *prog;
int ret = 0;
-
rcu_read_lock();
prog = rcu_dereference(cgrp->bpf.effective[type]);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index df2e0f14a95d..7480cebab073 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5186,4 +5186,22 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
mutex_unlock(&cgroup_mutex);
return ret;
}
+
+int cgroup_bpf_run_filter_sk(struct sock *sk,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ int ret = 0;
+
+ while (cgrp) {
+ ret = __cgroup_bpf_run_filter_sk(cgrp, sk, type);
+ if (ret < 0)
+ break;
+
+ cgrp = cgroup_parent(cgrp);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(cgroup_bpf_run_filter_sk);
#endif /* CONFIG_CGROUP_BPF */
--
2.1.4
^ permalink raw reply related
* [PATCH net-next 0/8] bpf: Add option to set mark and priority in cgroup sock programs
From: David Ahern @ 2017-08-23 0:20 UTC (permalink / raw)
To: netdev, daniel, ast, tj, davem; +Cc: David Ahern
Add option to set mark and priority in addition to bound device for newly
created sockets. Also, allow the bpf programs to use the get_current_uid_gid
helper meaning socket marks, priority and device can be set base on the
uid/gid of the running process.
For flexbility in deploying these programs, cgroups are walked from
current to root running any program attached. This allows one cgroup
level to control the device a socket is bound to (e.g, a VRF) while
cgroups can be used to set socket marks and priority.
Sample programs are updated to demonstrate the new options.
David Ahern (8):
bpf: Recursively apply cgroup sock filters
bpf: Add mark and priority to sock options that can be set
bpf: Allow cgroup sock filters to use get_current_uid_gid helper
bpf samples: Update sock test to allow setting mark and priority
bpf/samples: Add detach option to test_cgrp2_sock
samples/bpf: Add option to dump socket settings
samples/bpf: Add test case for nested socket options
samples/bpf: Update cgroup socket examples to use uid gid helper
include/linux/bpf-cgroup.h | 5 +-
include/uapi/linux/bpf.h | 2 +
kernel/bpf/cgroup.c | 4 +-
kernel/cgroup/cgroup.c | 18 +++
net/core/filter.c | 39 ++++++-
samples/bpf/sock_flags_kern.c | 5 +
samples/bpf/test_cgrp2_sock.c | 252 ++++++++++++++++++++++++++++++++++++----
samples/bpf/test_cgrp2_sock.sh | 2 +-
samples/bpf/test_cgrp2_sock3.sh | 118 +++++++++++++++++++
9 files changed, 413 insertions(+), 32 deletions(-)
create mode 100755 samples/bpf/test_cgrp2_sock3.sh
--
2.1.4
^ permalink raw reply
* [PATCH net-next 0/3 v8] Add support for rmnet driver
From: Subash Abhinov Kasiviswanathan @ 2017-08-23 0:20 UTC (permalink / raw)
To: netdev, davem, fengguang.wu, dcbw, jiri, stephen, David.Laight,
marcel, andrew
Cc: Subash Abhinov Kasiviswanathan
This patch adds support for the rmnet driver which is required to
support recent chipsets using Qualcomm Technologies, Inc. modems. The data
from hardware follows the multiplexing and aggregation protocol (MAP).
This driver can be used to register onto any physical network device in
IP mode. Physical transports include USB, HSIC, PCIe and IP accelerator.
rmnet driver helps to decode these packets and queue them to network
stack (and encode and transmit it to the physical device).
--
v1: Same as the RFC patch with some minor fixes for issues reported by
kbuild test robot.
v1->v2: Change datatypes and remove config IOCTL as mentioned by David.
Also fix checkpatch issues and remove some unused code.
v2->v3: Move location to drivers/net and rename to rmnet. Change the
userspace - netlink communication from custom netlink to rtnl_link_ops.
Refactor some code. Use a fixed config for ingress and egress.
v3->v4: Move location to drivers/net/ethernet/qualcomm/.
Fix comments from Stephen and Jiri -
Split the ether and arp type changes into seperate patches.
Remove debug and custom logging and switch to standard netdevice log.
Remove module parameters. Refactor and change some code style issues.
v4->v5: Rename some structs and variables. Move the initializer
before the for loop start. Put the arp type in correct sequence.
v5->v6: Fix comments from Dan -
Use the upper link API. As a result, remove all the refcounting logic.
Device refcount is explicitly held on real_dev on rx_handler
registration only. Modifiy the flow control struct. Remove the unused
ethernet mode handling.
v6->v7: Fix comments from David - Add newline to end of Makefile. Remove
inline from .c files. Move the module init/exit to rmnet config. Fix an
error reported by kbuild test robot for an unused file.
v7->v8: Use a smaller value for ETH_P_MAP as mentioned by David. Change
netdev_info to netdev_dbg as mentioned by Andew. Fix comments from
Stephen regarding netdev_priv and sparse related errors of using 0 as NULL
Subash Abhinov Kasiviswanathan (3):
net: ether: Add support for multiplexing and aggregation type
net: arp: Add support for raw IP device
drivers: net: ethernet: qualcomm: rmnet: Initial implementation
Documentation/networking/rmnet.txt | 82 ++++
drivers/net/ethernet/qualcomm/Kconfig | 2 +
drivers/net/ethernet/qualcomm/Makefile | 2 +
drivers/net/ethernet/qualcomm/rmnet/Kconfig | 12 +
drivers/net/ethernet/qualcomm/rmnet/Makefile | 12 +
drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 417 +++++++++++++++++++++
drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h | 54 +++
.../net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 271 +++++++++++++
.../net/ethernet/qualcomm/rmnet/rmnet_handlers.h | 26 ++
drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h | 88 +++++
.../ethernet/qualcomm/rmnet/rmnet_map_command.c | 107 ++++++
.../net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 105 ++++++
.../net/ethernet/qualcomm/rmnet/rmnet_private.h | 45 +++
drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 237 ++++++++++++
drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h | 31 ++
include/uapi/linux/if_arp.h | 1 +
include/uapi/linux/if_ether.h | 1 +
17 files changed, 1493 insertions(+)
create mode 100644 Documentation/networking/rmnet.txt
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/Kconfig
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/Makefile
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
--
1.9.1
^ permalink raw reply
* [PATCH v2 2/2] selftests/net: add test to verify datagram socket timeout
From: Vallish Vaidyeshwara @ 2017-08-23 0:10 UTC (permalink / raw)
To: davem, shuah, richardcochran, xiyou.wangcong, netdev,
linux-kernel
Cc: eduval, anchalag, vallish
In-Reply-To: <1503447027-44399-1-git-send-email-vallish@amazon.com>
AF_UNIX and AF_INET datagram sockets use high resolution timer to time
SO_RCVTIMEO value used with setsockopt(2). This test checks for the
accuracy of kernel notifying these sockets timeout to application. Test
program has code to check AF_UNIX socket, however the kernel function used
to timeout AF_INET socket is the same kernel function used by AF_UNIX as
well which is __skb_wait_for_more_packets().
Reported-by: Manjula Peiris <thelgep@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
tools/testing/selftests/net/Makefile | 3 +-
.../testing/selftests/net/datagram_sock_timeout.c | 119 +++++++++++++++++++++
.../selftests/net/run_datagram_sock_timeout.sh | 12 +++
3 files changed, 133 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/net/datagram_sock_timeout.c
create mode 100755 tools/testing/selftests/net/run_datagram_sock_timeout.sh
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index f6c9dbf..eb5a8c7 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -3,11 +3,12 @@
CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
CFLAGS += -I../../../../usr/include/
-TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh
+TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh run_datagram_sock_timeout.sh
TEST_GEN_FILES = socket
TEST_GEN_FILES += psock_fanout psock_tpacket
TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_FILES += reuseport_dualstack
+TEST_GEN_FILES += datagram_sock_timeout
include ../lib.mk
diff --git a/tools/testing/selftests/net/datagram_sock_timeout.c b/tools/testing/selftests/net/datagram_sock_timeout.c
new file mode 100644
index 0000000..362becf
--- /dev/null
+++ b/tools/testing/selftests/net/datagram_sock_timeout.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2017 Amazon.com, Inc.
+ * Author: Manjula Peiris <thelgep@amazon.com>
+ * Vallish Vaidyeshwara <vallish@amazon.com>
+ *
+ * selftests/net: test to verify datagram socket timeout
+ *
+ * AF_UNIX and AF_INET datagram sockets use high resolution timer to time
+ * SO_RCVTIMEO value used with setsockopt(2). This test checks for the accuracy
+ * of kernel notifying these sockets timeout to application. Test program has
+ * code to check AF_UNIX socket, however the kernel function used to timeout
+ * AF_INET socket is the same kernel function used by AF_UNIX as well which is
+ * __skb_wait_for_more_packets().
+ *
+ * License (GPLv2):
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.
+ */
+
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <err.h>
+#include <errno.h>
+#include <sys/un.h>
+#include <time.h>
+#include <assert.h>
+
+#define BUF_SIZE 128
+#define KB 1024
+#define NUM_FD 2
+
+static int set_socket_timeout(int sockfd, unsigned int ms)
+{
+ int ret;
+ struct timeval timeout;
+ socklen_t cb = sizeof(timeout);
+
+ timeout.tv_sec = ms / 1000;
+ timeout.tv_usec = (ms % 1000) * 1000;
+ ret = setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO, &timeout, cb);
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ char err[BUF_SIZE];
+ int ret;
+ int fds[NUM_FD];
+ struct msghdr message;
+ char buffer[KB];
+ struct sockaddr_storage src_addr;
+ struct iovec iov[1];
+ time_t start, end;
+ unsigned int timeout;
+
+ iov[0].iov_base = buffer;
+ iov[0].iov_len = sizeof(buffer);
+ message.msg_name = &src_addr;
+ message.msg_namelen = sizeof(src_addr);
+ message.msg_iov = iov;
+ message.msg_iovlen = 1;
+ message.msg_control = 0;
+ message.msg_controllen = 0;
+
+ if (argc != 2) {
+ fprintf(stderr,
+ "datagram_sock_timeout failed: no timeout specified\n");
+ return -1;
+ }
+ timeout = (unsigned int)(atoi(argv[1]));
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, fds) != 0) {
+ strerror_r(errno, err, BUF_SIZE);
+ fprintf(stderr, "socketpair() call failed with %s\n", err);
+ return -1;
+ }
+
+ if (set_socket_timeout(fds[0], timeout) != 0) {
+ strerror_r(errno, err, BUF_SIZE);
+ fprintf(stderr, "setsockopt() call failed with %s\n", err);
+ return -1;
+ }
+
+ start = time(NULL);
+ ret = (int)recvmsg(fds[0], &message, 0);
+ end = time(NULL);
+ if (!(ret == -1 && errno == 11)) {
+ fprintf(stderr,
+ "datagram_sock_timeout failed: test was interrupted\n");
+ return -1;
+ }
+
+ if ((end - start) != (timeout / 1000)) {
+ fprintf(stderr,
+ "datagram_sock_timeout failed: took %lu seconds\n",
+ (long)(end - start));
+ return -1;
+ }
+
+ close(fds[0]);
+ close(fds[1]);
+
+ fprintf(stderr, "datagram_sock_timeout passed\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/run_datagram_sock_timeout.sh b/tools/testing/selftests/net/run_datagram_sock_timeout.sh
new file mode 100755
index 0000000..d5f4f82
--- /dev/null
+++ b/tools/testing/selftests/net/run_datagram_sock_timeout.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+# Runs datagram socket timeout test
+
+echo "--------------------"
+echo "running run_datagram_sock_timeout test"
+echo "--------------------"
+./datagram_sock_timeout 180000
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+else
+ echo "[PASS]"
+fi
--
2.7.3.AMZN
^ permalink raw reply related
* [PATCH v2 1/2] net: enable high resolution timer mode to timeout datagram sockets
From: Vallish Vaidyeshwara @ 2017-08-23 0:10 UTC (permalink / raw)
To: davem, shuah, richardcochran, xiyou.wangcong, netdev,
linux-kernel
Cc: eduval, anchalag, vallish
In-Reply-To: <1503447027-44399-1-git-send-email-vallish@amazon.com>
Enable high resolution timer mode to time SO_RCVTIMEO value used with
setsockopt(2) on AF_UNIX and AF_INET datagram sockets. By default,
SO_RCVTIMEO uses low resolution timer which is good for most of socket
use cases.
Background:
Kernel timer wheel was refactored in 4.8 to avoid drawbacks with previous
implementation:
https://lwn.net/Articles/691064/
Unlike the previous "kernel timer wheel" implementation in 4.4 which aimed
for accuracy by paying cost for cascading tracked timers at the boundary of
256 jiffies, the new timer wheel implementation gets rid of cascading
latency by paying a price for being less accurate for far off timers.
Use Case:
New implementation is good for most of socket use cases. However we have a
use case where our application is sensitive to socket timeout including
long timeouts. Please refer to test code as part of this patch series.
One of the test runs with a timeout value of 180 seconds timed out at
190 seconds.
[root@]# ./datagram_sock_timeout 180000
datagram_sock_timeout failed: took 190.00 seconds
[root@]#
The same program when run on a 4.4 kernel would timeout more acurately and
the kernel added slack was not noticeable to user application.
Interesting text:
a) Standards for setsockopt:
http://pubs.opengroup.org/onlinepubs/009695399/functions/setsockopt.html
<snip>
SO_RCVTIMEO
Sets the timeout value that specifies the maximum amount of time an input
function waits until it completes. It accepts a timeval structure with the
number of seconds and microseconds specifying the limit on how long to wait
for an input operation to complete. If a receive operation has blocked for
this much time without receiving additional data, it shall return with a
partial count or errno set to [EAGAIN] or [EWOULDBLOCK] if no data is
received. The default for this option is zero, which indicates that a
receive operation shall not time out. This option takes a timeval
structure. Note that not all implementations allow this option to be set.
<end snip>
This only talks about the maximum time and the current behavior indeed
follows this standard. System call does not return before the time
specified and it does return EAGAIN.
b) Man page for SETSOCKOPT(3P):
<snip>
The option_name argument specifies a single option to set. It can be
one of the socket-level options defined in <sys_socket.h> and described in
Section 2.10.16, Use of Options. If option_name is equal to SO_RCVTIMEO
or SO_SNDTIMEO and the implementation supports setting the option, it is
unspecified whether the struct timeval pointed to by option_value is
stored as provided by this function or is rounded up to align with the
resolution of the clock being used. If setsockopt() is called with
option_name equal to SO_ACCEPTCONN, SO_ERROR, or SO_TYPE, the behavior is
unspecified.
<end snip>
Behavior is unspecified.
3) Man page for SELECT(2):
<snip>
Note that the timeout interval will be rounded up to the system
clock granularity, and kernel scheduling delays mean that the blocking
interval may overrun by a small amount. If both fields of the timeval
structure are zero, then select() returns immediately. (This is useful
for polling.) If timeout is NULL (no timeout), select() can block
indefinitely.
<end snip>
Select system call guarantees timeout interval and inturn uses highres
timer.
Reported-by: Manjula Peiris <thelgep@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
net/core/datagram.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647b..832c147 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -80,6 +80,7 @@ static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, i
return 0;
return autoremove_wake_function(wait, mode, sync, key);
}
+
/*
* Wait for the last received packet to be different from skb
*/
@@ -87,6 +88,9 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
const struct sk_buff *skb)
{
int error;
+ ktime_t expires;
+ struct timespec64 time;
+ unsigned long pre_sched_time;
DEFINE_WAIT_FUNC(wait, receiver_wake_function);
prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
@@ -116,7 +120,15 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
goto interrupted;
error = 0;
- *timeo_p = schedule_timeout(*timeo_p);
+ /* Wait using highres timer */
+ time.tv_sec = *timeo_p / HZ;
+ time.tv_nsec = jiffies_to_nsecs(*timeo_p % HZ);
+ expires = ktime_add_safe(ktime_get(), timespec64_to_ktime(time));
+ pre_sched_time = jiffies;
+ if (schedule_hrtimeout(&expires, HRTIMER_MODE_ABS))
+ *timeo_p = jiffies - pre_sched_time;
+ else
+ *timeo_p = 0;
out:
finish_wait(sk_sleep(sk), &wait);
return error;
--
2.7.3.AMZN
^ permalink raw reply related
* [PATCH v2 0/2] enable hires timer to timeout datagram socket
From: Vallish Vaidyeshwara @ 2017-08-23 0:10 UTC (permalink / raw)
To: davem, shuah, richardcochran, xiyou.wangcong, netdev,
linux-kernel
Cc: eduval, anchalag, vallish
v1->v2:
- Cong Wang pointed out MAX_SCHEDULE_TIMEOUT wraparound, fixed the
patch 1/2 to accommodate MAX_SCHEDULE_TIMEOUT wait time
- Changed format of printing total time from float to long in
selftests patch 2/2
Hello Dave,
Resending the patch series to include netdev mailing list with a
cover letter.
I am submitting 2 patch series to enable hires timer to timeout
datagram sockets (AF_UNIX & AF_INET domain) and test code to test
timeout accuracy on these sockets.
There has been a behavior change in 4.9 kernel with refactoring of Kernel
timer wheel in 4.8. We have a use case wherein our datagram socket
application is sensitive to socket timeout including long timeouts.
One of the test runs with a timeout value of 180 seconds timed out at
190 seconds.
[root@]# ./datagram_sock_timeout 180000
datagram_sock_timeout failed: took 190.00 seconds
[root@]#
The same program when run on a 4.4 kernel would timeout more accurately and
the kernel added slack was not noticeable to user application.
Patch 1: Has core code change of enabling hires timer to timeout datagram
socket on AF_UNIX and AF_INET domain
Patch 2: Test code to report regression in timeout behavior related to
patch 1
Vallish Vaidyeshwara (2):
net: enable high resolution timer mode to timeout datagram sockets
selftests/net: add test to verify datagram socket timeout
net/core/datagram.c | 14 ++-
tools/testing/selftests/net/Makefile | 3 +-
.../testing/selftests/net/datagram_sock_timeout.c | 119 +++++++++++++++++++++
.../selftests/net/run_datagram_sock_timeout.sh | 12 +++
4 files changed, 146 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/selftests/net/datagram_sock_timeout.c
create mode 100755 tools/testing/selftests/net/run_datagram_sock_timeout.sh
--
2.7.3.AMZN
^ permalink raw reply
* [trivial] gre: fix goto statement typo
From: William Tu @ 2017-08-23 0:04 UTC (permalink / raw)
To: netdev; +Cc: trivial
Fix typo: pnet_tap_faied.
Signed-off-by: William Tu <u9012063@gmail.com>
---
net/ipv4/ip_gre.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 6e8a62289e03..5a20ba9b9b50 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1467,7 +1467,7 @@ static int __init ipgre_init(void)
err = register_pernet_device(&ipgre_tap_net_ops);
if (err < 0)
- goto pnet_tap_faied;
+ goto pnet_tap_failed;
err = register_pernet_device(&erspan_net_ops);
if (err < 0)
@@ -1503,7 +1503,7 @@ static int __init ipgre_init(void)
unregister_pernet_device(&erspan_net_ops);
pnet_erspan_failed:
unregister_pernet_device(&ipgre_tap_net_ops);
-pnet_tap_faied:
+pnet_tap_failed:
unregister_pernet_device(&ipgre_net_ops);
return err;
}
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 0/2] Two minor BPF cleanups
From: Daniel Borkmann @ 2017-08-22 23:47 UTC (permalink / raw)
To: davem; +Cc: ast, john.fastabend, netdev, Daniel Borkmann
Two minor cleanups on devmap and redirect I still had
in my queue.
Thanks!
Daniel Borkmann (2):
bpf: misc xdp redirect cleanups
bpf: minor cleanups for dev_map
kernel/bpf/devmap.c | 100 +++++++++++++++++++++-------------------------------
net/core/filter.c | 72 +++++++++++++++++--------------------
2 files changed, 73 insertions(+), 99 deletions(-)
--
1.9.3
^ permalink raw reply
* [PATCH net-next 2/2] bpf: minor cleanups for dev_map
From: Daniel Borkmann @ 2017-08-22 23:47 UTC (permalink / raw)
To: davem; +Cc: ast, john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <cover.1503445395.git.daniel@iogearbox.net>
Some minor code cleanups, while going over it I also noticed that
we're accounting the bitmap only for one CPU currently, so fix that
up as well.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
kernel/bpf/devmap.c | 100 +++++++++++++++++++++-------------------------------
1 file changed, 41 insertions(+), 59 deletions(-)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index fa08181..bfecabf 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,30 +48,30 @@
* calls will fail at this point.
*/
#include <linux/bpf.h>
-#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/rculist_nulls.h>
-#include "percpu_freelist.h"
-#include "bpf_lru_list.h"
-#include "map_in_map.h"
struct bpf_dtab_netdev {
struct net_device *dev;
- int key;
- struct rcu_head rcu;
struct bpf_dtab *dtab;
+ unsigned int bit;
+ struct rcu_head rcu;
};
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
- unsigned long int __percpu *flush_needed;
+ unsigned long __percpu *flush_needed;
struct list_head list;
};
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
+static u64 dev_map_bitmap_size(const union bpf_attr *attr)
+{
+ return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
struct bpf_dtab *dtab;
@@ -95,11 +95,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
dtab->map.map_flags = attr->map_flags;
dtab->map.numa_node = bpf_map_attr_numa_node(attr);
- err = -ENOMEM;
-
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
- cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+ cost += dev_map_bitmap_size(attr) * num_possible_cpus();
if (cost >= U32_MAX - PAGE_SIZE)
goto free_dtab;
@@ -110,12 +108,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (err)
goto free_dtab;
- err = -ENOMEM;
/* A per cpu bitfield with a bit per possible net device */
- dtab->flush_needed = __alloc_percpu(
- BITS_TO_LONGS(attr->max_entries) *
- sizeof(unsigned long),
- __alignof__(unsigned long));
+ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
+ __alignof__(unsigned long));
if (!dtab->flush_needed)
goto free_dtab;
@@ -128,12 +123,12 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
spin_lock(&dev_map_lock);
list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock);
- return &dtab->map;
+ return &dtab->map;
free_dtab:
free_percpu(dtab->flush_needed);
kfree(dtab);
- return ERR_PTR(err);
+ return ERR_PTR(-ENOMEM);
}
static void dev_map_free(struct bpf_map *map)
@@ -178,9 +173,6 @@ static void dev_map_free(struct bpf_map *map)
kfree(dev);
}
- /* At this point bpf program is detached and all pending operations
- * _must_ be complete
- */
free_percpu(dtab->flush_needed);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
@@ -190,7 +182,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
u32 index = key ? *(u32 *)key : U32_MAX;
- u32 *next = (u32 *)next_key;
+ u32 *next = next_key;
if (index >= dtab->map.max_entries) {
*next = 0;
@@ -199,29 +191,16 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
if (index == dtab->map.max_entries - 1)
return -ENOENT;
-
*next = index + 1;
return 0;
}
-void __dev_map_insert_ctx(struct bpf_map *map, u32 key)
+void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
- __set_bit(key, bitmap);
-}
-
-struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
-{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct bpf_dtab_netdev *dev;
-
- if (key >= map->max_entries)
- return NULL;
-
- dev = READ_ONCE(dtab->netdev_map[key]);
- return dev ? dev->dev : NULL;
+ __set_bit(bit, bitmap);
}
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
@@ -248,7 +227,6 @@ void __dev_map_flush(struct bpf_map *map)
continue;
netdev = dev->dev;
-
__clear_bit(bit, bitmap);
if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush))
continue;
@@ -261,43 +239,49 @@ void __dev_map_flush(struct bpf_map *map)
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
*/
-static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev;
- u32 i = *(u32 *)key;
- if (i >= map->max_entries)
+ if (key >= map->max_entries)
return NULL;
- dev = READ_ONCE(dtab->netdev_map[i]);
- return dev ? &dev->dev->ifindex : NULL;
+ dev = READ_ONCE(dtab->netdev_map[key]);
+ return dev ? dev->dev : NULL;
}
-static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev)
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+
+ return dev ? &dev->ifindex : NULL;
+}
+
+static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
- if (old_dev->dev->netdev_ops->ndo_xdp_flush) {
- struct net_device *fl = old_dev->dev;
+ if (dev->dev->netdev_ops->ndo_xdp_flush) {
+ struct net_device *fl = dev->dev;
unsigned long *bitmap;
int cpu;
for_each_online_cpu(cpu) {
- bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu);
- __clear_bit(old_dev->key, bitmap);
+ bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
+ __clear_bit(dev->bit, bitmap);
- fl->netdev_ops->ndo_xdp_flush(old_dev->dev);
+ fl->netdev_ops->ndo_xdp_flush(dev->dev);
}
}
}
static void __dev_map_entry_free(struct rcu_head *rcu)
{
- struct bpf_dtab_netdev *old_dev;
+ struct bpf_dtab_netdev *dev;
- old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
- dev_map_flush_old(old_dev);
- dev_put(old_dev->dev);
- kfree(old_dev);
+ dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ dev_map_flush_old(dev);
+ dev_put(dev->dev);
+ kfree(dev);
}
static int dev_map_delete_elem(struct bpf_map *map, void *key)
@@ -309,8 +293,8 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
if (k >= map->max_entries)
return -EINVAL;
- /* Use synchronize_rcu() here to ensure any rcu critical sections
- * have completed, but this does not guarantee a flush has happened
+ /* Use call_rcu() here to ensure any rcu critical sections have
+ * completed, but this does not guarantee a flush has happened
* yet. Because driver side rcu_read_lock/unlock only protects the
* running XDP program. However, for pending flush operations the
* dev and ctx are stored in another per cpu map. And additionally,
@@ -334,10 +318,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
-
if (unlikely(i >= dtab->map.max_entries))
return -E2BIG;
-
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
@@ -355,7 +337,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EINVAL;
}
- dev->key = i;
+ dev->bit = i;
dev->dtab = dtab;
}
--
1.9.3
^ permalink raw reply related
* [PATCH net-next 1/2] bpf: misc xdp redirect cleanups
From: Daniel Borkmann @ 2017-08-22 23:47 UTC (permalink / raw)
To: davem; +Cc: ast, john.fastabend, netdev, Daniel Borkmann
In-Reply-To: <cover.1503445395.git.daniel@iogearbox.net>
Few cleanups including: bpf_redirect_map() is really XDP only due to
the return code. Move it to a more appropriate location where we do
the XDP redirect handling and change it's name into bpf_xdp_redirect_map()
to make it consistent to the bpf_xdp_redirect() helper.
xdp_do_redirect_map() helper can be static since only used out of filter.c
file. Drop the goto in xdp_do_generic_redirect() and only return errors
directly. In xdp_do_flush_map() only clear ri->map_to_flush which is the
arg we're using in that function, ri->map is cleared earlier along with
ri->ifindex.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
net/core/filter.c | 72 +++++++++++++++++++++++++------------------------------
1 file changed, 32 insertions(+), 40 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index fa21156..2a0d762 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1835,29 +1835,6 @@ int skb_do_redirect(struct sk_buff *skb)
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags)
-{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
-
- if (unlikely(flags))
- return XDP_ABORTED;
-
- ri->ifindex = ifindex;
- ri->flags = flags;
- ri->map = map;
-
- return XDP_REDIRECT;
-}
-
-static const struct bpf_func_proto bpf_redirect_map_proto = {
- .func = bpf_redirect_map,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_ANYTHING,
-};
-
BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
@@ -2506,13 +2483,11 @@ static int __bpf_tx_xdp(struct net_device *dev,
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
if (err)
return err;
-
if (map)
__dev_map_insert_ctx(map, index);
else
dev->netdev_ops->ndo_xdp_flush(dev);
-
- return err;
+ return 0;
}
void xdp_do_flush_map(void)
@@ -2520,16 +2495,14 @@ void xdp_do_flush_map(void)
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
struct bpf_map *map = ri->map_to_flush;
- ri->map = NULL;
ri->map_to_flush = NULL;
-
if (map)
__dev_map_flush(map);
}
EXPORT_SYMBOL_GPL(xdp_do_flush_map);
-int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
struct bpf_map *map = ri->map;
@@ -2545,14 +2518,12 @@ int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
err = -EINVAL;
goto out;
}
-
- if (ri->map_to_flush && (ri->map_to_flush != map))
+ if (ri->map_to_flush && ri->map_to_flush != map)
xdp_do_flush_map();
err = __bpf_tx_xdp(fwd, map, xdp, index);
if (likely(!err))
ri->map_to_flush = map;
-
out:
trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
return err;
@@ -2594,20 +2565,17 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb)
ri->ifindex = 0;
if (unlikely(!dev)) {
bpf_warn_invalid_xdp_redirect(index);
- goto err;
+ return -EINVAL;
}
if (unlikely(!(dev->flags & IFF_UP)))
- goto err;
-
+ return -ENETDOWN;
len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
if (skb->len > len)
- goto err;
+ return -E2BIG;
skb->dev = dev;
return 0;
-err:
- return -EINVAL;
}
EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
@@ -2620,6 +2588,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb)
ri->ifindex = ifindex;
ri->flags = flags;
+
return XDP_REDIRECT;
}
@@ -2631,6 +2600,29 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb)
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags)
+{
+ struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+ if (unlikely(flags))
+ return XDP_ABORTED;
+
+ ri->ifindex = ifindex;
+ ri->flags = flags;
+ ri->map = map;
+
+ return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
+ .func = bpf_xdp_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
bool bpf_helper_changes_pkt_data(void *func)
{
if (func == bpf_skb_vlan_push ||
@@ -3233,7 +3225,7 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
case BPF_FUNC_redirect:
return &bpf_xdp_redirect_proto;
case BPF_FUNC_redirect_map:
- return &bpf_redirect_map_proto;
+ return &bpf_xdp_redirect_map_proto;
default:
return bpf_base_func_proto(func_id);
}
--
1.9.3
^ permalink raw reply related
* linux-next: build warning after merge of the net tree
From: Stephen Rothwell @ 2017-08-22 23:34 UTC (permalink / raw)
To: David Miller, Networking
Cc: Linux-Next Mailing List, Linux Kernel Mailing List
Hi all,
After merging the net tree, today's linux-next build (x86_64 allmodconfig)
produced this warning:
net/ipv6/route.c: In function 'rt6_check':
net/ipv6/route.c:1294:43: warning: 'rt_cookie' may be used uninitialized in this function [-Wmaybe-uninitialized]
if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
^
Introduced by commit
c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node")
--
Cheers,
Stephen Rothwell
^ permalink raw reply
* Re: [PATCH net] fsl/man: Inherit parent device and of_node
From: David Miller @ 2017-08-22 23:32 UTC (permalink / raw)
To: f.fainelli; +Cc: netdev, junote, madalin.bucur, linux-kernel
In-Reply-To: <20170822222447.16754-1-f.fainelli@gmail.com>
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 22 Aug 2017 15:24:47 -0700
> Junote Cai reported that he was not able to get a DSA setup involving the
> Freescale DPAA/FMAN driver to work and narrowed it down to
> of_find_net_device_by_node(). This function requires the network device's
> device reference to be correctly set which is the case here, though we have
> lost any device_node association there.
>
> The problem is that dpaa_eth_add_device() allocates a "dpaa-ethernet" platform
> device, and later on dpaa_eth_probe() is called but SET_NETDEV_DEV() won't be
> propagating &pdev->dev.of_node properly. Fix this by inherenting both the parent
> device and the of_node when dpaa_eth_add_device() creates the platform device.
>
> Fixes: 3933961682a3 ("fsl/fman: Add FMan MAC driver")
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Applied and queued up for -stable, thanks.
^ permalink raw reply
* Re: [PATCH][net-next] MIPS,bpf: fix missing break in switch statement
From: David Miller @ 2017-08-22 23:18 UTC (permalink / raw)
To: colin.king; +Cc: netdev
In-Reply-To: <20170822224606.6273-1-colin.king@canonical.com>
From: Colin King <colin.king@canonical.com>
Date: Tue, 22 Aug 2017 23:46:06 +0100
> From: Colin Ian King <colin.king@canonical.com>
>
> There is a missing break causing a fall-through and setting
> ctx.use_bbit_insns to the wrong value. Fix this by adding the
> missing break.
>
> Detected with cppcheck:
> "Variable 'ctx.use_bbit_insns' is reassigned a value before the old
> one has been used. 'break;' missing?"
>
> Fixes: 8d8d18c3283f ("MIPS,bpf: Fix using smp_processor_id() in preemptible splat.")
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> Acked-by: David Daney <david.daney@cavium.com>
Applied, thanks.
^ permalink raw reply
* [PATCH][net-next] MIPS,bpf: fix missing break in switch statement
From: Colin King @ 2017-08-22 22:46 UTC (permalink / raw)
Cc: netdev
From: Colin Ian King <colin.king@canonical.com>
There is a missing break causing a fall-through and setting
ctx.use_bbit_insns to the wrong value. Fix this by adding the
missing break.
Detected with cppcheck:
"Variable 'ctx.use_bbit_insns' is reassigned a value before the old
one has been used. 'break;' missing?"
Fixes: 8d8d18c3283f ("MIPS,bpf: Fix using smp_processor_id() in preemptible splat.")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: David Daney <david.daney@cavium.com>
---
arch/mips/net/ebpf_jit.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 44ddc12cbb0e..7646891c4e9b 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -1892,6 +1892,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
case CPU_CAVIUM_OCTEON2:
case CPU_CAVIUM_OCTEON3:
ctx.use_bbit_insns = 1;
+ break;
default:
ctx.use_bbit_insns = 0;
}
--
2.14.1
^ permalink raw reply related
* Re: [PATCH net] bpf: fix map value attribute for hash of maps
From: Martin KaFai Lau @ 2017-08-22 22:27 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: davem, ast, netdev
In-Reply-To: <7f75667dcb2464053e2adf4fc01fdb8d77dfb335.1503439140.git.daniel@iogearbox.net>
On Wed, Aug 23, 2017 at 12:06:09AM +0200, Daniel Borkmann wrote:
> Currently, iproute2's BPF ELF loader works fine with array of maps
> when retrieving the fd from a pinned node and doing a selfcheck
> against the provided map attributes from the object file, but we
> fail to do the same for hash of maps and thus refuse to get the
> map from pinned node.
>
> Reason is that when allocating hash of maps, fd_htab_map_alloc() will
> set the value size to sizeof(void *), and any user space map creation
> requests are forced to set 4 bytes as value size. Thus, selfcheck
> will complain about exposed 8 bytes on 64 bit archs vs. 4 bytes from
> object file as value size. Contract is that fdinfo or BPF_MAP_GET_FD_BY_ID
> returns the value size used to create the map.
>
> Fix it by handling it the same way as we do for array of maps, which
> means that we leave value size at 4 bytes and in the allocation phase
> round up value size to 8 bytes. alloc_htab_elem() needs an adjustment
> in order to copy rounded up 8 bytes due to bpf_fd_htab_map_update_elem()
> calling into htab_map_update_elem() with the pointer of the map
> pointer as value. Unlike array of maps where we just xchg(), we're
> using the generic htab_map_update_elem() callback also used from helper
> calls, which published the key/value already on return, so we need
> to ensure to memcpy() the right size.
>
> Fixes: bcc6b1b7ebf8 ("bpf: Add hash of maps support")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
> ---
> kernel/bpf/hashtab.c | 30 +++++++++++++++++-------------
> 1 file changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 4fb4631..d11c818 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
> }
> }
>
> +static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
> +{
> + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
> + BITS_PER_LONG == 64;
> +}
> +
> +static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
> +{
> + u32 size = htab->map.value_size;
> +
> + if (percpu || fd_htab_map_needs_adjust(htab))
> + size = round_up(size, 8);
> + return size;
> +}
> +
> static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
> void *value, u32 key_size, u32 hash,
> bool percpu, bool onallcpus,
> struct htab_elem *old_elem)
> {
> - u32 size = htab->map.value_size;
> + u32 size = htab_size_value(htab, percpu);
> bool prealloc = htab_is_prealloc(htab);
> struct htab_elem *l_new, **pl_new;
> void __percpu *pptr;
> @@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
>
> memcpy(l_new->key, key, key_size);
> if (percpu) {
> - /* round up value_size to 8 bytes */
> - size = round_up(size, 8);
> -
> if (prealloc) {
> pptr = htab_elem_get_ptr(l_new, key_size);
> } else {
> @@ -1209,17 +1221,9 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
>
> static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
> {
> - struct bpf_map *map;
> -
> if (attr->value_size != sizeof(u32))
> return ERR_PTR(-EINVAL);
> -
> - /* pointer is stored internally */
> - attr->value_size = sizeof(void *);
> - map = htab_map_alloc(attr);
> - attr->value_size = sizeof(u32);
> -
> - return map;
> + return htab_map_alloc(attr);
> }
>
> static void fd_htab_map_free(struct bpf_map *map)
> --
> 1.9.3
>
^ permalink raw reply
* Re: [PATCH net-next v5] openvswitch: enable NSH support
From: kbuild test robot @ 2017-08-22 22:26 UTC (permalink / raw)
To: Yi Yang
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
jbenc-H+wXaHxf7aLQT0dZR+AlfA, e, kbuild-all-JC7UmRfGjtg
In-Reply-To: <1503294863-12859-1-git-send-email-yi.y.yang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
[-- Attachment #1: Type: text/plain, Size: 2006 bytes --]
Hi Yi,
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/Yi-Yang/openvswitch-enable-NSH-support/20170822-184251
config: x86_64-randconfig-a0-08230538 (attached as .config)
compiler: gcc-4.4 (Debian 4.4.7-8) 4.4.7
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
All warnings (new ones prefixed by >>):
net//openvswitch/flow.c: In function 'parse_nsh':
>> net//openvswitch/flow.c:498: warning: unused variable 'err'
vim +/err +498 net//openvswitch/flow.c
493
494 static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
495 {
496 struct nsh_hdr *nsh = (struct nsh_hdr *)skb_network_header(skb);
497 u8 version, length;
> 498 int err;
499
500 if (unlikely(skb->len < NSH_BASE_HDR_LEN))
501 return -EINVAL;
502
503 version = nsh_get_ver(nsh);
504 length = nsh_hdr_len(nsh);
505
506 if (version != 0)
507 return -EINVAL;
508
509 if (unlikely(skb->len < length))
510 return -EINVAL;
511
512 key->nsh.flags = nsh_get_flags(nsh);
513 key->nsh.ttl = nsh_get_ttl(nsh);
514 key->nsh.mdtype = nsh->md_type;
515 key->nsh.np = nsh->next_proto;
516 key->nsh.path_hdr = nsh->path_hdr;
517 switch (key->nsh.mdtype) {
518 case NSH_M_TYPE1:
519 if (length != NSH_M_TYPE1_LEN)
520 return -EINVAL;
521 memcpy(key->nsh.context, nsh->md1.context,
522 sizeof(nsh->md1));
523 break;
524 case NSH_M_TYPE2:
525 /* Don't support MD type 2 metedata parsing yet */
526 if (length < NSH_BASE_HDR_LEN)
527 return -EINVAL;
528
529 memset(key->nsh.context, 0,
530 sizeof(nsh->md1));
531 break;
532 default:
533 return -EINVAL;
534 }
535
536 return 0;
537 }
538
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply
* [PATCH net] fsl/man: Inherit parent device and of_node
From: Florian Fainelli @ 2017-08-22 22:24 UTC (permalink / raw)
To: netdev; +Cc: junote, madalin.bucur, Florian Fainelli, open list
Junote Cai reported that he was not able to get a DSA setup involving the
Freescale DPAA/FMAN driver to work and narrowed it down to
of_find_net_device_by_node(). This function requires the network device's
device reference to be correctly set which is the case here, though we have
lost any device_node association there.
The problem is that dpaa_eth_add_device() allocates a "dpaa-ethernet" platform
device, and later on dpaa_eth_probe() is called but SET_NETDEV_DEV() won't be
propagating &pdev->dev.of_node properly. Fix this by inherenting both the parent
device and the of_node when dpaa_eth_add_device() creates the platform device.
Fixes: 3933961682a3 ("fsl/fman: Add FMan MAC driver")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/ethernet/freescale/fman/mac.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index 6e67d22fd0d5..1c7da16ad0ff 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -623,6 +623,8 @@ static struct platform_device *dpaa_eth_add_device(int fman_id,
goto no_mem;
}
+ pdev->dev.of_node = node;
+ pdev->dev.parent = priv->dev;
set_dma_ops(&pdev->dev, get_dma_ops(priv->dev));
ret = platform_device_add_data(pdev, &data, sizeof(data));
--
2.9.3
^ permalink raw reply related
* [PATCH net v2 2/2] net: dsa: skb_put_padto() already frees nskb
From: Florian Fainelli @ 2017-08-22 22:12 UTC (permalink / raw)
To: netdev; +Cc: davem, vivien.didelot, Woojung.Huh, UNGLinuxDriver,
Florian Fainelli
In-Reply-To: <20170822221215.16305-1-f.fainelli@gmail.com>
The first call of skb_put_padto() will free up the SKB on error, but we
return NULL which tells dsa_slave_xmit() that the original SKB should be
freed so this would lead to a double free here.
The second skb_put_padto() already frees the passed sk_buff reference
upon error, so calling kfree_skb() on it again is not necessary.
Detected by CoverityScan, CID#1416687 ("USE_AFTER_FREE")
Fixes: e71cb9e00922 ("net: dsa: ksz: fix skb freeing")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
net/dsa/tag_ksz.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index de66ca8e6201..3bd6e2a83125 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -42,7 +42,8 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len;
if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) {
- if (skb_put_padto(skb, skb->len + padlen))
+ /* Let dsa_slave_xmit() free skb */
+ if (__skb_put_padto(skb, skb->len + padlen, false))
return NULL;
nskb = skb;
@@ -60,10 +61,11 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
skb_transport_header(skb) - skb->head);
skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
- if (skb_put_padto(nskb, nskb->len + padlen)) {
- kfree_skb(nskb);
+ /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free
+ * skb
+ */
+ if (skb_put_padto(nskb, nskb->len + padlen))
return NULL;
- }
kfree_skb(skb);
}
--
2.9.3
^ permalink raw reply related
* [PATCH net v2 1/2] net: core: Specify skb_pad()/skb_put_padto() SKB freeing
From: Florian Fainelli @ 2017-08-22 22:12 UTC (permalink / raw)
To: netdev; +Cc: davem, vivien.didelot, Woojung.Huh, UNGLinuxDriver,
Florian Fainelli
In-Reply-To: <20170822221215.16305-1-f.fainelli@gmail.com>
Rename skb_pad() into __skb_pad() and make it take a third argument:
free_on_error which controls whether kfree_skb() should be called or
not, skb_pad() directly makes use of it and passes true to preserve its
existing behavior. Do exactly the same thing with __skb_put_padto() and
skb_put_padto().
Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
include/linux/skbuff.h | 41 +++++++++++++++++++++++++++++++++++++----
net/core/skbuff.c | 13 ++++++++-----
2 files changed, 45 insertions(+), 9 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dbe29b6c9bd6..d67a8182e5eb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -973,7 +973,23 @@ int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
-int skb_pad(struct sk_buff *skb, int pad);
+int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);
+
+/**
+ * skb_pad - zero pad the tail of an skb
+ * @skb: buffer to pad
+ * @pad: space to pad
+ *
+ * Ensure that a buffer is followed by a padding area that is zero
+ * filled. Used by network drivers which may DMA or transfer data
+ * beyond the buffer end onto the wire.
+ *
+ * May return error in out of memory cases. The skb is freed on error.
+ */
+static inline int skb_pad(struct sk_buff *skb, int pad)
+{
+ return __skb_pad(skb, pad, true);
+}
#define dev_kfree_skb(a) consume_skb(a)
int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
@@ -2825,25 +2841,42 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len)
* skb_put_padto - increase size and pad an skbuff up to a minimal size
* @skb: buffer to pad
* @len: minimal length
+ * @free_on_error: free buffer on error
*
* Pads up a buffer to ensure the trailing bytes exist and are
* blanked. If the buffer already contains sufficient data it
* is untouched. Otherwise it is extended. Returns zero on
- * success. The skb is freed on error.
+ * success. The skb is freed on error if @free_on_error is true.
*/
-static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len,
+ bool free_on_error)
{
unsigned int size = skb->len;
if (unlikely(size < len)) {
len -= size;
- if (skb_pad(skb, len))
+ if (__skb_pad(skb, len, free_on_error))
return -ENOMEM;
__skb_put(skb, len);
}
return 0;
}
+/**
+ * skb_put_padto - increase size and pad an skbuff up to a minimal size
+ * @skb: buffer to pad
+ * @len: minimal length
+ *
+ * Pads up a buffer to ensure the trailing bytes exist and are
+ * blanked. If the buffer already contains sufficient data it
+ * is untouched. Otherwise it is extended. Returns zero on
+ * success. The skb is freed on error.
+ */
+static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+{
+ return __skb_put_padto(skb, len, true);
+}
+
static inline int skb_add_data(struct sk_buff *skb,
struct iov_iter *from, int copy)
{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f990eb8b30a9..e07556606284 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1363,18 +1363,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
EXPORT_SYMBOL(skb_copy_expand);
/**
- * skb_pad - zero pad the tail of an skb
+ * __skb_pad - zero pad the tail of an skb
* @skb: buffer to pad
* @pad: space to pad
+ * @free_on_error: free buffer on error
*
* Ensure that a buffer is followed by a padding area that is zero
* filled. Used by network drivers which may DMA or transfer data
* beyond the buffer end onto the wire.
*
- * May return error in out of memory cases. The skb is freed on error.
+ * May return error in out of memory cases. The skb is freed on error
+ * if @free_on_error is true.
*/
-int skb_pad(struct sk_buff *skb, int pad)
+int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
int err;
int ntail;
@@ -1403,10 +1405,11 @@ int skb_pad(struct sk_buff *skb, int pad)
return 0;
free_skb:
- kfree_skb(skb);
+ if (free_on_error)
+ kfree_skb(skb);
return err;
}
-EXPORT_SYMBOL(skb_pad);
+EXPORT_SYMBOL(__skb_pad);
/**
* pskb_put - add data to the tail of a potentially fragmented buffer
--
2.9.3
^ permalink raw reply related
* [PATCH net v2 0/2] net: dsa: Fix tag_ksz.c
From: Florian Fainelli @ 2017-08-22 22:12 UTC (permalink / raw)
To: netdev; +Cc: davem, vivien.didelot, Woojung.Huh, UNGLinuxDriver,
Florian Fainelli
This implements David's suggestion of providing low-level functions
to control whether skb_pad() and skb_put_padto() should be freeing
the passed skb.
We make use of it to fix a double free in net/dsa/tag_ksz.c that would
occur if we kept using skb_put_padto() in both places.
Florian Fainelli (2):
net: core: Specify skb_pad()/skb_put_padto() SKB freeing
net: dsa: skb_put_padto() already frees nskb
include/linux/skbuff.h | 41 +++++++++++++++++++++++++++++++++++++----
net/core/skbuff.c | 13 ++++++++-----
net/dsa/tag_ksz.c | 10 ++++++----
3 files changed, 51 insertions(+), 13 deletions(-)
--
2.9.3
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox