Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf 1/2] bpf: fix alignment of netns_dev/netns_ino fields in bpf_{map,prog}_info
From: Eugene Syromiatnikov @ 2018-05-27 11:28 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, Martin KaFai Lau, Daniel Borkmann,
	Alexei Starovoitov, David S. Miller, Jiri Olsa, Ingo Molnar,
	Lawrence Brakmo, Andrey Ignatov, Jakub Kicinski, John Fastabend,
	Dmitry V. Levin

Recent introduction of netns_dev/netns_ino to bpf_map_info/bpf_prog info
has broken compat, as offsets of these fields are different in 32-bit
and 64-bit ABIs.  One fix (other than implementing compat support in
syscall in order to handle this discrepancy) is to use __aligned_u64
instead of __u64 for these fields.

Reported-by: Dmitry V. Levin <ldv@altlinux.org>
Fixes: 52775b33bb507 ("bpf: offload: report device information about
offloaded maps")
Fixes: 675fc275a3a2d ("bpf: offload: report device information for
offloaded programs")

Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
---
 include/uapi/linux/bpf.h       | 8 ++++----
 tools/include/uapi/linux/bpf.h | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec897..903010a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1017,8 +1017,8 @@ struct bpf_prog_info {
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
-	__u64 netns_dev;
-	__u64 netns_ino;
+	__aligned_u64 netns_dev;
+	__aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -1030,8 +1030,8 @@ struct bpf_map_info {
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
-	__u64 netns_dev;
-	__u64 netns_ino;
+	__aligned_u64 netns_dev;
+	__aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c5ec897..903010a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1017,8 +1017,8 @@ struct bpf_prog_info {
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
-	__u64 netns_dev;
-	__u64 netns_ino;
+	__aligned_u64 netns_dev;
+	__aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -1030,8 +1030,8 @@ struct bpf_map_info {
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
-	__u64 netns_dev;
-	__u64 netns_ino;
+	__aligned_u64 netns_dev;
+	__aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
-- 
2.1.4

^ permalink raw reply related

* [PATCH bpf 2/2] bpf: enforce usage of __aligned_u64 in the UAPI header
From: Eugene Syromiatnikov @ 2018-05-27 11:28 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, Martin KaFai Lau, Daniel Borkmann,
	Alexei Starovoitov, David S. Miller, Jiri Olsa, Ingo Molnar,
	Lawrence Brakmo, Andrey Ignatov, Jakub Kicinski, John Fastabend,
	Dmitry V. Levin

Use __aligned_u64 instead of __u64 everywhere in the UAPI header,
similarly to v4.17-rc1~94^2~58^2 "RDMA: Change all uapi headers to use
__aligned_u64 instead of __u64".

This commit doesn't change structure layouts, but imposes additional
alignment requirements on struct bpf_stack_build_id, struct
bpf_sock_ops, struct bpf_perf_event_value, and struct
bpf_raw_tracepoint_args; of them only struct bpf_sock_ops and struct
bpf_perf_event_value have 64-bit fields that were present in a released
kernel without this additional alignment requirement (bytes_received
and bytes_acked were added to struct bpf_sock_ops in commit
v4.16-rc1~123^2~33^2~5^2~4, struct bpf_perf_event_value was added
in commit v4.15-rc1~84^2~532^2~3).

Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
---
 include/uapi/linux/bpf.h       | 22 +++++++++++-----------
 tools/include/uapi/linux/bpf.h | 22 +++++++++++-----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 903010a..174e99a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -259,8 +259,8 @@ struct bpf_stack_build_id {
 	__s32		status;
 	unsigned char	build_id[BPF_BUILD_ID_SIZE];
 	union {
-		__u64	offset;
-		__u64	ip;
+		__aligned_u64	offset;
+		__aligned_u64	ip;
 	};
 };
 
@@ -288,7 +288,7 @@ union bpf_attr {
 			__aligned_u64 value;
 			__aligned_u64 next_key;
 		};
-		__u64		flags;
+		__aligned_u64	flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
@@ -360,7 +360,7 @@ union bpf_attr {
 	} query;
 
 	struct {
-		__u64 name;
+		__aligned_u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
 } __attribute__((aligned(8)));
@@ -1011,7 +1011,7 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
-	__u64 load_time;	/* ns since boottime */
+	__aligned_u64 load_time;	/* ns since boottime */
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
@@ -1101,8 +1101,8 @@ struct bpf_sock_ops {
 	__u32 lost_out;
 	__u32 sacked_out;
 	__u32 sk_txhash;
-	__u64 bytes_received;
-	__u64 bytes_acked;
+	__aligned_u64 bytes_received;
+	__aligned_u64 bytes_acked;
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
@@ -1189,9 +1189,9 @@ enum {
 #define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
 struct bpf_perf_event_value {
-	__u64 counter;
-	__u64 enabled;
-	__u64 running;
+	__aligned_u64 counter;
+	__aligned_u64 enabled;
+	__aligned_u64 running;
 };
 
 #define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
@@ -1209,7 +1209,7 @@ struct bpf_cgroup_dev_ctx {
 };
 
 struct bpf_raw_tracepoint_args {
-	__u64 args[0];
+	__aligned_u64 args[0];
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 903010a..174e99a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -259,8 +259,8 @@ struct bpf_stack_build_id {
 	__s32		status;
 	unsigned char	build_id[BPF_BUILD_ID_SIZE];
 	union {
-		__u64	offset;
-		__u64	ip;
+		__aligned_u64	offset;
+		__aligned_u64	ip;
 	};
 };
 
@@ -288,7 +288,7 @@ union bpf_attr {
 			__aligned_u64 value;
 			__aligned_u64 next_key;
 		};
-		__u64		flags;
+		__aligned_u64	flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
@@ -360,7 +360,7 @@ union bpf_attr {
 	} query;
 
 	struct {
-		__u64 name;
+		__aligned_u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
 } __attribute__((aligned(8)));
@@ -1011,7 +1011,7 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
-	__u64 load_time;	/* ns since boottime */
+	__aligned_u64 load_time;	/* ns since boottime */
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
@@ -1101,8 +1101,8 @@ struct bpf_sock_ops {
 	__u32 lost_out;
 	__u32 sacked_out;
 	__u32 sk_txhash;
-	__u64 bytes_received;
-	__u64 bytes_acked;
+	__aligned_u64 bytes_received;
+	__aligned_u64 bytes_acked;
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
@@ -1189,9 +1189,9 @@ enum {
 #define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
 struct bpf_perf_event_value {
-	__u64 counter;
-	__u64 enabled;
-	__u64 running;
+	__aligned_u64 counter;
+	__aligned_u64 enabled;
+	__aligned_u64 running;
 };
 
 #define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
@@ -1209,7 +1209,7 @@ struct bpf_cgroup_dev_ctx {
 };
 
 struct bpf_raw_tracepoint_args {
-	__u64 args[0];
+	__aligned_u64 args[0];
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH net] mlxsw: spectrum: Forbid creation of VLAN 1 over port/LAG
From: Jiri Pirko @ 2018-05-27 12:26 UTC (permalink / raw)
  To: Ido Schimmel; +Cc: netdev, davem, jiri, petrm, mlxsw
In-Reply-To: <20180527064841.32199-1-idosch@mellanox.com>

Sun, May 27, 2018 at 08:48:41AM CEST, idosch@mellanox.com wrote:
>From: Petr Machata <petrm@mellanox.com>
>
>VLAN 1 is internally used for untagged traffic. Prevent creation of
>explicit netdevice for that VLAN, because that currently isn't supported
>and leads to the NULL pointer dereference cited below.
>
>Fix by preventing creation of VLAN devices with VID of 1 over mlxsw
>devices or LAG devices that involve mlxsw devices.
>
>[  327.175816] ================================================================================
>[  327.184544] UBSAN: Undefined behaviour in drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c:200:12
>[  327.193667] member access within null pointer of type 'const struct mlxsw_sp_fid'
>[  327.201226] CPU: 0 PID: 8983 Comm: ip Not tainted 4.17.0-rc4-petrm_net_ip6gre_headroom-custom-140 #11
>[  327.210496] Hardware name: Mellanox Technologies Ltd. "MSN2410-CB2F"/"SA000874", BIOS 4.6.5 03/08/2016
>[  327.219872] Call Trace:
>[  327.222384]  dump_stack+0xc3/0x12b
>[  327.234007]  ubsan_epilogue+0x9/0x49
>[  327.237638]  ubsan_type_mismatch_common+0x1f9/0x2d0
>[  327.255769]  __ubsan_handle_type_mismatch+0x90/0xa7
>[  327.264716]  mlxsw_sp_fid_type+0x35/0x50 [mlxsw_spectrum]
>[  327.270255]  mlxsw_sp_port_vlan_router_leave+0x46/0xc0 [mlxsw_spectrum]
>[  327.277019]  mlxsw_sp_inetaddr_port_vlan_event+0xe1/0x340 [mlxsw_spectrum]
>[  327.315031]  mlxsw_sp_netdevice_vrf_event+0xa8/0x100 [mlxsw_spectrum]
>[  327.321626]  mlxsw_sp_netdevice_event+0x276/0x430 [mlxsw_spectrum]
>[  327.367863]  notifier_call_chain+0x4c/0x150
>[  327.372128]  __netdev_upper_dev_link+0x1b3/0x260
>[  327.399450]  vrf_add_slave+0xce/0x170 [vrf]
>[  327.403703]  do_setlink+0x658/0x1d70
>[  327.508998]  rtnl_newlink+0x908/0xf20
>[  327.559128]  rtnetlink_rcv_msg+0x50c/0x720
>[  327.571720]  netlink_rcv_skb+0x16a/0x1f0
>[  327.583450]  netlink_unicast+0x2ca/0x3e0
>[  327.599305]  netlink_sendmsg+0x3e2/0x7f0
>[  327.616655]  sock_sendmsg+0x76/0xc0
>[  327.620207]  ___sys_sendmsg+0x494/0x5d0
>[  327.666117]  __sys_sendmsg+0xc2/0x130
>[  327.690953]  do_syscall_64+0x66/0x370
>[  327.694677]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>[  327.699782] RIP: 0033:0x7f4c2f3f8037
>[  327.703393] RSP: 002b:00007ffe8c389708 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
>[  327.711035] RAX: ffffffffffffffda RBX: 000000005b03f53e RCX: 00007f4c2f3f8037
>[  327.718229] RDX: 0000000000000000 RSI: 00007ffe8c389760 RDI: 0000000000000003
>[  327.725431] RBP: 00007ffe8c389760 R08: 0000000000000000 R09: 00007f4c2f443630
>[  327.732632] R10: 00000000000005eb R11: 0000000000000246 R12: 0000000000000000
>[  327.739833] R13: 00000000006774e0 R14: 00007ffe8c3897e8 R15: 0000000000000000
>[  327.747096] ================================================================================
>
>Fixes: 9589a7b5d7d9 ("mlxsw: spectrum: Handle VLAN devices linking / unlinking")
>Suggested-by: Ido Schimmel <idosch@mellanox.com>
>Signed-off-by: Petr Machata <petrm@mellanox.com>
>Signed-off-by: Ido Schimmel <idosch@mellanox.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* Congratulations,
From: Mikhail Fridman @ 2018-05-27 12:39 UTC (permalink / raw)




-- 
Hello,

I Mikhail Fridman. has selected you specially as one of my beneficiaries
for my Charitable Donation, Just as I have declared on May 23, 2016 to
give my fortune as charity.

Check the link below for confirmation:

https://www.ibtimes.co.uk/russias-second-wealthiest-man-mikhail-fridman-plans-leaving-14-2bn-fortune-charity-1561604

Reply as soon as possible with further directives.


Best Regards,
Mikhail Fridman.

^ permalink raw reply

* [PATCH] iptables-compat: homogenize error message
From: Arushi Singhal @ 2018-05-27 13:09 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, coreteam, netdev, linux-kernel

There is a difference between error messages in iptables and
iptables-compat:

#sudo iptables-compat -D INPUT 4
iptables: No chain/target/match by that name.

#sudo iptables -D INPUT 4
iptables: Index of deletion too big.

Now, will show same error message.

Signed-off-by: Arushi Singhal <arushisinghal19971997@gmail.com>
---
 iptables/nft.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/iptables/nft.c b/iptables/nft.c
index e33d00f..40646f4 100644
--- a/iptables/nft.c
+++ b/iptables/nft.c
@@ -2603,7 +2603,7 @@ const char *nft_strerror(int err)
 	    { nft_rule_add, E2BIG, "Index of insertion too big" },
 	    { nft_rule_check, ENOENT, "Bad rule (does a matching rule exist in that chain?)" },
 	    { nft_rule_replace, ENOENT, "Index of replacement too big" },
-	    { nft_rule_delete_num, E2BIG, "Index of deletion too big" },
+	    { nft_rule_delete_num, ENOENT, "Index of deletion too big" },
 /*	    { TC_READ_COUNTER, E2BIG, "Index of counter too big" },
 	    { TC_ZERO_COUNTER, E2BIG, "Index of counter too big" }, */
 	    { nft_rule_add, ELOOP, "Loop found in table" },
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 0/7] net: Add address attribute to control metric of prefix route
From: dsahern @ 2018-05-27 15:09 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern

From: David Ahern <dsahern@gmail.com>

For use cases such as VRR (Virtual Router Redundancy) interface managers
want efficient control over the order of prefix routes when multiple
interfaces have addresses with overlapping/duplicate subnets.

Currently, if two interfaces have addresses in the same subnet, the order
of the prefix route entries is determined by the order in which the
addresses are assigned or the links brought up. Any actions like cycling
an interface up and down changes that order. This set adds a new attribute
for addresses to allow a user to specify the metric of the prefix route
associated with an address giving interface managers better and more
efficient control of the order of prefix routes.

Patches 1-3 refactor IPv6 address add functions to pass an ifa6_config
struct. The functions currently have a long list of arguments and adding
the metric just makes it worse. Because of the overall diff size in
moving the arguments to a struct, the change is done in stages to make
it easier to review starting with the bottom function and pushing the
struct up to callers in each successive patch.

Patch 4 introduces the new attribute.

Patches 5 and 6 add support for the new attribute to IPv4 and IPv6
addresses.

Patch 7 adds a set of test cases.

Patch 8 adds support to iproute2

Changes since RFC
- collapsed patches 1 and 3 into patch 2
- simplified stack variables in fib_modify_prefix_metric in patch 5

David Ahern (7):
  net/ipv6: Convert ipv6_add_addr to struct ifa6_config
  net/ipv6: Pass ifa6_config struct to inet6_addr_add
  net/ipv6: Pass ifa6_config struct to inet6_addr_modify
  net: Add IFA_RT_PRIORITY address attribute
  net/ipv4: Add support for specifying metric of connected routes
  net/ipv6: Add support for specifying metric of connected routes
  selftests: fib_tests: Add prefix route tests with metric

 include/linux/inetdevice.h               |   1 +
 include/net/addrconf.h                   |  13 ++
 include/net/if_inet6.h                   |   1 +
 include/net/route.h                      |   1 +
 include/uapi/linux/if_addr.h             |   1 +
 net/ipv4/devinet.c                       |  15 ++
 net/ipv4/fib_frontend.c                  |  53 ++++-
 net/ipv6/addrconf.c                      | 359 +++++++++++++++++++------------
 tools/testing/selftests/net/fib_tests.sh | 181 +++++++++++++++-
 9 files changed, 472 insertions(+), 153 deletions(-)
 mode change 100755 => 100644 tools/testing/selftests/net/fib_tests.sh

-- 
2.11.0

^ permalink raw reply

* [PATCH net-next 1/7] net/ipv6: Convert ipv6_add_addr to struct ifa6_config
From: dsahern @ 2018-05-27 15:09 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180527151000.30488-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Move config parameters for adding an ipv6 address to a struct. struct
names stem from inet6_rtm_newaddr which is the modern handler for
adding an address.

Start the conversion to ifa6_config with ipv6_add_addr. This is an argument
move only; no functional change intended. Mapping of variable changes:

    addr      -->  cfg->pfx
    peer_addr -->  cfg->peer_pfx
    pfxlen    -->  cfg->plen
    flags     -->  cfg->ifa_flags

scope, valid_lft, prefered_lft have the same names within cfg
(with corrected spelling).

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/addrconf.h |  12 +++++
 net/ipv6/addrconf.c    | 134 +++++++++++++++++++++++++++----------------------
 2 files changed, 87 insertions(+), 59 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index c07d4dd09361..f766af2cd1a4 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -59,6 +59,18 @@ struct in6_validator_info {
 	struct netlink_ext_ack	*extack;
 };
 
+struct ifa6_config {
+	const struct in6_addr	*pfx;
+	unsigned int		plen;
+
+	const struct in6_addr	*peer_pfx;
+
+	u32			ifa_flags;
+	u32			preferred_lft;
+	u32			valid_lft;
+	u16			scope;
+};
+
 int addrconf_init(void);
 void addrconf_cleanup(void);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index fbfd71a2d9c8..4988f2265882 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -986,17 +986,15 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 /* On success it returns ifp with increased reference count */
 
 static struct inet6_ifaddr *
-ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
-	      const struct in6_addr *peer_addr, int pfxlen,
-	      int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
+ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	      bool can_block, struct netlink_ext_ack *extack)
 {
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
+	int addr_type = ipv6_addr_type(cfg->pfx);
 	struct net *net = dev_net(idev->dev);
 	struct inet6_ifaddr *ifa = NULL;
 	struct fib6_info *f6i = NULL;
 	int err = 0;
-	int addr_type = ipv6_addr_type(addr);
 
 	if (addr_type == IPV6_ADDR_ANY ||
 	    addr_type & IPV6_ADDR_MULTICAST ||
@@ -1019,7 +1017,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	 */
 	if (can_block) {
 		struct in6_validator_info i6vi = {
-			.i6vi_addr = *addr,
+			.i6vi_addr = *cfg->pfx,
 			.i6vi_dev = idev,
 			.extack = extack,
 		};
@@ -1036,7 +1034,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags);
+	f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags);
 	if (IS_ERR(f6i)) {
 		err = PTR_ERR(f6i);
 		f6i = NULL;
@@ -1049,21 +1047,21 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 
 	neigh_parms_data_state_setall(idev->nd_parms);
 
-	ifa->addr = *addr;
-	if (peer_addr)
-		ifa->peer_addr = *peer_addr;
+	ifa->addr = *cfg->pfx;
+	if (cfg->peer_pfx)
+		ifa->peer_addr = *cfg->peer_pfx;
 
 	spin_lock_init(&ifa->lock);
 	INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
 	INIT_HLIST_NODE(&ifa->addr_lst);
-	ifa->scope = scope;
-	ifa->prefix_len = pfxlen;
-	ifa->flags = flags;
+	ifa->scope = cfg->scope;
+	ifa->prefix_len = cfg->plen;
+	ifa->flags = cfg->ifa_flags;
 	/* No need to add the TENTATIVE flag for addresses with NODAD */
-	if (!(flags & IFA_F_NODAD))
+	if (!(cfg->ifa_flags & IFA_F_NODAD))
 		ifa->flags |= IFA_F_TENTATIVE;
-	ifa->valid_lft = valid_lft;
-	ifa->prefered_lft = prefered_lft;
+	ifa->valid_lft = cfg->valid_lft;
+	ifa->prefered_lft = cfg->preferred_lft;
 	ifa->cstamp = ifa->tstamp = jiffies;
 	ifa->tokenized = false;
 
@@ -1260,11 +1258,10 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
 {
 	struct inet6_dev *idev = ifp->idev;
 	struct in6_addr addr, *tmpaddr;
-	unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age;
+	unsigned long tmp_tstamp, age;
 	unsigned long regen_advance;
-	int tmp_plen;
+	struct ifa6_config cfg;
 	int ret = 0;
-	u32 addr_flags;
 	unsigned long now = jiffies;
 	long max_desync_factor;
 	s32 cnf_temp_preferred_lft;
@@ -1326,13 +1323,12 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
 		}
 	}
 
-	tmp_valid_lft = min_t(__u32,
-			      ifp->valid_lft,
+	cfg.valid_lft = min_t(__u32, ifp->valid_lft,
 			      idev->cnf.temp_valid_lft + age);
-	tmp_prefered_lft = cnf_temp_preferred_lft + age -
-			    idev->desync_factor;
-	tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
-	tmp_plen = ifp->prefix_len;
+	cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
+	cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft);
+
+	cfg.plen = ifp->prefix_len;
 	tmp_tstamp = ifp->tstamp;
 	spin_unlock_bh(&ifp->lock);
 
@@ -1346,21 +1342,22 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
 	 * temporary addresses being generated.
 	 */
 	age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
-	if (tmp_prefered_lft <= regen_advance + age) {
+	if (cfg.preferred_lft <= regen_advance + age) {
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
 		ret = -1;
 		goto out;
 	}
 
-	addr_flags = IFA_F_TEMPORARY;
+	cfg.ifa_flags = IFA_F_TEMPORARY;
 	/* set in addrconf_prefix_rcv() */
 	if (ifp->flags & IFA_F_OPTIMISTIC)
-		addr_flags |= IFA_F_OPTIMISTIC;
+		cfg.ifa_flags |= IFA_F_OPTIMISTIC;
+
+	cfg.pfx = &addr;
+	cfg.scope = ipv6_addr_scope(cfg.pfx);
 
-	ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
-			    ipv6_addr_scope(&addr), addr_flags,
-			    tmp_valid_lft, tmp_prefered_lft, block, NULL);
+	ift = ipv6_add_addr(idev, &cfg, block, NULL);
 	if (IS_ERR(ift)) {
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
@@ -2031,13 +2028,17 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 	spin_lock_bh(&ifp->lock);
 
 	if (ifp->flags & IFA_F_STABLE_PRIVACY) {
-		int scope = ifp->scope;
-		u32 flags = ifp->flags;
 		struct in6_addr new_addr;
 		struct inet6_ifaddr *ifp2;
-		u32 valid_lft, preferred_lft;
-		int pfxlen = ifp->prefix_len;
 		int retries = ifp->stable_privacy_retry + 1;
+		struct ifa6_config cfg = {
+			.pfx = &new_addr,
+			.plen = ifp->prefix_len,
+			.ifa_flags = ifp->flags,
+			.valid_lft = ifp->valid_lft,
+			.preferred_lft = ifp->prefered_lft,
+			.scope = ifp->scope,
+		};
 
 		if (retries > net->ipv6.sysctl.idgen_retries) {
 			net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
@@ -2050,9 +2051,6 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 						 idev))
 			goto errdad;
 
-		valid_lft = ifp->valid_lft;
-		preferred_lft = ifp->prefered_lft;
-
 		spin_unlock_bh(&ifp->lock);
 
 		if (idev->cnf.max_addresses &&
@@ -2063,9 +2061,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 		net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
 				     ifp->idev->dev->name);
 
-		ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
-				     scope, flags, valid_lft,
-				     preferred_lft, false, NULL);
+		ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
 		if (IS_ERR(ifp2))
 			goto lock_errdad;
 
@@ -2507,12 +2503,20 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 
 	if (!ifp && valid_lft) {
 		int max_addresses = in6_dev->cnf.max_addresses;
+		struct ifa6_config cfg = {
+			.pfx = addr,
+			.plen = pinfo->prefix_len,
+			.ifa_flags = addr_flags,
+			.valid_lft = valid_lft,
+			.preferred_lft = prefered_lft,
+			.scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+		};
 
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 		if ((net->ipv6.devconf_all->optimistic_dad ||
 		     in6_dev->cnf.optimistic_dad) &&
 		    !net->ipv6.devconf_all->forwarding && sllao)
-			addr_flags |= IFA_F_OPTIMISTIC;
+			cfg.ifa_flags |= IFA_F_OPTIMISTIC;
 #endif
 
 		/* Do not allow to create too much of autoconfigured
@@ -2520,11 +2524,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 		 */
 		if (!max_addresses ||
 		    ipv6_count_addresses(in6_dev) < max_addresses)
-			ifp = ipv6_add_addr(in6_dev, addr, NULL,
-					    pinfo->prefix_len,
-					    addr_type&IPV6_ADDR_SCOPE_MASK,
-					    addr_flags, valid_lft,
-					    prefered_lft, false, NULL);
+			ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);
 
 		if (IS_ERR_OR_NULL(ifp))
 			return -1;
@@ -2836,12 +2836,19 @@ static int inet6_addr_add(struct net *net, int ifindex,
 			  __u32 prefered_lft, __u32 valid_lft,
 			  struct netlink_ext_ack *extack)
 {
+	struct ifa6_config cfg = {
+		.pfx = pfx,
+		.plen = plen,
+		.peer_pfx = peer_pfx,
+		.ifa_flags = ifa_flags,
+		.preferred_lft = prefered_lft,
+		.valid_lft = valid_lft,
+	};
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
 	struct net_device *dev;
 	unsigned long timeout;
 	clock_t expires;
-	int scope;
 	u32 flags;
 
 	ASSERT_RTNL();
@@ -2872,7 +2879,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 			return ret;
 	}
 
-	scope = ipv6_addr_scope(pfx);
+	cfg.scope = ipv6_addr_scope(pfx);
 
 	timeout = addrconf_timeout_fixup(valid_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
@@ -2892,9 +2899,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 		prefered_lft = timeout;
 	}
 
-	ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
-			    valid_lft, prefered_lft, true, extack);
-
+	ifp = ipv6_add_addr(idev, &cfg, true, extack);
 	if (!IS_ERR(ifp)) {
 		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
 			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
@@ -3010,11 +3015,16 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		     int plen, int scope)
 {
 	struct inet6_ifaddr *ifp;
+	struct ifa6_config cfg = {
+		.pfx = addr,
+		.plen = plen,
+		.ifa_flags = IFA_F_PERMANENT,
+		.valid_lft = INFINITY_LIFE_TIME,
+		.preferred_lft = INFINITY_LIFE_TIME,
+		.scope = scope
+	};
 
-	ifp = ipv6_add_addr(idev, addr, NULL, plen,
-			    scope, IFA_F_PERMANENT,
-			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME,
-			    true, NULL);
+	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
 		spin_lock_bh(&ifp->lock);
 		ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3104,18 +3114,24 @@ static void init_loopback(struct net_device *dev)
 void addrconf_add_linklocal(struct inet6_dev *idev,
 			    const struct in6_addr *addr, u32 flags)
 {
+	struct ifa6_config cfg = {
+		.pfx = addr,
+		.plen = 64,
+		.ifa_flags = flags | IFA_F_PERMANENT,
+		.valid_lft = INFINITY_LIFE_TIME,
+		.preferred_lft = INFINITY_LIFE_TIME,
+		.scope = IFA_LINK
+	};
 	struct inet6_ifaddr *ifp;
-	u32 addr_flags = flags | IFA_F_PERMANENT;
 
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad ||
 	     idev->cnf.optimistic_dad) &&
 	    !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
-		addr_flags |= IFA_F_OPTIMISTIC;
+		cfg.ifa_flags |= IFA_F_OPTIMISTIC;
 #endif
 
-	ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
-			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
+	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
 				      0, 0, GFP_ATOMIC);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 2/7] net/ipv6: Pass ifa6_config struct to inet6_addr_add
From: dsahern @ 2018-05-27 15:09 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180527151000.30488-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Move the creation of struct ifa6_config up to callers of inet6_addr_add.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/addrconf.c | 112 ++++++++++++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 55 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4988f2265882..2db1acf70610 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2830,20 +2830,9 @@ static int ipv6_mc_config(struct sock *sk, bool join,
  *	Manual configuration of address on an interface
  */
 static int inet6_addr_add(struct net *net, int ifindex,
-			  const struct in6_addr *pfx,
-			  const struct in6_addr *peer_pfx,
-			  unsigned int plen, __u32 ifa_flags,
-			  __u32 prefered_lft, __u32 valid_lft,
+			  struct ifa6_config *cfg,
 			  struct netlink_ext_ack *extack)
 {
-	struct ifa6_config cfg = {
-		.pfx = pfx,
-		.plen = plen,
-		.peer_pfx = peer_pfx,
-		.ifa_flags = ifa_flags,
-		.preferred_lft = prefered_lft,
-		.valid_lft = valid_lft,
-	};
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
 	struct net_device *dev;
@@ -2853,14 +2842,14 @@ static int inet6_addr_add(struct net *net, int ifindex,
 
 	ASSERT_RTNL();
 
-	if (plen > 128)
+	if (cfg->plen > 128)
 		return -EINVAL;
 
 	/* check the lifetime */
-	if (!valid_lft || prefered_lft > valid_lft)
+	if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
 		return -EINVAL;
 
-	if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64)
+	if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64)
 		return -EINVAL;
 
 	dev = __dev_get_by_index(net, ifindex);
@@ -2871,37 +2860,37 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	if (IS_ERR(idev))
 		return PTR_ERR(idev);
 
-	if (ifa_flags & IFA_F_MCAUTOJOIN) {
+	if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
 		int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
-					 true, pfx, ifindex);
+					 true, cfg->pfx, ifindex);
 
 		if (ret < 0)
 			return ret;
 	}
 
-	cfg.scope = ipv6_addr_scope(pfx);
+	cfg->scope = ipv6_addr_scope(cfg->pfx);
 
-	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		expires = jiffies_to_clock_t(timeout * HZ);
-		valid_lft = timeout;
+		cfg->valid_lft = timeout;
 		flags = RTF_EXPIRES;
 	} else {
 		expires = 0;
 		flags = 0;
-		ifa_flags |= IFA_F_PERMANENT;
+		cfg->ifa_flags |= IFA_F_PERMANENT;
 	}
 
-	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		if (timeout == 0)
-			ifa_flags |= IFA_F_DEPRECATED;
-		prefered_lft = timeout;
+			cfg->ifa_flags |= IFA_F_DEPRECATED;
+		cfg->preferred_lft = timeout;
 	}
 
-	ifp = ipv6_add_addr(idev, &cfg, true, extack);
+	ifp = ipv6_add_addr(idev, cfg, true, extack);
 	if (!IS_ERR(ifp)) {
-		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
+		if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
 			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
 					      expires, flags, GFP_KERNEL);
 		}
@@ -2917,15 +2906,15 @@ static int inet6_addr_add(struct net *net, int ifindex,
 		 * manually configured addresses
 		 */
 		addrconf_dad_start(ifp);
-		if (ifa_flags & IFA_F_MANAGETEMPADDR)
-			manage_tempaddrs(idev, ifp, valid_lft, prefered_lft,
-					 true, jiffies);
+		if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR)
+			manage_tempaddrs(idev, ifp, cfg->valid_lft,
+					 cfg->preferred_lft, true, jiffies);
 		in6_ifa_put(ifp);
 		addrconf_verify_rtnl();
 		return 0;
-	} else if (ifa_flags & IFA_F_MCAUTOJOIN) {
-		ipv6_mc_config(net->ipv6.mc_autojoin_sk,
-			       false, pfx, ifindex);
+	} else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
+		ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
+			       cfg->pfx, ifindex);
 	}
 
 	return PTR_ERR(ifp);
@@ -2976,6 +2965,11 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
 
 int addrconf_add_ifaddr(struct net *net, void __user *arg)
 {
+	struct ifa6_config cfg = {
+		.ifa_flags = IFA_F_PERMANENT,
+		.preferred_lft = INFINITY_LIFE_TIME,
+		.valid_lft = INFINITY_LIFE_TIME,
+	};
 	struct in6_ifreq ireq;
 	int err;
 
@@ -2985,10 +2979,11 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
 		return -EFAULT;
 
+	cfg.pfx = &ireq.ifr6_addr;
+	cfg.plen = ireq.ifr6_prefixlen;
+
 	rtnl_lock();
-	err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
-			     ireq.ifr6_prefixlen, IFA_F_PERMANENT,
-			     INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
+	err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL);
 	rtnl_unlock();
 	return err;
 }
@@ -4624,12 +4619,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct net *net = sock_net(skb->sk);
 	struct ifaddrmsg *ifm;
 	struct nlattr *tb[IFA_MAX+1];
-	struct in6_addr *pfx, *peer_pfx;
+	struct in6_addr *peer_pfx;
 	struct inet6_ifaddr *ifa;
 	struct net_device *dev;
 	struct inet6_dev *idev;
-	u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
-	u32 ifa_flags;
+	struct ifa6_config cfg;
 	int err;
 
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
@@ -4637,60 +4631,68 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	memset(&cfg, 0, sizeof(cfg));
+
 	ifm = nlmsg_data(nlh);
-	pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
-	if (!pfx)
+	cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
+	if (!cfg.pfx)
 		return -EINVAL;
 
+	cfg.peer_pfx = peer_pfx;
+	cfg.plen = ifm->ifa_prefixlen;
+	cfg.valid_lft = INFINITY_LIFE_TIME;
+	cfg.preferred_lft = INFINITY_LIFE_TIME;
+
 	if (tb[IFA_CACHEINFO]) {
 		struct ifa_cacheinfo *ci;
 
 		ci = nla_data(tb[IFA_CACHEINFO]);
-		valid_lft = ci->ifa_valid;
-		preferred_lft = ci->ifa_prefered;
-	} else {
-		preferred_lft = INFINITY_LIFE_TIME;
-		valid_lft = INFINITY_LIFE_TIME;
+		cfg.valid_lft = ci->ifa_valid;
+		cfg.preferred_lft = ci->ifa_prefered;
 	}
 
 	dev =  __dev_get_by_index(net, ifm->ifa_index);
 	if (!dev)
 		return -ENODEV;
 
-	ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags;
+	if (tb[IFA_FLAGS])
+		cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]);
+	else
+		cfg.ifa_flags = ifm->ifa_flags;
 
 	/* We ignore other flags so far. */
-	ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
-		     IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+	cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
+			 IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
+			 IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
 
 	idev = ipv6_find_idev(dev);
 	if (IS_ERR(idev))
 		return PTR_ERR(idev);
 
 	if (!ipv6_allow_optimistic_dad(net, idev))
-		ifa_flags &= ~IFA_F_OPTIMISTIC;
+		cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
 
-	if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) {
+	if (cfg.ifa_flags & IFA_F_NODAD &&
+	    cfg.ifa_flags & IFA_F_OPTIMISTIC) {
 		NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
 		return -EINVAL;
 	}
 
-	ifa = ipv6_get_ifaddr(net, pfx, dev, 1);
+	ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
 	if (!ifa) {
 		/*
 		 * It would be best to check for !NLM_F_CREATE here but
 		 * userspace already relies on not having to provide this.
 		 */
-		return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
-				      ifm->ifa_prefixlen, ifa_flags,
-				      preferred_lft, valid_lft, extack);
+		return inet6_addr_add(net, ifm->ifa_index, &cfg, extack);
 	}
 
 	if (nlh->nlmsg_flags & NLM_F_EXCL ||
 	    !(nlh->nlmsg_flags & NLM_F_REPLACE))
 		err = -EEXIST;
 	else
-		err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
+		err = inet6_addr_modify(ifa, cfg.ifa_flags, cfg.preferred_lft,
+					cfg.valid_lft);
 
 	in6_ifa_put(ifa);
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 3/7] net/ipv6: Pass ifa6_config struct to inet6_addr_modify
From: dsahern @ 2018-05-27 15:09 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180527151000.30488-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Update inet6_addr_modify to take ifa6_config argument versus a parameter
list. This is an argument move only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/addrconf.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2db1acf70610..1b1e63a4520b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4527,8 +4527,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 			      ifm->ifa_prefixlen);
 }
 
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
-			     u32 prefered_lft, u32 valid_lft)
+static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
 {
 	u32 flags;
 	clock_t expires;
@@ -4538,32 +4537,32 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 
 	ASSERT_RTNL();
 
-	if (!valid_lft || (prefered_lft > valid_lft))
+	if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
 		return -EINVAL;
 
-	if (ifa_flags & IFA_F_MANAGETEMPADDR &&
+	if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
 	    (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
 		return -EINVAL;
 
 	if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
-		ifa_flags &= ~IFA_F_OPTIMISTIC;
+		cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
 
-	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		expires = jiffies_to_clock_t(timeout * HZ);
-		valid_lft = timeout;
+		cfg->valid_lft = timeout;
 		flags = RTF_EXPIRES;
 	} else {
 		expires = 0;
 		flags = 0;
-		ifa_flags |= IFA_F_PERMANENT;
+		cfg->ifa_flags |= IFA_F_PERMANENT;
 	}
 
-	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		if (timeout == 0)
-			ifa_flags |= IFA_F_DEPRECATED;
-		prefered_lft = timeout;
+			cfg->ifa_flags |= IFA_F_DEPRECATED;
+		cfg->preferred_lft = timeout;
 	}
 
 	spin_lock_bh(&ifp->lock);
@@ -4573,16 +4572,16 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 	ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
 			IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
 			IFA_F_NOPREFIXROUTE);
-	ifp->flags |= ifa_flags;
+	ifp->flags |= cfg->ifa_flags;
 	ifp->tstamp = jiffies;
-	ifp->valid_lft = valid_lft;
-	ifp->prefered_lft = prefered_lft;
+	ifp->valid_lft = cfg->valid_lft;
+	ifp->prefered_lft = cfg->preferred_lft;
 
 	spin_unlock_bh(&ifp->lock);
 	if (!(ifp->flags&IFA_F_TENTATIVE))
 		ipv6_ifa_notify(0, ifp);
 
-	if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
+	if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
 				      ifp->idev->dev, expires, flags,
 				      GFP_KERNEL);
@@ -4601,10 +4600,14 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 	}
 
 	if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
-		if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
-			valid_lft = prefered_lft = 0;
-		manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft,
-				 !was_managetempaddr, jiffies);
+		if (was_managetempaddr &&
+		    !(ifp->flags & IFA_F_MANAGETEMPADDR)) {
+			cfg->valid_lft = 0;
+			cfg->preferred_lft = 0;
+		}
+		manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
+				 cfg->preferred_lft, !was_managetempaddr,
+				 jiffies);
 	}
 
 	addrconf_verify_rtnl();
@@ -4691,8 +4694,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	    !(nlh->nlmsg_flags & NLM_F_REPLACE))
 		err = -EEXIST;
 	else
-		err = inet6_addr_modify(ifa, cfg.ifa_flags, cfg.preferred_lft,
-					cfg.valid_lft);
+		err = inet6_addr_modify(ifa, &cfg);
 
 	in6_ifa_put(ifa);
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 4/7] net: Add IFA_RT_PRIORITY address attribute
From: dsahern @ 2018-05-27 15:09 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180527151000.30488-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Currently, if two interfaces have addresses in the same connected route,
then the order of the prefix route entries is determined by the order in
which the addresses are assigned or the links brought up.

Add IFA_RT_PRIORITY to allow user to specify the metric of the prefix
route associated with an address giving interface managers better
control of the order of prefix routes.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/if_addr.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index 2ef053d265de..ebaf5701c9db 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -33,6 +33,7 @@ enum {
 	IFA_CACHEINFO,
 	IFA_MULTICAST,
 	IFA_FLAGS,
+	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
 	__IFA_MAX,
 };

-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 6/7] net/ipv6: Add support for specifying metric of connected routes
From: dsahern @ 2018-05-27 15:09 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180527151000.30488-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add support for IFA_RT_PRIORITY to ipv6 addresses.

If the metric is changed on an existing address then the new route
is inserted before removing the old one. Since the metric is one
of the route keys, the prefix route can not be atomically replaced.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/addrconf.h |  1 +
 include/net/if_inet6.h |  1 +
 net/ipv6/addrconf.c    | 93 ++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f766af2cd1a4..5f43f7a70fe6 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -65,6 +65,7 @@ struct ifa6_config {
 
 	const struct in6_addr	*peer_pfx;
 
+	u32			rt_priority;
 	u32			ifa_flags;
 	u32			preferred_lft;
 	u32			valid_lft;
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index db389253dc2a..d7578cf49c3a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -42,6 +42,7 @@ enum {
 struct inet6_ifaddr {
 	struct in6_addr		addr;
 	__u32			prefix_len;
+	__u32			rt_priority;
 
 	/* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
 	__u32			valid_lft;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b1e63a4520b..f09afc232da4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1056,6 +1056,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	INIT_HLIST_NODE(&ifa->addr_lst);
 	ifa->scope = cfg->scope;
 	ifa->prefix_len = cfg->plen;
+	ifa->rt_priority = cfg->rt_priority;
 	ifa->flags = cfg->ifa_flags;
 	/* No need to add the TENTATIVE flag for addresses with NODAD */
 	if (!(cfg->ifa_flags & IFA_F_NODAD))
@@ -1356,6 +1357,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
 
 	cfg.pfx = &addr;
 	cfg.scope = ipv6_addr_scope(cfg.pfx);
+	cfg.rt_priority = 0;
 
 	ift = ipv6_add_addr(idev, &cfg, block, NULL);
 	if (IS_ERR(ift)) {
@@ -2314,12 +2316,13 @@ static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
  */
 
 static void
-addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
-		      unsigned long expires, u32 flags, gfp_t gfp_flags)
+addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
+		      struct net_device *dev, unsigned long expires,
+		      u32 flags, gfp_t gfp_flags)
 {
 	struct fib6_config cfg = {
 		.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
-		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
 		.fc_ifindex = dev->ifindex,
 		.fc_expires = expires,
 		.fc_dst_len = plen,
@@ -2683,7 +2686,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 				expires = jiffies_to_clock_t(rt_expires);
 			}
 			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
-					      dev, expires, flags, GFP_ATOMIC);
+					      0, dev, expires, flags,
+					      GFP_ATOMIC);
 		}
 		fib6_info_release(rt);
 	}
@@ -2891,8 +2895,9 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	ifp = ipv6_add_addr(idev, cfg, true, extack);
 	if (!IS_ERR(ifp)) {
 		if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
-			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
-					      expires, flags, GFP_KERNEL);
+			addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+					      ifp->rt_priority, dev, expires,
+					      flags, GFP_KERNEL);
 		}
 
 		/* Send a netlink notification if DAD is enabled and
@@ -3056,7 +3061,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
 	if (addr.s6_addr32[3]) {
 		add_addr(idev, &addr, plen, scope);
-		addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags,
+		addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
 				      GFP_ATOMIC);
 		return;
 	}
@@ -3081,8 +3086,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 				}
 
 				add_addr(idev, &addr, plen, flag);
-				addrconf_prefix_route(&addr, plen, idev->dev, 0,
-						      pflags, GFP_ATOMIC);
+				addrconf_prefix_route(&addr, plen, 0, idev->dev,
+						      0, pflags, GFP_ATOMIC);
 			}
 		}
 	}
@@ -3128,7 +3133,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
 
 	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
 				      0, 0, GFP_ATOMIC);
 		addrconf_dad_start(ifp);
 		in6_ifa_put(ifp);
@@ -3244,7 +3249,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 			addrconf_add_linklocal(idev, &addr,
 					       IFA_F_STABLE_PRIVACY);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev,
+			addrconf_prefix_route(&addr, 64, 0, idev->dev,
 					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_EUI64:
@@ -3255,7 +3260,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 		if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
 			addrconf_add_linklocal(idev, &addr, 0);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev,
+			addrconf_prefix_route(&addr, 64, 0, idev->dev,
 					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_NONE:
@@ -3375,7 +3380,8 @@ static int fixup_permanent_addr(struct net *net,
 
 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      idev->dev, 0, 0, GFP_ATOMIC);
+				      ifp->rt_priority, idev->dev, 0, 0,
+				      GFP_ATOMIC);
 	}
 
 	if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -4495,6 +4501,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
 	[IFA_LOCAL]		= { .len = sizeof(struct in6_addr) },
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .len = sizeof(u32) },
+	[IFA_RT_PRIORITY]	= { .len = sizeof(u32) },
 };
 
 static int
@@ -4527,6 +4534,37 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 			      ifm->ifa_prefixlen);
 }
 
+static int modify_prefix_route(struct inet6_ifaddr *ifp,
+			       unsigned long expires, u32 flags)
+{
+	struct fib6_info *f6i;
+
+	f6i = addrconf_get_prefix_route(&ifp->addr,
+					ifp->prefix_len,
+					ifp->idev->dev,
+					0, RTF_GATEWAY | RTF_DEFAULT);
+	if (!f6i)
+		return -ENOENT;
+
+	if (f6i->fib6_metric != ifp->rt_priority) {
+		/* add new one */
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      ifp->rt_priority, ifp->idev->dev,
+				      expires, flags, GFP_KERNEL);
+		/* delete old one */
+		ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+	} else {
+		if (!expires)
+			fib6_clean_expires(f6i);
+		else
+			fib6_set_expires(f6i, expires);
+
+		fib6_info_release(f6i);
+	}
+
+	return 0;
+}
+
 static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
 {
 	u32 flags;
@@ -4577,14 +4615,25 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
 	ifp->valid_lft = cfg->valid_lft;
 	ifp->prefered_lft = cfg->preferred_lft;
 
+	if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
+		ifp->rt_priority = cfg->rt_priority;
+
 	spin_unlock_bh(&ifp->lock);
 	if (!(ifp->flags&IFA_F_TENTATIVE))
 		ipv6_ifa_notify(0, ifp);
 
 	if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      ifp->idev->dev, expires, flags,
-				      GFP_KERNEL);
+		int rc = -ENOENT;
+
+		if (had_prefixroute)
+			rc = modify_prefix_route(ifp, expires, flags);
+
+		/* prefix route could have been deleted; if so restore it */
+		if (rc == -ENOENT) {
+			addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+					      ifp->rt_priority, ifp->idev->dev,
+					      expires, flags, GFP_KERNEL);
+		}
 	} else if (had_prefixroute) {
 		enum cleanup_prefix_rt_t action;
 		unsigned long rt_expires;
@@ -4643,6 +4692,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	cfg.peer_pfx = peer_pfx;
 	cfg.plen = ifm->ifa_prefixlen;
+	if (tb[IFA_RT_PRIORITY])
+		cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
 	cfg.valid_lft = INFINITY_LIFE_TIME;
 	cfg.preferred_lft = INFINITY_LIFE_TIME;
 
@@ -4745,7 +4797,8 @@ static inline int inet6_ifaddr_msgsize(void)
 	       + nla_total_size(16) /* IFA_LOCAL */
 	       + nla_total_size(16) /* IFA_ADDRESS */
 	       + nla_total_size(sizeof(struct ifa_cacheinfo))
-	       + nla_total_size(4)  /* IFA_FLAGS */;
+	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(4)  /* IFA_RT_PRIORITY */;
 }
 
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
@@ -4791,6 +4844,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
 		if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
 			goto error;
 
+	if (ifa->rt_priority &&
+	    nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority))
+		goto error;
+
 	if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
 		goto error;
 
@@ -5635,7 +5692,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
 		if (!ipv6_addr_any(&ifp->peer_addr))
-			addrconf_prefix_route(&ifp->peer_addr, 128,
+			addrconf_prefix_route(&ifp->peer_addr, 128, 0,
 					      ifp->idev->dev, 0, 0,
 					      GFP_ATOMIC);
 		break;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 5/7] net/ipv4: Add support for specifying metric of connected routes
From: dsahern @ 2018-05-27 15:09 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180527151000.30488-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add support for IFA_RT_PRIORITY to ipv4 addresses.

If the metric is changed on an existing address then the new route
is inserted before removing the old one. Since the metric is one
of the route keys, the prefix route can not be replaced.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/linux/inetdevice.h |  1 +
 include/net/route.h        |  1 +
 net/ipv4/devinet.c         | 15 +++++++++++++
 net/ipv4/fib_frontend.c    | 53 ++++++++++++++++++++++++++++++++++++----------
 4 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index e16fe7d44a71..27650f1bff3d 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -139,6 +139,7 @@ struct in_ifaddr {
 	__be32			ifa_local;
 	__be32			ifa_address;
 	__be32			ifa_mask;
+	__u32			ifa_rt_priority;
 	__be32			ifa_broadcast;
 	unsigned char		ifa_scope;
 	unsigned char		ifa_prefixlen;
diff --git a/include/net/route.h b/include/net/route.h
index dbb032d5921b..bb53cdba38dc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -225,6 +225,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 struct in_ifaddr;
 void fib_add_ifaddr(struct in_ifaddr *);
 void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
 
 void rt_add_uncached_list(struct rtable *rt);
 void rt_del_uncached_list(struct rtable *rt);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 40f001782c1b..d7585ab1a77a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .type = NLA_U32 },
+	[IFA_RT_PRIORITY]	= { .type = NLA_U32 },
 };
 
 #define IN4_ADDR_HSIZE_SHIFT	8
@@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
 	else
 		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
 
+	if (tb[IFA_RT_PRIORITY])
+		ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
 	if (tb[IFA_CACHEINFO]) {
 		struct ifa_cacheinfo *ci;
 
@@ -906,12 +910,20 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
 					 extack);
 	} else {
+		u32 new_metric = ifa->ifa_rt_priority;
+
 		inet_free_ifa(ifa);
 
 		if (nlh->nlmsg_flags & NLM_F_EXCL ||
 		    !(nlh->nlmsg_flags & NLM_F_REPLACE))
 			return -EEXIST;
 		ifa = ifa_existing;
+
+		if (ifa->ifa_rt_priority != new_metric) {
+			fib_modify_prefix_metric(ifa, new_metric);
+			ifa->ifa_rt_priority = new_metric;
+		}
+
 		set_ifa_lifetime(ifa, valid_lft, prefered_lft);
 		cancel_delayed_work(&check_lifetime_work);
 		queue_delayed_work(system_power_efficient_wq,
@@ -1549,6 +1561,7 @@ static size_t inet_nlmsg_size(void)
 	       + nla_total_size(4) /* IFA_BROADCAST */
 	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
 	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(4)  /* IFA_RT_PRIORITY */
 	       + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
 }
 
@@ -1618,6 +1631,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 	    (ifa->ifa_label[0] &&
 	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
 	    nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+	    (ifa->ifa_rt_priority &&
+	     nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
 	    put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
 			  preferred, valid))
 		goto nla_put_failure;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 045c43a27c12..07989d9ab2f2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -846,7 +846,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
  * to fib engine. It is legal, because all events occur
  * only when netlink is already locked.
  */
-static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
+		      struct in_ifaddr *ifa, u32 rt_priority)
 {
 	struct net *net = dev_net(ifa->ifa_dev->dev);
 	u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
@@ -856,6 +857,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
 		.fc_type = type,
 		.fc_dst = dst,
 		.fc_dst_len = dst_len,
+		.fc_priority = rt_priority,
 		.fc_prefsrc = ifa->ifa_local,
 		.fc_oif = ifa->ifa_dev->dev->ifindex,
 		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
@@ -901,31 +903,57 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 		}
 	}
 
-	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
 
 	if (!(dev->flags & IFF_UP))
 		return;
 
 	/* Add broadcast address, if it is explicitly assigned. */
 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
-		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+			  prim, 0);
 
 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
 		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
 			fib_magic(RTM_NEWROUTE,
 				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
-				  prefix, ifa->ifa_prefixlen, prim);
+				  prefix, ifa->ifa_prefixlen, prim,
+				  ifa->ifa_rt_priority);
 
 		/* Add network specific broadcasts, when it takes a sense */
 		if (ifa->ifa_prefixlen < 31) {
-			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
+				  prim, 0);
 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
-				  32, prim);
+				  32, prim, 0);
 		}
 	}
 }
 
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
+{
+	__be32 prefix = ifa->ifa_address & ifa->ifa_mask;
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+
+	if (!(dev->flags & IFF_UP) ||
+	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
+	    ipv4_is_zeronet(prefix) ||
+	    prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+		return;
+
+	/* add the new */
+	fib_magic(RTM_NEWROUTE,
+		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+		  prefix, ifa->ifa_prefixlen, ifa, new_metric);
+
+	/* delete the old */
+	fib_magic(RTM_DELROUTE,
+		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+		  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
+}
+
 /* Delete primary or secondary address.
  * Optionally, on secondary address promotion consider the addresses
  * from subnet iprim as deleted, even if they are in device list.
@@ -967,7 +995,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
 			fib_magic(RTM_DELROUTE,
 				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
-				  any, ifa->ifa_prefixlen, prim);
+				  any, ifa->ifa_prefixlen, prim, 0);
 		subnet = 1;
 	}
 
@@ -1051,17 +1079,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 
 no_promotions:
 	if (!(ok & BRD_OK))
-		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+			  prim, 0);
 	if (subnet && ifa->ifa_prefixlen < 31) {
 		if (!(ok & BRD1_OK))
-			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
+				  prim, 0);
 		if (!(ok & BRD0_OK))
-			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
+				  prim, 0);
 	}
 	if (!(ok & LOCAL_OK)) {
 		unsigned int addr_type;
 
-		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
 
 		/* Check, that this local address finally disappeared. */
 		addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
-- 
2.11.0

^ permalink raw reply related

* [PATCH iproute2-next] ipaddress: Add support for address metric
From: dsahern @ 2018-05-27 15:10 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180527151000.30488-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add support for IFA_RT_PRIORITY using the same keywords as iproute for
RTA_PRIORITY.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/uapi/linux/if_addr.h |  1 +
 ip/ipaddress.c               | 15 ++++++++++++++-
 man/man8/ip-address.8.in     |  6 ++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index c4899e22cdaa..a924606f36e5 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -33,6 +33,7 @@ enum {
 	IFA_CACHEINFO,
 	IFA_MULTICAST,
 	IFA_FLAGS,
+	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
 	__IFA_MAX,
 };
 
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 75539e057f6a..6b53b753ead9 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -63,7 +63,7 @@ static void usage(void)
 	fprintf(stderr, "       ip address {showdump|restore}\n");
 	fprintf(stderr, "IFADDR := PREFIX | ADDR peer PREFIX\n");
 	fprintf(stderr, "          [ broadcast ADDR ] [ anycast ADDR ]\n");
-	fprintf(stderr, "          [ label IFNAME ] [ scope SCOPE-ID ]\n");
+	fprintf(stderr, "          [ label IFNAME ] [ scope SCOPE-ID ] [ metric METRIC ]\n");
 	fprintf(stderr, "SCOPE-ID := [ host | link | global | NUMBER ]\n");
 	fprintf(stderr, "FLAG-LIST := [ FLAG-LIST ] FLAG\n");
 	fprintf(stderr, "FLAG  := [ permanent | dynamic | secondary | primary |\n");
@@ -1328,6 +1328,10 @@ int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n,
 							   rta_tb[IFA_ADDRESS]));
 		}
 		print_int(PRINT_ANY, "prefixlen", "/%d ", ifa->ifa_prefixlen);
+
+		if (rta_tb[IFA_RT_PRIORITY])
+			print_uint(PRINT_ANY, "metric", "metric %u ",
+				   rta_getattr_u32(rta_tb[IFA_RT_PRIORITY]));
 	}
 
 	if (brief)
@@ -2119,6 +2123,15 @@ static int ipaddr_modify(int cmd, int flags, int argc, char **argv)
 			NEXT_ARG();
 			l = *argv;
 			addattr_l(&req.n, sizeof(req), IFA_LABEL, l, strlen(l)+1);
+		} else if (matches(*argv, "metric") == 0 ||
+			   matches(*argv, "priority") == 0 ||
+			   matches(*argv, "preference") == 0) {
+			__u32 metric;
+
+			NEXT_ARG();
+			if (get_u32(&metric, *argv, 0))
+				invarg("\"metric\" value is invalid\n", *argv);
+			addattr32(&req.n, sizeof(req), IFA_RT_PRIORITY, metric);
 		} else if (matches(*argv, "valid_lft") == 0) {
 			if (valid_lftp)
 				duparg("valid_lft", *argv);
diff --git a/man/man8/ip-address.8.in b/man/man8/ip-address.8.in
index 7ebf0bc98dcd..c3861b3725cc 100644
--- a/man/man8/ip-address.8.in
+++ b/man/man8/ip-address.8.in
@@ -27,6 +27,8 @@ ip-address \- protocol address management
 .IR IFNAME " ] [ "
 .B  scope
 .IR SCOPE-ID " ] [ "
+.B  metric
+.IR METRIC " ] [ "
 .B  to
 .IR PREFIX " ] [ " FLAG-LIST " ] [ "
 .B  label
@@ -215,6 +217,10 @@ valid inside this site.
 .in -8
 
 .TP
+.BI metric " NUMBER"
+priority of prefix route associated with address.
+
+.TP
 .BI valid_lft " LFT"
 the valid lifetime of this address; see section 5.5.4 of
 RFC 4862. When it expires, the address is removed by the kernel.
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 7/7] selftests: fib_tests: Add prefix route tests with metric
From: dsahern @ 2018-05-27 15:09 UTC (permalink / raw)
  To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180527151000.30488-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add tests verifying prefix routes are inserted with expected metric.

IPv6 prefix route tests
    TEST: Default metric                                      [ OK ]
    TEST: User specified metric on first device               [ OK ]
    TEST: User specified metric on second device              [ OK ]
    TEST: Delete of address on first device                   [ OK ]
    TEST: Modify metric of address                            [ OK ]
    TEST: Prefix route removed on link down                   [ OK ]
    TEST: Prefix route with metric on link up                 [ OK ]

IPv4 prefix route tests
    TEST: Default metric                                      [ OK ]
    TEST: User specified metric on first device               [ OK ]
    TEST: User specified metric on second device              [ OK ]
    TEST: Delete of address on first device                   [ OK ]
    TEST: Modify metric of address                            [ OK ]
    TEST: Prefix route removed on link down                   [ OK ]
    TEST: Prefix route with metric on link up                 [ OK ]

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 tools/testing/selftests/net/fib_tests.sh | 181 ++++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 1 deletion(-)
 mode change 100755 => 100644 tools/testing/selftests/net/fib_tests.sh

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
old mode 100755
new mode 100644
index e7d76fbc36e9..9780c5a2d73f
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -6,7 +6,8 @@
 
 ret=0
 
-TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt"
+# all tests in this script. Can be overridden with -t option
+TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric"
 VERBOSE=0
 PAUSE_ON_FAIL=no
 PAUSE=no
@@ -642,6 +643,8 @@ check_route6()
 	local rc=0
 
 	out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
+	[ "${out}" = "${expected}" ] && return 0
+
 	if [ -z "${out}" ]; then
 		if [ "$VERBOSE" = "1" ]; then
 			printf "\nNo route entry found\n"
@@ -911,6 +914,98 @@ ipv6_route_test()
 	route_cleanup
 }
 
+ip_addr_metric_check()
+{
+	ip addr help 2>&1 | grep -q metric
+	if [ $? -ne 0 ]; then
+		echo "iproute2 command does not support metric for addresses. Skipping test"
+		return 1
+	fi
+
+	return 0
+}
+
+ipv6_addr_metric_test()
+{
+	local rc
+
+	echo
+	echo "IPv6 prefix route tests"
+
+	ip_addr_metric_check || return 1
+
+	setup
+
+	set -e
+	$IP li add dummy1 type dummy
+	$IP li add dummy2 type dummy
+	$IP li set dummy1 up
+	$IP li set dummy2 up
+
+	# default entry is metric 256
+	run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64"
+	run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64"
+	set +e
+
+	check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 256 2001:db8:104::/64 dev dummy2 proto kernel metric 256"
+	log_test $? 0 "Default metric"
+
+	set -e
+	run_cmd "$IP -6 addr flush dev dummy1"
+	run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64 metric 257"
+	set +e
+
+	check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 256 2001:db8:104::/64 dev dummy1 proto kernel metric 257"
+	log_test $? 0 "User specified metric on first device"
+
+	set -e
+	run_cmd "$IP -6 addr flush dev dummy2"
+	run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64 metric 258"
+	set +e
+
+	check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 257 2001:db8:104::/64 dev dummy2 proto kernel metric 258"
+	log_test $? 0 "User specified metric on second device"
+
+	run_cmd "$IP -6 addr del dev dummy1 2001:db8:104::1/64 metric 257"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 258"
+		rc=$?
+	fi
+	log_test $rc 0 "Delete of address on first device"
+
+	run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::2/64 metric 259"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
+		rc=$?
+	fi
+	log_test $rc 0 "Modify metric of address"
+
+	# verify prefix route removed on down
+	run_cmd "ip netns exec testns sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1"
+	run_cmd "$IP li set dev dummy2 down"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 ""
+		rc=$?
+	fi
+	log_test $rc 0 "Prefix route removed on link down"
+
+	# verify prefix route re-inserted with assigned metric
+	run_cmd "$IP li set dev dummy2 up"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
+		rc=$?
+	fi
+	log_test $rc 0 "Prefix route with metric on link up"
+
+	$IP li del dummy1
+	$IP li del dummy2
+	cleanup
+}
+
 # add route for a prefix, flushing any existing routes first
 # expected to be the first step of a test
 add_route()
@@ -955,6 +1050,8 @@ check_route()
 	local rc=0
 
 	out=$($IP ro ls match ${pfx})
+	[ "${out}" = "${expected}" ] && return 0
+
 	if [ -z "${out}" ]; then
 		if [ "$VERBOSE" = "1" ]; then
 			printf "\nNo route entry found\n"
@@ -1181,6 +1278,86 @@ ipv4_route_test()
 	route_cleanup
 }
 
+ipv4_addr_metric_test()
+{
+	local rc
+
+	echo
+	echo "IPv4 prefix route tests"
+
+	ip_addr_metric_check || return 1
+
+	setup
+
+	set -e
+	$IP li add dummy1 type dummy
+	$IP li add dummy2 type dummy
+	$IP li set dummy1 up
+	$IP li set dummy2 up
+
+	# default entry is metric 256
+	run_cmd "$IP addr add dev dummy1 172.16.104.1/24"
+	run_cmd "$IP addr add dev dummy2 172.16.104.2/24"
+	set +e
+
+	check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2"
+	log_test $? 0 "Default metric"
+
+	set -e
+	run_cmd "$IP addr flush dev dummy1"
+	run_cmd "$IP addr add dev dummy1 172.16.104.1/24 metric 257"
+	set +e
+
+	check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257"
+	log_test $? 0 "User specified metric on first device"
+
+	set -e
+	run_cmd "$IP addr flush dev dummy2"
+	run_cmd "$IP addr add dev dummy2 172.16.104.2/24 metric 258"
+	set +e
+
+	check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
+	log_test $? 0 "User specified metric on second device"
+
+	run_cmd "$IP addr del dev dummy1 172.16.104.1/24 metric 257"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
+		rc=$?
+	fi
+	log_test $rc 0 "Delete of address on first device"
+
+	run_cmd "$IP addr change dev dummy2 172.16.104.2/24 metric 259"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
+		rc=$?
+	fi
+	log_test $rc 0 "Modify metric of address"
+
+	# verify prefix route removed on down
+	run_cmd "$IP li set dev dummy2 down"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route ""
+		rc=$?
+	fi
+	log_test $rc 0 "Prefix route removed on link down"
+
+	# verify prefix route re-inserted with assigned metric
+	run_cmd "$IP li set dev dummy2 up"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
+		rc=$?
+	fi
+	log_test $rc 0 "Prefix route with metric on link up"
+
+	$IP li del dummy1
+	$IP li del dummy2
+	cleanup
+}
+
 ################################################################################
 # usage
 
@@ -1245,6 +1422,8 @@ do
 	fib_nexthop_test|nexthop)	fib_nexthop_test;;
 	ipv6_route_test|ipv6_rt)	ipv6_route_test;;
 	ipv4_route_test|ipv4_rt)	ipv4_route_test;;
+	ipv6_addr_metric)		ipv6_addr_metric_test;;
+	ipv4_addr_metric)		ipv4_addr_metric_test;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
 	esac
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net] VSOCK: check sk state before receive
From: Hangbin Liu @ 2018-05-27 15:29 UTC (permalink / raw)
  To: netdev; +Cc: Stefan Hajnoczi, Jorgen Hansen, David S. Miller
In-Reply-To: <1527382936-4850-1-git-send-email-liuhangbin@gmail.com>

Hmm...Although I won't reproduce this bug with my reproducer after
apply my patch. I could still get a similiar issue with syzkaller sock vnet test.

It looks this patch is not complete. Here is the KASAN call trace with my patch.
I can also reproduce it without my patch.

==================================================================
BUG: KASAN: use-after-free in vmci_transport_allow_dgram.part.7+0x155/0x1a0 [vmw_vsock_vmci_transport]
Read of size 4 at addr ffff880026a3a914 by task kworker/0:2/96

CPU: 0 PID: 96 Comm: kworker/0:2 Not tainted 4.17.0-rc6.vsock+ #28
Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
Workqueue: events dg_delayed_dispatch [vmw_vmci]
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0xdd/0x18e lib/dump_stack.c:113
 print_address_description+0x7a/0x3e0 mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report+0x1dd/0x460 mm/kasan/report.c:412
 vmci_transport_allow_dgram.part.7+0x155/0x1a0 [vmw_vsock_vmci_transport]
 vmci_transport_recv_dgram_cb+0x5d/0x200 [vmw_vsock_vmci_transport]
 dg_delayed_dispatch+0x99/0x1b0 [vmw_vmci]
 process_one_work+0xa4e/0x1720 kernel/workqueue.c:2145
 worker_thread+0x1df/0x1400 kernel/workqueue.c:2279
 kthread+0x343/0x4b0 kernel/kthread.c:240
 ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:412

Allocated by task 2684:
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xa0/0xd0 mm/kasan/kasan.c:553
 slab_post_alloc_hook mm/slab.h:444 [inline]
 slab_alloc_node mm/slub.c:2741 [inline]
 slab_alloc mm/slub.c:2749 [inline]
 kmem_cache_alloc+0x105/0x330 mm/slub.c:2754
 sk_prot_alloc+0x6a/0x2c0 net/core/sock.c:1468
 sk_alloc+0xc9/0xbb0 net/core/sock.c:1528
 __vsock_create+0xc8/0x9b0 [vsock]
 vsock_create+0xfd/0x1a0 [vsock]
 __sock_create+0x310/0x690 net/socket.c:1285
 sock_create net/socket.c:1325 [inline]
 __sys_socket+0x101/0x240 net/socket.c:1355
 __do_sys_socket net/socket.c:1364 [inline]
 __se_sys_socket net/socket.c:1362 [inline]
 __x64_sys_socket+0x7d/0xd0 net/socket.c:1362
 do_syscall_64+0x175/0x630 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 2684:
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x130/0x180 mm/kasan/kasan.c:521
 slab_free_hook mm/slub.c:1388 [inline]
 slab_free_freelist_hook mm/slub.c:1415 [inline]
 slab_free mm/slub.c:2988 [inline]
 kmem_cache_free+0xce/0x410 mm/slub.c:3004
 sk_prot_free net/core/sock.c:1509 [inline]
 __sk_destruct+0x629/0x940 net/core/sock.c:1593
 sk_destruct+0x4e/0x90 net/core/sock.c:1601
 __sk_free+0xd3/0x320 net/core/sock.c:1612
 sk_free+0x2a/0x30 net/core/sock.c:1623
 __vsock_release+0x431/0x610 [vsock]
 vsock_release+0x3c/0xc0 [vsock]
 sock_release+0x91/0x200 net/socket.c:594
 sock_close+0x17/0x20 net/socket.c:1149
 __fput+0x368/0xa20 fs/file_table.c:209
 task_work_run+0x1c5/0x2a0 kernel/task_work.c:113
 exit_task_work include/linux/task_work.h:22 [inline]
 do_exit+0x1876/0x26c0 kernel/exit.c:865
 do_group_exit+0x159/0x3e0 kernel/exit.c:968
 get_signal+0x65a/0x1780 kernel/signal.c:2482
 do_signal+0xa4/0x1fe0 arch/x86/kernel/signal.c:810
 exit_to_usermode_loop+0x1b8/0x260 arch/x86/entry/common.c:162
 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
 do_syscall_64+0x505/0x630 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

The buggy address belongs to the object at ffff880026a3a600
 which belongs to the cache AF_VSOCK of size 1056
The buggy address is located 788 bytes inside of
 1056-byte region [ffff880026a3a600, ffff880026a3aa20)
The buggy address belongs to the page:
page:ffffea00009a8e00 count:1 mapcount:0 mapping:0000000000000000 index:0x0 compound_mapcount: 0
flags: 0xfffffc0008100(slab|head)
raw: 000fffffc0008100 0000000000000000 0000000000000000 00000001000d000d
raw: dead000000000100 dead000000000200 ffff880034471a40 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff880026a3a800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff880026a3a880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff880026a3a900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                         ^
 ffff880026a3a980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff880026a3aa00: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

^ permalink raw reply

* [PATCH v2 0/5] build warnings cleanup
From: Atul Gupta @ 2018-05-27 15:45 UTC (permalink / raw)
  To: herbert, linux-crypto; +Cc: gustavo, dan.carpenter, netdev, davem, atul.gupta

Build warnings cleanup reported for
- using only 128b key
- wait for buffer in sendmsg/sendpage
- check for null before using skb
- free rspq_skb_cache in error path
- indentation

v2:
  Added bug report description for 0002
  Incorported comments from Dan Carpenter

Atul Gupta (5):
  crypto:chtls: key len correction
  crypto: chtls: wait for memory sendmsg, sendpage
  crypto: chtls: dereference null variable
  crypto: chtls: kbuild warnings
  crypto: chtls: free beyond end rspq_skb_cache

 drivers/crypto/chelsio/chtls/chtls.h      |   1 +
 drivers/crypto/chelsio/chtls/chtls_hw.c   |   6 +-
 drivers/crypto/chelsio/chtls/chtls_io.c   | 104 +++++++++++++++++++++++++++---
 drivers/crypto/chelsio/chtls/chtls_main.c |   3 +-
 4 files changed, 98 insertions(+), 16 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v2 1/5] crypto:chtls: key len correction
From: Atul Gupta @ 2018-05-27 15:45 UTC (permalink / raw)
  To: herbert, linux-crypto; +Cc: gustavo, dan.carpenter, netdev, davem, atul.gupta
In-Reply-To: <1527435922-6727-1-git-send-email-atul.gupta@chelsio.com>

corrected the key length to copy 128b key. Removed 192b and 256b
key as user input supports key of size 128b in gcm_ctx

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
 drivers/crypto/chelsio/chtls/chtls_hw.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_hw.c b/drivers/crypto/chelsio/chtls/chtls_hw.c
index 54a13aa9..55d5014 100644
--- a/drivers/crypto/chelsio/chtls/chtls_hw.c
+++ b/drivers/crypto/chelsio/chtls/chtls_hw.c
@@ -213,7 +213,7 @@ static int chtls_key_info(struct chtls_sock *csk,
 			  struct _key_ctx *kctx,
 			  u32 keylen, u32 optname)
 {
-	unsigned char key[CHCR_KEYCTX_CIPHER_KEY_SIZE_256];
+	unsigned char key[AES_KEYSIZE_128];
 	struct tls12_crypto_info_aes_gcm_128 *gcm_ctx;
 	unsigned char ghash_h[AEAD_H_SIZE];
 	struct crypto_cipher *cipher;
@@ -228,10 +228,6 @@ static int chtls_key_info(struct chtls_sock *csk,
 
 	if (keylen == AES_KEYSIZE_128) {
 		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_128;
-	} else if (keylen == AES_KEYSIZE_192) {
-		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_192;
-	} else if (keylen == AES_KEYSIZE_256) {
-		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_256;
 	} else {
 		pr_err("GCM: Invalid key length %d\n", keylen);
 		return -EINVAL;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 2/5] crypto: chtls: wait for memory sendmsg, sendpage
From: Atul Gupta @ 2018-05-27 15:45 UTC (permalink / raw)
  To: herbert, linux-crypto; +Cc: gustavo, dan.carpenter, netdev, davem, atul.gupta
In-Reply-To: <1527435922-6727-1-git-send-email-atul.gupta@chelsio.com>

address suspicious code <gustavo@embeddedor.com>

1210       set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1211       }

The issue is that in the code above, set_bit is never reached
due to the 'continue' statement at line 1208.

Also reported by bug report:<dan.carpenter@oracle.com>
1210       set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Not reachable.

Its required to wait for buffer in the send path and takes care of
unaddress and un-handled SOCK_NOSPACE.

v2: use csk_mem_free where appropriate
    proper indent of goto do_nonblock
    replace out with do_rm_wq

Reported-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
 drivers/crypto/chelsio/chtls/chtls.h      |  1 +
 drivers/crypto/chelsio/chtls/chtls_io.c   | 90 +++++++++++++++++++++++++++++--
 drivers/crypto/chelsio/chtls/chtls_main.c |  1 +
 3 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls.h b/drivers/crypto/chelsio/chtls/chtls.h
index 1b2f43c..a53a0e6 100644
--- a/drivers/crypto/chelsio/chtls/chtls.h
+++ b/drivers/crypto/chelsio/chtls/chtls.h
@@ -144,6 +144,7 @@ struct chtls_dev {
 	struct list_head rcu_node;
 	struct list_head na_node;
 	unsigned int send_page_order;
+	int max_host_sndbuf;
 	struct key_map kmap;
 };
 
diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index 840dd01..7aa5d90 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -914,6 +914,78 @@ static u16 tls_header_read(struct tls_hdr *thdr, struct iov_iter *from)
 	return (__force u16)cpu_to_be16(thdr->length);
 }
 
+static int csk_mem_free(struct chtls_dev *cdev, struct sock *sk)
+{
+	return (cdev->max_host_sndbuf - sk->sk_wmem_queued);
+}
+
+static int csk_wait_memory(struct chtls_dev *cdev,
+			   struct sock *sk, long *timeo_p)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int sndbuf, err = 0;
+	long current_timeo;
+	long vm_wait = 0;
+	bool noblock;
+
+	current_timeo = *timeo_p;
+	noblock = (*timeo_p ? false : true);
+	sndbuf = cdev->max_host_sndbuf;
+	if (csk_mem_free(cdev, sk)) {
+		current_timeo = (prandom_u32() % (HZ / 5)) + 2;
+		vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+	}
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (1) {
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+			goto do_error;
+		if (!*timeo_p) {
+			if (noblock)
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			goto do_nonblock;
+		}
+		if (signal_pending(current))
+			goto do_interrupted;
+		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+		if (csk_mem_free(cdev, sk) && !vm_wait)
+			break;
+
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_pending++;
+		sk_wait_event(sk, &current_timeo, sk->sk_err ||
+			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
+			      (csk_mem_free(cdev, sk) && !vm_wait), &wait);
+		sk->sk_write_pending--;
+
+		if (vm_wait) {
+			vm_wait -= current_timeo;
+			current_timeo = *timeo_p;
+			if (current_timeo != MAX_SCHEDULE_TIMEOUT) {
+				current_timeo -= vm_wait;
+				if (current_timeo < 0)
+					current_timeo = 0;
+			}
+			vm_wait = 0;
+		}
+		*timeo_p = current_timeo;
+	}
+do_rm_wq:
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return err;
+do_error:
+	err = -EPIPE;
+	goto do_rm_wq;
+do_nonblock:
+	err = -EAGAIN;
+	goto do_rm_wq;
+do_interrupted:
+	err = sock_intr_errno(*timeo_p);
+	goto do_rm_wq;
+}
+
 int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
@@ -952,6 +1024,8 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			copy = mss - skb->len;
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 		}
+		if (!csk_mem_free(cdev, sk))
+			goto wait_for_sndbuf;
 
 		if (is_tls_tx(csk) && !csk->tlshws.txleft) {
 			struct tls_hdr hdr;
@@ -1099,8 +1173,10 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 		if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND)
 			push_frames_if_head(sk);
 		continue;
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
-		err = sk_stream_wait_memory(sk, &timeo);
+		err = csk_wait_memory(cdev, sk, &timeo);
 		if (err)
 			goto do_error;
 	}
@@ -1131,6 +1207,7 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 		   int offset, size_t size, int flags)
 {
 	struct chtls_sock *csk;
+	struct chtls_dev *cdev;
 	int mss, err, copied;
 	struct tcp_sock *tp;
 	long timeo;
@@ -1138,6 +1215,7 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 	tp = tcp_sk(sk);
 	copied = 0;
 	csk = rcu_dereference_sk_user_data(sk);
+	cdev = csk->cdev;
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
 	err = sk_stream_wait_connect(sk, &timeo);
@@ -1156,6 +1234,8 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 		if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
 		    copy <= 0) {
 new_buf:
+			if (!csk_mem_free(cdev, sk))
+				goto wait_for_sndbuf;
 
 			if (is_tls_tx(csk)) {
 				skb = get_record_skb(sk,
@@ -1167,7 +1247,7 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 				skb = get_tx_skb(sk, 0);
 			}
 			if (!skb)
-				goto do_error;
+				goto wait_for_memory;
 			copy = mss;
 		}
 		if (copy > size)
@@ -1206,8 +1286,12 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 		if (unlikely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND))
 			push_frames_if_head(sk);
 		continue;
-
+wait_for_sndbuf:
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		err = csk_wait_memory(cdev, sk, &timeo);
+		if (err)
+			goto do_error;
 	}
 out:
 	csk_reset_flag(csk, CSK_TX_MORE_DATA);
diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c
index 53ffb00..273afd3 100644
--- a/drivers/crypto/chelsio/chtls/chtls_main.c
+++ b/drivers/crypto/chelsio/chtls/chtls_main.c
@@ -238,6 +238,7 @@ static void *chtls_uld_add(const struct cxgb4_lld_info *info)
 	spin_lock_init(&cdev->idr_lock);
 	cdev->send_page_order = min_t(uint, get_order(32768),
 				      send_page_order);
+	cdev->max_host_sndbuf = 48 * 1024;
 
 	if (lldi->vr->key.size)
 		if (chtls_init_kmap(cdev, lldi))
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 3/5] crypto: chtls: dereference null variable
From: Atul Gupta @ 2018-05-27 15:45 UTC (permalink / raw)
  To: herbert, linux-crypto; +Cc: gustavo, dan.carpenter, netdev, davem, atul.gupta
In-Reply-To: <1527435922-6727-1-git-send-email-atul.gupta@chelsio.com>

skb dereferenced before check in sendpage

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
 drivers/crypto/chelsio/chtls/chtls_io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index 7aa5d90..8cfc27b 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -1230,9 +1230,8 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 		struct sk_buff *skb = skb_peek_tail(&csk->txq);
 		int copy, i;
 
-		copy = mss - skb->len;
 		if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
-		    copy <= 0) {
+		    (copy = mss - skb->len) <= 0) {
 new_buf:
 			if (!csk_mem_free(cdev, sk))
 				goto wait_for_sndbuf;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 4/5] crypto: chtls: kbuild warnings
From: Atul Gupta @ 2018-05-27 15:45 UTC (permalink / raw)
  To: herbert, linux-crypto; +Cc: gustavo, dan.carpenter, netdev, davem, atul.gupta
In-Reply-To: <1527435922-6727-1-git-send-email-atul.gupta@chelsio.com>

- unindented continue
- check for null page
- signed return

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
 drivers/crypto/chelsio/chtls/chtls_io.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index 8cfc27b..51fc682 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -907,11 +907,11 @@ static int chtls_skb_copy_to_page_nocache(struct sock *sk,
 }
 
 /* Read TLS header to find content type and data length */
-static u16 tls_header_read(struct tls_hdr *thdr, struct iov_iter *from)
+static int tls_header_read(struct tls_hdr *thdr, struct iov_iter *from)
 {
 	if (copy_from_iter(thdr, sizeof(*thdr), from) != sizeof(*thdr))
 		return -EFAULT;
-	return (__force u16)cpu_to_be16(thdr->length);
+	return (__force int)cpu_to_be16(thdr->length);
 }
 
 static int csk_mem_free(struct chtls_dev *cdev, struct sock *sk)
@@ -1083,9 +1083,10 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			int off = TCP_OFF(sk);
 			bool merge;
 
-			if (page)
-				pg_size <<= compound_order(page);
+			if (!page)
+				goto wait_for_memory;
 
+			pg_size <<= compound_order(page);
 			if (off < pg_size &&
 			    skb_can_coalesce(skb, i, page, off)) {
 				merge = 1;
@@ -1492,7 +1493,7 @@ static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			break;
 		chtls_cleanup_rbuf(sk, copied);
 		sk_wait_data(sk, &timeo, NULL);
-			continue;
+		continue;
 found_ok_skb:
 		if (!skb->len) {
 			skb_dst_set(skb, NULL);
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 5/5] crypto: chtls: free beyond end rspq_skb_cache
From: Atul Gupta @ 2018-05-27 15:45 UTC (permalink / raw)
  To: herbert, linux-crypto; +Cc: gustavo, dan.carpenter, netdev, davem, atul.gupta
In-Reply-To: <1527435922-6727-1-git-send-email-atul.gupta@chelsio.com>

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
---
 drivers/crypto/chelsio/chtls/chtls_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c
index 273afd3..9b07f91 100644
--- a/drivers/crypto/chelsio/chtls/chtls_main.c
+++ b/drivers/crypto/chelsio/chtls/chtls_main.c
@@ -250,7 +250,7 @@ static void *chtls_uld_add(const struct cxgb4_lld_info *info)
 
 	return cdev;
 out_rspq_skb:
-	for (j = 0; j <= i; j++)
+	for (j = 0; j < i; j++)
 		kfree_skb(cdev->rspq_skb_cache[j]);
 	kfree_skb(cdev->askb);
 out_skb:
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH rdma-next v1 00/13] Verbs flow counters support
From: Leon Romanovsky @ 2018-05-27 16:22 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: RDMA mailing list, Boris Pismenny, Matan Barak, Raed Salem,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180527102346.15149-1-leon@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 904 bytes --]

On Sun, May 27, 2018 at 01:23:33PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
>
> Changelog v0->v1:
>  * Decouple from DevX submission
>  * Use uverbs_attr_get_obj at counters read method
>  * Added define for max read buffer size (MAX_COUNTERS_BUFF_SIZE)
>  * Removed the struct mlx5_ib_flow_counter basic_flow_cnts and
>    the related structs used, used define instead
>  * Took Matan's patch from DevX
>  * uverbs_free_counters removed void* casting
>  * Added check to bound ncounters value (added define
>  * Changed user supplied data buffer structure to be array of
>    struct <desc,index> pair (applied this change to user space also)
>
> Not changed:
>  * UAPI files
>  * Addition of uhw to flow
>
> Thanks

Sorry for the noise, but please drop this series.

We found improper locking sequence in one of error paths which can be
triggered by the user.

Thanks

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH v2 04/11] net: sched: always take reference to action
From: Vlad Buslov @ 2018-05-27 16:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri, ast, daniel, kliteyn,
	Vlad Buslov
In-Reply-To: <1527439298-22306-1-git-send-email-vladbu@mellanox.com>

Without rtnl lock protection it is no longer safe to use pointer to tc
action without holding reference to it. (it can be destroyed concurrently)

Remove unsafe action idr lookup function. Instead of it, implement safe tcf
idr check function that atomically looks up action in idr and increments
its reference and bind counters. Implement both action search and check
using new safe function

Reference taken by idr check is temporal and should not be accounted by
userspace clients (both logically and to preserver current API behavior).
Subtract temporal reference when dumping action to userspace using existing
tca_get_fill function arguments.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
 net/sched/act_api.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 256b0c93916c..aa304d36fee0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -284,44 +284,38 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(tcf_generic_walker);
 
-static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
+static bool __tcf_idr_check(struct tc_action_net *tn, u32 index,
+			    struct tc_action **a, int bind)
 {
-	struct tc_action *p = NULL;
+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
+	struct tc_action *p;
 
 	spin_lock(&idrinfo->lock);
 	p = idr_find(&idrinfo->action_idr, index);
+	if (p) {
+		refcount_inc(&p->tcfa_refcnt);
+		if (bind)
+			atomic_inc(&p->tcfa_bindcnt);
+	}
 	spin_unlock(&idrinfo->lock);
 
-	return p;
+	if (p) {
+		*a = p;
+		return true;
+	}
+	return false;
 }
 
 int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
 {
-	struct tcf_idrinfo *idrinfo = tn->idrinfo;
-	struct tc_action *p = tcf_idr_lookup(index, idrinfo);
-
-	if (p) {
-		*a = p;
-		return 1;
-	}
-	return 0;
+	return __tcf_idr_check(tn, index, a, 0);
 }
 EXPORT_SYMBOL(tcf_idr_search);
 
 bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
 		   int bind)
 {
-	struct tcf_idrinfo *idrinfo = tn->idrinfo;
-	struct tc_action *p = tcf_idr_lookup(index, idrinfo);
-
-	if (index && p) {
-		if (bind)
-			atomic_inc(&p->tcfa_bindcnt);
-		refcount_inc(&p->tcfa_refcnt);
-		*a = p;
-		return true;
-	}
-	return false;
+	return __tcf_idr_check(tn, index, a, bind);
 }
 EXPORT_SYMBOL(tcf_idr_check);
 
@@ -932,7 +926,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
 	if (!skb)
 		return -ENOBUFS;
 	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
-			 0, 0) <= 0) {
+			 0, 1) <= 0) {
 		NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
 		kfree_skb(skb);
 		return -EINVAL;
@@ -1072,7 +1066,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 		return -ENOBUFS;
 
 	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
-			 0, 1) <= 0) {
+			 0, 2) <= 0) {
 		NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
 		kfree_skb(skb);
 		return -EINVAL;
@@ -1131,14 +1125,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 	if (event == RTM_GETACTION)
 		ret = tcf_get_notify(net, portid, n, &actions, event, extack);
 	else { /* delete */
+		cleanup_a(&actions, 1); /* lookup took reference */
 		ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack);
 		if (ret)
 			goto err;
 		return ret;
 	}
 err:
-	if (event != RTM_GETACTION)
-		tcf_action_destroy(&actions, 0);
+	tcf_action_destroy(&actions, 0);
 	return ret;
 }
 
-- 
2.7.5

^ permalink raw reply related

* [PATCH v2 09/11] net: sched: use reference counting action init
From: Vlad Buslov @ 2018-05-27 16:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri, ast, daniel, kliteyn,
	Vlad Buslov
In-Reply-To: <1527439298-22306-1-git-send-email-vladbu@mellanox.com>

Change action API to assume that action init function always takes
reference to action, even when overwriting existing action. This is
necessary because action API continues to use action pointer after init
function is done. At this point action becomes accessible for concurrent
modifications, so user must always hold reference to it.

Implement helper put list function to atomically release list of actions
after action API init code is done using them.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
 net/sched/act_api.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f019f0464cec..eefe8c2fe667 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -627,6 +627,18 @@ static int tcf_action_put(struct tc_action *p)
 	return __tcf_action_put(p, false);
 }
 
+static void tcf_action_put_lst(struct list_head *actions)
+{
+	struct tc_action *a, *tmp;
+
+	list_for_each_entry_safe(a, tmp, actions, list) {
+		const struct tc_action_ops *ops = a->ops;
+
+		if (tcf_action_put(a))
+			module_put(ops->owner);
+	}
+}
+
 int
 tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
@@ -835,17 +847,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	return ERR_PTR(err);
 }
 
-static void cleanup_a(struct list_head *actions, int ovr)
-{
-	struct tc_action *a;
-
-	if (!ovr)
-		return;
-
-	list_for_each_entry(a, actions, list)
-		refcount_dec(&a->tcfa_refcnt);
-}
-
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 		    struct nlattr *est, char *name, int ovr, int bind,
 		    struct list_head *actions, size_t *attr_size,
@@ -874,11 +875,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 	}
 
 	*attr_size = tcf_action_full_attrs_size(sz);
-
-	/* Remove the temp refcnt which was necessary to protect against
-	 * destroying an existing action which was being replaced
-	 */
-	cleanup_a(actions, ovr);
 	return 0;
 
 err:
@@ -1209,7 +1205,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 		return ret;
 	}
 err:
-	tcf_action_destroy(&actions, 0);
+	tcf_action_put_lst(&actions);
 	return ret;
 }
 
@@ -1251,8 +1247,11 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
 			      &attr_size, true, extack);
 	if (ret)
 		return ret;
+	ret = tcf_add_notify(net, n, &actions, portid, attr_size, extack);
+	if (ovr)
+		tcf_action_put_lst(&actions);
 
-	return tcf_add_notify(net, n, &actions, portid, attr_size, extack);
+	return ret;
 }
 
 static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
-- 
2.7.5

^ permalink raw reply related

* [PATCH v2 00/11] Modify action API for implementing lockless actions
From: Vlad Buslov @ 2018-05-27 16:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri, ast, daniel, kliteyn,
	Vlad Buslov

Currently, all netlink protocol handlers for updating rules, actions and
qdiscs are protected with single global rtnl lock which removes any
possibility for parallelism. This patch set is a first step to remove
rtnl lock dependency from TC rules update path.

Recently, new rtnl registration flag RTNL_FLAG_DOIT_UNLOCKED was added.
Handlers registered with this flag are called without RTNL taken. End
goal is to have rule update handlers(RTM_NEWTFILTER, RTM_DELTFILTER,
etc.) to be registered with UNLOCKED flag to allow parallel execution.
However, there is no intention to completely remove or split rtnl lock
itself. This patch set addresses specific problems in action API that
prevents it from being executed concurrently.

As a preparation for executing TC rules update handlers without rtnl
lock, action API code was audited to determine areas that assume
external synchronization with rtnl lock and must be changed to allow
safe concurrent access with following results:

1. Action idr is already protected with spinlock. However, some code
   paths assume that idr state is not changes between several
   consecutive tcf_idr_* function calls.
2. tc_action reference and bind counters are implemented as plain
   integers. They purpose was to allow single actions to be shared
   between multiple filters, not to provide means for concurrent
   modification.
3. tc_action 'cookie' pointer field is not protected against
   modification.
4. Action API functions, that work with set of actions, use intrusive
   linked list, which cannot be used concurrently without additional
   synchronization.
5. Action API functions don't take reference to actions while using
   them, assuming external synchronization with rtnl lock.

Following solutions to these problems are implemented:

1. To remove assumption that idr state doesn't change between tcf_idr_*
   calls, implement new functions that atomically perform several
   operations on idr without releasing idr spinlock. (function to
   atomically lookup and delete action by index, function to atomically
   check if action exists and allocate new one if necessary, etc.)
2. Use atomic operations on counters to make them suitable for
   concurrent get/put operations.
3. Data that 'cookie' points to is never modified, so it enough to
   refactor it to rcu pointer to prevent concurrent de-allocation.
4. Action API doesn't actually use any linked list specific operations
   on actions intrusive linked list, so it can be refactored to array in
   straightforward manner.
5. Always take reference to action while accessing it in action API.
   tcf_idr_search function modified to take reference to action before
   returning it, so there is no way to lookup an action without
   incrementing its reference counter. All users of this function are
   modified to release the reference, after they done using action. With
   all users using reference counting, it is now safe to concurrently
   delete actions.

Since only shared state in action API module are actions themselves and
action idr, these changes are sufficient to not to rely on global rtnl
lock for protection of internal action API data structures.

Changes from V1 to V2:
- Removed redundant actions ops lookup during delete.
- Merge action ops delete definition and implementation.
- Assume all actions have delete implemented and don't check for it
  explicitly.
- Resplit action lookup/release code to prevent memory leaks in
  individual patches.
- Make __tcf_idr_check function static
- Remove unique idr insertion function. Change original idr insert to do
  the same thing.
- Merge changes that take reference to action when performing lookup and
  changes that account for this additional reference when dumping action
  to user space into single patch.
- Change convoluted commit message.
- Rename "unlocked" to "rtnl_held" for clarity.
- Remove estimator lock add patch.
- Refactor action check-alloc code into standalone function.
- Rename tcf_idr_find_delete to tcf_idr_delete_index.
- Rearrange variable definitions in tc_action_delete.
- Add patch that refactors action API code to use array of pointers to
  actions instead of intrusive linked list.
- Expand cover letter.

Vlad Buslov (11):
  net: sched: use rcu for action cookie update
  net: sched: change type of reference and bind counters
  net: sched: implement unlocked action init API
  net: sched: always take reference to action
  net: sched: implement action API that deletes action by index
  net: sched: add 'delete' function to action ops
  net: sched: implement reference counted action release
  net: sched: don't release reference on action overwrite
  net: sched: use reference counting action init
  net: sched: atomically check-allocate action
  net: sched: change action API to use array of pointers to actions

 include/net/act_api.h      |  25 ++-
 include/net/pkt_cls.h      |   1 +
 net/sched/act_api.c        | 408 +++++++++++++++++++++++++++++++--------------
 net/sched/act_bpf.c        |  34 ++--
 net/sched/act_connmark.c   |  29 +++-
 net/sched/act_csum.c       |  34 ++--
 net/sched/act_gact.c       |  31 +++-
 net/sched/act_ife.c        |  33 ++--
 net/sched/act_ipt.c        |  44 ++++-
 net/sched/act_mirred.c     |  38 +++--
 net/sched/act_nat.c        |  30 +++-
 net/sched/act_pedit.c      |  35 +++-
 net/sched/act_police.c     |  31 ++--
 net/sched/act_sample.c     |  34 ++--
 net/sched/act_simple.c     |  31 +++-
 net/sched/act_skbedit.c    |  31 +++-
 net/sched/act_skbmod.c     |  34 ++--
 net/sched/act_tunnel_key.c |  32 ++--
 net/sched/act_vlan.c       |  40 +++--
 net/sched/cls_api.c        |  25 +--
 20 files changed, 704 insertions(+), 296 deletions(-)

-- 
2.7.5

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox