Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net] net: defxx: replace dev_kfree_skb_irq by dev_consume_skb_irq for drop profiles
From: Maciej W. Rozycki @ 2019-02-05 19:57 UTC (permalink / raw)
  To: Yang Wei; +Cc: netdev, David S. Miller, yang.wei9
In-Reply-To: <1549382464-5138-1-git-send-email-albin_yang@163.com>

On Wed, 6 Feb 2019, Yang Wei wrote:

> From: Yang Wei <yang.wei9@zte.com.cn>
> 
> dev_consume_skb_irq() should be called in dfx_xmt_done() when skb
> xmit done. It makes drop profiles(dropwatch, perf) more friendly.
> 
> Signed-off-by: Yang Wei <yang.wei9@zte.com.cn>

Reviewed-by: Maciej W. Rozycki <macro@linux-mips.org>

 It looks to me the driver has to be reviewed WRT `dev_kfree_skb' vs 
`kfree_skb' use too.  I'll have a look into it unless you are happy to do 
that.

 Thanks for your contribution!

  Maciej

^ permalink raw reply

* Re: [PATCH v2] net: dsa: mv88e6xxx: Revise irq setup ordering
From: Andrew Lunn @ 2019-02-05 19:54 UTC (permalink / raw)
  To: John David Anglin; +Cc: Russell King, Vivien Didelot, Florian Fainelli, netdev
In-Reply-To: <964a7e63-82b5-1b21-830e-ec5c7f1eb917@bell.net>

> My hunch is the second hunk of the original patch will fix this.

O.K.

But please fully explain how the race happens and how the fix fully
fixes it. I don't really want to add code which just makes the race
window smaller, but the race still happens. That helps nobody.

In the end, i think you will end up polling. If so, feel free to
submit a patch which makes the poll interval build time configurable.

       Andrew

^ permalink raw reply

* Re: [PATCH] ipmr: ip6mr: Create new sockopt to clear mfc cache only
From: Nikolay Aleksandrov @ 2019-02-05 19:52 UTC (permalink / raw)
  To: Callum Sinclair, davem, kuznet, yoshfuji, netdev, linux-kernel,
	Roopa Prabhu
In-Reply-To: <20190205025800.17185-2-callum.sinclair@alliedtelesis.co.nz>

On 05/02/2019 04:58, Callum Sinclair wrote:
> Currently the only way to clear the mfc cache was to delete the entries
> one by one using the MRT_DEL_MFC socket option or to destroy and
> recreate the socket.
> 
> Create a new socket option which will clear the multicast forwarding
> cache on the socket without destroying the socket.
> 
> Signed-off-by: Callum Sinclair <callum.sinclair@alliedtelesis.co.nz>
> ---
>  include/uapi/linux/mroute.h  |  3 ++-
>  include/uapi/linux/mroute6.h |  3 ++-
>  net/ipv4/ipmr.c              | 40 +++++++++++++++++++++----------
>  net/ipv6/ip6mr.c             | 46 +++++++++++++++++++++++-------------
>  4 files changed, 61 insertions(+), 31 deletions(-)
> 

Hi,
Why don't you do it per-table ? That is - use the selected table and flush
only its entries, I think that would be more useful. Also would be nice to
make the interface flush optional, e.g. depending on a user-supplied value.
Some people would like to flush only the MFC entries without deleting the
devices. Since it's called DEL_MFC_ALL, I'd expect this call to act only
on the MFC entries and not the device list, but it also flushes that.
I'd rename it to MRT_FLUSH with an argument which chooses to flush MFCs +
VIFs based on flags (so you could flush either one separately).

Thanks,
 Nik

> diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
> index 5d37a9ccce63..8a0beb885cd9 100644
> --- a/include/uapi/linux/mroute.h
> +++ b/include/uapi/linux/mroute.h
> @@ -28,7 +28,8 @@
>  #define MRT_TABLE	(MRT_BASE+9)	/* Specify mroute table ID		*/
>  #define MRT_ADD_MFC_PROXY	(MRT_BASE+10)	/* Add a (*,*|G) mfc entry	*/
>  #define MRT_DEL_MFC_PROXY	(MRT_BASE+11)	/* Del a (*,*|G) mfc entry	*/
> -#define MRT_MAX		(MRT_BASE+11)
> +#define MRT_DEL_MFC_ALL		(MRT_BASE+12)	/* Del all multicast entries	*/
> +#define MRT_MAX		(MRT_BASE+12)
>  
>  #define SIOCGETVIFCNT	SIOCPROTOPRIVATE	/* IP protocol privates */
>  #define SIOCGETSGCNT	(SIOCPROTOPRIVATE+1)
> diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
> index 9999cc006390..7def70cdf571 100644
> --- a/include/uapi/linux/mroute6.h
> +++ b/include/uapi/linux/mroute6.h
> @@ -31,7 +31,8 @@
>  #define MRT6_TABLE	(MRT6_BASE+9)	/* Specify mroute table ID		*/
>  #define MRT6_ADD_MFC_PROXY	(MRT6_BASE+10)	/* Add a (*,*|G) mfc entry	*/
>  #define MRT6_DEL_MFC_PROXY	(MRT6_BASE+11)	/* Del a (*,*|G) mfc entry	*/
> -#define MRT6_MAX	(MRT6_BASE+11)
> +#define MRT6_DEL_MFC_ALL	(MRT6_BASE+12)	/* Del all multicast entries	*/
> +#define MRT6_MAX	(MRT6_BASE+12)
>  
>  #define SIOCGETMIFCNT_IN6	SIOCPROTOPRIVATE	/* IP protocol privates */
>  #define SIOCGETSGCNT_IN6	(SIOCPROTOPRIVATE+1)
> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
> index ddbf8c9a1abb..ccfeebd38e1a 100644
> --- a/net/ipv4/ipmr.c
> +++ b/net/ipv4/ipmr.c
> @@ -1298,22 +1298,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
>  	return 0;
>  }
>  
> -/* Close the multicast socket, and clear the vif tables etc */
> -static void mroute_clean_tables(struct mr_table *mrt, bool all)
> +/* Clear the vif tables */
> +static void mroute_clean_cache(struct mr_table *mrt, bool all)
>  {
>  	struct net *net = read_pnet(&mrt->net);
> -	struct mr_mfc *c, *tmp;
>  	struct mfc_cache *cache;
> -	LIST_HEAD(list);
> -	int i;
> -
> -	/* Shut down all active vif entries */
> -	for (i = 0; i < mrt->maxvif; i++) {
> -		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
> -			continue;
> -		vif_delete(mrt, i, 0, &list);
> -	}
> -	unregister_netdevice_many(&list);
> +	struct mr_mfc *c, *tmp;
>  
>  	/* Wipe the cache */
>  	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
> @@ -1340,6 +1330,23 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
>  	}
>  }
>  
> +/* Close the multicast socket, and clear the vif tables etc */
> +static void mroute_clean_tables(struct mr_table *mrt, bool all)
> +{
> +	LIST_HEAD(list);
> +	int i;
> +
> +	/* Shut down all active vif entries */
> +	for (i = 0; i < mrt->maxvif; i++) {
> +		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
> +			continue;
> +		vif_delete(mrt, i, 0, &list);
> +	}
> +	unregister_netdevice_many(&list);
> +
> +	mroute_clean_cache(mrt, all);
> +}
> +
>  /* called from ip_ra_control(), before an RCU grace period,
>   * we dont need to call synchronize_rcu() here
>   */
> @@ -1482,6 +1489,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
>  					   sk == rtnl_dereference(mrt->mroute_sk),
>  					   parent);
>  		break;
> +	case MRT_DEL_MFC_ALL:
> +		rtnl_lock();
> +		ipmr_for_each_table(mrt, net) {
> +			mroute_clean_cache(mrt, true);
> +		}
> +		rtnl_unlock();
> +		break;
>  	/* Control PIM assert. */
>  	case MRT_ASSERT:
>  		if (optlen != sizeof(val)) {
> diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
> index 30337b38274b..0168420d217b 100644
> --- a/net/ipv6/ip6mr.c
> +++ b/net/ipv6/ip6mr.c
> @@ -1492,25 +1492,11 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
>  	return 0;
>  }
>  
> -/*
> - *	Close the multicast socket, and clear the vif tables etc
> - */
> -
> -static void mroute_clean_tables(struct mr_table *mrt, bool all)
> +/* Clear the vif tables */
> +static void mroute_clean_cache(struct mr_table *mrt, bool all)
>  {
>  	struct mr_mfc *c, *tmp;
> -	LIST_HEAD(list);
> -	int i;
> -
> -	/* Shut down all active vif entries */
> -	for (i = 0; i < mrt->maxvif; i++) {
> -		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
> -			continue;
> -		mif6_delete(mrt, i, 0, &list);
> -	}
> -	unregister_netdevice_many(&list);
>  
> -	/* Wipe the cache */
>  	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
>  		if (!all && (c->mfc_flags & MFC_STATIC))
>  			continue;
> @@ -1536,6 +1522,27 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
>  	}
>  }
>  
> +/*
> + *	Close the multicast socket, and clear the vif tables etc
> + */
> +
> +static void mroute_clean_tables(struct mr_table *mrt, bool all)
> +{
> +	LIST_HEAD(list);
> +	int i;
> +
> +	/* Shut down all active vif entries */
> +	for (i = 0; i < mrt->maxvif; i++) {
> +		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
> +			continue;
> +		mif6_delete(mrt, i, 0, &list);
> +	}
> +	unregister_netdevice_many(&list);
> +
> +	/* Wipe the cache */
> +	mroute_clean_cache(mrt, all);
> +}
> +
>  static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
>  {
>  	int err = 0;
> @@ -1703,6 +1710,13 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
>  					    parent);
>  		rtnl_unlock();
>  		return ret;
> +	case MRT6_DEL_MFC_ALL:
> +		rtnl_lock();
> +		ip6mr_for_each_table(mrt, net) {
> +			mroute_clean_cache(mrt, true);
> +		}
> +		rtnl_unlock();
> +		return 0;
>  
>  	/*
>  	 *	Control PIM assert (to activate pim will activate assert)
> 


^ permalink raw reply

* [PATCH bpf-next 1/2] btf: separate btf creation and loading
From: Andrii Nakryiko @ 2019-02-05 19:48 UTC (permalink / raw)
  To: songliubraving, yhs, ast, kafai, netdev, daniel, andrii.nakryiko
  Cc: Andrii Nakryiko
In-Reply-To: <20190205194856.967463-1-andriin@fb.com>

This change splits out previous btf__new functionality of constructing
struct btf and loading it into kernel into two:
- btf__new() just creates and initializes struct btf
- btf__load() attempts to load existing struct btf into kernel

btf__free will still close BTF fd, if it was ever loaded successfully
into kernel.

This change allows users of libbpf to manipulate BTF using its API,
without the need to unnecessarily load it into kernel.

One of the intended use cases is pahole using libbpf to do DWARF to BTF
conversion and deduplication using libbpf, while handling ELF sections
overwrites and other concerns on its own.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
---
 tools/lib/bpf/btf.c      | 53 ++++++++++++++++++++++------------------
 tools/lib/bpf/btf.h      |  1 +
 tools/lib/bpf/libbpf.c   |  2 +-
 tools/lib/bpf/libbpf.map |  1 +
 4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 4949f8840bda..065d51fa63e5 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -366,8 +366,6 @@ void btf__free(struct btf *btf)
 
 struct btf *btf__new(__u8 *data, __u32 size)
 {
-	__u32 log_buf_size = 0;
-	char *log_buf = NULL;
 	struct btf *btf;
 	int err;
 
@@ -377,15 +375,6 @@ struct btf *btf__new(__u8 *data, __u32 size)
 
 	btf->fd = -1;
 
-	log_buf = malloc(BPF_LOG_BUF_SIZE);
-	if (!log_buf) {
-		err = -ENOMEM;
-		goto done;
-	}
-
-	*log_buf = 0;
-	log_buf_size = BPF_LOG_BUF_SIZE;
-
 	btf->data = malloc(size);
 	if (!btf->data) {
 		err = -ENOMEM;
@@ -395,17 +384,6 @@ struct btf *btf__new(__u8 *data, __u32 size)
 	memcpy(btf->data, data, size);
 	btf->data_size = size;
 
-	btf->fd = bpf_load_btf(btf->data, btf->data_size,
-			       log_buf, log_buf_size, false);
-
-	if (btf->fd == -1) {
-		err = -errno;
-		pr_warning("Error loading BTF: %s(%d)\n", strerror(errno), errno);
-		if (log_buf && *log_buf)
-			pr_warning("%s\n", log_buf);
-		goto done;
-	}
-
 	err = btf_parse_hdr(btf);
 	if (err)
 		goto done;
@@ -417,8 +395,6 @@ struct btf *btf__new(__u8 *data, __u32 size)
 	err = btf_parse_type_sec(btf);
 
 done:
-	free(log_buf);
-
 	if (err) {
 		btf__free(btf);
 		return ERR_PTR(err);
@@ -427,6 +403,35 @@ struct btf *btf__new(__u8 *data, __u32 size)
 	return btf;
 }
 
+int btf__load(struct btf* btf) {
+	__u32 log_buf_size = BPF_LOG_BUF_SIZE;
+	char *log_buf = NULL;
+
+	if (btf->fd >= 0) {
+		return -EEXIST;
+	}
+
+	log_buf = malloc(log_buf_size);
+	if (!log_buf)
+		return -ENOMEM;
+
+	*log_buf = 0;
+
+	btf->fd = bpf_load_btf(btf->data, btf->data_size,
+			       log_buf, log_buf_size, false);
+	if (btf->fd < 0) {
+		btf->fd = -errno;
+		pr_warning("Error loading BTF: %s(%d)\n", strerror(errno), errno);
+		if (*log_buf)
+			pr_warning("%s\n", log_buf);
+		goto done;
+	}
+
+done:
+	free(log_buf);
+	return btf->fd;
+}
+
 int btf__fd(const struct btf *btf)
 {
 	return btf->fd;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 25a9d2db035d..e8410887f93a 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -57,6 +57,7 @@ struct btf_ext_header {
 
 LIBBPF_API void btf__free(struct btf *btf);
 LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size);
+LIBBPF_API int btf__load(struct btf* btf);
 LIBBPF_API __s32 btf__find_by_name(const struct btf *btf,
 				   const char *type_name);
 LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 47969aa0faf8..75b82c1cfc72 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -835,7 +835,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj, int flags)
 			obj->efile.maps_shndx = idx;
 		else if (strcmp(name, BTF_ELF_SEC) == 0) {
 			obj->btf = btf__new(data->d_buf, data->d_size);
-			if (IS_ERR(obj->btf)) {
+			if (IS_ERR(obj->btf) || btf__load(obj->btf) < 0) {
 				pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n",
 					   BTF_ELF_SEC, PTR_ERR(obj->btf));
 				obj->btf = NULL;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 89c1149e32ee..ffa1fe044f6a 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -134,6 +134,7 @@ LIBBPF_0.0.2 {
 		bpf_object__find_map_fd_by_name;
 		bpf_get_link_xdp_id;
 		btf__dedup;
+		btf__load;
 		btf__get_map_kv_tids;
 		btf__get_nr_types;
 		btf__get_strings;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 2/2] btf: expose API to work with raw btf data
From: Andrii Nakryiko @ 2019-02-05 19:48 UTC (permalink / raw)
  To: songliubraving, yhs, ast, kafai, netdev, daniel, andrii.nakryiko
  Cc: Andrii Nakryiko
In-Reply-To: <20190205194856.967463-1-andriin@fb.com>

This patch exposes two new APIs btf__get_raw_data_size() and
btf__get_raw_data() that allows to get a copy of raw BTF data out of
struct btf. This is useful for external programs that need to manipulate
raw data, e.g., pahole using btf__dedup() to deduplicate BTF type info
and then writing it back to file.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
---
 tools/lib/bpf/btf.c      | 10 ++++++++++
 tools/lib/bpf/btf.h      |  2 ++
 tools/lib/bpf/libbpf.map |  2 ++
 3 files changed, 14 insertions(+)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 065d51fa63e5..6491d440c9a7 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -437,6 +437,16 @@ int btf__fd(const struct btf *btf)
 	return btf->fd;
 }
 
+__u32 btf__get_raw_data_size(const struct btf *btf)
+{
+	return btf->data_size;
+}
+
+void btf__get_raw_data(const struct btf *btf, char *data)
+{
+	memcpy(data, btf->data, btf->data_size);
+}
+
 void btf__get_strings(const struct btf *btf, const char **strings,
 		      __u32 *str_len)
 {
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index e8410887f93a..d46f680b9416 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -66,6 +66,8 @@ LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf,
 LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__fd(const struct btf *btf);
+LIBBPF_API __u32 btf__get_raw_data_size(const struct btf *btf);
+LIBBPF_API void btf__get_raw_data(const struct btf *btf, char *data);
 LIBBPF_API void btf__get_strings(const struct btf *btf, const char **strings,
 				 __u32 *str_len);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index ffa1fe044f6a..873a26cd714d 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -137,6 +137,8 @@ LIBBPF_0.0.2 {
 		btf__load;
 		btf__get_map_kv_tids;
 		btf__get_nr_types;
+		btf__get_raw_data;
+		btf__get_raw_data_size;
 		btf__get_strings;
 		btf_ext__free;
 		btf_ext__func_info_rec_size;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 0/2] tools/btf: extends libbpf APIs to work with btf w/o kernel
From: Andrii Nakryiko @ 2019-02-05 19:48 UTC (permalink / raw)
  To: songliubraving, yhs, ast, kafai, netdev, daniel, andrii.nakryiko
  Cc: Andrii Nakryiko

This patchset changes existing btf__new() API call to only load and initialize
struct btf, while exposing new btf__load() API to attempt to load and validate
BTF in kernel. It also adds ability to copy raw BTF data out of struct btf for
further processing by external applications.

This makes utilizing libbpf's APIs that don't require kernel facilities (e.g.,
btf_dedup) simpler and more natural from external application.

Andrii Nakryiko (2):
  btf: separate btf creation and loading
  btf: expose API to work with raw btf data

 tools/lib/bpf/btf.c      | 63 +++++++++++++++++++++++++---------------
 tools/lib/bpf/btf.h      |  3 ++
 tools/lib/bpf/libbpf.c   |  2 +-
 tools/lib/bpf/libbpf.map |  3 ++
 4 files changed, 46 insertions(+), 25 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH bpf-next 2/2] tools/bpf: add log_level to bpf_load_program_attr
From: Yonghong Song @ 2019-02-05 19:48 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team, Yonghong Song
In-Reply-To: <20190205194821.1804747-1-yhs@fb.com>

The kernel verifier has three levels of logs:
    0: no logs
    1: logs mostly useful
  > 1: verbose

Current libbpf API functions bpf_load_program_xattr() and
bpf_load_program() cannot specify log_level.
The bcc, however, provides an interface for user to
specify log_level 2 for verbose output.

This patch added log_level into structure
bpf_load_program_attr, so users, including bcc, can use
bpf_load_program_xattr() to change log_level.

The bpf selftest test_sock.c is modified to enable log_level = 2.
If the "verbose" in test_sock.c is changed to true,
the test will output logs like below:
  $ ./test_sock
  func#0 @0
  0: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_-1
  0: (bf) r6 = r1
  1: R1=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) R10=fp0,call_-1
  1: (61) r7 = *(u32 *)(r6 +28)
  invalid bpf_context access off=28 size=4

  Test case: bind4 load with invalid access: src_ip6 .. [PASS]
  ...
  Test case: bind6 allow all .. [PASS]
  Summary: 16 PASSED, 0 FAILED

Some test_sock tests are negative tests and verbose verifier
log will be printed out as shown in the above.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/bpf.c                     | 4 +++-
 tools/lib/bpf/bpf.h                     | 1 +
 tools/testing/selftests/bpf/test_sock.c | 9 ++++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 3defad77dc7a..78aa8c2b1a5c 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -214,6 +214,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 {
 	void *finfo = NULL, *linfo = NULL;
 	union bpf_attr attr;
+	__u32 log_level;
 	__u32 name_len;
 	int fd;
 
@@ -292,7 +293,8 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	/* Try again with log */
 	attr.log_buf = ptr_to_u64(log_buf);
 	attr.log_size = log_buf_sz;
-	attr.log_level = 1;
+	log_level = load_attr->log_level;
+	attr.log_level = (log_level >= 2) ? log_level : 1;
 	log_buf[0] = 0;
 	fd = sys_bpf_prog_load(&attr, sizeof(attr));
 done:
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index ed09eed2dc3b..15a8e22e8eae 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -76,6 +76,7 @@ struct bpf_load_program_attr {
 	const struct bpf_insn *insns;
 	size_t insns_cnt;
 	const char *license;
+	__u32 log_level;
 	__u32 kern_version;
 	__u32 prog_ifindex;
 	__u32 prog_btf_fd;
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index 561ffb6d6433..fb679ac3d4b0 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -20,6 +20,7 @@
 #define MAX_INSNS	512
 
 char bpf_log_buf[BPF_LOG_BUF_SIZE];
+static bool verbose = false;
 
 struct sock_test {
 	const char *descr;
@@ -325,6 +326,7 @@ static int load_sock_prog(const struct bpf_insn *prog,
 			  enum bpf_attach_type attach_type)
 {
 	struct bpf_load_program_attr attr;
+	int ret;
 
 	memset(&attr, 0, sizeof(struct bpf_load_program_attr));
 	attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
@@ -332,8 +334,13 @@ static int load_sock_prog(const struct bpf_insn *prog,
 	attr.insns = prog;
 	attr.insns_cnt = probe_prog_length(attr.insns);
 	attr.license = "GPL";
+	attr.log_level = 2;
 
-	return bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+	ret = bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+	if (verbose && ret < 0)
+		fprintf(stderr, "%s\n", bpf_log_buf);
+
+	return ret;
 }
 
 static int attach_sock_prog(int cgfd, int progfd,
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 1/2] tools/bpf: add const qualifier to btf__get_map_kv_tids() map_name parameter
From: Yonghong Song @ 2019-02-05 19:48 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team, Yonghong Song
In-Reply-To: <20190205194821.1804747-1-yhs@fb.com>

Commit 96408c43447a ("tools/bpf: implement libbpf btf__get_map_kv_tids() API function")
added the API function btf__get_map_kv_tids():
  btf__get_map_kv_tids(const struct btf *btf, char *map_name, ...)

The parameter map_name has type "char *". This is okay inside libbpf library since
the map_name is from bpf_map->name which also has type "char *".

This will be problematic if the caller for map_name already has attribute "const",
e.g., from C++ string.c_str(). It will result in either a warning or an error.

  /home/yhs/work/bcc/src/cc/btf.cc:166:51:
    error: invalid conversion from ‘const char*’ to ‘char*’ [-fpermissive]
      return btf__get_map_kv_tids(btf_, map_name.c_str()

This patch added "const" attributes to map_name parameter.

Fixes: 96408c43447a ("tools/bpf: implement libbpf btf__get_map_kv_tids() API function")
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/btf.c | 2 +-
 tools/lib/bpf/btf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 4949f8840bda..3b3a2959d03a 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -511,7 +511,7 @@ int btf__get_from_id(__u32 id, struct btf **btf)
 	return err;
 }
 
-int btf__get_map_kv_tids(const struct btf *btf, char *map_name,
+int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 			 __u32 expected_key_size, __u32 expected_value_size,
 			 __u32 *key_type_id, __u32 *value_type_id)
 {
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 25a9d2db035d..b393da90cc85 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -69,7 +69,7 @@ LIBBPF_API void btf__get_strings(const struct btf *btf, const char **strings,
 				 __u32 *str_len);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
 LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf);
-LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, char *map_name,
+LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 				    __u32 expected_key_size,
 				    __u32 expected_value_size,
 				    __u32 *key_type_id, __u32 *value_type_id);
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 0/2] tools/bpf: two changes in libbpf
From: Yonghong Song @ 2019-02-05 19:48 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team, Yonghong Song

Two changes in libbpf. Patch #1 adds "const" qualifier to parameter
"map_name" of function btf__get_map_kv_tids().
Patch #2 adds log_level to structure bpf_load_program_attr so
verifier log level 2 can be used with api function
bpf_load_program_xattr(). Please see individual patches for details.

Yonghong Song (2):
  tools/bpf: add const qualifier to btf__get_map_kv_tids() map_name
    parameter
  tools/bpf: add log_level to bpf_load_program_attr

 tools/lib/bpf/bpf.c                     | 4 +++-
 tools/lib/bpf/bpf.h                     | 1 +
 tools/lib/bpf/btf.c                     | 2 +-
 tools/lib/bpf/btf.h                     | 2 +-
 tools/testing/selftests/bpf/test_sock.c | 9 ++++++++-
 5 files changed, 14 insertions(+), 4 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH v2 net-next] net: phy: improve genphy_c45_read_link
From: Heiner Kallweit @ 2019-02-05 19:41 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, David Miller, Russell King
  Cc: netdev@vger.kernel.org

Let's make genphy_c45_read_link behave the same as genphy_update_link
and set phydev->link in the function directly. This allows to simplify
the callers. In addition don't check further devices once we detect
that at least one device reports link as down.

v2:
- remove an unused variable

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/phy/marvell10g.c |  2 --
 drivers/net/phy/phy-c45.c    | 15 ++++++---------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 38cfc923a..be2cfdfd8 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -475,8 +475,6 @@ static int mv3310_read_status(struct phy_device *phydev)
 	if (val < 0)
 		return val;
 
-	phydev->link = val > 0 ? 1 : 0;
-
 	val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_STAT1);
 	if (val < 0)
 		return val;
diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 03af927fa..b874c4858 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -134,7 +134,7 @@ EXPORT_SYMBOL_GPL(genphy_c45_aneg_done);
  * @mmd_mask: MMDs to read status from
  *
  * Read the link status from the specified MMDs, and if they all indicate
- * that the link is up, return positive.  If an error is encountered,
+ * that the link is up, set phydev->link to 1.  If an error is encountered,
  * a negative errno will be returned, otherwise zero.
  */
 int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask)
@@ -142,7 +142,7 @@ int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask)
 	int val, devad;
 	bool link = true;
 
-	while (mmd_mask) {
+	while (mmd_mask && link) {
 		devad = __ffs(mmd_mask);
 		mmd_mask &= ~BIT(devad);
 
@@ -158,7 +158,9 @@ int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask)
 			link = false;
 	}
 
-	return link;
+	phydev->link = link;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(genphy_c45_read_link);
 
@@ -278,7 +280,6 @@ EXPORT_SYMBOL_GPL(gen10g_config_aneg);
 int gen10g_read_status(struct phy_device *phydev)
 {
 	u32 mmd_mask = phydev->c45_ids.devices_in_package;
-	int ret;
 
 	/* For now just lie and say it's 10G all the time */
 	phydev->speed = SPEED_10000;
@@ -287,11 +288,7 @@ int gen10g_read_status(struct phy_device *phydev)
 	/* Avoid reading the vendor MMDs */
 	mmd_mask &= ~(BIT(MDIO_MMD_VEND1) | BIT(MDIO_MMD_VEND2));
 
-	ret = genphy_c45_read_link(phydev, mmd_mask);
-
-	phydev->link = ret > 0 ? 1 : 0;
-
-	return 0;
+	return genphy_c45_read_link(phydev, mmd_mask);
 }
 EXPORT_SYMBOL_GPL(gen10g_read_status);
 
-- 
2.20.1


^ permalink raw reply related

* TC stats / hw offload question
From: Edward Cree @ 2019-02-05 19:41 UTC (permalink / raw)
  To: netdev; +Cc: Jiri Pirko, Jamal Hadi Salim, Cong Wang

Regarding TC_CLSFLOWER_STATS, when a filter rule modifies the length of
 the packet, e.g. by adding a VLAN or encap header, should the bytes
 counter count the length of the packet _before_ edits (i.e. as seen by
 the match), or the length after edits?  If the latter, what is the
 correct behaviour when (say) a packet is mirrored as-is but also
 redirected with encapsulation?
The fact that the stats live in the struct tc_action suggests a per-
 action connection that would imply post-edit length, but then in
 tcf_exts_dump_stats() we only look at the first action, which seems to
 imply we really want the pre-edit length.
I can't find any kind of doc or spec defining what behaviour is required.

-Ed

^ permalink raw reply

* [vhost:linux-next 2/23] include/linux/swiotlb.h:99:22: error: static declaration of 'swiotlb_max_mapping_size' follows non-static declaration
From: kbuild test robot @ 2019-02-05 19:37 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: kbuild-all, kvm, virtualization, netdev, Michael S. Tsirkin,
	Konrad Rzeszutek Wilk, Christoph Hellwig

[-- Attachment #1: Type: text/plain, Size: 1685 bytes --]

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
head:   104f89a60ef5ec77d6f559eac4676844b3480740
commit: 951a381d4c0d45a9b44de30228c6ef17083854ea [2/23] swiotlb: Introduce swiotlb_max_mapping_size()
config: i386-tinyconfig (attached as .config)
compiler: gcc-8 (Debian 8.2.0-14) 8.2.0
reproduce:
        git checkout 951a381d4c0d45a9b44de30228c6ef17083854ea
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):


vim +/swiotlb_max_mapping_size +99 include/linux/swiotlb.h

    75	
    76	bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
    77			size_t size, enum dma_data_direction dir, unsigned long attrs);
    78	void __init swiotlb_exit(void);
    79	unsigned int swiotlb_max_segment(void);
    80	#else
    81	#define swiotlb_force SWIOTLB_NO_FORCE
    82	static inline bool is_swiotlb_buffer(phys_addr_t paddr)
    83	{
    84		return false;
    85	}
    86	static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys,
    87			dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir,
    88			unsigned long attrs)
    89	{
    90		return false;
    91	}
    92	static inline void swiotlb_exit(void)
    93	{
    94	}
    95	static inline unsigned int swiotlb_max_segment(void)
    96	{
    97		return 0;
    98	}
  > 99	static inline size_t swiotlb_max_mapping_size(struct device *dev)
   100	{
   101		return SIZE_MAX;
   102	}
   103	#endif /* CONFIG_SWIOTLB */
   104	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 6514 bytes --]

^ permalink raw reply

* [vhost:linux-next 3/23] include/linux/swiotlb.h:105:20: error: static declaration of 'is_swiotlb_active' follows non-static declaration
From: kbuild test robot @ 2019-02-05 19:37 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: kbuild-all, kvm, virtualization, netdev, Michael S. Tsirkin,
	Konrad Rzeszutek Wilk, Christoph Hellwig

[-- Attachment #1: Type: text/plain, Size: 3120 bytes --]

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
head:   104f89a60ef5ec77d6f559eac4676844b3480740
commit: 155fcd8511de5f99c27a726e9153b87cce528b6e [3/23] swiotlb: Add is_swiotlb_active() function
config: i386-tinyconfig (attached as .config)
compiler: gcc-8 (Debian 8.2.0-14) 8.2.0
reproduce:
        git checkout 155fcd8511de5f99c27a726e9153b87cce528b6e
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from arch/x86/include/asm/swiotlb.h:5,
                    from arch/x86/include/asm/dma-mapping.h:13,
                    from include/linux/dma-mapping.h:261,
                    from include/linux/skbuff.h:34,
                    from include/net/net_namespace.h:36,
                    from include/linux/inet.h:46,
                    from include/linux/sunrpc/msg_prot.h:204,
                    from include/linux/sunrpc/auth.h:16,
                    from include/linux/nfs_fs.h:31,
                    from init/do_mounts.c:22:
   include/linux/swiotlb.h:100:22: error: static declaration of 'swiotlb_max_mapping_size' follows non-static declaration
    static inline size_t swiotlb_max_mapping_size(struct device *dev)
                         ^~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/swiotlb.h:65:8: note: previous declaration of 'swiotlb_max_mapping_size' was here
    size_t swiotlb_max_mapping_size(struct device *dev);
           ^~~~~~~~~~~~~~~~~~~~~~~~
>> include/linux/swiotlb.h:105:20: error: static declaration of 'is_swiotlb_active' follows non-static declaration
    static inline bool is_swiotlb_active(void)
                       ^~~~~~~~~~~~~~~~~
   include/linux/swiotlb.h:66:6: note: previous declaration of 'is_swiotlb_active' was here
    bool is_swiotlb_active(void);
         ^~~~~~~~~~~~~~~~~

vim +/is_swiotlb_active +105 include/linux/swiotlb.h

    76	
    77	bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
    78			size_t size, enum dma_data_direction dir, unsigned long attrs);
    79	void __init swiotlb_exit(void);
    80	unsigned int swiotlb_max_segment(void);
    81	#else
    82	#define swiotlb_force SWIOTLB_NO_FORCE
    83	static inline bool is_swiotlb_buffer(phys_addr_t paddr)
    84	{
    85		return false;
    86	}
    87	static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys,
    88			dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir,
    89			unsigned long attrs)
    90	{
    91		return false;
    92	}
    93	static inline void swiotlb_exit(void)
    94	{
    95	}
    96	static inline unsigned int swiotlb_max_segment(void)
    97	{
    98		return 0;
    99	}
 > 100	static inline size_t swiotlb_max_mapping_size(struct device *dev)
   101	{
   102		return SIZE_MAX;
   103	}
   104	
 > 105	static inline bool is_swiotlb_active(void)
   106	{
   107		return false;
   108	}
   109	#endif /* CONFIG_SWIOTLB */
   110	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 6514 bytes --]

^ permalink raw reply

* Re: Stack sends oversize UDP packet to the driver
From: Michael Chan @ 2019-02-05 19:35 UTC (permalink / raw)
  To: Mahesh Bandewar (महेश बंडेवार)
  Cc: Daniel Axtens, Netdev, David Miller, Eric Dumazet,
	Willem de Bruijn
In-Reply-To: <CAF2d9jiMMez62n7U+k02V7sRp9Q-usFC1X0-h8aqq-BtjmtVmA@mail.gmail.com>

On Wed, Jan 30, 2019 at 5:00 PM Mahesh Bandewar (महेश बंडेवार)
<maheshb@google.com> wrote:
>
> On Wed, Jan 30, 2019 at 1:07 AM Michael Chan <michael.chan@broadcom.com> wrote:
> >
> > On Tue, Jan 22, 2019 at 10:29 AM Mahesh Bandewar (महेश बंडेवार)
> > <maheshb@google.com> wrote:
> >
> > >
> > > The idea behind the fix is very simple and it is to create a dst-only
> > > (unregistered) device with a very low MTU and use it instead of 'lo'
> > > while invalidating the dst. This would make it *not* forward packets
> > > to driver which might need fragmentation.
> > >
> >
> > We tested the 2 patches many times and including an overnight test.  I
> > can confirm that the oversize UDP packets are no longer seen with the
> > patches applied.  However, I don't see the blackhole xmit function
> > getting called to free the SKBs though.
> >
> Thanks for the confirmation Michael. The blackhole device mtu is
> really small, so I would assume the fragmentation code dropped those
> packets before calling the xmit function (in ip_fragment), you could
> verify that with icmp counters.
>

I've looked at this a little more.  The blackhole_dev is not IFF_UP |
IFF_RUNNING, right?  May be that's why the packets are never getting
to the xmit function?

^ permalink raw reply

* Re: Kernel panic in eth_header
From: Florian Fainelli @ 2019-02-05 19:34 UTC (permalink / raw)
  To: Eric Dumazet, Andrew, Netdev
In-Reply-To: <19716555-3522-cbdd-a128-e2ec672f89cd@gmail.com>

On 2/5/19 8:57 AM, Eric Dumazet wrote:
> 
> 
> On 02/05/2019 08:29 AM, Andrew wrote:
>> Hi all.
>>
>> After upgrade on PPPoE BRAS to kernel 4.9.153 I've got an kernel panic after a 3 days of uptime.
>>
>> Unfortunately kernel is compiled w/o debug info; I rebuilt kernel with debug info enabled (kernel is compiled with same function addresses - I compare vmlinux symbol maps) - it says that panic is in net/ethernet/eth.c:88
>>
>> Below there is a kernel panic trace. igb is from vendor, ver. 5.3.5.4. What extra info is needed?
>>
>> [263565.106441] BUG: unable to handle kernel paging request at ffff88015a4d2dd4
>> [263565.113527] IP: [<ffffffff8158e48b>] eth_header+0x3b/0xc0
>> [263565.119030] PGD 1e8f067 [263565.121474] PUD 0
>> [263565.123580]
>> [263565.125166] Oops: 0002 [#1] SMP
>> [263565.128398] Modules linked in: xt_nat iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 iptable_filter xt_length xt_TCPMSS xt_tcpudp xt_mark xt_dscp iptable_mangle ip_tables x_tables nf_nat_pptp nf_conntrack_pptp nf_conntrack_proto_gre nf_nat_proto_gre nf_nat nf_conntrack sch_sfq sch_htb cls_u32 sch_ingress sch_prio sch_tbf cls_flow cls_fw act_police ifb 8021q mrp garp stp llc softdog pppoe pppox ppp_generic slhc i2c_nforce2 i2c_core igb(O) parport_pc dca parport thermal asus_atk0110 fan ptp k10temp hwmon pps_core nv_tco
>> [263565.176083] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G           O    4.9.153-x86_64 #1
>> [263565.183996] Hardware name: System manufacturer System Product Name/M2N-E, BIOS ASUS M2N-E ACPI BIOS Revision 5001 03/23/2010
>> [263565.195289] task: ffff88007d0f5200 task.stack: ffffc9000006c000
>> [263565.201295] RIP: 0010:[<ffffffff8158e48b>] [<ffffffff8158e48b>] eth_header+0x3b/0xc0
>> [263565.209225] RSP: 0018:ffff88007fa83c58  EFLAGS: 00010286
>> [263565.214622] RAX: ffff88015a4d2dc8 RBX: 0000000000000008 RCX: ffff8800682434a0
>> [263565.221843] RDX: ffff88015a4d2dc8 RSI: ffff88015a4d2dc8 RDI: ffff880077aab000
>> [263565.229062] RBP: ffff88007b663d90 R08: ffff88007b663d90 R09: 0000000000000574
>> [263565.236281] R10: ffff88007d1fa000 R11: 0000000000000000 R12: ffff8800682434a0
>> [263565.243501] R13: ffff88007d1fa000 R14: 0000000000000574 R15: 0000000000000008
>> [263565.250719] FS:  0000000000000000(0000) GS:ffff88007fa80000(0000) knlGS:0000000000000000
>> [263565.258894] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [263565.264725] CR2: ffff88015a4d2dd4 CR3: 000000007ad73000 CR4: 00000000000006f0
>> [263565.271944] Stack:
>> [263565.274041]  ffff880077aab000 ffff880068243400 ffff88007a745000 ffff8800682434a0
>> [263565.281582]  0000000000000002 ffffffff81571d09 ffff880068243400 ffff88007fa83d00
>> [263565.289121]  ffff88007a745000 ffff880077aab000 ffff88007a712000 ffffffff815a8c61
>> [263565.296661] Call Trace:
>> [263565.299193]  <IRQ> [263565.301205] [<ffffffff81571d09>] ? neigh_connected_output+0xa9/0x100
>> [263565.307740]  [<ffffffff815a8c61>] ? ip_finish_output2+0x221/0x400
>> [263565.313920]  [<ffffffff8159e144>] ? nf_iterate+0x54/0x60
>> [263565.319319]  [<ffffffff815ab2fa>] ? ip_output+0x6a/0xf0
>> [263565.324631]  [<ffffffff8159e102>] ? nf_iterate+0x12/0x60
>> [263565.330030]  [<ffffffff815aa6e0>] ? ip_fragment.constprop.5+0x80/0x80
>> [263565.336556]  [<ffffffff815a73b6>] ? ip_forward+0x396/0x480
>> [263565.342128]  [<ffffffff815a6fb0>] ? ip_check_defrag+0x1e0/0x1e0
>> [263565.348134]  [<ffffffff815a5a2e>] ? ip_rcv+0x2ae/0x370
>> [263565.353361]  [<ffffffffa0107c02>] ? pppoe_rcv_core+0xd2/0x160 [pppoe]
>> [263565.359888]  [<ffffffff815a5170>] ? ip_local_deliver_finish+0x1d0/0x1d0
>> [263565.366586]  [<ffffffff81562a57>] ? __netif_receive_skb_core+0x527/0xa80
>> [263565.373373]  [<ffffffff81567632>] ? process_backlog+0x92/0x130
>> [263565.379291]  [<ffffffff8156745d>] ? net_rx_action+0x24d/0x390
>> [263565.385124]  [<ffffffff81628374>] ? __do_softirq+0xf4/0x2a0
>> [263565.390784]  [<ffffffff8107136c>] ? irq_exit+0xbc/0xd0
>> [263565.396008]  [<ffffffff81626cd6>] ? call_function_single_interrupt+0x96/0xa0
>> [263565.403141]  <EOI> [263565.405153] [<ffffffff81623eb0>] ? __sched_text_end+0x2/0x2
>> [263565.410907]  [<ffffffff81624182>] ? native_safe_halt+0x2/0x10
>> [263565.416741]  [<ffffffff81623ec8>] ? default_idle+0x18/0xd0
>> [263565.422314]  [<ffffffff810a7a46>] ? cpu_startup_entry+0x126/0x220
>> [263565.428492]  [<ffffffff8104c261>] ? start_secondary+0x161/0x180
>> [263565.434496] Code: 0e 00 00 00 53 89 d3 49 89 cc 4c 89 c5 45 89 ce e8 bb 8a fc ff 66 83 fb 01 48 89 c6 74 44 66 83 fb 04 74 3e 66 c1 c3 08 48 85 ed <66> 89 58 0c 74 40 8b 45 00 4d 85 e4 89 46 06 0f b7 45 04 66 89
>> [263565.454534] RIP  [<ffffffff8158e48b>] eth_header+0x3b/0xc0
>> [263565.460124]  RSP <ffff88007fa83c58>
>> [263565.463696] CR2: ffff88015a4d2dd4
>> [263565.467104] ---[ end trace a1bcaf3618724adf ]---
>> [263565.471807] Kernel panic - not syncing: Fatal exception in interrupt
>> [263565.478245] Kernel Offset: disabled
>> [263565.481818] Rebooting in 5 seconds..
>>
> 
> 
> This is a well known issue, a fix should come shortly in stable branches

Is Peter or yourself doing the backport? David would only take care of
the most two recent stable kernels.

Sorry about missing that change as part of the fragmenstack backport to
4.9...

> 
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index f8bbd693c19c247e41839c2d0b5318ca51b23ee8..d95b32af4a0e3f552405c9e61cc372729834160c 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -425,6 +425,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
>          * fragment.
>          */
>  
> +       err = -EINVAL;
>         /* Find out where to put this fragment.  */
>         prev_tail = qp->q.fragments_tail;
>         if (!prev_tail)
> @@ -501,7 +502,6 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
>  
>  discard_qp:
>         inet_frag_kill(&qp->q);
> -       err = -EINVAL;
>         __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
>  err:
>         kfree_skb(skb);
> 
> 
> 


-- 
Florian

^ permalink raw reply

* Re: Kernel panic in eth_header
From: Florian Fainelli @ 2019-02-05 19:34 UTC (permalink / raw)
  To: Eric Dumazet, Andrew, Netdev
In-Reply-To: <19716555-3522-cbdd-a128-e2ec672f89cd@gmail.com>

On 2/5/19 8:57 AM, Eric Dumazet wrote:
> 
> 
> On 02/05/2019 08:29 AM, Andrew wrote:
>> Hi all.
>>
>> After upgrade on PPPoE BRAS to kernel 4.9.153 I've got an kernel panic after a 3 days of uptime.
>>
>> Unfortunately kernel is compiled w/o debug info; I rebuilt kernel with debug info enabled (kernel is compiled with same function addresses - I compare vmlinux symbol maps) - it says that panic is in net/ethernet/eth.c:88
>>
>> Below there is a kernel panic trace. igb is from vendor, ver. 5.3.5.4. What extra info is needed?
>>
>> [263565.106441] BUG: unable to handle kernel paging request at ffff88015a4d2dd4
>> [263565.113527] IP: [<ffffffff8158e48b>] eth_header+0x3b/0xc0
>> [263565.119030] PGD 1e8f067 [263565.121474] PUD 0
>> [263565.123580]
>> [263565.125166] Oops: 0002 [#1] SMP
>> [263565.128398] Modules linked in: xt_nat iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 iptable_filter xt_length xt_TCPMSS xt_tcpudp xt_mark xt_dscp iptable_mangle ip_tables x_tables nf_nat_pptp nf_conntrack_pptp nf_conntrack_proto_gre nf_nat_proto_gre nf_nat nf_conntrack sch_sfq sch_htb cls_u32 sch_ingress sch_prio sch_tbf cls_flow cls_fw act_police ifb 8021q mrp garp stp llc softdog pppoe pppox ppp_generic slhc i2c_nforce2 i2c_core igb(O) parport_pc dca parport thermal asus_atk0110 fan ptp k10temp hwmon pps_core nv_tco
>> [263565.176083] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G           O    4.9.153-x86_64 #1
>> [263565.183996] Hardware name: System manufacturer System Product Name/M2N-E, BIOS ASUS M2N-E ACPI BIOS Revision 5001 03/23/2010
>> [263565.195289] task: ffff88007d0f5200 task.stack: ffffc9000006c000
>> [263565.201295] RIP: 0010:[<ffffffff8158e48b>] [<ffffffff8158e48b>] eth_header+0x3b/0xc0
>> [263565.209225] RSP: 0018:ffff88007fa83c58  EFLAGS: 00010286
>> [263565.214622] RAX: ffff88015a4d2dc8 RBX: 0000000000000008 RCX: ffff8800682434a0
>> [263565.221843] RDX: ffff88015a4d2dc8 RSI: ffff88015a4d2dc8 RDI: ffff880077aab000
>> [263565.229062] RBP: ffff88007b663d90 R08: ffff88007b663d90 R09: 0000000000000574
>> [263565.236281] R10: ffff88007d1fa000 R11: 0000000000000000 R12: ffff8800682434a0
>> [263565.243501] R13: ffff88007d1fa000 R14: 0000000000000574 R15: 0000000000000008
>> [263565.250719] FS:  0000000000000000(0000) GS:ffff88007fa80000(0000) knlGS:0000000000000000
>> [263565.258894] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [263565.264725] CR2: ffff88015a4d2dd4 CR3: 000000007ad73000 CR4: 00000000000006f0
>> [263565.271944] Stack:
>> [263565.274041]  ffff880077aab000 ffff880068243400 ffff88007a745000 ffff8800682434a0
>> [263565.281582]  0000000000000002 ffffffff81571d09 ffff880068243400 ffff88007fa83d00
>> [263565.289121]  ffff88007a745000 ffff880077aab000 ffff88007a712000 ffffffff815a8c61
>> [263565.296661] Call Trace:
>> [263565.299193]  <IRQ> [263565.301205] [<ffffffff81571d09>] ? neigh_connected_output+0xa9/0x100
>> [263565.307740]  [<ffffffff815a8c61>] ? ip_finish_output2+0x221/0x400
>> [263565.313920]  [<ffffffff8159e144>] ? nf_iterate+0x54/0x60
>> [263565.319319]  [<ffffffff815ab2fa>] ? ip_output+0x6a/0xf0
>> [263565.324631]  [<ffffffff8159e102>] ? nf_iterate+0x12/0x60
>> [263565.330030]  [<ffffffff815aa6e0>] ? ip_fragment.constprop.5+0x80/0x80
>> [263565.336556]  [<ffffffff815a73b6>] ? ip_forward+0x396/0x480
>> [263565.342128]  [<ffffffff815a6fb0>] ? ip_check_defrag+0x1e0/0x1e0
>> [263565.348134]  [<ffffffff815a5a2e>] ? ip_rcv+0x2ae/0x370
>> [263565.353361]  [<ffffffffa0107c02>] ? pppoe_rcv_core+0xd2/0x160 [pppoe]
>> [263565.359888]  [<ffffffff815a5170>] ? ip_local_deliver_finish+0x1d0/0x1d0
>> [263565.366586]  [<ffffffff81562a57>] ? __netif_receive_skb_core+0x527/0xa80
>> [263565.373373]  [<ffffffff81567632>] ? process_backlog+0x92/0x130
>> [263565.379291]  [<ffffffff8156745d>] ? net_rx_action+0x24d/0x390
>> [263565.385124]  [<ffffffff81628374>] ? __do_softirq+0xf4/0x2a0
>> [263565.390784]  [<ffffffff8107136c>] ? irq_exit+0xbc/0xd0
>> [263565.396008]  [<ffffffff81626cd6>] ? call_function_single_interrupt+0x96/0xa0
>> [263565.403141]  <EOI> [263565.405153] [<ffffffff81623eb0>] ? __sched_text_end+0x2/0x2
>> [263565.410907]  [<ffffffff81624182>] ? native_safe_halt+0x2/0x10
>> [263565.416741]  [<ffffffff81623ec8>] ? default_idle+0x18/0xd0
>> [263565.422314]  [<ffffffff810a7a46>] ? cpu_startup_entry+0x126/0x220
>> [263565.428492]  [<ffffffff8104c261>] ? start_secondary+0x161/0x180
>> [263565.434496] Code: 0e 00 00 00 53 89 d3 49 89 cc 4c 89 c5 45 89 ce e8 bb 8a fc ff 66 83 fb 01 48 89 c6 74 44 66 83 fb 04 74 3e 66 c1 c3 08 48 85 ed <66> 89 58 0c 74 40 8b 45 00 4d 85 e4 89 46 06 0f b7 45 04 66 89
>> [263565.454534] RIP  [<ffffffff8158e48b>] eth_header+0x3b/0xc0
>> [263565.460124]  RSP <ffff88007fa83c58>
>> [263565.463696] CR2: ffff88015a4d2dd4
>> [263565.467104] ---[ end trace a1bcaf3618724adf ]---
>> [263565.471807] Kernel panic - not syncing: Fatal exception in interrupt
>> [263565.478245] Kernel Offset: disabled
>> [263565.481818] Rebooting in 5 seconds..
>>
> 
> 
> This is a well known issue, a fix should come shortly in stable branches

Is Peter or yourself doing the backport? David would only take care of
the most two recent stable kernels.

Sorry about missing that change as part of the fragmenstack backport to
4.9...

> 
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index f8bbd693c19c247e41839c2d0b5318ca51b23ee8..d95b32af4a0e3f552405c9e61cc372729834160c 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -425,6 +425,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
>          * fragment.
>          */
>  
> +       err = -EINVAL;
>         /* Find out where to put this fragment.  */
>         prev_tail = qp->q.fragments_tail;
>         if (!prev_tail)
> @@ -501,7 +502,6 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
>  
>  discard_qp:
>         inet_frag_kill(&qp->q);
> -       err = -EINVAL;
>         __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
>  err:
>         kfree_skb(skb);
> 
> 
> 


-- 
Florian

^ permalink raw reply

* Re: [B.A.T.M.A.N.] [RFC v4 00/19] batman-adv: netlink restructuring, part 2
From: Linus Lüssing @ 2019-02-05 19:24 UTC (permalink / raw)
  To: The list for a Better Approach To Mobile Ad-hoc Networking
  Cc: netdev, Jiri Pirko
In-Reply-To: <1895931.G10psR3j26@sven-edge>

On Sat, Jan 26, 2019 at 11:47:20AM +0100, Sven Eckelmann wrote:
> On Saturday, 19 January 2019 16.56.07 CET Sven Eckelmann wrote:
> [...]
> > There were also two topics which were not yet really discussed and thus
> > these requests (from Linus) were not yet implemented:
> 
> @Jiri, @Linus maybe you can discuss these topics further and select the 
> correct solution.
> 
> > * convert BATADV_ATTR_MULTICAST_MODE_ENABLED to an u32 and let don't handle
> >   it like a boolean. Instead use it to select how multicast traffic has to
> >   be handled:
> >   
> >   - 0: ignore multicast optimization and just flood it like broadcast
> >     traffic
> >   - 1: enabled multicast optimization
> >   - 2: undefined but also some kind of multicast optimization
> >   - 3: undefined but also some kind of multicast of optimization
> >   - ...
> 
> Multicast mode is currently defined.
> 
> * according to batctl manpage:
> 
>     multicast_mode|mm [0|1]
>            If no parameter is given the current multicast mode set‐
>            ting is displayed. Otherwise the parameter is used to en‐
>            able or disable multicast optimizations (i.e. disabling
>            means always sending own multicast frames via classic
>            flooding).
> 
> * according to sysfs ABI:
> 
>    What:           /sys/class/net/<mesh_iface>/mesh/multicast_mode
>    Date:           Feb 2014
>    Contact:        Linus Lüssing <linus.luessing@web.de>
>    Description:
>                    Indicates whether multicast optimizations are enabled
>                    or disabled. If set to zero then all nodes in the
>                    mesh are going to use classic flooding for any
>                    multicast packet with no optimizations.
> 
> Both define it as boolean value and therefore it was converted to a boolean 
> value (via u8) in netlink.
> 
> But Linus now suggested that it is actually an u32. Most likely 0 == to 
> something like BATADV_MULTICAST_MODE_FLOODING. But I have no idea what 1 is or 
> what 2, 3, 4, .. would be. So I need some input here.

Right, 0 would be flooding. 1 is the current multicast-aware multi-to-single-unicast
conversion mechanism, which would be extended to a multi-to-multi-unicast mechanism
with the last multicast related patch to the batman mailing list.

In this particular case there is no need to assign a new
multicast_mode == 2, because the old approach can be achieved
again by setting multicast_fanout to 1.


After the multi-to-(multi-)unicast conversion mechanism the next
simple step I would have in mind would be adding a "struct
batadv_mcast_packet". Which would basically behave similar to a
normal batadv_unicast_packet, but would be able to hold a flexible
amount of destination addresses.


And then there was the old tracker packet idea which could come
after that.

And then, recently a source-specific-multicast, reverse tracker
packet approach was discussed. Which would have yet another magnitude
of complexity. But would have very compelling scalability properties.


From a user perspective, once someone relies on some degree of
multicast capabilities and if a new mechanism somehow fails for
the user (for instance a bug or special topology properties), it
might not be feasible to go back to flooding. For instance, let's
say with multicast-to-multi-unicast a Freifunk-like network were now
able to scale to 1500 nodes. And now we introduce some new
enhancements/mechanisms for multicast and things break. Then
flooding would likely leave a broken/unusable network for them
too, due to congestion.

Maybe seeing "multicast_mode" as a gear shift would be an analogy?


> And Jiri said that it should be renamed to BATADV_ATTR_MULTICAST_ENABLED -
> which seems to suggest that he doesn't like the idea of a u32 for some reason
> and prefers to use a boolean value.
> 
> And now Linus even said that it should be a bit field - which makes it even 
> more vague to me and I have now absolutely no idea what should be implemented.
> 
> * BIT 0 for flooding vs ?

Flooding would always be available.

BIT 0: Enable multicast-to-(multi-)unicast
BIT 1: Enable multicast-packet-type
BIT 2: Enable multicast-tracker-packets
BIT 3: Enable ssm-multicast-tracker-packets
...

The thing is, these mechanisms do not necessarilly have to be
exclusive. For instance, the tracker packet mechanism needs to
propagate the tracker packets for a second or so first before it
can be safely used for multicast data packets. In the mean time we
could either flood or buffer the packet. Or we could send them via
multicast-to-multi-unicast.

Hm, but maybe this is getting too flexible. If multicast_mode == n
would mean that all features 0 <= n are available would be fine,
too, I think.


Another thought, if all this is too vague for now... what about
ommiting the BATADV_ATTR_MULTICAST_(MODE)_ENABLED for now and use
a reverse logic instead? Like
BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, defaulting to false.
That still leaves the opportunity to add a BATADV_ATTR_MULTICAST_MODE
option or BATADV_ATTR_MULTICAST_FEATURES bitset later when we have
actual code needing it.

(btw., is there a general preference in netlink towards either
grouping things in bitfields or individual _ENABLED attributes?)

Regards, Linus

^ permalink raw reply

* Re: [PATCH 0/6] Netfilter fixes for net
From: David Miller @ 2019-02-05 19:23 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <20190205190415.25041-1-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue,  5 Feb 2019 20:04:09 +0100

> The following patchset contains Netfilter fixes for net:
 ...
> Diffstat look rather larger than usual because of the new selftest, but
> Florian and I consider that having tests soon into the tree is good to
> improve coverage. If there's a different policy in this regard, please,
> let me know.

Adding a test case like this fine and in fact encouraged.

> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Pulled, thanks.

^ permalink raw reply

* Re: [PATCH v2] net: dsa: mv88e6xxx: Revise irq setup ordering
From: John David Anglin @ 2019-02-05 19:20 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Russell King, Vivien Didelot, Florian Fainelli, netdev
In-Reply-To: <20190205022149.GA10838@lunn.ch>

On 2019-02-04 9:21 p.m., Andrew Lunn wrote:
>> The problem is INTn can go low before the interrupt handler for it is
>> registered and enabled.
>> This can't happen.  The domain is setup immediately after registering
>> the GPIO interrupt.
>> The interrupt can't fire until one of the enables is set.
> These two statement seem to contradict each other?
I don't think so.  In the first, I am referring to the enabling of the
GPIO pin interrupt in the
Armada 3720 CPU.  Specifically, this would be the setting of the enable
in the South Bridge
GPIO2 Interrupt Enable Register (RD0018C00).  In the second, I'm
referring to the enables
in the switch's Global Control Register.  Setting these enables to zero
forces the the switch's
INTn output high (assuming there isn't a short).  This shouldn't cause a
CPU interrupt if the
South Bridge GPIO2 Polarity Control is set correctly at the time.  INTn
goes low after a
switch reset because EEIntEn is set on reset, so clearing the enables
will causes a rising
edge on INTn.
>
>> These are set
>> by mv88e6xxx_g2_irq_setup(),
>> mv88e6xxx_g1_atu_prob_irq_setup() and
>> mv88e6xxx_g1_vtu_prob_irq_setup().  These irqs
>> are setup after mv88e6xxx_g1_irq_setup()/mv88e6xxx_irq_poll_setup() is
>> called.  Thus, the
>> irq domain is setup before the GPIO interrupt can fire.
> At what point is INTn going low, causing you all these problems? I've
> yet to see a real description of the race. Please give us a blow by
> blow of how the race happens. And then explain how your fix actually
> fixes this.
I'm going to withdraw my proposed change.  I had thought that calling
request_threaded_irq()
earlier fixed the issue at boot.  But given your mail, I reverted the
change in my 4.20.6 build
and I wasn't able to reproduce the problem.  Yet, my 4.20.4 build fails
every time doing a power
on boot.

We have the following interrupt status when it fails:

 44:          2          0     GPIO2  23 Edge      d0032004.mdio-mii:01
 48:          0          0  mv88e6xxx-g1   3 Edge      mv88e6xxx-g1-atu-prob
 50:          0          0  mv88e6xxx-g1   5 Edge      mv88e6xxx-g1-vtu-prob
 52:          0          1  mv88e6xxx-g1   7 Edge      mv88e6xxx-g2
 55:          0          1  mv88e6xxx-g2   1 Edge     
!soc!internal-regs@d0000000!mdio@32004!switch0@1!mdio:11
 56:          0          0  mv88e6xxx-g2   2 Edge     
!soc!internal-regs@d0000000!mdio@32004!switch0@1!mdio:12
 57:          0          0  mv88e6xxx-g2   3 Edge     
!soc!internal-regs@d0000000!mdio@32004!switch0@1!mdio:13
 69:          0          0  mv88e6xxx-g2  15 Edge      mv88e6xxx-watchdog

We have the following values in the Global1 registers:

MCLI> sys dumpGlobal1

------------------------------------------------------
Switch Global Status(0x0)                       c881
ATU FID Register(0x1)                           0000
VTU FID Register(0x2)                           0000
VTU SID Register(0x3)                           0000
Switch Global Control Register(0x4)             40a8

The DeviceInt bit is set in the Global Status register.  It would appear
we have missed an edge on GPIO2 23.
Yet, we have handled 2 interrupts on it.  So, this isn't the setup issue
that I thought it was.

>
> Also, i'm not yet convinced this hardware can actually work correctly
> with edge interrupts. You can probably reduce the size of the race
> window, but i don't think you can eliminate it. And if you cannot
> eliminate it, at some point it is going to hit you.
You could be right but I don't want to give up just yet.  I need to go
back and rebuild v4.20.4 and retest.
My hunch is the second hunk of the original patch will fix this.

Dave

-- 
John David Anglin  dave.anglin@bell.net


^ permalink raw reply

* Re: [PATCH v2 2/2] r8169: Avoid pointer aliasing
From: Joe Perches @ 2019-02-05 19:19 UTC (permalink / raw)
  To: David Miller
  Cc: thierry.reding, hkallweit1, andrew, nic_swsd, netdev,
	linux-kernel
In-Reply-To: <20190205.111404.1429981997134153696.davem@davemloft.net>

On Tue, 2019-02-05 at 11:14 -0800, David Miller wrote:
> From: Joe Perches <joe@perches.com>
> Date: Tue, 05 Feb 2019 10:42:54 -0800
> 
> > On Mon, 2019-02-04 at 19:20 -0800, David Miller wrote:
> >> From: Thierry Reding <thierry.reding@gmail.com>
> >> Date: Mon,  4 Feb 2019 17:42:13 +0100
> >> 
> >> > @@ -7316,7 +7325,7 @@ static int rtl_get_ether_clk(struct rtl8169_private *tp)
> >> >  static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
> >> >  {
> >> >       const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
> >> > -     u8 mac_addr[ETH_ALEN] __aligned(4) = {};
> >> > +     u8 mac_addr[ETH_ALEN] = {};
> >> >       struct rtl8169_private *tp;
> >> 
> >> I agree with Heiner, you have to provide at least 2 byte alignment for this
> >> buffer due to the reasons he stated.
> > 
> > It's declared after a pointer so it is already is 2 byte aligned.
> > 
> > A lot of drivers wouldn't work otherwise.
> 
> That's assuming a lot about what the compiler will do when nit allocates
> local variables to the stack.

It's also assuming what a compiler will do when
it defines a struct.

> I want it _explicit_.

Your choice, but there are a _lot_ of existing uses
and I think requiring it is as senseless as requiring
void * arithmetic to be cast to char * as gcc and
clang already do not add padding after a pointer.



^ permalink raw reply

* [PATCH net-next] net: phy: improve genphy_c45_read_link
From: Heiner Kallweit @ 2019-02-05 19:17 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, David Miller, Russell King
  Cc: netdev@vger.kernel.org

Let's make genphy_c45_read_link behave the same as genphy_update_link
and set phydev->link in the function directly. This allows to simplify
the callers. In addition don't check further devices once we detect
that at least one device reports link as down.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/phy/marvell10g.c |  2 --
 drivers/net/phy/phy-c45.c    | 14 ++++++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 38cfc923a..be2cfdfd8 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -475,8 +475,6 @@ static int mv3310_read_status(struct phy_device *phydev)
 	if (val < 0)
 		return val;
 
-	phydev->link = val > 0 ? 1 : 0;
-
 	val = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_STAT1);
 	if (val < 0)
 		return val;
diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 03af927fa..8a82c3c7a 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -134,7 +134,7 @@ EXPORT_SYMBOL_GPL(genphy_c45_aneg_done);
  * @mmd_mask: MMDs to read status from
  *
  * Read the link status from the specified MMDs, and if they all indicate
- * that the link is up, return positive.  If an error is encountered,
+ * that the link is up, set phydev->link to 1.  If an error is encountered,
  * a negative errno will be returned, otherwise zero.
  */
 int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask)
@@ -142,7 +142,7 @@ int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask)
 	int val, devad;
 	bool link = true;
 
-	while (mmd_mask) {
+	while (mmd_mask && link) {
 		devad = __ffs(mmd_mask);
 		mmd_mask &= ~BIT(devad);
 
@@ -158,7 +158,9 @@ int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask)
 			link = false;
 	}
 
-	return link;
+	phydev->link = link;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(genphy_c45_read_link);
 
@@ -287,11 +289,7 @@ int gen10g_read_status(struct phy_device *phydev)
 	/* Avoid reading the vendor MMDs */
 	mmd_mask &= ~(BIT(MDIO_MMD_VEND1) | BIT(MDIO_MMD_VEND2));
 
-	ret = genphy_c45_read_link(phydev, mmd_mask);
-
-	phydev->link = ret > 0 ? 1 : 0;
-
-	return 0;
+	return genphy_c45_read_link(phydev, mmd_mask);
 }
 EXPORT_SYMBOL_GPL(gen10g_read_status);
 
-- 
2.20.1






^ permalink raw reply related

* Re: [PATCH v2 2/2] r8169: Avoid pointer aliasing
From: David Miller @ 2019-02-05 19:14 UTC (permalink / raw)
  To: joe; +Cc: thierry.reding, hkallweit1, andrew, nic_swsd, netdev,
	linux-kernel
In-Reply-To: <8553086eaec167846992f1cff12aa388cee81767.camel@perches.com>

From: Joe Perches <joe@perches.com>
Date: Tue, 05 Feb 2019 10:42:54 -0800

> On Mon, 2019-02-04 at 19:20 -0800, David Miller wrote:
>> From: Thierry Reding <thierry.reding@gmail.com>
>> Date: Mon,  4 Feb 2019 17:42:13 +0100
>> 
>> > @@ -7316,7 +7325,7 @@ static int rtl_get_ether_clk(struct rtl8169_private *tp)
>> >  static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
>> >  {
>> >       const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
>> > -     u8 mac_addr[ETH_ALEN] __aligned(4) = {};
>> > +     u8 mac_addr[ETH_ALEN] = {};
>> >       struct rtl8169_private *tp;
>> 
>> I agree with Heiner, you have to provide at least 2 byte alignment for this
>> buffer due to the reasons he stated.
> 
> It's declared after a pointer so it is already is 2 byte aligned.
> 
> A lot of drivers wouldn't work otherwise.

That's assuming a lot about what the compiler will do when it allocates
local variables to the stack.

I want it _explicit_.

^ permalink raw reply

* [PATCH 3/6] netfilter: nf_nat: skip nat clash resolution for same-origin entries
From: Pablo Neira Ayuso @ 2019-02-05 19:04 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190205190415.25041-1-pablo@netfilter.org>

From: Martynas Pumputis <martynas@weave.works>

It is possible that two concurrent packets originating from the same
socket of a connection-less protocol (e.g. UDP) can end up having
different IP_CT_DIR_REPLY tuples which results in one of the packets
being dropped.

To illustrate this, consider the following simplified scenario:

1. Packet A and B are sent at the same time from two different threads
   by same UDP socket.  No matching conntrack entry exists yet.
   Both packets cause allocation of a new conntrack entry.
2. get_unique_tuple gets called for A.  No clashing entry found.
   conntrack entry for A is added to main conntrack table.
3. get_unique_tuple is called for B and will find that the reply
   tuple of B is already taken by A.
   It will allocate a new UDP source port for B to resolve the clash.
4. conntrack entry for B cannot be added to main conntrack table
   because its ORIGINAL direction is clashing with A and the REPLY
   directions of A and B are not the same anymore due to UDP source
   port reallocation done in step 3.

This patch modifies nf_conntrack_tuple_taken so it doesn't consider
colliding reply tuples if the IP_CT_DIR_ORIGINAL tuples are equal.

[ Florian: simplify patch to not use .allow_clash setting
  and always ignore identical flows ]

Signed-off-by: Martynas Pumputis <martynas@weave.works>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 741b533148ba..db4d46332e86 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1007,6 +1007,22 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 		}
 
 		if (nf_ct_key_equal(h, tuple, zone, net)) {
+			/* Tuple is taken already, so caller will need to find
+			 * a new source port to use.
+			 *
+			 * Only exception:
+			 * If the *original tuples* are identical, then both
+			 * conntracks refer to the same flow.
+			 * This is a rare situation, it can occur e.g. when
+			 * more than one UDP packet is sent from same socket
+			 * in different threads.
+			 *
+			 * Let nf_ct_resolve_clash() deal with this later.
+			 */
+			if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+					      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+				continue;
+
 			NF_CT_STAT_INC_ATOMIC(net, found);
 			rcu_read_unlock();
 			return 1;
-- 
2.11.0


^ permalink raw reply related

* [PATCH 2/6] selftests: netfilter: add simple masq/redirect test cases
From: Pablo Neira Ayuso @ 2019-02-05 19:04 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190205190415.25041-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Check basic nat/redirect/masquerade for ipv4 and ipv6.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/Makefile   |   2 +-
 tools/testing/selftests/netfilter/nft_nat.sh | 762 +++++++++++++++++++++++++++
 2 files changed, 763 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/netfilter/nft_nat.sh

diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile
index 47ed6cef93fb..c9ff2b47bd1c 100644
--- a/tools/testing/selftests/netfilter/Makefile
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for netfilter selftests
 
-TEST_PROGS := nft_trans_stress.sh
+TEST_PROGS := nft_trans_stress.sh nft_nat.sh
 
 include ../lib.mk
diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh
new file mode 100755
index 000000000000..8ec76681605c
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_nat.sh
@@ -0,0 +1,762 @@
+#!/bin/bash
+#
+# This test is for basic NAT functionality: snat, dnat, redirect, masquerade.
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without ip tool"
+	exit $ksft_skip
+fi
+
+ip netns add ns0
+ip netns add ns1
+ip netns add ns2
+
+ip link add veth0 netns ns0 type veth peer name eth0 netns ns1
+ip link add veth1 netns ns0 type veth peer name eth0 netns ns2
+
+ip -net ns0 link set lo up
+ip -net ns0 link set veth0 up
+ip -net ns0 addr add 10.0.1.1/24 dev veth0
+ip -net ns0 addr add dead:1::1/64 dev veth0
+
+ip -net ns0 link set veth1 up
+ip -net ns0 addr add 10.0.2.1/24 dev veth1
+ip -net ns0 addr add dead:2::1/64 dev veth1
+
+for i in 1 2; do
+  ip -net ns$i link set lo up
+  ip -net ns$i link set eth0 up
+  ip -net ns$i addr add 10.0.$i.99/24 dev eth0
+  ip -net ns$i route add default via 10.0.$i.1
+  ip -net ns$i addr add dead:$i::99/64 dev eth0
+  ip -net ns$i route add default via dead:$i::1
+done
+
+bad_counter()
+{
+	local ns=$1
+	local counter=$2
+	local expect=$3
+
+	echo "ERROR: $counter counter in $ns has unexpected value (expected $expect)" 1>&2
+	ip netns exec $ns nft list counter inet filter $counter 1>&2
+}
+
+check_counters()
+{
+	ns=$1
+	local lret=0
+
+	cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84")
+	if [ $? -ne 0 ]; then
+		bad_counter $ns ns0in "packets 1 bytes 84"
+		lret=1
+	fi
+	cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84")
+	if [ $? -ne 0 ]; then
+		bad_counter $ns ns0out "packets 1 bytes 84"
+		lret=1
+	fi
+
+	expect="packets 1 bytes 104"
+	cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect")
+	if [ $? -ne 0 ]; then
+		bad_counter $ns ns0in6 "$expect"
+		lret=1
+	fi
+	cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect")
+	if [ $? -ne 0 ]; then
+		bad_counter $ns ns0out6 "$expect"
+		lret=1
+	fi
+
+	return $lret
+}
+
+check_ns0_counters()
+{
+	local ns=$1
+	local lret=0
+
+	cnt=$(ip netns exec ns0 nft list counter inet filter ns0in | grep -q "packets 0 bytes 0")
+	if [ $? -ne 0 ]; then
+		bad_counter ns0 ns0in "packets 0 bytes 0"
+		lret=1
+	fi
+
+	cnt=$(ip netns exec ns0 nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0")
+	if [ $? -ne 0 ]; then
+		bad_counter ns0 ns0in6 "packets 0 bytes 0"
+		lret=1
+	fi
+
+	cnt=$(ip netns exec ns0 nft list counter inet filter ns0out | grep -q "packets 0 bytes 0")
+	if [ $? -ne 0 ]; then
+		bad_counter ns0 ns0out "packets 0 bytes 0"
+		lret=1
+	fi
+	cnt=$(ip netns exec ns0 nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0")
+	if [ $? -ne 0 ]; then
+		bad_counter ns0 ns0out6 "packets 0 bytes 0"
+		lret=1
+	fi
+
+	for dir in "in" "out" ; do
+		expect="packets 1 bytes 84"
+		cnt=$(ip netns exec ns0 nft list counter inet filter ${ns}${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns0 $ns$dir "$expect"
+			lret=1
+		fi
+
+		expect="packets 1 bytes 104"
+		cnt=$(ip netns exec ns0 nft list counter inet filter ${ns}${dir}6 | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns0 $ns$dir6 "$expect"
+			lret=1
+		fi
+	done
+
+	return $lret
+}
+
+reset_counters()
+{
+	for i in 0 1 2;do
+		ip netns exec ns$i nft reset counters inet > /dev/null
+	done
+}
+
+test_local_dnat6()
+{
+	local lret=0
+ip netns exec ns0 nft -f - <<EOF
+table ip6 nat {
+	chain output {
+		type nat hook output priority 0; policy accept;
+		ip6 daddr dead:1::99 dnat to dead:2::99
+	}
+}
+EOF
+	if [ $? -ne 0 ]; then
+		echo "SKIP: Could not add add ip6 dnat hook"
+		return $ksft_skip
+	fi
+
+	# ping netns1, expect rewrite to netns2
+	ip netns exec ns0 ping -q -c 1 dead:1::99 > /dev/null
+	if [ $? -ne 0 ]; then
+		lret=1
+		echo "ERROR: ping6 failed"
+		return $lret
+	fi
+
+	expect="packets 0 bytes 0"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns0 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns0 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns0 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns0 ns2$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# expect 0 count in ns1
+	expect="packets 0 bytes 0"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns0${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# expect 1 packet in ns2
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns0${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	test $lret -eq 0 && echo "PASS: ipv6 ping to ns1 was NATted to ns2"
+	ip netns exec ns0 nft flush chain ip6 nat output
+
+	return $lret
+}
+
+test_local_dnat()
+{
+	local lret=0
+ip netns exec ns0 nft -f - <<EOF
+table ip nat {
+	chain output {
+		type nat hook output priority 0; policy accept;
+		ip daddr 10.0.1.99 dnat to 10.0.2.99
+	}
+}
+EOF
+	# ping netns1, expect rewrite to netns2
+	ip netns exec ns0 ping -q -c 1 10.0.1.99 > /dev/null
+	if [ $? -ne 0 ]; then
+		lret=1
+		echo "ERROR: ping failed"
+		return $lret
+	fi
+
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns0 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns0 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns0 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns0 ns2$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# expect 0 count in ns1
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns0${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# expect 1 packet in ns2
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns0${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	test $lret -eq 0 && echo "PASS: ping to ns1 was NATted to ns2"
+
+	ip netns exec ns0 nft flush chain ip nat output
+
+	reset_counters
+	ip netns exec ns0 ping -q -c 1 10.0.1.99 > /dev/null
+	if [ $? -ne 0 ]; then
+		lret=1
+		echo "ERROR: ping failed"
+		return $lret
+	fi
+
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns0 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns0 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns0 ns2$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# expect 1 count in ns1
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns0${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns0 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# expect 0 packet in ns2
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns0${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns2$dir "$expect"
+			lret=1
+		fi
+	done
+
+	test $lret -eq 0 && echo "PASS: ping to ns1 OK after nat output chain flush"
+
+	return $lret
+}
+
+
+test_masquerade6()
+{
+	local lret=0
+
+	ip netns exec ns0 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+	ip netns exec ns2 ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+	if [ $? -ne 0 ] ; then
+		echo "ERROR: cannot ping ns1 from ns2 via ipv6"
+		return 1
+		lret=1
+	fi
+
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns2$dir "$expect"
+			lret=1
+		fi
+
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+# add masquerading rule
+ip netns exec ns0 nft -f - <<EOF
+table ip6 nat {
+	chain postrouting {
+		type nat hook postrouting priority 0; policy accept;
+		meta oif veth0 masquerade
+	}
+}
+EOF
+	ip netns exec ns2 ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+	if [ $? -ne 0 ] ; then
+		echo "ERROR: cannot ping ns1 from ns2 with active ipv6 masquerading"
+		lret=1
+	fi
+
+	# ns1 should have seen packets from ns0, due to masquerade
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns0${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# ns1 should not have seen packets from ns2, due to masquerade
+	expect="packets 0 bytes 0"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	ip netns exec ns0 nft flush chain ip6 nat postrouting
+	if [ $? -ne 0 ]; then
+		echo "ERROR: Could not flush ip6 nat postrouting" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: IPv6 masquerade for ns2"
+
+	return $lret
+}
+
+test_masquerade()
+{
+	local lret=0
+
+	ip netns exec ns0 sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+	ip netns exec ns0 sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+	ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+	if [ $? -ne 0 ] ; then
+		echo "ERROR: canot ping ns1 from ns2"
+		lret=1
+	fi
+
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns2$dir "$expect"
+			lret=1
+		fi
+
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+# add masquerading rule
+ip netns exec ns0 nft -f - <<EOF
+table ip nat {
+	chain postrouting {
+		type nat hook postrouting priority 0; policy accept;
+		meta oif veth0 masquerade
+	}
+}
+EOF
+	ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+	if [ $? -ne 0 ] ; then
+		echo "ERROR: cannot ping ns1 from ns2 with active ip masquerading"
+		lret=1
+	fi
+
+	# ns1 should have seen packets from ns0, due to masquerade
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns0${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# ns1 should not have seen packets from ns2, due to masquerade
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	ip netns exec ns0 nft flush chain ip nat postrouting
+	if [ $? -ne 0 ]; then
+		echo "ERROR: Could not flush nat postrouting" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: IP masquerade for ns2"
+
+	return $lret
+}
+
+test_redirect6()
+{
+	local lret=0
+
+	ip netns exec ns0 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+	ip netns exec ns2 ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+	if [ $? -ne 0 ] ; then
+		echo "ERROR: cannnot ping ns1 from ns2 via ipv6"
+		lret=1
+	fi
+
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns2$dir "$expect"
+			lret=1
+		fi
+
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+# add redirect rule
+ip netns exec ns0 nft -f - <<EOF
+table ip6 nat {
+	chain prerouting {
+		type nat hook prerouting priority 0; policy accept;
+		meta iif veth1 meta l4proto icmpv6 ip6 saddr dead:2::99 ip6 daddr dead:1::99 redirect
+	}
+}
+EOF
+	ip netns exec ns2 ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+	if [ $? -ne 0 ] ; then
+		echo "ERROR: cannot ping ns1 from ns2 with active ip6 redirect"
+		lret=1
+	fi
+
+	# ns1 should have seen no packets from ns2, due to redirection
+	expect="packets 0 bytes 0"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# ns0 should have seen packets from ns2, due to masquerade
+	expect="packets 1 bytes 104"
+	for dir in "in6" "out6" ; do
+		cnt=$(ip netns exec ns0 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	ip netns exec ns0 nft delete table ip6 nat
+	if [ $? -ne 0 ]; then
+		echo "ERROR: Could not delete ip6 nat table" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: IPv6 redirection for ns2"
+
+	return $lret
+}
+
+test_redirect()
+{
+	local lret=0
+
+	ip netns exec ns0 sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+	ip netns exec ns0 sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+	ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+	if [ $? -ne 0 ] ; then
+		echo "ERROR: cannot ping ns1 from ns2"
+		lret=1
+	fi
+
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns2$dir "$expect"
+			lret=1
+		fi
+
+		cnt=$(ip netns exec ns2 nft list counter inet filter ns1${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns2 ns1$dir "$expect"
+			lret=1
+		fi
+	done
+
+	reset_counters
+
+# add redirect rule
+ip netns exec ns0 nft -f - <<EOF
+table ip nat {
+	chain prerouting {
+		type nat hook prerouting priority 0; policy accept;
+		meta iif veth1 ip protocol icmp ip saddr 10.0.2.99 ip daddr 10.0.1.99 redirect
+	}
+}
+EOF
+	ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+	if [ $? -ne 0 ] ; then
+		echo "ERROR: cannot ping ns1 from ns2 with active ip redirect"
+		lret=1
+	fi
+
+	# ns1 should have seen no packets from ns2, due to redirection
+	expect="packets 0 bytes 0"
+	for dir in "in" "out" ; do
+
+		cnt=$(ip netns exec ns1 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	# ns0 should have seen packets from ns2, due to masquerade
+	expect="packets 1 bytes 84"
+	for dir in "in" "out" ; do
+		cnt=$(ip netns exec ns0 nft list counter inet filter ns2${dir} | grep -q "$expect")
+		if [ $? -ne 0 ]; then
+			bad_counter ns1 ns0$dir "$expect"
+			lret=1
+		fi
+	done
+
+	ip netns exec ns0 nft delete table ip nat
+	if [ $? -ne 0 ]; then
+		echo "ERROR: Could not delete nat table" 1>&2
+		lret=1
+	fi
+
+	test $lret -eq 0 && echo "PASS: IP redirection for ns2"
+
+	return $lret
+}
+
+
+# ip netns exec ns0 ping -c 1 -q 10.0.$i.99
+for i in 0 1 2; do
+ip netns exec ns$i nft -f - <<EOF
+table inet filter {
+	counter ns0in {}
+	counter ns1in {}
+	counter ns2in {}
+
+	counter ns0out {}
+	counter ns1out {}
+	counter ns2out {}
+
+	counter ns0in6 {}
+	counter ns1in6 {}
+	counter ns2in6 {}
+
+	counter ns0out6 {}
+	counter ns1out6 {}
+	counter ns2out6 {}
+
+	map nsincounter {
+		type ipv4_addr : counter
+		elements = { 10.0.1.1 : "ns0in",
+			     10.0.2.1 : "ns0in",
+			     10.0.1.99 : "ns1in",
+			     10.0.2.99 : "ns2in" }
+	}
+
+	map nsincounter6 {
+		type ipv6_addr : counter
+		elements = { dead:1::1 : "ns0in6",
+			     dead:2::1 : "ns0in6",
+			     dead:1::99 : "ns1in6",
+			     dead:2::99 : "ns2in6" }
+	}
+
+	map nsoutcounter {
+		type ipv4_addr : counter
+		elements = { 10.0.1.1 : "ns0out",
+			     10.0.2.1 : "ns0out",
+			     10.0.1.99: "ns1out",
+			     10.0.2.99: "ns2out" }
+	}
+
+	map nsoutcounter6 {
+		type ipv6_addr : counter
+		elements = { dead:1::1 : "ns0out6",
+			     dead:2::1 : "ns0out6",
+			     dead:1::99 : "ns1out6",
+			     dead:2::99 : "ns2out6" }
+	}
+
+	chain input {
+		type filter hook input priority 0; policy accept;
+		counter name ip saddr map @nsincounter
+		icmpv6 type { "echo-request", "echo-reply" } counter name ip6 saddr map @nsincounter6
+	}
+	chain output {
+		type filter hook output priority 0; policy accept;
+		counter name ip daddr map @nsoutcounter
+		icmpv6 type { "echo-request", "echo-reply" } counter name ip6 daddr map @nsoutcounter6
+	}
+}
+EOF
+done
+
+sleep 3
+# test basic connectivity
+for i in 1 2; do
+  ip netns exec ns0 ping -c 1 -q 10.0.$i.99 > /dev/null
+  if [ $? -ne 0 ];then
+  	echo "ERROR: Could not reach other namespace(s)" 1>&2
+	ret=1
+  fi
+
+  ip netns exec ns0 ping -c 1 -q dead:$i::99 > /dev/null
+  if [ $? -ne 0 ];then
+	echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2
+	ret=1
+  fi
+  check_counters ns$i
+  if [ $? -ne 0 ]; then
+	ret=1
+  fi
+
+  check_ns0_counters ns$i
+  if [ $? -ne 0 ]; then
+	ret=1
+  fi
+  reset_counters
+done
+
+if [ $ret -eq 0 ];then
+	echo "PASS: netns routing/connectivity: ns0 can reach ns1 and ns2"
+fi
+
+reset_counters
+test_local_dnat
+test_local_dnat6
+
+reset_counters
+test_masquerade
+test_masquerade6
+
+reset_counters
+test_redirect
+test_redirect6
+
+for i in 0 1 2; do ip netns del ns$i;done
+
+exit $ret
-- 
2.11.0


^ permalink raw reply related

* [PATCH 6/6] netfilter: nft_compat: don't use refcount_inc on newly allocated entry
From: Pablo Neira Ayuso @ 2019-02-05 19:04 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190205190415.25041-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

When I moved the refcount to refcount_t type I missed the fact that
refcount_inc() will result in use-after-free warning with
CONFIG_REFCOUNT_FULL=y builds.

The correct fix would be to init the reference count to 1 at allocation
time, but, unfortunately we cannot do this, as we can't undo that
in case something else fails later in the batch.

So only solution I see is to special-case the 'new entry' condition
and replace refcount_inc() with a "delayed" refcount_set(1) in this case,
as done here.

The .activate callback can be removed to simplify things, we only
need to make sure that deactivate() decrements/unlinks the entry
from the list at end of transaction phase (commit or abort).

Fixes: 12c44aba6618 ("netfilter: nft_compat: use refcnt_t type for nft_xt reference count")
Reported-by: Jordan Glover <Golden_Miller83@protonmail.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 62 +++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 39 deletions(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 0732a2fc697c..fe64df848365 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -61,6 +61,21 @@ static struct nft_compat_net *nft_compat_pernet(struct net *net)
 	return net_generic(net, nft_compat_net_id);
 }
 
+static void nft_xt_get(struct nft_xt *xt)
+{
+	/* refcount_inc() warns on 0 -> 1 transition, but we can't
+	 * init the reference count to 1 in .select_ops -- we can't
+	 * undo such an increase when another expression inside the same
+	 * rule fails afterwards.
+	 */
+	if (xt->listcnt == 0)
+		refcount_set(&xt->refcnt, 1);
+	else
+		refcount_inc(&xt->refcnt);
+
+	xt->listcnt++;
+}
+
 static bool nft_xt_put(struct nft_xt *xt)
 {
 	if (refcount_dec_and_test(&xt->refcnt)) {
@@ -291,7 +306,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return -EINVAL;
 
 	nft_xt = container_of(expr->ops, struct nft_xt, ops);
-	refcount_inc(&nft_xt->refcnt);
+	nft_xt_get(nft_xt);
 	return 0;
 }
 
@@ -504,7 +519,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return ret;
 
 	nft_xt = container_of(expr->ops, struct nft_xt, ops);
-	refcount_inc(&nft_xt->refcnt);
+	nft_xt_get(nft_xt);
 	return 0;
 }
 
@@ -558,45 +573,16 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
 }
 
-static void nft_compat_activate(const struct nft_ctx *ctx,
-				const struct nft_expr *expr,
-				struct list_head *h)
-{
-	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
-
-	if (xt->listcnt == 0)
-		list_add(&xt->head, h);
-
-	xt->listcnt++;
-}
-
-static void nft_compat_activate_mt(const struct nft_ctx *ctx,
-				   const struct nft_expr *expr)
-{
-	struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
-
-	nft_compat_activate(ctx, expr, &cn->nft_match_list);
-}
-
-static void nft_compat_activate_tg(const struct nft_ctx *ctx,
-				   const struct nft_expr *expr)
-{
-	struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
-
-	nft_compat_activate(ctx, expr, &cn->nft_target_list);
-}
-
 static void nft_compat_deactivate(const struct nft_ctx *ctx,
 				  const struct nft_expr *expr,
 				  enum nft_trans_phase phase)
 {
 	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
 
-	if (phase == NFT_TRANS_COMMIT)
-		return;
-
-	if (--xt->listcnt == 0)
-		list_del_init(&xt->head);
+	if (phase == NFT_TRANS_ABORT || phase == NFT_TRANS_COMMIT) {
+		if (--xt->listcnt == 0)
+			list_del_init(&xt->head);
+	}
 }
 
 static void
@@ -852,7 +838,6 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	nft_match->ops.eval = nft_match_eval;
 	nft_match->ops.init = nft_match_init;
 	nft_match->ops.destroy = nft_match_destroy;
-	nft_match->ops.activate = nft_compat_activate_mt;
 	nft_match->ops.deactivate = nft_compat_deactivate;
 	nft_match->ops.dump = nft_match_dump;
 	nft_match->ops.validate = nft_match_validate;
@@ -870,7 +855,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 
 	nft_match->ops.size = matchsize;
 
-	nft_match->listcnt = 1;
+	nft_match->listcnt = 0;
 	list_add(&nft_match->head, &cn->nft_match_list);
 
 	return &nft_match->ops;
@@ -957,7 +942,6 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
 	nft_target->ops.init = nft_target_init;
 	nft_target->ops.destroy = nft_target_destroy;
-	nft_target->ops.activate = nft_compat_activate_tg;
 	nft_target->ops.deactivate = nft_compat_deactivate;
 	nft_target->ops.dump = nft_target_dump;
 	nft_target->ops.validate = nft_target_validate;
@@ -968,7 +952,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	else
 		nft_target->ops.eval = nft_target_eval_xt;
 
-	nft_target->listcnt = 1;
+	nft_target->listcnt = 0;
 	list_add(&nft_target->head, &cn->nft_target_list);
 
 	return &nft_target->ops;
-- 
2.11.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox