Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH ethtool] ethtool.c: fix memory leaks
From: Ivan Vecera @ 2016-04-08  8:06 UTC (permalink / raw)
  To: netdev; +Cc: bwh
In-Reply-To: <1458303855-4976-1-git-send-email-ivecera@redhat.com>

On 18.3.2016 13:24, Ivan Vecera wrote:
> Memory allocated at several places is not appropriately freed.
>
> Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Ben, ping.

I.
> ---
>   ethtool.c | 60 +++++++++++++++++++++++++++++++++++++++++++++---------------
>   1 file changed, 45 insertions(+), 15 deletions(-)
>
> diff --git a/ethtool.c b/ethtool.c
> index 0cd0d4f..ca0bf28 100644
> --- a/ethtool.c
> +++ b/ethtool.c
> @@ -2065,10 +2065,14 @@ static int do_gfeatures(struct cmd_context *ctx)
>   	features = get_features(ctx, defs);
>   	if (!features) {
>   		fprintf(stdout, "no feature info available\n");
> +		free(defs);
>   		return 1;
>   	}
>
>   	dump_features(defs, features, NULL);
> +
> +	free(features);
> +	free(defs);
>   	return 0;
>   }
>
> @@ -2078,11 +2082,11 @@ static int do_sfeatures(struct cmd_context *ctx)
>   	int any_changed = 0, any_mismatch = 0;
>   	u32 off_flags_wanted = 0;
>   	u32 off_flags_mask = 0;
> -	struct ethtool_sfeatures *efeatures;
> +	struct ethtool_sfeatures *efeatures = NULL;
>   	struct cmdline_info *cmdline_features;
> -	struct feature_state *old_state, *new_state;
> +	struct feature_state *old_state = NULL, *new_state = NULL;
>   	struct ethtool_value eval;
> -	int err;
> +	int err, retval = 1;
>   	int i, j;
>
>   	defs = get_feature_defs(ctx);
> @@ -2096,7 +2100,7 @@ static int do_sfeatures(struct cmd_context *ctx)
>   				   sizeof(efeatures->features[0]));
>   		if (!efeatures) {
>   			perror("Cannot parse arguments");
> -			return 1;
> +			goto finish;
>   		}
>   		efeatures->cmd = ETHTOOL_SFEATURES;
>   		efeatures->size = FEATURE_BITS_TO_BLOCKS(defs->n_features);
> @@ -2114,7 +2118,7 @@ static int do_sfeatures(struct cmd_context *ctx)
>   				  sizeof(cmdline_features[0]));
>   	if (!cmdline_features) {
>   		perror("Cannot parse arguments");
> -		return 1;
> +		goto finish;
>   	}
>   	for (i = 0; i < ARRAY_SIZE(off_flag_def); i++)
>   		flag_to_cmdline_info(off_flag_def[i].short_name,
> @@ -2133,12 +2137,13 @@ static int do_sfeatures(struct cmd_context *ctx)
>
>   	if (!any_changed) {
>   		fprintf(stdout, "no features changed\n");
> -		return 0;
> +		retval = 0;
> +		goto finish;
>   	}
>
>   	old_state = get_features(ctx, defs);
>   	if (!old_state)
> -		return 1;
> +		goto finish;
>
>   	if (efeatures) {
>   		/* For each offload that the user specified, update any
> @@ -2182,7 +2187,7 @@ static int do_sfeatures(struct cmd_context *ctx)
>   		err = send_ioctl(ctx, efeatures);
>   		if (err < 0) {
>   			perror("Cannot set device feature settings");
> -			return 1;
> +			goto finish;
>   		}
>   	} else {
>   		for (i = 0; i < ARRAY_SIZE(off_flag_def); i++) {
> @@ -2197,7 +2202,7 @@ static int do_sfeatures(struct cmd_context *ctx)
>   					fprintf(stderr,
>   						"Cannot set device %s settings: %m\n",
>   						off_flag_def[i].long_name);
> -					return 1;
> +					goto finish;
>   				}
>   			}
>   		}
> @@ -2211,7 +2216,8 @@ static int do_sfeatures(struct cmd_context *ctx)
>   			err = send_ioctl(ctx, &eval);
>   			if (err) {
>   				perror("Cannot set device flag settings");
> -				return 92;
> +				retval = 92;
> +				goto finish;
>   			}
>   		}
>   	}
> @@ -2219,7 +2225,7 @@ static int do_sfeatures(struct cmd_context *ctx)
>   	/* Compare new state with requested state */
>   	new_state = get_features(ctx, defs);
>   	if (!new_state)
> -		return 1;
> +		goto finish;
>   	any_changed = new_state->off_flags != old_state->off_flags;
>   	any_mismatch = (new_state->off_flags !=
>   			((old_state->off_flags & ~off_flags_mask) |
> @@ -2238,13 +2244,19 @@ static int do_sfeatures(struct cmd_context *ctx)
>   		if (!any_changed) {
>   			fprintf(stderr,
>   				"Could not change any device features\n");
> -			return 1;
> +			goto finish;
>   		}
>   		printf("Actual changes:\n");
>   		dump_features(defs, new_state, old_state);
>   	}
>
> -	return 0;
> +	retval = 0;
> +finish:
> +	free(new_state);
> +	free(old_state);
> +	free(efeatures);
> +	free(defs);
> +	return retval;
>   }
>
>   static int do_gset(struct cmd_context *ctx)
> @@ -2705,8 +2717,18 @@ static int do_gregs(struct cmd_context *ctx)
>   			return 75;
>   		}
>
> -		regs = realloc(regs, sizeof(*regs) + st.st_size);
> -		regs->len = st.st_size;
> +		if (regs->len != st.st_size) {
> +			struct ethtool_regs *new_regs;
> +			new_regs = realloc(regs, sizeof(*regs) + st.st_size);
> +			if (!new_regs) {
> +				perror("Cannot allocate memory for register "
> +				       "dump");
> +				free(regs);
> +				return 73;
> +			}
> +			regs = new_regs;
> +			regs->len = st.st_size;
> +		}
>   		nread = fread(regs->data, regs->len, 1, f);
>   		fclose(f);
>   		if (nread != 1) {
> @@ -3775,6 +3797,7 @@ static int do_gprivflags(struct cmd_context *ctx)
>   	}
>   	if (strings->len == 0) {
>   		fprintf(stderr, "No private flags defined\n");
> +		free(strings);
>   		return 1;
>   	}
>   	if (strings->len > 32) {
> @@ -3786,6 +3809,7 @@ static int do_gprivflags(struct cmd_context *ctx)
>   	flags.cmd = ETHTOOL_GPFLAGS;
>   	if (send_ioctl(ctx, &flags)) {
>   		perror("Cannot get private flags");
> +		free(strings);
>   		return 1;
>   	}
>
> @@ -3804,6 +3828,7 @@ static int do_gprivflags(struct cmd_context *ctx)
>   		       (const char *)strings->data + i * ETH_GSTRING_LEN,
>   		       (flags.data & (1U << i)) ? "on" : "off");
>
> +	free(strings);
>   	return 0;
>   }
>
> @@ -3825,6 +3850,7 @@ static int do_sprivflags(struct cmd_context *ctx)
>   	}
>   	if (strings->len == 0) {
>   		fprintf(stderr, "No private flags defined\n");
> +		free(strings);
>   		return 1;
>   	}
>   	if (strings->len > 32) {
> @@ -3836,6 +3862,7 @@ static int do_sprivflags(struct cmd_context *ctx)
>   	cmdline = calloc(strings->len, sizeof(*cmdline));
>   	if (!cmdline) {
>   		perror("Cannot parse arguments");
> +		free(strings);
>   		return 1;
>   	}
>   	for (i = 0; i < strings->len; i++) {
> @@ -3852,6 +3879,7 @@ static int do_sprivflags(struct cmd_context *ctx)
>   	flags.cmd = ETHTOOL_GPFLAGS;
>   	if (send_ioctl(ctx, &flags)) {
>   		perror("Cannot get private flags");
> +		free(strings);
>   		return 1;
>   	}
>
> @@ -3859,9 +3887,11 @@ static int do_sprivflags(struct cmd_context *ctx)
>   	flags.data = (flags.data & ~seen_flags) | wanted_flags;
>   	if (send_ioctl(ctx, &flags)) {
>   		perror("Cannot set private flags");
> +		free(strings);
>   		return 1;
>   	}
>
> +	free(strings);
>   	return 0;
>   }
>
>

^ permalink raw reply

* Re: [PATCH net] tuntap: restore default qdisc
From: Phil Sutter @ 2016-04-08  9:02 UTC (permalink / raw)
  To: Jason Wang; +Cc: davem, netdev, linux-kernel, mst
In-Reply-To: <1460093208-4364-1-git-send-email-jasowang@redhat.com>

On Fri, Apr 08, 2016 at 01:26:48PM +0800, Jason Wang wrote:
> After commit f84bb1eac027 ("net: fix IFF_NO_QUEUE for drivers using
> alloc_netdev"), default qdisc was changed to noqueue because
> tuntap does not set tx_queue_len during .setup(). This patch restores
> default qdisc by setting tx_queue_len in tun_setup().
> 
> Fixes: f84bb1eac027 ("net: fix IFF_NO_QUEUE for drivers using alloc_netdev")
> Cc: Phil Sutter <phil@nwl.cc>
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Acked-by: Phil Sutter <phil@nwl.cc>

^ permalink raw reply

* [PATCH net-next] net: ethernet: stmmac: GMAC4.xx: Fix TX descriptor preparation
From: Alexandre TORGUE @ 2016-04-08  9:18 UTC (permalink / raw)
  To: netdev, peppe.cavallaro; +Cc: dan.carpenter, kernel-janitors

On GMAC4.xx each descriptor contains 2 buffers of 16KB (each).
Initially, those 2 buffers was filled in dwmac4_rd_prepare_tx_desc but
it is actually not needed. Indeed, stmmac driver supports frame up to
9000 bytes (jumbo). So only one buffer is needed.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alexandre TORGUE <alexandre.torgue@st.com>

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index d4952c7..4ec7397 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -254,14 +254,7 @@ static void dwmac4_rd_prepare_tx_desc(struct dma_desc *p, int is_fs, int len,
 {
 	unsigned int tdes3 = p->des3;
 
-	if (unlikely(len > BUF_SIZE_16KiB)) {
-		p->des2 |= (((len - BUF_SIZE_16KiB) <<
-			     TDES2_BUFFER2_SIZE_MASK_SHIFT)
-			    & TDES2_BUFFER2_SIZE_MASK)
-			    | (BUF_SIZE_16KiB & TDES2_BUFFER1_SIZE_MASK);
-	} else {
-		p->des2 |= (len & TDES2_BUFFER1_SIZE_MASK);
-	}
+	p->des2 |= (len & TDES2_BUFFER1_SIZE_MASK);
 
 	if (is_fs)
 		tdes3 |= TDES3_FIRST_DESCRIPTOR;
-- 
1.9.1

^ permalink raw reply related

* [PATCH] ipv6: rework the lock in addrconf_permanent_addr
From: roy.qing.li @ 2016-04-08  9:22 UTC (permalink / raw)
  To: netdev

From: Li RongQing <roy.qing.li@gmail.com>

1. nothing of idev is changed, so read lock is enough
2. ifp is changed, so used ifp->lock or cmpxchg to protect it

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
---
 net/ipv6/addrconf.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27aed1a..f6e7605b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3184,14 +3184,21 @@ static void addrconf_gre_config(struct net_device *dev)
 static void l3mdev_check_host_rt(struct inet6_dev *idev,
 				  struct inet6_ifaddr *ifp)
 {
+	struct rt6_info *rt = NULL;
+
+	spin_lock(&ifp->lock);
 	if (ifp->rt) {
 		u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
 
 		if (tb_id != ifp->rt->rt6i_table->tb6_id) {
-			ip6_del_rt(ifp->rt);
+			rt = ifp->rt;
 			ifp->rt = NULL;
 		}
 	}
+	spin_unlock(&ifp->lock);
+
+	if (rt)
+		ip6_del_rt(rt);
 }
 #else
 static void l3mdev_check_host_rt(struct inet6_dev *idev,
@@ -3203,6 +3210,8 @@ static void l3mdev_check_host_rt(struct inet6_dev *idev,
 static int fixup_permanent_addr(struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
+	struct rt6_info *prev;
+
 	l3mdev_check_host_rt(idev, ifp);
 
 	if (!ifp->rt) {
@@ -3212,7 +3221,12 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
 		if (unlikely(IS_ERR(rt)))
 			return PTR_ERR(rt);
 
-		ifp->rt = rt;
+		prev = cmpxchg(&ifp->rt, NULL, rt);
+
+		/*if cmpxchg failed*/
+		if (prev) {
+			ip6_rt_put(rt);
+		}
 	}
 
 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
@@ -3234,21 +3248,21 @@ static void addrconf_permanent_addr(struct net_device *dev)
 	if (!idev)
 		return;
 
-	write_lock_bh(&idev->lock);
+	read_lock_bh(&idev->lock);
 
 	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
 		if ((ifp->flags & IFA_F_PERMANENT) &&
 		    fixup_permanent_addr(idev, ifp) < 0) {
-			write_unlock_bh(&idev->lock);
+			read_unlock_bh(&idev->lock);
 			ipv6_del_addr(ifp);
-			write_lock_bh(&idev->lock);
+			read_lock_bh(&idev->lock);
 
 			net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
 					     idev->dev->name, &ifp->addr);
 		}
 	}
 
-	write_unlock_bh(&idev->lock);
+	read_unlock_bh(&idev->lock);
 }
 
 static int addrconf_notify(struct notifier_block *this, unsigned long event,
-- 
2.1.4

^ permalink raw reply related

* Re: [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx
From: Jesper Dangaard Brouer @ 2016-04-08  9:38 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer
In-Reply-To: <1460090930-11219-2-git-send-email-bblanco@plumgrid.com>


On Thu,  7 Apr 2016 21:48:47 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:

> Add two new set/get netdev ops for drivers implementing the
> BPF_PROG_TYPE_PHYS_DEV filter.
> 
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
[...]
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index cb4e508..3acf732 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
[...]
> @@ -1102,6 +1103,14 @@ struct tc_to_netdev {
>   *	appropriate rx headroom value allows avoiding skb head copy on
>   *	forward. Setting a negative value resets the rx headroom to the
>   *	default value.
> + * int  (*ndo_bpf_set)(struct net_device *dev, struct bpf_prog *prog);
> + *	This function is used to set or clear a bpf program used in the
> + *	earliest stages of packet rx. The prog will have been loaded as
> + *	BPF_PROG_TYPE_PHYS_DEV. The callee is responsible for calling
> + *	bpf_prog_put on any old progs that are stored, but not on the passed
> + *	in prog.
> + * bool (*ndo_bpf_get)(struct net_device *dev);
> + *	This function is used to check if a bpf program is set on the device.
>   *

This interface for the entire device, right.  I can imagine that users
want to attach a eBPF program per RX queue.  Can we adapt the interface
to support this? (could default to adding it all queues)


I'm also wondering if we should add a "flags" parameter.  Or maybe we
can extend 'struct bpf_prog' with I have in mind.

When the eBPF program is attached to a RX queue, I want to know if the
program want to modify packet-data, up-front.

The problem is that most drivers use dma_sync, which means that data is
considered 'read-only' (the "considered" part depend on DMA engine, and
we might find a DMA loop-hole for some configs).
  If the program want to write, the driver have the option of
reconfiguring the driver routine to use dma_unmap, before handing over
the page.  Or driver can also choose to realloc the specific RX ring
queue pages as single pages (using dma_map/unmap consistently).
 This also allow us to give a return code indicating given driver does
not support writable packet-pages (rejecting program).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
From: Jesper Dangaard Brouer @ 2016-04-08 10:36 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer
In-Reply-To: <1460090930-11219-1-git-send-email-bblanco@plumgrid.com>

On Thu,  7 Apr 2016 21:48:46 -0700
Brenden Blanco <bblanco@plumgrid.com> wrote:

> Add a new bpf prog type that is intended to run in early stages of the
> packet rx path. Only minimal packet metadata will be available, hence a
> new context type, struct bpf_phys_dev_md, is exposed to userspace. So
> far only expose the readable packet length, and only in read mode.
> 
> The PHYS_DEV name is chosen to represent that the program is meant only
> for physical adapters, rather than all netdevs.
> 
> While the user visible struct is new, the underlying context must be
> implemented as a minimal skb in order for the packet load_* instructions
> to work. The skb filled in by the driver must have skb->len, skb->head,
> and skb->data set, and skb->data_len == 0.
> 
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> ---
>  include/uapi/linux/bpf.h | 14 ++++++++++
>  kernel/bpf/verifier.c    |  1 +
>  net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 70eda5a..3018d83 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -93,6 +93,7 @@ enum bpf_prog_type {
>  	BPF_PROG_TYPE_SCHED_CLS,
>  	BPF_PROG_TYPE_SCHED_ACT,
>  	BPF_PROG_TYPE_TRACEPOINT,
> +	BPF_PROG_TYPE_PHYS_DEV,
>  };
>  
>  #define BPF_PSEUDO_MAP_FD	1
> @@ -368,6 +369,19 @@ struct __sk_buff {
>  	__u32 tc_classid;
>  };
>  
> +/* user return codes for PHYS_DEV prog type */
> +enum bpf_phys_dev_action {
> +	BPF_PHYS_DEV_DROP,
> +	BPF_PHYS_DEV_OK,
> +};

I can imagine these extra return codes:

 BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
 BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
 BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */

The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
which we can look at when we get that far...

For the "MODIFIED" case, people caring about checksumming, might want
to voice their concern if we want additional info or return codes,
indicating if software need to recalc CSUM (or e.g. if only IP-pseudo
hdr was modified).


> +/* user accessible metadata for PHYS_DEV packet hook
> + * new fields must be added to the end of this structure
> + */
> +struct bpf_phys_dev_md {
> +	__u32 len;
> +};

This is userspace visible.  I can easily imagine this structure will get
extended.  How does a userspace eBPF program know that the struct got
extended? (bet you got some scheme, normally I would add a "version" as
first elem).

BTW, how and where is this "struct bpf_phys_dev_md" allocated?


>  struct bpf_tunnel_key {
>  	__u32 tunnel_id;
>  	union {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 58792fe..877542d 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1344,6 +1344,7 @@ static bool may_access_skb(enum bpf_prog_type type)
>  	case BPF_PROG_TYPE_SOCKET_FILTER:
>  	case BPF_PROG_TYPE_SCHED_CLS:
>  	case BPF_PROG_TYPE_SCHED_ACT:
> +	case BPF_PROG_TYPE_PHYS_DEV:
>  		return true;
>  	default:
>  		return false;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e8486ba..4f73fb9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2021,6 +2021,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
>  	}
>  }
>  
> +static const struct bpf_func_proto *
> +phys_dev_func_proto(enum bpf_func_id func_id)
> +{
> +	return sk_filter_func_proto(func_id);
> +}
> +
>  static bool __is_valid_access(int off, int size, enum bpf_access_type type)
>  {
>  	/* check bounds */
> @@ -2076,6 +2082,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
>  	return __is_valid_access(off, size, type);
>  }
>  
> +static bool __is_valid_phys_dev_access(int off, int size,
> +				       enum bpf_access_type type)
> +{
> +	if (off < 0 || off >= sizeof(struct bpf_phys_dev_md))
> +		return false;
> +
> +	if (off % size != 0)
> +		return false;
> +
> +	if (size != 4)
> +		return false;
> +
> +	return true;
> +}
> +
> +static bool phys_dev_is_valid_access(int off, int size,
> +				     enum bpf_access_type type)
> +{
> +	if (type == BPF_WRITE)
> +		return false;
> +
> +	switch (off) {
> +	case offsetof(struct bpf_phys_dev_md, len):
> +		break;
> +	default:
> +		return false;
> +	}
> +	return __is_valid_phys_dev_access(off, size, type);
> +}
> +
>  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>  				      int src_reg, int ctx_off,
>  				      struct bpf_insn *insn_buf,
> @@ -2213,6 +2249,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>  	return insn - insn_buf;
>  }
>  
> +static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
> +					   int dst_reg, int src_reg,
> +					   int ctx_off,
> +					   struct bpf_insn *insn_buf,
> +					   struct bpf_prog *prog)
> +{
> +	struct bpf_insn *insn = insn_buf;
> +
> +	switch (ctx_off) {
> +	case offsetof(struct bpf_phys_dev_md, len):
> +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
> +
> +		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
> +				      offsetof(struct sk_buff, len));
> +		break;
> +	}
> +
> +	return insn - insn_buf;
> +}
> +
>  static const struct bpf_verifier_ops sk_filter_ops = {
>  	.get_func_proto = sk_filter_func_proto,
>  	.is_valid_access = sk_filter_is_valid_access,
> @@ -2225,6 +2281,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
>  	.convert_ctx_access = bpf_net_convert_ctx_access,
>  };
>  
> +static const struct bpf_verifier_ops phys_dev_ops = {
> +	.get_func_proto = phys_dev_func_proto,
> +	.is_valid_access = phys_dev_is_valid_access,
> +	.convert_ctx_access = bpf_phys_dev_convert_ctx_access,
> +};
> +
>  static struct bpf_prog_type_list sk_filter_type __read_mostly = {
>  	.ops = &sk_filter_ops,
>  	.type = BPF_PROG_TYPE_SOCKET_FILTER,
> @@ -2240,11 +2302,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
>  	.type = BPF_PROG_TYPE_SCHED_ACT,
>  };
>  
> +static struct bpf_prog_type_list phys_dev_type __read_mostly = {
> +	.ops = &phys_dev_ops,
> +	.type = BPF_PROG_TYPE_PHYS_DEV,
> +};
> +
>  static int __init register_sk_filter_ops(void)
>  {
>  	bpf_register_prog_type(&sk_filter_type);
>  	bpf_register_prog_type(&sched_cls_type);
>  	bpf_register_prog_type(&sched_act_type);
> +	bpf_register_prog_type(&phys_dev_type);
>  
>  	return 0;
>  }



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
From: Daniel Borkmann @ 2016-04-08 11:09 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, eric.dumazet,
	ecree, john.fastabend, tgraf, johannes, eranlinuxmellanox,
	lorenzo
In-Reply-To: <20160408123614.2a15a346@redhat.com>

On 04/08/2016 12:36 PM, Jesper Dangaard Brouer wrote:
> On Thu,  7 Apr 2016 21:48:46 -0700
> Brenden Blanco <bblanco@plumgrid.com> wrote:
>
>> Add a new bpf prog type that is intended to run in early stages of the
>> packet rx path. Only minimal packet metadata will be available, hence a
>> new context type, struct bpf_phys_dev_md, is exposed to userspace. So
>> far only expose the readable packet length, and only in read mode.
>>
>> The PHYS_DEV name is chosen to represent that the program is meant only
>> for physical adapters, rather than all netdevs.
>>
>> While the user visible struct is new, the underlying context must be
>> implemented as a minimal skb in order for the packet load_* instructions
>> to work. The skb filled in by the driver must have skb->len, skb->head,
>> and skb->data set, and skb->data_len == 0.
>>
>> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
>> ---
>>   include/uapi/linux/bpf.h | 14 ++++++++++
>>   kernel/bpf/verifier.c    |  1 +
>>   net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 83 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 70eda5a..3018d83 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -93,6 +93,7 @@ enum bpf_prog_type {
>>   	BPF_PROG_TYPE_SCHED_CLS,
>>   	BPF_PROG_TYPE_SCHED_ACT,
>>   	BPF_PROG_TYPE_TRACEPOINT,
>> +	BPF_PROG_TYPE_PHYS_DEV,
>>   };
>>
>>   #define BPF_PSEUDO_MAP_FD	1
>> @@ -368,6 +369,19 @@ struct __sk_buff {
>>   	__u32 tc_classid;
>>   };
>>
>> +/* user return codes for PHYS_DEV prog type */
>> +enum bpf_phys_dev_action {
>> +	BPF_PHYS_DEV_DROP,
>> +	BPF_PHYS_DEV_OK,
>> +};
>
> I can imagine these extra return codes:
>
>   BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
>   BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
>   BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
>
> The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> which we can look at when we get that far...

I'd probably let the tcpdump case be handled by the rest of the stack.
Forwarding could be to some txqueue or perhaps directly to a virtual net
device e.g. the veth end sitting in a container (where fake skb would
need to be promoted to a real one). Or, perhaps instead of veth end,
directly demuxed into a target socket queue in that container ...
Alternatively for tcpdump use-case, you could also do sampling on the
bpf_phy_dev with eBPF maps.

> For the "MODIFIED" case, people caring about checksumming, might want
> to voice their concern if we want additional info or return codes,
> indicating if software need to recalc CSUM (or e.g. if only IP-pseudo
> hdr was modified).
>
>> +/* user accessible metadata for PHYS_DEV packet hook
>> + * new fields must be added to the end of this structure
>> + */
>> +struct bpf_phys_dev_md {
>> +	__u32 len;
>> +};
>
> This is userspace visible.  I can easily imagine this structure will get
> extended.  How does a userspace eBPF program know that the struct got
> extended? (bet you got some scheme, normally I would add a "version" as
> first elem).
>
> BTW, how and where is this "struct bpf_phys_dev_md" allocated?

It isn't, see bpf_phys_dev_convert_ctx_access(). At load time the verifier
will convert/rewrite this based on offsetof() to a real access of the per
cpu sk_buff, that's the only purpose.

Cheers,
Daniel

^ permalink raw reply

* Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk
From: Yang Yingliang @ 2016-04-08 11:18 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem, Ding Tianhong
In-Reply-To: <1460040665.6473.398.camel@edumazet-glaptop3.roam.corp.google.com>



On 2016/4/7 22:51, Eric Dumazet wrote:
> On Thu, 2016-04-07 at 03:21 -0700, Eric Dumazet wrote:
>
>> Please do not send patches before really understanding the issue you
>> have.
>>
>> Having a backlog of 12506206 bytes is ridiculous. Dropping packets is
>> absolutely fine if this ever happens.
>>
>> Something is really wrong on your host, or the sender simply does not
>> comply with TCP protocol (not caring of receiver window at all)
>>
>> Since you added a trace of truesize, please also trace skb->len
>>

[2016-04-08 18:33:39][ 9748.726948] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:31992, len:17540
[2016-04-08 18:33:39][ 9748.726964] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:29326, truesize:18662, 
len:10240
[2016-04-08 18:33:39][ 9748.726986] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:39990, len:21920
[2016-04-08 18:33:39][ 9748.727028] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.727068] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.727082] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:21328, truesize:5332, len:2940
[2016-04-08 18:33:39][ 9748.727310] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:53320, len:29220
[2016-04-08 18:33:39][ 9748.727326] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:26660, truesize:7998, len:4400
[2016-04-08 18:33:39][ 9748.727352] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:47988, truesize:58652, 
len:32140
[2016-04-08 18:33:39][ 9748.727389] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:0, truesize:39990, len:21920
[2016-04-08 18:33:39][ 9748.727409] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:12607514 rmem_alloc:58652, truesize:18662, 
len:10240

If I expand buffer 5 times((sndbuf+rcvbuf)*5). There are only 5M data in 
backlog at most.

[2016-04-08 18:33:39][ 9748.777743] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5435954 rmem_alloc:0, truesize:55986, len:30680
[2016-04-08 18:33:39][ 9748.777762] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5457282 rmem_alloc:58652, truesize:21328, 
len:11700
[2016-04-08 18:33:39][ 9748.777804] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5515934 rmem_alloc:55986, truesize:58652, 
len:32140
[2016-04-08 18:33:39][ 9748.777818] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5537262 rmem_alloc:0, truesize:21328, len:11700
[2016-04-08 18:33:39][ 9748.777839] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5574586 rmem_alloc:0, truesize:37324, len:20460
[2016-04-08 18:33:39][ 9748.777854] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5601246 rmem_alloc:58652, truesize:26660, 
len:14620
[2016-04-08 18:33:39][ 9748.777881] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5659898 rmem_alloc:21328, truesize:58652, 
len:32140
[2016-04-08 18:33:39][ 9748.777894] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:5675894 rmem_alloc:37324, truesize:15996, len:8780
[2016-04-08 18:33:39][ 9748.778047] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:58652 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.778075] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:117304 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.778084] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:122636 rmem_alloc:0, truesize:5332, len:2940
[2016-04-08 18:33:39][ 9748.778109] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:175956 rmem_alloc:0, truesize:53320, len:29220
[2016-04-08 18:33:39][ 9748.778156] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:234608 rmem_alloc:0, truesize:58652, len:32140
[2016-04-08 18:33:39][ 9748.778178] TCP: rcvbuf:10485760 sndbuf:2097152 
limit:12582912 backloglen:282596 rmem_alloc:58652, truesize:47988, len:26300
>
> BTW, have you played with /proc/sys/net/ipv4/tcp_adv_win_scale ?
>

I expand  tcp_adv_win_scale and tcp_rmem. It has no effect.
>
>
>
>

^ permalink raw reply

* [PATCH RESEND] net: ethernet: renesas: ravb_main: test clock rate to avoid division by 0
From: Wolfram Sang @ 2016-04-08 11:28 UTC (permalink / raw)
  To: netdev; +Cc: Wolfram Sang, linux-renesas-soc, David Miller, Sergei Shtylyov

From: Wolfram Sang <wsa+renesas@sang-engineering.com>

The clk API may return 0 on clk_get_rate, so we should check the result before
using it as a divisor.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Acked-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
---

The original patch was marked as "NOT APPLICABLE" in patchwork. I reviewed
again and can't see why. So, in case this remains true, please explain.

 drivers/net/ethernet/renesas/ravb_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 4e1a7dba7c4abb..791930b63991dc 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1691,6 +1691,9 @@ static int ravb_set_gti(struct net_device *ndev)
 	rate = clk_get_rate(clk);
 	clk_put(clk);
 
+	if (!rate)
+		return -EINVAL;
+
 	inc = 1000000000ULL << 20;
 	do_div(inc, rate);
 
-- 
2.7.0

^ permalink raw reply related

* Re: [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program
From: Jesper Dangaard Brouer @ 2016-04-08 11:41 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer
In-Reply-To: <1460090930-11219-4-git-send-email-bblanco@plumgrid.com>


On Thu,  7 Apr 2016 21:48:49 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:

> +int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length)
> +{
> +	struct sk_buff *skb = this_cpu_ptr(&percpu_bpf_phys_dev_md);
> +	int ret;
> +
> +	build_bpf_phys_dev_md(skb, data, length);
> +
> +	rcu_read_lock();
> +	ret = BPF_PROG_RUN(prog, (void *)skb);
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
[...]

> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> index 86bcfe5..287da02 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> @@ -840,6 +843,23 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>  		l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
>  			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
>  
> +		/* A bpf program gets first chance to drop the packet. It may
> +		 * read bytes but not past the end of the frag.
> +		 */
> +		if (prog) {
> +			struct ethhdr *ethh;
> +			dma_addr_t dma;
> +
> +			dma = be64_to_cpu(rx_desc->data[0].addr);
> +			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
> +						DMA_FROM_DEVICE);
> +			ethh = page_address(frags[0].page) +
> +							frags[0].page_offset;
> +			if (mlx4_call_bpf(prog, ethh, frags[0].page_size) ==
> +							BPF_PHYS_DEV_DROP)
> +				goto next;
> +		}
> +

Do the program need to know if the packet-page we handover is
considered 'read-only' at the call site?  Or do we rely on my idea
requiring the registration to "know" this...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH] ipv6: rework the lock in addrconf_permanent_addr
From: Sergei Shtylyov @ 2016-04-08 11:50 UTC (permalink / raw)
  To: roy.qing.li, netdev
In-Reply-To: <1460107359-8937-1-git-send-email-roy.qing.li@gmail.com>

Hello.

On 4/8/2016 12:22 PM, roy.qing.li@gmail.com wrote:

> From: Li RongQing <roy.qing.li@gmail.com>
>
> 1. nothing of idev is changed, so read lock is enough
> 2. ifp is changed, so used ifp->lock or cmpxchg to protect it
>
> Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
> ---
>   net/ipv6/addrconf.c | 26 ++++++++++++++++++++------
>   1 file changed, 20 insertions(+), 6 deletions(-)
>
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index 27aed1a..f6e7605b 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
[...]
> @@ -3212,7 +3221,12 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
>   		if (unlikely(IS_ERR(rt)))
>   			return PTR_ERR(rt);
>
> -		ifp->rt = rt;
> +		prev = cmpxchg(&ifp->rt, NULL, rt);
> +
> +		/*if cmpxchg failed*/

    Please add spaces after /* and before */

> +		if (prev) {
> +			ip6_rt_put(rt);
> +		}

    {} not needed here, please remove.

[...]

MBR, Sergei

^ permalink raw reply

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
From: Jesper Dangaard Brouer @ 2016-04-08 12:33 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer, linux-mm
In-Reply-To: <20160408123614.2a15a346@redhat.com>


On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> > +/* user return codes for PHYS_DEV prog type */
> > +enum bpf_phys_dev_action {
> > +	BPF_PHYS_DEV_DROP,
> > +	BPF_PHYS_DEV_OK,
> > +};  
> 
> I can imagine these extra return codes:
> 
>  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
>  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
>  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> 
> The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> which we can look at when we get that far...

I want to point out something which is quite FUNDAMENTAL, for
understanding these return codes (and network stack).


At driver RX time, the network stack basically have two ways of
building an SKB, which is send up the stack.

Option-A (fastest): The packet page is writable. The SKB can be
allocated and skb->data/head can point directly to the page.  And
we place/write skb_shared_info in the end/tail-room. (This is done by
calling build_skb()).

Option-B (slower): The packet page is read-only.  The SKB cannot point
skb->data/head directly to the page, because skb_shared_info need to be
written into skb->end (slightly hidden via skb_shinfo() casting).  To
get around this, a separate piece of memory is allocated (speedup by
__alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
be written. (This is done when calling netdev/napi_alloc_skb()).
  Drivers then need to copy over packet headers, and assign + adjust
skb_shinfo(skb)->frags[0] offset to skip copied headers.


Unfortunately most drivers use option-B.  Due to cost of calling the
page allocator.  It is only slightly most expensive to get a larger
compound page from the page allocator, which then can be partitioned into
page-fragments, thus amortizing the page alloc cost.  Unfortunately the
cost is added later, when constructing the SKB.
 Another reason for option-B, is that archs with expensive IOMMU
requirements (like PowerPC), don't need to dma_unmap on every packet,
but only on the compound page level.

Side-note: Most drivers have a "copy-break" optimization.  Especially
for option-B, when copying header data anyhow. For small packet, one
might as well free (or recycle) the RX page, if header size fits into
the newly allocated memory (for skb_shared_info).


For the early filter drop (DDoS use-case), it does not matter that the
packet-page is read-only.

BUT for the future XDP (eXpress Data Path) use-case it does matter.  If
we ever want to see speeds comparable to DPDK, then drivers to
need to implement option-A, as this allow forwarding at the packet-page
level.

I hope, my future page-pool facility can remove/hide the cost calling
the page allocator.


Back to the return codes, thus:
-------------------------------
BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
the SKB, and treat packet data as read-only.

BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH] iproute2: tc_bpf.c: fix building with musl libc
From: Gustavo Zacarias @ 2016-04-08 12:59 UTC (permalink / raw)
  To: netdev; +Cc: Gustavo Zacarias

We need limits.h for PATH_MAX, fixes:

tc_bpf.c: In function ‘bpf_map_selfcheck_pinned’:
tc_bpf.c:222:12: error: ‘PATH_MAX’ undeclared (first use in this
function)
  char file[PATH_MAX], buff[4096];

Signed-off-by: Gustavo Zacarias <gustavo@zacarias.com.ar>
---
 tc/tc_bpf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c
index d94af82..042e76f 100644
--- a/tc/tc_bpf.c
+++ b/tc/tc_bpf.c
@@ -20,6 +20,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <stdarg.h>
+#include <limits.h>
 
 #ifdef HAVE_ELF
 #include <libelf.h>
-- 
2.7.3

^ permalink raw reply related

* [PATCH net-next] sock: tigthen lockdep checks for sock_owned_by_user
From: Hannes Frederic Sowa @ 2016-04-08 13:11 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-cifs-u79uwXL29TY76Z2rM5mHXA,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

sock_owned_by_user should not be used without socket lock held. It seems
to be a common practice to check .owned before lock reclassification, so
provide a little help to abstract this check away.

Cc: linux-cifs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-bluetooth-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Signed-off-by: Hannes Frederic Sowa <hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r@public.gmane.org>
---
 fs/cifs/connect.c            |  4 ++--
 include/net/sock.h           | 44 +++++++++++++++++++++++++++++---------------
 net/bluetooth/af_bluetooth.c |  2 +-
 net/llc/llc_proc.c           |  2 +-
 net/sunrpc/svcsock.c         |  3 +--
 net/sunrpc/xprtsock.c        |  3 +--
 6 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a763cd3d9e7c80..3e37e52639394b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2918,7 +2918,7 @@ static inline void
 cifs_reclassify_socket4(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
-	BUG_ON(sock_owned_by_user(sk));
+	BUG_ON(!sock_allow_reclassification(sk));
 	sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
 		&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
 }
@@ -2927,7 +2927,7 @@ static inline void
 cifs_reclassify_socket6(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
-	BUG_ON(sock_owned_by_user(sk));
+	BUG_ON(!sock_allow_reclassification(sk));
 	sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
 		&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index 81d6fecec0a2c0..baba58770ac593 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1316,21 +1316,6 @@ static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
-/* Used by processes to "lock" a socket state, so that
- * interrupts and bottom half handlers won't change it
- * from under us. It essentially blocks any incoming
- * packets, so that we won't get any new data or any
- * packets that change the state of the socket.
- *
- * While locked, BH processing will add new packets to
- * the backlog queue.  This queue is processed by the
- * owner of the socket lock right before it is released.
- *
- * Since ~2.3.5 it is also exclusive sleep lock serializing
- * accesses from user process context.
- */
-#define sock_owned_by_user(sk)	((sk)->sk_lock.owned)
-
 static inline void sock_release_ownership(struct sock *sk)
 {
 	if (sk->sk_lock.owned) {
@@ -1403,6 +1388,35 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 		spin_unlock_bh(&sk->sk_lock.slock);
 }
 
+/* Used by processes to "lock" a socket state, so that
+ * interrupts and bottom half handlers won't change it
+ * from under us. It essentially blocks any incoming
+ * packets, so that we won't get any new data or any
+ * packets that change the state of the socket.
+ *
+ * While locked, BH processing will add new packets to
+ * the backlog queue.  This queue is processed by the
+ * owner of the socket lock right before it is released.
+ *
+ * Since ~2.3.5 it is also exclusive sleep lock serializing
+ * accesses from user process context.
+ */
+
+static inline bool sock_owned_by_user(const struct sock *sk)
+{
+#ifdef CONFIG_LOCKDEP
+	WARN_ON(!lockdep_sock_is_held(sk));
+#endif
+	return sk->sk_lock.owned;
+}
+
+/* no reclassification while locks are held */
+static inline bool sock_allow_reclassification(const struct sock *csk)
+{
+	struct sock *sk = (struct sock *)csk;
+
+	return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock);
+}
 
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		      struct proto *prot, int kern);
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 955eda93e66f32..3df7aefb766335 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -65,7 +65,7 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = {
 void bt_sock_reclassify_lock(struct sock *sk, int proto)
 {
 	BUG_ON(!sk);
-	BUG_ON(sock_owned_by_user(sk));
+	BUG_ON(!sock_allow_reclassification(sk));
 
 	sock_lock_init_class_and_name(sk,
 			bt_slock_key_strings[proto], &bt_slock_key[proto],
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 1a3c7e0f5d0de3..29c509c54bb22d 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -195,7 +195,7 @@ static int llc_seq_core_show(struct seq_file *seq, void *v)
 		   timer_pending(&llc->pf_cycle_timer.timer),
 		   timer_pending(&llc->rej_sent_timer.timer),
 		   timer_pending(&llc->busy_state_timer.timer),
-		   !!sk->sk_backlog.tail, !!sock_owned_by_user(sk));
+		   !!sk->sk_backlog.tail, !!sk->sk_lock.owned);
 out:
 	return 0;
 }
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 1413cdcc131c4a..bfe0a06530f6aa 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -85,8 +85,7 @@ static void svc_reclassify_socket(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 
-	WARN_ON_ONCE(sock_owned_by_user(sk));
-	if (sock_owned_by_user(sk))
+	if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
 		return;
 
 	switch (sk->sk_family) {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 65e759569e4873..b7d6b76dede095 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1881,8 +1881,7 @@ static inline void xs_reclassify_socket6(struct socket *sock)
 
 static inline void xs_reclassify_socket(int family, struct socket *sock)
 {
-	WARN_ON_ONCE(sock_owned_by_user(sock->sk));
-	if (sock_owned_by_user(sock->sk))
+	if (WARN_ON_ONCE(!sock_allow_reclassification(sock->sk)))
 		return;
 
 	switch (family) {
-- 
2.5.5

^ permalink raw reply related

* Re: [PATCH] iproute2: tc_bpf.c: fix building with musl libc
From: Daniel Borkmann @ 2016-04-08 13:14 UTC (permalink / raw)
  To: Gustavo Zacarias, netdev
In-Reply-To: <1460120373-32096-1-git-send-email-gustavo@zacarias.com.ar>

On 04/08/2016 02:59 PM, Gustavo Zacarias wrote:
> We need limits.h for PATH_MAX, fixes:
>
> tc_bpf.c: In function ‘bpf_map_selfcheck_pinned’:
> tc_bpf.c:222:12: error: ‘PATH_MAX’ undeclared (first use in this
> function)
>    char file[PATH_MAX], buff[4096];
>
> Signed-off-by: Gustavo Zacarias <gustavo@zacarias.com.ar>

Fine with me, thanks!

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

^ permalink raw reply

* Please get back to me.
From:  Nawal @ 2016-04-08 13:29 UTC (permalink / raw)
  To: Recipients

Hi, my name is Nawal denison, i need your assistance to move my family
out of Egypt to your country due to the conflict and killings going on here . I have $3. million dollars. which i will use to Establish A
viable joint company with you for your help, attach is my pic and my daughter

Thank you.
Nawal denison

^ permalink raw reply

* [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Daniel Borkmann @ 2016-04-08 13:55 UTC (permalink / raw)
  To: davem; +Cc: hannes, robbat2, netdev, Daniel Borkmann

The original tokenized iid support implemented via f53adae4eae5 ("net: ipv6:
add tokenized interface identifier support") didn't allow for clearing a
device token as it was intended that this addressing mode was the only one
active for globally scoped IPv6 addresses. Later we relaxed that restriction
via 617fe29d45bd ("net: ipv6: only invalidate previously tokenized addresses"),
and we should also allow for clearing tokens as there's no good reason why
it shouldn't be allowed.

Fixes: 617fe29d45bd ("net: ipv6: only invalidate previously tokenized addresses")
Reported-by: Robin H. Johnson <robbat2@gentoo.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 Dave, I think net-next is fine, but don't have any objections if you rather
 want to put it into net. Thanks!

 net/ipv6/addrconf.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27aed1a..a6c9927 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4995,15 +4995,13 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
 {
 	struct inet6_ifaddr *ifp;
 	struct net_device *dev = idev->dev;
-	bool update_rs = false;
+	bool clear_token, update_rs = false;
 	struct in6_addr ll_addr;
 
 	ASSERT_RTNL();
 
 	if (!token)
 		return -EINVAL;
-	if (ipv6_addr_any(token))
-		return -EINVAL;
 	if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
 		return -EINVAL;
 	if (!ipv6_accept_ra(idev))
@@ -5018,10 +5016,13 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
 
 	write_unlock_bh(&idev->lock);
 
+	clear_token = ipv6_addr_any(token);
+	if (clear_token)
+		goto update_lft;
+
 	if (!idev->dead && (idev->if_flags & IF_READY) &&
 	    !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
 			     IFA_F_OPTIMISTIC)) {
-
 		/* If we're not ready, then normal ifup will take care
 		 * of this. Otherwise, we need to request our rs here.
 		 */
@@ -5029,6 +5030,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
 		update_rs = true;
 	}
 
+update_lft:
 	write_lock_bh(&idev->lock);
 
 	if (update_rs) {
-- 
1.9.3

^ permalink raw reply related

* Re: [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Hannes Frederic Sowa @ 2016-04-08 13:57 UTC (permalink / raw)
  To: Daniel Borkmann, davem; +Cc: robbat2, netdev
In-Reply-To: <307b4d32099f606d0fbe0ce9fecd3a039b796361.1460123261.git.daniel@iogearbox.net>

On 08.04.2016 15:55, Daniel Borkmann wrote:
> The original tokenized iid support implemented via f53adae4eae5 ("net: ipv6:
> add tokenized interface identifier support") didn't allow for clearing a
> device token as it was intended that this addressing mode was the only one
> active for globally scoped IPv6 addresses. Later we relaxed that restriction
> via 617fe29d45bd ("net: ipv6: only invalidate previously tokenized addresses"),
> and we should also allow for clearing tokens as there's no good reason why
> it shouldn't be allowed.
>
> Fixes: 617fe29d45bd ("net: ipv6: only invalidate previously tokenized addresses")
> Reported-by: Robin H. Johnson <robbat2@gentoo.org>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

Thanks,
Hannes

^ permalink raw reply

* Re: [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Bjørn Mork @ 2016-04-08 14:18 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: davem, hannes, robbat2, netdev
In-Reply-To: <307b4d32099f606d0fbe0ce9fecd3a039b796361.1460123261.git.daniel@iogearbox.net>

Daniel Borkmann <daniel@iogearbox.net> writes:

>  
>  	if (!token)
>  		return -EINVAL;
> -	if (ipv6_addr_any(token))
> -		return -EINVAL;
>  	if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
>  		return -EINVAL;

Not directly related to the patch in question.  It just made me aware of
this restriction...

I realize that I'm a few years late here, but what's with the IFF_NOARP?
Is that just because we can't do DAD for the token based addresses?  How
is that different from manually configuring the whole address?



Bjørn

^ permalink raw reply

* Re: qdisc spin lock
From: Eric Dumazet @ 2016-04-08 14:19 UTC (permalink / raw)
  To: Michael Ma; +Cc: Cong Wang, Linux Kernel Network Developers
In-Reply-To: <CAAmHdhxagKnLP1_5ZW7HTsVBu0TSFYKCvNstAEWN-NHrdnvvVQ@mail.gmail.com>

On Thu, 2016-03-31 at 16:48 -0700, Michael Ma wrote:
> I didn't really know that multiple qdiscs can be isolated using MQ so
> that each txq can be associated with a particular qdisc. Also we don't
> really have multiple interfaces...
> 
> With this MQ solution we'll still need to assign transmit queues to
> different classes by doing some math on the bandwidth limit if I
> understand correctly, which seems to be less convenient compared with
> a solution purely within HTB.
> 
> I assume that with this solution I can still share qdisc among
> multiple transmit queues - please let me know if this is not the case.

Note that this MQ + HTB thing works well, unless you use a bonding
device. (Or you need the MQ+HTB on the slaves, with no way of sharing
tokens between the slaves)


https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=bb1d912323d5dd50e1079e389f4e964be14f0ae3

bonding can not really be used as a true MQ device yet.

I might send a patch to disable this 'bonding feature' if no slave sets
a queue_id. 

^ permalink raw reply

* Re: [PATCH net-next] ipv6, token: allow for clearing the current device token
From: Hannes Frederic Sowa @ 2016-04-08 14:33 UTC (permalink / raw)
  To: Bjørn Mork, Daniel Borkmann; +Cc: davem, robbat2, netdev
In-Reply-To: <878u0otc96.fsf@nemi.mork.no>



On Fri, Apr 8, 2016, at 16:18, Bjørn Mork wrote:
> Daniel Borkmann <daniel@iogearbox.net> writes:
> 
> >  
> >  	if (!token)
> >  		return -EINVAL;
> > -	if (ipv6_addr_any(token))
> > -		return -EINVAL;
> >  	if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
> >  		return -EINVAL;
> 
> Not directly related to the patch in question.  It just made me aware of
> this restriction...
> 
> I realize that I'm a few years late here, but what's with the IFF_NOARP?
> Is that just because we can't do DAD for the token based addresses?  How
> is that different from manually configuring the whole address?

IFF_NOARP is kind of the equivalent to no neighbor discovery. If you set
a token and never get in a router advertisement you never create a
tokenized ip address, thus the feature is useless.

Bye,
Hannes

^ permalink raw reply

* Re: [PATCH RFC] net: decrease the length of backlog queue immediately after it's detached from sk
From: Eric Dumazet @ 2016-04-08 14:44 UTC (permalink / raw)
  To: Yang Yingliang; +Cc: netdev, davem, Ding Tianhong
In-Reply-To: <5707939B.2030907@huawei.com>

On Fri, 2016-04-08 at 19:18 +0800, Yang Yingliang wrote:

> I expand  tcp_adv_win_scale and tcp_rmem. It has no effect.

Try :

echo -2 >/proc/sys/net/ipv4/tcp_adv_win_scale

And restart your flows.

^ permalink raw reply

* [PATCH] [linux-next] Doc: networking: Fix typo in dsa
From: Masanari Iida @ 2016-04-08 15:00 UTC (permalink / raw)
  To: linux-kernel, corbet, linux-doc, davem, netdev; +Cc: Masanari Iida

This patch fix typos in Documentation/networking/dsa.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
---
 Documentation/networking/dsa/bcm_sf2.txt | 2 +-
 Documentation/networking/dsa/dsa.txt     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/dsa/bcm_sf2.txt b/Documentation/networking/dsa/bcm_sf2.txt
index d999d0c1c5b8..eba3a2431e91 100644
--- a/Documentation/networking/dsa/bcm_sf2.txt
+++ b/Documentation/networking/dsa/bcm_sf2.txt
@@ -38,7 +38,7 @@ Implementation details
 ======================
 
 The driver is located in drivers/net/dsa/bcm_sf2.c and is implemented as a DSA
-driver; see Documentation/networking/dsa/dsa.txt for details on the subsytem
+driver; see Documentation/networking/dsa/dsa.txt for details on the subsystem
 and what it provides.
 
 The SF2 switch is configured to enable a Broadcom specific 4-bytes switch tag
diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 3b196c304b73..36f905d9c77c 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -334,7 +334,7 @@ more specifically with its VLAN filtering portion when configuring VLANs on top
 of per-port slave network devices. Since DSA primarily deals with
 MDIO-connected switches, although not exclusively, SWITCHDEV's
 prepare/abort/commit phases are often simplified into a prepare phase which
-checks whether the operation is supporte by the DSA switch driver, and a commit
+checks whether the operation is supported by the DSA switch driver, and a commit
 phase which applies the changes.
 
 As of today, the only SWITCHDEV objects supported by DSA are the FDB and VLAN
-- 
2.8.0

^ permalink raw reply related

* Re: [RFC PATCH] possible bug in handling of ipv4 route caching
From: Chris Friesen @ 2016-04-08 15:00 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: netdev
In-Reply-To: <alpine.LFD.2.11.1604072305400.2154@ja.home.ssi.bg>

On 04/07/2016 03:20 PM, Julian Anastasov wrote:
> On Thu, 7 Apr 2016, Chris Friesen wrote:
>
>> Hi,
>>
>> We think we may have found a bug in the handling of ipv4 route caching,
>> and are curious what you think.
>>
>> For local routes that require a particular output interface we do not
>> want to cache the result.  Caching the result causes incorrect behaviour
>> when there are multiple source addresses on the interface.  The end
>> result being that if the intended recipient is waiting on that interface
>> for the packet he won't receive it because it will be delivered on the
>> loopback interface and the IP_PKTINFO ipi_ifindex will be set to the
>> loopback interface as well.

>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index 02c6229..e965d4b 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -2045,6 +2045,17 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
>>   		 */
>>   		if (fi && res->prefixlen < 4)
>>   			fi = NULL;
>> +	} else if ((type == RTN_LOCAL) && (orig_oif != 0)) {
>
> 	So, we can be more specific. Can this work?:
>
> 	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
> 		   (orig_oif != dev_out->ifindex)) {
>
> 	I.e. we should allow to cache orig_oif=LOOPBACK_IFINDEX
> but eth1 should not be cached.

Yes, we think that will work.  New patch to follow.

Chris

^ permalink raw reply

* [PATCH v2] route: do not cache fib route info on local routes with oif
From: Chris Friesen @ 2016-04-08 15:08 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: netdev
In-Reply-To: <alpine.LFD.2.11.1604072305400.2154@ja.home.ssi.bg>

For local routes that require a particular output interface we do not want to
cache the result.  Caching the result causes incorrect behaviour when there are
multiple source addresses on the interface.  The end result being that if the
intended recipient is waiting on that interface for the packet he won't receive
it because it will be delivered on the loopback interface and the IP_PKTINFO
ipi_ifindex will be set to the loopback interface as well.

This can be tested by running a program such as "dhcp_release" which attempts
to inject a packet on a particular interface so that it is received by another
program on the same board.  The receiving process should see an IP_PKTINFO
ipi_ifndex value of the source interface (e.g., eth1) instead of the loopback
interface (e.g., lo).  The packet will still appear on the loopback interface
in tcpdump but the important aspect is that the CMSG info is correct.

Sample dhcp_release command line:

    dhcp_release eth1 192.168.204.222 02:11:33:22:44:66

Signed-off-by: Allain Legacy <allain.legacy@windriver.com>
Signed off-by: Chris Friesen <chris.friesen@windriver.com>
---
  net/ipv4/route.c | 12 ++++++++++++
  1 file changed, 12 insertions(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c6229..437a377 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct 
fib_result *res,
  		 */
  		if (fi && res->prefixlen < 4)
  			fi = NULL;
+	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+		   (orig_oif != dev_out->ifindex)) {
+		/* For local routes that require a particular output interface
+		 * we do not want to cache the result.  Caching the result
+		 * causes incorrect behaviour when there are multiple source
+		 * addresses on the interface, the end result being that if the
+		 * intended recipient is waiting on that interface for the
+		 * packet he won't receive it because it will be delivered on
+		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
+		 * be set to the loopback interface as well.
+		 */
+		fi = NULL;
  	}

  	fnhe = NULL;

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox