Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: Question on rhashtable in worst-case scenario.
From: Herbert Xu @ 2016-04-02  1:46 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Ben Greear, David Miller, linux-kernel, linux-wireless, netdev,
	tgraf
In-Reply-To: <1459546450.3342.22.camel@sipsolutions.net>

On Fri, Apr 01, 2016 at 11:34:10PM +0200, Johannes Berg wrote:
>
> I was thinking about that one - it's not obvious to me from the code
> how this "explicitly checking for dups" would be done or let's say how
> rhashtable differentiates. But since it seems to work for Ben until
> hitting a certain number of identical keys, surely that's just me not
> understanding the code rather than anything else :)

It's really simple, rhashtable_insert_fast does not check for dups
while rhashtable_lookup_insert_* do.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [RFC PATCH 5/5] Add sample for adding simple drop program to link
From: Brenden Blanco @ 2016-04-02  1:21 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, gerlitz, daniel,
	john.fastabend, brouer
In-Reply-To: <1459560118-5582-1-git-send-email-bblanco@plumgrid.com>

Add a sample program that only drops packets at the
BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
observed single core rate is ~14.6Mpps.

Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.

$ perf record -a samples/bpf/netdrvx1 $(</sys/class/net/eth0/ifindex)
proto 17:   14597724 drops/s

./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 6486875(c6485849+d1026) usec, 23689465 (60byte,0frags)
  3651906pps 1752Mb/sec (1752914880bps) errors: 0
Device: eth4@1
Result: OK: 6486874(c6485656+d1217) usec, 23689489 (60byte,0frags)
  3651911pps 1752Mb/sec (1752917280bps) errors: 0
Device: eth4@2
Result: OK: 6486851(c6485730+d1120) usec, 23687853 (60byte,0frags)
  3651672pps 1752Mb/sec (1752802560bps) errors: 0
Device: eth4@3
Result: OK: 6486879(c6485807+d1071) usec, 23688954 (60byte,0frags)
  3651825pps 1752Mb/sec (1752876000bps) errors: 0

perf report --no-children:
  18.36%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_process_rx_cq
  15.98%  swapper        [kernel.vmlinux]  [k] poll_idle
  12.71%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_alloc_frags
   6.87%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_free_frag
   4.20%  ksoftirqd/1    [kernel.vmlinux]  [k] get_page_from_freelist
   4.09%  swapper        [mlx4_en]         [k] mlx4_en_process_rx_cq
   3.32%  ksoftirqd/1    [kernel.vmlinux]  [k] sk_load_byte_positive_offset
   2.39%  ksoftirqd/1    [mdio]            [k] 0x00000000000074cd
   2.23%  swapper        [mlx4_en]         [k] mlx4_en_alloc_frags
   2.20%  ksoftirqd/1    [kernel.vmlinux]  [k] free_pages_prepare
   2.08%  ksoftirqd/1    [mlx4_en]         [k] mlx4_call_bpf
   1.57%  ksoftirqd/1    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
   1.35%  ksoftirqd/1    [mdio]            [k] 0x00000000000074fa
   1.09%  ksoftirqd/1    [kernel.vmlinux]  [k] free_one_page
   1.02%  ksoftirqd/1    [kernel.vmlinux]  [k] bpf_map_lookup_elem
   0.90%  ksoftirqd/1    [kernel.vmlinux]  [k] __alloc_pages_nodemask
   0.88%  swapper        [kernel.vmlinux]  [k] intel_idle
   0.82%  ksoftirqd/1    [mdio]            [k] 0x00000000000074be
   0.80%  swapper        [mlx4_en]         [k] mlx4_en_free_frag

machine specs:
 receiver - Intel E5-1630 v3 @ 3.70GHz
 sender - Intel E5645 @ 2.40GHz
 Mellanox ConnectX-3 @40G

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 samples/bpf/Makefile        |   4 ++
 samples/bpf/bpf_load.c      |   8 +++
 samples/bpf/netdrvx1_kern.c |  26 ++++++++
 samples/bpf/netdrvx1_user.c | 155 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+)
 create mode 100644 samples/bpf/netdrvx1_kern.c
 create mode 100644 samples/bpf/netdrvx1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 502c9fc..ad36bb8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -19,6 +19,7 @@ hostprogs-y += lathist
 hostprogs-y += offwaketime
 hostprogs-y += spintest
 hostprogs-y += map_perf_test
+hostprogs-y += netdrvx1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -38,6 +39,7 @@ lathist-objs := bpf_load.o libbpf.o lathist_user.o
 offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
 spintest-objs := bpf_load.o libbpf.o spintest_user.o
 map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
+netdrvx1-objs := bpf_load.o libbpf.o netdrvx1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -56,6 +58,7 @@ always += lathist_kern.o
 always += offwaketime_kern.o
 always += spintest_kern.o
 always += map_perf_test_kern.o
+always += netdrvx1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -75,6 +78,7 @@ HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_offwaketime += -lelf
 HOSTLOADLIBES_spintest += -lelf
 HOSTLOADLIBES_map_perf_test += -lelf -lrt
+HOSTLOADLIBES_netdrvx1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 58f86bd..9308fbc 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -49,6 +49,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_socket = strncmp(event, "socket", 6) == 0;
 	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
 	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
+	bool is_phys_dev = strncmp(event, "phys_dev", 8) == 0;
 	enum bpf_prog_type prog_type;
 	char buf[256];
 	int fd, efd, err, id;
@@ -63,6 +64,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
 	} else if (is_kprobe || is_kretprobe) {
 		prog_type = BPF_PROG_TYPE_KPROBE;
+	} else if (is_phys_dev) {
+		prog_type = BPF_PROG_TYPE_PHYS_DEV;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -76,6 +79,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_phys_dev)
+		return 0;
+
 	if (is_socket) {
 		event += 6;
 		if (*event != '/')
@@ -304,6 +310,7 @@ int load_bpf_file(char *path)
 
 			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
 			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
+			    memcmp(shname_prog, "phys_dev", 8) == 0 ||
 			    memcmp(shname_prog, "socket", 6) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
@@ -320,6 +327,7 @@ int load_bpf_file(char *path)
 
 		if (memcmp(shname, "kprobe/", 7) == 0 ||
 		    memcmp(shname, "kretprobe/", 10) == 0 ||
+		    memcmp(shname, "phys_dev", 8) == 0 ||
 		    memcmp(shname, "socket", 6) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
diff --git a/samples/bpf/netdrvx1_kern.c b/samples/bpf/netdrvx1_kern.c
new file mode 100644
index 0000000..9837d8a
--- /dev/null
+++ b/samples/bpf/netdrvx1_kern.c
@@ -0,0 +1,26 @@
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dropcnt = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 256,
+};
+
+SEC("phys_dev1")
+int bpf_prog1(struct xdp_metadata *ctx)
+{
+	int index = load_byte(ctx, ETH_HLEN + offsetof(struct iphdr, protocol));
+	long *value;
+
+	value = bpf_map_lookup_elem(&dropcnt, &index);
+	if (value)
+		*value += 1;
+
+	return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/netdrvx1_user.c b/samples/bpf/netdrvx1_user.c
new file mode 100644
index 0000000..9e6ec9a
--- /dev/null
+++ b/samples/bpf/netdrvx1_user.c
@@ -0,0 +1,155 @@
+#include <linux/bpf.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "bpf_load.h"
+#include "libbpf.h"
+
+static int set_link_bpf_fd(int ifindex, int fd)
+{
+	struct sockaddr_nl sa;
+	int sock, seq = 0, len, ret = -1;
+	char buf[4096];
+	struct rtattr *rta;
+	struct {
+		struct nlmsghdr  nh;
+		struct ifinfomsg ifinfo;
+		char             attrbuf[64];
+	} req;
+	struct nlmsghdr *nh;
+	struct nlmsgerr *err;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.nl_family = AF_NETLINK;
+
+	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (sock < 0) {
+		printf("open netlink socket: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+		printf("bind to netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_type = RTM_SETLINK;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.ifinfo.ifi_family = AF_UNSPEC;
+	req.ifinfo.ifi_index = ifindex;
+	rta = (struct rtattr *)(((char *) &req)
+				+ NLMSG_ALIGN(req.nh.nlmsg_len));
+	rta->rta_type = 42/*IFLA_BPF_FD*/;
+	rta->rta_len = RTA_LENGTH(sizeof(unsigned int));
+	req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len)
+		+ RTA_LENGTH(sizeof(fd));
+	memcpy(RTA_DATA(rta), &fd, sizeof(fd));
+	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+		printf("send to netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	len = recv(sock, buf, sizeof(buf), 0);
+	if (len < 0) {
+		printf("recv from netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+	     nh = NLMSG_NEXT(nh, len)) {
+		if (nh->nlmsg_pid != getpid()) {
+			printf("Wrong pid %d, expected %d\n",
+			       nh->nlmsg_pid, getpid());
+			goto cleanup;
+		}
+		if (nh->nlmsg_seq != seq) {
+			printf("Wrong seq %d, expected %d\n",
+			       nh->nlmsg_seq, seq);
+			goto cleanup;
+		}
+		switch (nh->nlmsg_type) {
+		case NLMSG_ERROR:
+			err = (struct nlmsgerr *)NLMSG_DATA(nh);
+			if (!err->error)
+				continue;
+			printf("nlmsg error %s\n", strerror(-err->error));
+			goto cleanup;
+		case NLMSG_DONE:
+			break;
+		}
+	}
+
+	ret = 0;
+
+cleanup:
+	close(sock);
+	return ret;
+}
+
+/* simple per-protocol drop counter
+ */
+static void poll_stats(int secs)
+{
+	unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+	__u64 values[nr_cpus];
+	__u32 key;
+	int i;
+
+	sleep(secs);
+
+	for (key = 0; key < 256; key++) {
+		__u64 sum = 0;
+
+		assert(bpf_lookup_elem(map_fd[0], &key, values) == 0);
+		for (i = 0; i < nr_cpus; i++)
+			sum += values[i];
+		if (sum)
+			printf("proto %u: %10llu drops/s\n", key, sum/secs);
+	}
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int ifindex;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (ac != 2) {
+		printf("usage: %s IFINDEX\n", argv[0]);
+		return 1;
+	}
+
+	ifindex = strtoul(argv[1], NULL, 0);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if (!prog_fd[0]) {
+		printf("load_bpf_file: %s\n", strerror(errno));
+		return 1;
+	}
+
+	if (set_link_bpf_fd(ifindex, prog_fd[0]) < 0) {
+		printf("link set bpf fd failed\n");
+		return 1;
+	}
+
+	poll_stats(5);
+
+	set_link_bpf_fd(ifindex, -1);
+
+	return 0;
+}
-- 
2.8.0

^ permalink raw reply related

* [RFC PATCH 4/5] mlx4: add support for fast rx drop bpf program
From: Brenden Blanco @ 2016-04-02  1:21 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, gerlitz, daniel,
	john.fastabend, brouer
In-Reply-To: <1459560118-5582-1-git-send-email-bblanco@plumgrid.com>

Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx4 driver.  Since
bpf programs require a skb context to navigate the packet, build a
percpu fake skb with the minimal fields. This avoids the costly
allocation for packets that end up being dropped.

Since mlx4 is so far the only user of this pseudo skb, the build
function is defined locally.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 61 ++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 18 ++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  2 +
 3 files changed, 81 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index b4b258c..89ca787 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -31,6 +31,7 @@
  *
  */
 
+#include <linux/bpf.h>
 #include <linux/etherdevice.h>
 #include <linux/tcp.h>
 #include <linux/if_vlan.h>
@@ -1966,6 +1967,9 @@ void mlx4_en_free_resources(struct mlx4_en_priv *priv)
 			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
 	}
 
+	if (priv->prog)
+		bpf_prog_put(priv->prog);
+
 }
 
 int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
@@ -2456,6 +2460,61 @@ static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 m
 	return err;
 }
 
+static DEFINE_PER_CPU(struct sk_buff, percpu_pseudo_skb);
+
+static void build_pseudo_skb_for_bpf(struct sk_buff *skb, void *data,
+				     unsigned int length)
+{
+	/* data_len is intentionally not set here so that skb_is_nonlinear()
+	 * returns false
+	 */
+
+	skb->len = length;
+	skb->head = data;
+	skb->data = data;
+}
+
+int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length)
+{
+	struct sk_buff *skb = this_cpu_ptr(&percpu_pseudo_skb);
+	int ret;
+
+	build_pseudo_skb_for_bpf(skb, data, length);
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, (void *)skb);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int mlx4_bpf_set(struct net_device *dev, int fd)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct bpf_prog *prog = NULL, *old_prog;
+
+	if (fd >= 0) {
+		prog = bpf_prog_get(fd);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+
+		if (prog->type != BPF_PROG_TYPE_PHYS_DEV) {
+			bpf_prog_put(prog);
+			return -EINVAL;
+		}
+	}
+
+	old_prog = xchg(&priv->prog, prog);
+	if (old_prog) {
+		synchronize_net();
+		bpf_prog_put(old_prog);
+	}
+
+	priv->dev->bpf_valid = !!prog;
+
+	return 0;
+}
+
 static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_open		= mlx4_en_open,
 	.ndo_stop		= mlx4_en_close,
@@ -2486,6 +2545,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_features_check	= mlx4_en_features_check,
 #endif
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_bpf_set		= mlx4_bpf_set,
 };
 
 static const struct net_device_ops mlx4_netdev_ops_master = {
@@ -2524,6 +2584,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_features_check	= mlx4_en_features_check,
 #endif
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_bpf_set		= mlx4_bpf_set,
 };
 
 struct mlx4_en_bond {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 86bcfe5..03fe005 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -748,6 +748,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
 	struct mlx4_en_rx_alloc *frags;
 	struct mlx4_en_rx_desc *rx_desc;
+	struct bpf_prog *prog;
 	struct sk_buff *skb;
 	int index;
 	int nr;
@@ -764,6 +765,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	if (budget <= 0)
 		return polled;
 
+	prog = READ_ONCE(priv->prog);
+
 	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
 	 * descriptor offset can be deduced from the CQE index instead of
 	 * reading 'cqe->index' */
@@ -840,6 +843,21 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
 			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
 
+		/* A bpf program gets first chance to drop the packet. It may
+		 * read bytes but not past the end of the frag. A non-zero
+		 * return indicates packet should be dropped.
+		 */
+		if (prog) {
+			struct ethhdr *ethh;
+
+			ethh = (struct ethhdr *)(page_address(frags[0].page) +
+						 frags[0].page_offset);
+			if (mlx4_call_bpf(prog, ethh, length)) {
+				priv->stats.rx_dropped++;
+				goto next;
+			}
+		}
+
 		if (likely(dev->features & NETIF_F_RXCSUM)) {
 			if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
 						      MLX4_CQE_STATUS_UDP)) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index d12ab6a..3d0fc89 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -568,6 +568,7 @@ struct mlx4_en_priv {
 	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
 	struct hwtstamp_config hwtstamp_config;
 	u32 counter_index;
+	struct bpf_prog *prog;
 
 #ifdef CONFIG_MLX4_EN_DCB
 	struct ieee_ets ets;
@@ -682,6 +683,7 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv);
 void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv);
 int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
 void mlx4_en_rx_irq(struct mlx4_cq *mcq);
+int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length);
 
 int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
 int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
-- 
2.8.0

^ permalink raw reply related

* [RFC PATCH 3/5] rtnl: add option for setting link bpf prog
From: Brenden Blanco @ 2016-04-02  1:21 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, gerlitz, daniel,
	john.fastabend, brouer
In-Reply-To: <1459560118-5582-1-git-send-email-bblanco@plumgrid.com>

Sets the bpf program represented by fd as an early filter in the rx path
of the netdev. The fd must have been created as BPF_PROG_TYPE_PHYS_DEV.
Providing a negative value as fd clears the program. Getting the fd back
via rtnl is not possible, therefore reading of this value merely
provides a bool whether the program is valid on the link or not.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index c488066..08d66a3 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -155,6 +155,7 @@ enum {
 	IFLA_PROTO_DOWN,
 	IFLA_GSO_MAX_SEGS,
 	IFLA_GSO_MAX_SIZE,
+	IFLA_BPF_FD,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f206677..1c4a603 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -909,6 +909,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+	       + nla_total_size(4) /* IFLA_BPF_FD */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1241,6 +1242,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
 			atomic_read(&dev->carrier_changes)) ||
+	    nla_put_s32(skb, IFLA_BPF_FD, dev->bpf_valid) ||
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
 		goto nla_put_failure;
 
@@ -1374,6 +1376,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PHYS_SWITCH_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_LINK_NETNSID]	= { .type = NLA_S32 },
 	[IFLA_PROTO_DOWN]	= { .type = NLA_U8 },
+	[IFLA_BPF_FD]		= { .type = NLA_S32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2028,6 +2031,13 @@ static int do_setlink(const struct sk_buff *skb,
 		status |= DO_SETLINK_NOTIFY;
 	}
 
+	if (tb[IFLA_BPF_FD]) {
+		err = dev_change_bpf_fd(dev, nla_get_s32(tb[IFLA_BPF_FD]));
+		if (err)
+			goto errout;
+		status |= DO_SETLINK_NOTIFY;
+	}
+
 errout:
 	if (status & DO_SETLINK_MODIFIED) {
 		if (status & DO_SETLINK_NOTIFY)
-- 
2.8.0

^ permalink raw reply related

* [RFC PATCH 2/5] net: add ndo to set bpf prog in adapter rx
From: Brenden Blanco @ 2016-04-02  1:21 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, gerlitz, daniel,
	john.fastabend, brouer
In-Reply-To: <1459560118-5582-1-git-send-email-bblanco@plumgrid.com>

Add a new netdev op for drivers implementing the BPF_PROG_TYPE_PHYS_DEV
filter to get configuration. Since the fd is only used by the driver to
fetch the prog, the netdev should just keep a bit to indicate the
program is valid.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 include/linux/netdevice.h |  8 ++++++++
 net/core/dev.c            | 12 ++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb0d5d0..c46e2e3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1102,6 +1102,11 @@ struct tc_to_netdev {
  *	appropriate rx headroom value allows avoiding skb head copy on
  *	forward. Setting a negative value resets the rx headroom to the
  *	default value.
+ * int  (*ndo_bpf_set)(struct net_device *dev, int fd);
+ *	This function is used to set or clear a bpf program used in the
+ *	earliest stages of packet rx. The fd must be a program loaded as
+ *	BPF_PROG_TYPE_PHYS_DEV. Negative values of fd indicate the program
+ *	should be removed.
  *
  */
 struct net_device_ops {
@@ -1292,6 +1297,7 @@ struct net_device_ops {
 						       struct sk_buff *skb);
 	void			(*ndo_set_rx_headroom)(struct net_device *dev,
 						       int needed_headroom);
+	int			(*ndo_bpf_set)(struct net_device *dev, int fd);
 };
 
 /**
@@ -1875,6 +1881,7 @@ struct net_device {
 	struct phy_device	*phydev;
 	struct lock_class_key	*qdisc_tx_busylock;
 	bool			proto_down;
+	bool			bpf_valid;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -3268,6 +3275,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
+int dev_change_bpf_fd(struct net_device *dev, int fd);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index b9bcbe7..eb93414 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6480,6 +6480,18 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
+int dev_change_bpf_fd(struct net_device *dev, int fd)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_bpf_set)
+		return -EOPNOTSUPP;
+	if (!netif_device_present(dev))
+		return -ENODEV;
+	return ops->ndo_bpf_set(dev, fd);
+}
+EXPORT_SYMBOL(dev_change_bpf_fd);
+
 /**
  *	dev_new_index	-	allocate an ifindex
  *	@net: the applicable net namespace
-- 
2.8.0

^ permalink raw reply related

* [RFC PATCH 1/5] bpf: add PHYS_DEV prog type for early driver filter
From: Brenden Blanco @ 2016-04-02  1:21 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, gerlitz, daniel,
	john.fastabend, brouer
In-Reply-To: <1459560118-5582-1-git-send-email-bblanco@plumgrid.com>

Add a new bpf prog type that is intended to run in early stages of the
packet rx path. Only minimal packet metadata will be available, hence a new
context type, struct xdp_metadata, is exposed to userspace. So far only
expose the readable packet length, and only in read mode.

The PHYS_DEV name is chosen to represent that the program is meant only
for physical adapters, rather than all netdevs.

While the user visible struct is new, the underlying context must be
implemented as a minimal skb in order for the packet load_* instructions
to work. The skb filled in by the driver must have skb->len, skb->head,
and skb->data set, and skb->data_len == 0.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 include/uapi/linux/bpf.h |  5 ++++
 kernel/bpf/verifier.c    |  1 +
 net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 924f537..b8a4ef2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_KPROBE,
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
+	BPF_PROG_TYPE_PHYS_DEV,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
@@ -367,6 +368,10 @@ struct __sk_buff {
 	__u32 tc_classid;
 };
 
+struct xdp_metadata {
+	__u32 len;
+};
+
 struct bpf_tunnel_key {
 	__u32 tunnel_id;
 	union {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e..804ca70 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1340,6 +1340,7 @@ static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_PHYS_DEV:
 		return true;
 	default:
 		return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index b7177d0..c417db6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2018,6 +2018,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+phys_dev_func_proto(enum bpf_func_id func_id)
+{
+	return sk_filter_func_proto(func_id);
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	/* check bounds */
@@ -2073,6 +2079,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool __is_valid_xdp_access(int off, int size,
+				  enum bpf_access_type type)
+{
+	if (off < 0 || off >= sizeof(struct xdp_metadata))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	if (size != 4)
+		return false;
+
+	return true;
+}
+
+static bool phys_dev_is_valid_access(int off, int size,
+				     enum bpf_access_type type)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	switch (off) {
+	case offsetof(struct xdp_metadata, len):
+		break;
+	default:
+		return false;
+	}
+	return __is_valid_xdp_access(off, size, type);
+}
+
 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 				      int src_reg, int ctx_off,
 				      struct bpf_insn *insn_buf,
@@ -2210,6 +2246,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
+					   int dst_reg, int src_reg,
+					   int ctx_off,
+					   struct bpf_insn *insn_buf,
+					   struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct xdp_metadata, len):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, len));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto = sk_filter_func_proto,
 	.is_valid_access = sk_filter_is_valid_access,
@@ -2222,6 +2278,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops phys_dev_ops = {
+	.get_func_proto = phys_dev_func_proto,
+	.is_valid_access = phys_dev_is_valid_access,
+	.convert_ctx_access = bpf_phys_dev_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops = &sk_filter_ops,
 	.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2237,11 +2299,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
 	.type = BPF_PROG_TYPE_SCHED_ACT,
 };
 
+static struct bpf_prog_type_list phys_dev_type __read_mostly = {
+	.ops = &phys_dev_ops,
+	.type = BPF_PROG_TYPE_PHYS_DEV,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
 	bpf_register_prog_type(&sched_cls_type);
 	bpf_register_prog_type(&sched_act_type);
+	bpf_register_prog_type(&phys_dev_type);
 
 	return 0;
 }
-- 
2.8.0

^ permalink raw reply related

* [RFC PATCH 0/5] Add driver bpf hook for early packet drop
From: Brenden Blanco @ 2016-04-02  1:21 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, gerlitz, daniel,
	john.fastabend, brouer

This patch set introduces new infrastructure for programmatically
processing packets in the earliest stages of rx, as part of an effort
others are calling Express Data Path (XDP) [1]. Start this effort by
introducing a new bpf program type for early packet filtering, before even
an skb has been allocated.

With this, hope to enable line rate filtering, with this initial
implementation providing drop/allow action only.

Patch 1 introduces the new prog type and helpers for validating the bpf
program. A new userspace struct is defined containing only len as a field,
with others to follow in the future.
In patch 2, create a new ndo to pass the fd to support drivers. 
In patch 3, expose a new rtnl option to userspace.
In patch 4, enable support in mlx4 driver. No skb allocation is required,
instead a static percpu skb is kept in the driver and minimally initialized
for each driver frag.
In patch 5, create a sample drop and count program. With single core,
achieved ~14.5 Mpps drop rate on a 40G mlx4. This includes packet data
access, bpf array lookup, and increment.

Interestingly, accessing packet data from the program did not have a
noticeable impact on performance. Even so, future enhancements to
prefetching / batching / page-allocs should hopefully improve the
performance in this path.

[1] https://github.com/iovisor/bpf-docs/blob/master/Express_Data_Path.pdf

Brenden Blanco (5):
  bpf: add PHYS_DEV prog type for early driver filter
  net: add ndo to set bpf prog in adapter rx
  rtnl: add option for setting link bpf prog
  mlx4: add support for fast rx drop bpf program
  Add sample for adding simple drop program to link

 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  61 ++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     |  18 +++
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |   2 +
 include/linux/netdevice.h                      |   8 ++
 include/uapi/linux/bpf.h                       |   5 +
 include/uapi/linux/if_link.h                   |   1 +
 kernel/bpf/verifier.c                          |   1 +
 net/core/dev.c                                 |  12 ++
 net/core/filter.c                              |  68 +++++++++++
 net/core/rtnetlink.c                           |  10 ++
 samples/bpf/Makefile                           |   4 +
 samples/bpf/bpf_load.c                         |   8 ++
 samples/bpf/netdrvx1_kern.c                    |  26 +++++
 samples/bpf/netdrvx1_user.c                    | 155 +++++++++++++++++++++++++
 14 files changed, 379 insertions(+)
 create mode 100644 samples/bpf/netdrvx1_kern.c
 create mode 100644 samples/bpf/netdrvx1_user.c

-- 
2.8.0

^ permalink raw reply

* Re: [PATCH] RDS: sync congestion map updating
From: Leon Romanovsky @ 2016-04-02  1:14 UTC (permalink / raw)
  To: santosh shilimkar; +Cc: linux-rdma, Wengang Wang, netdev
In-Reply-To: <56FED04C.2060806@oracle.com>

On Fri, Apr 01, 2016 at 12:47:24PM -0700, santosh shilimkar wrote:
> (cc-ing netdev)
> On 3/30/2016 7:59 PM, Wengang Wang wrote:
> >
> >
> >在 2016年03月31日 09:51, Wengang Wang 写道:
> >>
> >>
> >>在 2016年03月31日 01:16, santosh shilimkar 写道:
> >>>Hi Wengang,
> >>>
> >>>On 3/30/2016 9:19 AM, Leon Romanovsky wrote:
> >>>>On Wed, Mar 30, 2016 at 05:08:22PM +0800, Wengang Wang wrote:
> >>>>>Problem is found that some among a lot of parallel RDS
> >>>>>communications hang.
> >>>>>In my test ten or so among 33 communications hang. The send
> >>>>>requests got
> >>>>>-ENOBUF error meaning the peer socket (port) is congested. But
> >>>>>meanwhile,
> >>>>>peer socket (port) is not congested.
> >>>>>
> >>>>>The congestion map updating can happen in two paths: one is in
> >>>>>rds_recvmsg path
> >>>>>and the other is when it receives packets from the hardware. There
> >>>>>is no
> >>>>>synchronization when updating the congestion map. So a bit
> >>>>>operation (clearing)
> >>>>>in the rds_recvmsg path can be skipped by another bit operation
> >>>>>(setting) in
> >>>>>hardware packet receving path.
> >>>>>
> >
> >To be more detailed.  Here, the two paths (user calls recvmsg and
> >hardware receives data) are for different rds socks. thus the
> >rds_sock->rs_recv_lock is not helpful to sync the updating on congestion
> >map.
> >
> For archive purpose, let me try to conclude the thread. I synced
> with Wengang offlist and came up with below fix. I was under
> impression that __set_bit_le() was atmoic version. After fixing
> it like patch(end of the email), the bug gets addressed.
> 
> I will probably send this as fix for stable as well.
> 
> 
> From 5614b61f6fdcd6ae0c04e50b97efd13201762294 Mon Sep 17 00:00:00 2001
> From: Santosh Shilimkar <santosh.shilimkar@oracle.com>
> Date: Wed, 30 Mar 2016 23:26:47 -0700
> Subject: [PATCH] RDS: Fix the atomicity for congestion map update
> 
> Two different threads with different rds sockets may be in
> rds_recv_rcvbuf_delta() via receive path. If their ports
> both map to the same word in the congestion map, then
> using non-atomic ops to update it could cause the map to
> be incorrect. Lets use atomics to avoid such an issue.
> 
> Full credit to Wengang <wen.gang.wang@oracle.com> for
> finding the issue, analysing it and also pointing out
> to offending code with spin lock based fix.

I'm glad that you solved the issue without spinlocks.
Out of curiosity, I see that this patch is needed to be sent
to Dave and applied by him. Is it right?

➜  linus-tree git:(master) ./scripts/get_maintainer.pl -f net/rds/cong.c
Santosh Shilimkar <santosh.shilimkar@oracle.com> (supporter:RDS -
RELIABLE DATAGRAM SOCKETS)
"David S. Miller" <davem@davemloft.net> (maintainer:NETWORKING
[GENERAL])
netdev@vger.kernel.org (open list:RDS - RELIABLE DATAGRAM SOCKETS)
linux-rdma@vger.kernel.org (open list:RDS - RELIABLE DATAGRAM SOCKETS)
rds-devel@oss.oracle.com (moderated list:RDS - RELIABLE DATAGRAM
SOCKETS)
linux-kernel@vger.kernel.org (open list)

> 
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>

Reviewed-by: Leon Romanovsky <leon@leon.nu>

^ permalink raw reply

* Re: [PATCH net] vlan: pull on __vlan_insert_tag error path and fix csum correction
From: Daniel Borkmann @ 2016-04-02  0:04 UTC (permalink / raw)
  To: David Miller; +Cc: jiri, alexei.starovoitov, jesse, tom, netdev
In-Reply-To: <56FEE80C.9080806@iogearbox.net>

On 04/01/2016 11:28 PM, Daniel Borkmann wrote:
> On 04/01/2016 09:00 PM, David Miller wrote:
>> From: Daniel Borkmann <daniel@iogearbox.net>
>> Date: Fri,  1 Apr 2016 11:41:03 +0200
>>
>>> Moreover, I noticed that when in the non-error path the __skb_pull()
>>> is done and the original offset to mac header was non-zero, we fixup
>>> from a wrong skb->data offset in the checksum complete processing.
>>>
>>> So the skb_postpush_rcsum() really needs to be done before __skb_pull()
>>> where skb->data still points to the mac header start.
>>
>> Ugh, what a mess, are you sure any of this is right even after your
>> change?  What happens (outside of the csum part) is this:
>>
>>     __skb_push(offset);
>>     __vlan_insert_tag(); {
>>         skb_push(VLAN_HLEN);
>>     ...
>>         memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN);
>>     }
>>     __skb_pull(offset);
>>
>> If I understand this correctly, the last pull will therefore put
>> skb->data pointing at vlan_ethhdr->h_vlan_TCI of the new VLAN header
>> pushed by __vlan_insert_tag().
>>
>> That is assuming skb->data began right after the original ethernet
>> header.
>
> Yes, this is correct. Now, continuing this train of thought: you have
> skb->data pointing _currently_ at vlan_ethhdr->h_vlan_TCI.
>
> And then you call:
>
>    skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
>
> So, we point from the ->vlan_TCI + (2 * ETH_ALEN) as start offset, and
> with VLAN_HLEN (= 4 bytes that we added) as length for the csum
> correction as input. So, we point way beyond what we actually wanted
> to fixup wrt csum, no?
>
> But what we actually want to sum is [h_vlan_proto + h_vlan_TCI], which
> is where above skb_postpush_rcsum() call points to _before_ the last
> __skb_pull() happens. In other words, still at that time, we have the
> same expectations as in __vlan_insert_tag().
>
>> To me, that postpull csum currently is absolutely in the correct spot,
>> because it's acting upon the pull done by __vlan_insert_tag(), not the
>> one done here by skb_vlan_push().
>>
>> Right?
>>
>> Can you tell me how you tested this?  Just curious...
>
> I noticed both while reviewing the code, the error path fixup is not
> critical for ovs or act_vlan as the skb is dropped afterwards, but not
> necessarily in eBPF case, so there it matters as eBPF doesn't know at
> this point, what the program is going to do with it (similar fixup is
> done in __skb_vlan_pop() error path, btw). For the csum, I did a hexdump
> to compare what we write and what is being passed in for the csum correction.
>
> Anyway ...
>
> Aside from all this and based on your comment, I'm investigating whether
> for the vlan push and also pop case the __skb_pull(skb, offset) in success
> case is actually enough and whether it needs to take VLAN_HLEN into account
> as well. But, I need to do more test for that one first. At least the skb_vlan_push()
> comment says "__vlan_insert_tag expect skb->data pointing to mac header.
> So change skb->data before calling it and change back to original position
> later", Jiri?

For this part, what is meant with "original" position (relative to the start
of the ethernet header [currently the case], or relative to some data, e.g.
before/after the call, I still expect skb->data position to point to my IP header)?

Thanks,
Daniel

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2016-04-02  0:04 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) Missing device reference in IPSEC input path results in crashes during
   device unregistration.  From Subash Abhinov Kasiviswanathan.

2) Per-queue ISR register writes not being done properly in macb driver,
   from Cyrille Pitchen.

3) Stats accounting bugs in bcmgenet, from Patri Gynther.

4) Lightweight tunnel's TTL and TOS were swapped in netlink dumps,
   from Quentin Armitage.

5) SXGBE driver has off-by-one in probe error paths, from Rasmus
   Villemoes.

6) Fix race in save/swap/delete options in netfilter ipset, from
   Vishwanath Pai.

7) Ageing time of bridge not set properly when not operating over a
   switchdev device.  Fix from Haishuang Yan.

8) Fix GRO regression wrt. nested FOU/GUE based tunnels, from
   Alexander Duyck.

9) IPV6 UDP code bumps wrong stats, from Eric Dumazet.

10) FEC driver should only access registers that actually exist on
    the given chipset, fix from Fabio Estevam.

Please pull, thanks a lot!

The following changes since commit e46b4e2b46e173889b19999b8bd033d5e8b3acf0:

  Merge tag 'trace-v4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace (2016-03-24 10:52:25 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 

for you to fetch changes up to db5dd0db2d8352bb7fd5e9d16e17b79d66c7e4e3:

  net: mvneta: fix changing MTU when using per-cpu processing (2016-04-01 15:16:37 -0400)

----------------------------------------------------------------
Alexander Duyck (3):
      ixgbe: Store VXLAN port number in network order
      ixgbe: Fix ATR so that it correctly handles IPv6 extension headers
      gro: Allow tunnel stacking in the case of FOU/GUE

Arnd Bergmann (1):
      openvswitch: call only into reachable nf-nat code

Bjorn Helgaas (1):
      netpoll: Fix extra refcount release in netpoll_cleanup()

Bjørn Mork (1):
      qmi_wwan: add "D-Link DWM-221 B1" device id

Charles Keepax (1):
      net: macb: Only call GPIO functions if there is a valid GPIO

Colin Ian King (1):
      qed: initialize return rc to avoid returning garbage

Cosmin-Gabriel Samoila (1):
      Drivers: isdn: hisax: isac.c: Fix assignment and check into one expression.

Cyrille Pitchen (2):
      net: macb: replace macb_writel() call by queue_writel() to update queue ISR
      net: macb: remove BUG_ON() and reset the queue to handle RX errors

Daniel Borkmann (3):
      bpf: add missing map_flags to bpf_map_show_fdinfo
      bpf: make padding in bpf_tunnel_key explicit
      tun, bpf: fix suspicious RCU usage in tun_{attach, detach}_filter

Daniele Palmas (1):
      net: usb: cdc_ncm: adding Telit LE910 V2 mobile broadband card

David S. Miller (4):
      Merge branch 'hns-fixes'
      Merge git://git.kernel.org/.../pablo/nf
      Merge branch '10GbE' of git://git.kernel.org/.../jkirsher/net-queue
      Merge branch 'stmmac-fixes'

Diego Viola (1):
      drivers/net/usb/plusb.c: Fix typo

Emil Tantilov (2):
      ixgbevf: fix error code path when setting MAC address
      ixgbe: make __ixgbe_setup_tc static

Eric Dumazet (1):
      ipv6: udp: fix UDP_MIB_IGNOREDMULTI updates

Fabio Estevam (1):
      fec: Do not access unexisting register in Coldfire

Florian Westphal (3):
      netfilter: x_tables: validate e->target_offset early
      netfilter: x_tables: make sure e->next_offset covers remaining blob size
      netfilter: x_tables: fix unconditional helper

Giuseppe CAVALLARO (3):
      stmmac: fix TX normal DESC
      Revert "stmmac: Fix 'eth0: No PHY found' regression"
      stmmac: fix MDIO settings

Haishuang Yan (2):
      openvswitch: Use proper buffer size in nla_memcpy
      bridge: Allow set bridge ageing time when switchdev disabled

Jaedon Shin (1):
      net: phy: bcm7xxx: Add entries for Broadcom BCM7346 and BCM7362

Jarno Rajahalme (1):
      openvswitch: Fix checking for new expected connections.

Jisheng Zhang (5):
      net: mvpp2: replace MVPP2_CPU_D_CACHE_LINE_SIZE with L1_CACHE_BYTES
      net: mvneta: replace MVNETA_CPU_D_CACHE_LINE_SIZE with L1_CACHE_BYTES
      net: mvpp2: fix maybe-uninitialized warning
      net: mvpp2: use cache_line_size() to get cacheline size
      net: mvneta: use cache_line_size() to get cacheline size

Kejian Yan (1):
      net: hns: fix warning of passing zero to 'PTR_ERR'

Lino Sanfilippo (1):
      ravb: fix software timestamping

Liping Zhang (1):
      netfilter: ipv4: fix NULL dereference

Lisheng (2):
      net: hns: fixed the setting and getting overtime bug
      net: hns: set-coalesce-usecs returns errno by dsaf.ko

Manish Chopra (1):
      qlge: Update version to 1.00.00.35

Marcelo Ricardo Leitner (1):
      sctp: really allow using GFP_KERNEL on sctp_packet_transmit

Marcin Wojtas (1):
      net: mvneta: fix changing MTU when using per-cpu processing

Mark Rustad (1):
      ixgbe: Use udelay to avoid sleeping while atomic

Michael Chan (3):
      bnxt_en: Implement proper firmware message padding.
      bnxt_en: Fix typo in bnxt_hwrm_set_pause_common().
      bnxt_en: Fix ethtool -a reporting.

Nicolas Dichtel (2):
      switchdev: fix typo in comments/doc
      rtnl: fix msg size calculation in if_nlmsg_size()

Pablo Neira Ayuso (2):
      netfilter: nfnetlink_queue: honor NFQA_CFG_F_FAIL_OPEN when netlink unicast fails
      netfilter: x_tables: enforce nul-terminated table name from getsockopt GET_ENTRIES

Patrick Uiterwijk (2):
      net: dsa: mv88e6xxx: Introduce _mv88e6xxx_phy_page_{read, write}
      net: dsa: mv88e6xxx: Clear the PDOWN bit on setup

Petri Gynther (2):
      net: bcmgenet: fix dev->stats.tx_bytes accounting
      net: bcmgenet: fix skb_len in bcmgenet_xmit_single()

Prashant Sreedharan (1):
      bnxt_en: Initialize CP doorbell value before ring allocation

Qianqian Xie (4):
      net: hns: fix a bug for cycle index
      net: hns: optimizate fmt of snprintf()
      net: hns: bug fix for return values
      net: hns: remove useless variable assignment and comment

Quentin Armitage (1):
      Fix returned tc and hoplimit values for route with IPv6 encapsulation

Rasmus Villemoes (1):
      net: sxgbe: fix error paths in sxgbe_platform_probe()

Sheng Li (2):
      net: hns: optimizate irq proccess for HNS V2
      net: hns: bug fix about getting hilink status for HNS v2

Sridhar Samudrala (3):
      ixgbe: fix error handling in TC cls_u32 offload routines
      ixgbe: Fix cls_u32 offload support for fields with masks
      ixgbe: Fix cls_u32 offload support for L4 ports

Stefan Assmann (2):
      ixgbe: call ndo_stop() instead of dev_close() when running offline selftest
      ixgbevf: call ndo_stop() instead of dev_close() when running offline selftest

Tushar Dave (1):
      ixgbe: Fix for RAR0 not being set to default MAC addr

Vishwanath Pai (1):
      netfilter: ipset: fix race condition in ipset save, swap and delete

Xin Long (1):
      team: team should sync the port's uc/mc addrs when add a port

subashab@codeaurora.org (1):
      xfrm: Fix crash observed during device unregistration and decryption

 Documentation/networking/switchdev.txt                |   2 +-
 drivers/isdn/hisax/isac.c                             |  15 ++--
 drivers/net/dsa/mv88e6xxx.c                           |  85 +++++++++++++++++----
 drivers/net/dsa/mv88e6xxx.h                           |   8 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt.c             |  10 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h             |   2 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c     |   6 +-
 drivers/net/ethernet/broadcom/genet/bcmgenet.c        |  16 ++--
 drivers/net/ethernet/broadcom/genet/bcmgenet.h        |   6 ++
 drivers/net/ethernet/cadence/macb.c                   |  69 +++++++++++++----
 drivers/net/ethernet/freescale/fec_main.c             |   2 +-
 drivers/net/ethernet/hisilicon/hns/hnae.h             |   2 +-
 drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c     |  64 ++++++----------
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c    |   3 +-
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c    |  12 +--
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c    |  40 +++++-----
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c     | 196 +++++++++++++++++++++++--------------------------
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h     |  23 +++---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h     |   3 +
 drivers/net/ethernet/hisilicon/hns/hns_enet.c         |  16 ++--
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c      |  10 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe.h              |  10 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c      |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c         | 165 +++++++++++++++++++----------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_model.h        |  21 ++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c         |   2 +-
 drivers/net/ethernet/intel/ixgbevf/ethtool.c          |   4 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h          |   2 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c     |  16 ++--
 drivers/net/ethernet/intel/ixgbevf/vf.c               |   4 +-
 drivers/net/ethernet/marvell/mvneta.c                 |  40 +++++-----
 drivers/net/ethernet/marvell/mvpp2.c                  |  18 +----
 drivers/net/ethernet/qlogic/qed/qed_int.c             |   2 +-
 drivers/net/ethernet/qlogic/qlge/qlge.h               |   2 +-
 drivers/net/ethernet/renesas/ravb_main.c              |   2 +-
 drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c   |   4 +-
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c       |  16 ++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     |  16 ++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c     |  10 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c |  91 ++++++++++++++++-------
 drivers/net/phy/bcm7xxx.c                             |   4 +
 drivers/net/team/team.c                               |   5 ++
 drivers/net/tun.c                                     |   8 +-
 drivers/net/usb/cdc_ncm.c                             |   7 ++
 drivers/net/usb/plusb.c                               |   2 +-
 drivers/net/usb/qmi_wwan.c                            |   1 +
 include/linux/brcmphy.h                               |   2 +
 include/linux/filter.h                                |   4 +
 include/linux/netfilter/ipset/ip_set.h                |   4 +
 include/linux/stmmac.h                                |   1 -
 include/uapi/linux/bpf.h                              |   1 +
 kernel/bpf/syscall.c                                  |   6 +-
 net/bridge/br_stp.c                                   |   2 +-
 net/bridge/netfilter/ebtables.c                       |   4 +
 net/bridge/netfilter/nft_reject_bridge.c              |  20 ++---
 net/core/filter.c                                     |  38 ++++++----
 net/core/netpoll.c                                    |   3 +-
 net/core/rtnetlink.c                                  |   1 +
 net/ipv4/fou.c                                        |  16 ++++
 net/ipv4/ip_tunnel_core.c                             |   4 +-
 net/ipv4/netfilter/arp_tables.c                       |  43 ++++++-----
 net/ipv4/netfilter/ip_tables.c                        |  48 ++++++------
 net/ipv4/netfilter/ipt_SYNPROXY.c                     |  54 +++++++-------
 net/ipv6/netfilter/ip6_tables.c                       |  48 ++++++----

^ permalink raw reply

* Re: [RFC PATCH net 3/4] ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update
From: Cong Wang @ 2016-04-01 23:15 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: netdev, Eric Dumazet, Wei Wang, Kernel Team
In-Reply-To: <CAM_iQpVeOKtTJ=oc9z6=SEaHDBTfpaPLKa-43TH1b_A3+TdWwA@mail.gmail.com>

On Fri, Apr 1, 2016 at 4:13 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Fri, Apr 1, 2016 at 3:56 PM, Martin KaFai Lau <kafai@fb.com> wrote:
>> +       bh_lock_sock(sk);
>> +       if (!sock_owned_by_user(sk))
>> +               ip6_datagram_dst_update(sk, false);
>> +       bh_unlock_sock(sk);
>
>
> My discussion with Eric shows that we probably don't need to hold
> this sock lock here, and you are Cc'ed in that thread, so
>
> 1) why do you still take the lock here?
> 2) why didn't you involve in our discussion if you disagree?

Here is the original thread:
http://comments.gmane.org/gmane.linux.network/405404

^ permalink raw reply

* Re: [RFC PATCH net 3/4] ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update
From: Cong Wang @ 2016-04-01 23:13 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: netdev, Eric Dumazet, Wei Wang, Kernel Team
In-Reply-To: <1459551391-69969-4-git-send-email-kafai@fb.com>

On Fri, Apr 1, 2016 at 3:56 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> +       bh_lock_sock(sk);
> +       if (!sock_owned_by_user(sk))
> +               ip6_datagram_dst_update(sk, false);
> +       bh_unlock_sock(sk);


My discussion with Eric shows that we probably don't need to hold
this sock lock here, and you are Cc'ed in that thread, so

1) why do you still take the lock here?
2) why didn't you involve in our discussion if you disagree?

^ permalink raw reply

* [RFC PATCH net 2/4] ipv6: datagram: Refactor dst lookup and update codes to a new function
From: Martin KaFai Lau @ 2016-04-01 22:56 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Eric Dumazet, Wei Wang, Kernel Team
In-Reply-To: <1459551391-69969-1-git-send-email-kafai@fb.com>

This patch moves the route lookup and update codes for connected
datagram sk to a newly created function ip6_datagram_dst_update()

It will be reused during the pmtu update in the later patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
---
 net/ipv6/datagram.c | 103 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index f07c1dd..140665b 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -64,16 +64,65 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 }
 
+static int ip6_datagram_dst_update(struct sock *sk)
+{
+	struct ip6_flowlabel *flowlabel = NULL;
+	struct in6_addr *final_p, final;
+	struct ipv6_txoptions *opt;
+	struct dst_entry *dst;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flowi6 fl6;
+	int err = 0;
+
+	if (np->sndflow && np->flow_label) {
+		flowlabel = fl6_sock_lookup(sk, np->flow_label);
+		if (!flowlabel)
+			return -EINVAL;
+	}
+	ip6_datagram_flow_key_init(&fl6, sk);
+
+	rcu_read_lock();
+	opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
+	final_p = fl6_update_dst(&fl6, opt, &final);
+	rcu_read_unlock();
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		goto out;
+	}
+
+	if (ipv6_addr_any(&np->saddr))
+		np->saddr = fl6.saddr;
+
+	if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+		sk->sk_v6_rcv_saddr = fl6.saddr;
+		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
+	}
+
+	ip6_dst_store(sk, dst,
+		      ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ?
+		      &sk->sk_v6_daddr : NULL,
+#ifdef CONFIG_IPV6_SUBTREES
+		      ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
+		      &np->saddr :
+#endif
+		      NULL);
+
+out:
+	fl6_sock_release(flowlabel);
+	return err;
+}
+
 static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
 	struct inet_sock	*inet = inet_sk(sk);
 	struct ipv6_pinfo	*np = inet6_sk(sk);
-	struct in6_addr	*daddr, *final_p, final;
-	struct dst_entry	*dst;
-	struct flowi6		fl6;
-	struct ip6_flowlabel	*flowlabel = NULL;
-	struct ipv6_txoptions	*opt;
+	struct in6_addr		*daddr;
 	int			addr_type;
 	int			err;
 	__be32			fl6_flowlabel = 0;
@@ -91,14 +140,8 @@ static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int a
 	if (usin->sin6_family != AF_INET6)
 		return -EAFNOSUPPORT;
 
-	if (np->sndflow) {
+	if (np->sndflow)
 		fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
-		if (fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
-			flowlabel = fl6_sock_lookup(sk, fl6_flowlabel);
-			if (!flowlabel)
-				return -EINVAL;
-		}
-	}
 
 	addr_type = ipv6_addr_type(&usin->sin6_addr);
 
@@ -178,45 +221,13 @@ ipv4_connected:
 	 *	destination cache for it.
 	 */
 
-	ip6_datagram_flow_key_init(&fl6, sk);
-
-	rcu_read_lock();
-	opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
-	final_p = fl6_update_dst(&fl6, opt, &final);
-	rcu_read_unlock();
-
-	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
-	err = 0;
-	if (IS_ERR(dst)) {
-		err = PTR_ERR(dst);
+	err = ip6_datagram_dst_update(sk);
+	if (err)
 		goto out;
-	}
-
-	/* source address lookup done in ip6_dst_lookup */
-
-	if (ipv6_addr_any(&np->saddr))
-		np->saddr = fl6.saddr;
-
-	if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-		sk->sk_v6_rcv_saddr = fl6.saddr;
-		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
-		if (sk->sk_prot->rehash)
-			sk->sk_prot->rehash(sk);
-	}
-
-	ip6_dst_store(sk, dst,
-		      ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ?
-		      &sk->sk_v6_daddr : NULL,
-#ifdef CONFIG_IPV6_SUBTREES
-		      ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
-		      &np->saddr :
-#endif
-		      NULL);
 
 	sk->sk_state = TCP_ESTABLISHED;
 	sk_set_txhash(sk);
 out:
-	fl6_sock_release(flowlabel);
 	return err;
 }
 
-- 
2.5.1

^ permalink raw reply related

* [RFC PATCH net 3/4] ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update
From: Martin KaFai Lau @ 2016-04-01 22:56 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Eric Dumazet, Wei Wang, Kernel Team
In-Reply-To: <1459551391-69969-1-git-send-email-kafai@fb.com>

There is a case in connected UDP socket such that
getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible
sequence could be the following:
1. Create a connected UDP socket
2. Send some datagrams out
3. Receive a ICMPV6_PKT_TOOBIG
4. No new outgoing datagrams to trigger the sk_dst_check()
   logic to update the sk->sk_dst_cache.
5. getsockopt(IPV6_MTU) returns the mtu from the invalid
   sk->sk_dst_cache instead of the newly created RTF_CACHE clone.

This patch updates the sk->sk_dst_cache for a connected datagram sk.

It is done under '!sock_owned_by_user(sk)' condition because
the user may make another ip6_datagram_connect() while
dst lookup and update are happening.

For the sock_owned_by_user(sk) == true case, the next patch will
introduce a release_cb() which will update the sk->sk_dst_cache.

Test:

Server (Connected UDP Socket):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Route Details:
[root@arch-fb-vm1 ~]# ip -6 r show | egrep '2fac'
2fac::/64 dev eth0  proto kernel  metric 256  pref medium
2fac:face::/64 via 2fac::face dev eth0  metric 1024  pref medium

A simple python code to create a connected UDP socket:

import socket
import errno

HOST = '2fac::1'
PORT = 8080

s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
s.bind((HOST, PORT))
s.connect(('2fac:face::face', 53))
print("connected")
while True:
    try:
	data = s.recv(1024)
    except socket.error as se:
	if se.errno == errno.EMSGSIZE:
		pmtu = s.getsockopt(41, 24)
		print("PMTU:%d" % pmtu)
		break
s.close()

Python program output after getting a ICMPV6_PKT_TOOBIG:
[root@arch-fb-vm1 ~]# python2 ~/devshare/kernel/tasks/fib6/udp-connect-53-8080.py
connected
PMTU:1300

Cache routes after recieving TOOBIG:
[root@arch-fb-vm1 ~]# ip -6 r show table cache
2fac:face::face via 2fac::face dev eth0  metric 0
    cache  expires 463sec mtu 1300 pref medium

Client (Send the ICMPV6_PKT_TOOBIG):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
scapy is used to generate the TOOBIG message.  Here is the scapy script I have
used:

>>> p=Ether(src='da:75:4d:36:ac:32', dst='52:54:00:12:34:66', type=0x86dd)/IPv6(src='2fac::face', dst='2fac::1')/ICMPv6PacketTooBig(mtu=1300)/IPv6(src='2fac::
1',dst='2fac:face::face', nh='UDP')/UDP(sport=8080,dport=53)
>>> sendp(p, iface='qemubr0')

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reported-by: Wei Wang <weiwan@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
---
 include/net/ipv6.h  |  1 +
 net/ipv6/datagram.c | 20 +++++++++++---------
 net/ipv6/route.c    | 11 +++++++++++
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index d0aeb97..fd02e90 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -959,6 +959,7 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
 int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
 int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
 				 int addr_len);
+int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
 
 int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
 		    int *addr_len);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 140665b..0b60f1e 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -64,7 +64,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 }
 
-static int ip6_datagram_dst_update(struct sock *sk)
+int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
 {
 	struct ip6_flowlabel *flowlabel = NULL;
 	struct in6_addr *final_p, final;
@@ -93,14 +93,16 @@ static int ip6_datagram_dst_update(struct sock *sk)
 		goto out;
 	}
 
-	if (ipv6_addr_any(&np->saddr))
-		np->saddr = fl6.saddr;
+	if (fix_sk_saddr) {
+		if (ipv6_addr_any(&np->saddr))
+			np->saddr = fl6.saddr;
 
-	if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-		sk->sk_v6_rcv_saddr = fl6.saddr;
-		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
-		if (sk->sk_prot->rehash)
-			sk->sk_prot->rehash(sk);
+		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+			sk->sk_v6_rcv_saddr = fl6.saddr;
+			inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+			if (sk->sk_prot->rehash)
+				sk->sk_prot->rehash(sk);
+		}
 	}
 
 	ip6_dst_store(sk, dst,
@@ -221,7 +223,7 @@ ipv4_connected:
 	 *	destination cache for it.
 	 */
 
-	err = ip6_datagram_dst_update(sk);
+	err = ip6_datagram_dst_update(sk, true);
 	if (err)
 		goto out;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ed44663..f7e6a6d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1417,8 +1417,19 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu);
 
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 {
+	struct dst_entry *dst;
+
 	ip6_update_pmtu(skb, sock_net(sk), mtu,
 			sk->sk_bound_dev_if, sk->sk_mark);
+
+	dst = __sk_dst_get(sk);
+	if (!dst || dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
+		return;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk))
+		ip6_datagram_dst_update(sk, false);
+	bh_unlock_sock(sk);
 }
 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
 
-- 
2.5.1

^ permalink raw reply related

* [RFC PATCH net 1/4] ipv6: datagram: Refactor flowi6 init codes to a new function
From: Martin KaFai Lau @ 2016-04-01 22:56 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Eric Dumazet, Wei Wang, Kernel Team
In-Reply-To: <1459551391-69969-1-git-send-email-kafai@fb.com>

Move flowi6 init codes for connected datagram sk to a newly created
function ip6_datagram_flow_key_init().

It will be reused during pmtu update in the later patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
---
 net/ipv6/datagram.c | 50 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 4281621..f07c1dd 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -40,6 +40,30 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a)
 	return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
 }
 
+static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	memset(fl6, 0, sizeof(*fl6));
+	fl6->flowi6_proto = sk->sk_protocol;
+	fl6->daddr = sk->sk_v6_daddr;
+	fl6->saddr = np->saddr;
+	fl6->flowi6_oif = sk->sk_bound_dev_if;
+	fl6->flowi6_mark = sk->sk_mark;
+	fl6->fl6_dport = inet->inet_dport;
+	fl6->fl6_sport = inet->inet_sport;
+	fl6->flowlabel = np->flow_label;
+
+	if (!fl6->flowi6_oif)
+		fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+
+	if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr))
+		fl6->flowi6_oif = np->mcast_oif;
+
+	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+}
+
 static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
@@ -52,6 +76,7 @@ static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int a
 	struct ipv6_txoptions	*opt;
 	int			addr_type;
 	int			err;
+	__be32			fl6_flowlabel = 0;
 
 	if (usin->sin6_family == AF_INET) {
 		if (__ipv6_only_sock(sk))
@@ -66,11 +91,10 @@ static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int a
 	if (usin->sin6_family != AF_INET6)
 		return -EAFNOSUPPORT;
 
-	memset(&fl6, 0, sizeof(fl6));
 	if (np->sndflow) {
-		fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
-		if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
-			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+		fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+		if (fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
+			flowlabel = fl6_sock_lookup(sk, fl6_flowlabel);
 			if (!flowlabel)
 				return -EINVAL;
 		}
@@ -145,7 +169,7 @@ ipv4_connected:
 	}
 
 	sk->sk_v6_daddr = *daddr;
-	np->flow_label = fl6.flowlabel;
+	np->flow_label = fl6_flowlabel;
 
 	inet->inet_dport = usin->sin6_port;
 
@@ -154,21 +178,7 @@ ipv4_connected:
 	 *	destination cache for it.
 	 */
 
-	fl6.flowi6_proto = sk->sk_protocol;
-	fl6.daddr = sk->sk_v6_daddr;
-	fl6.saddr = np->saddr;
-	fl6.flowi6_oif = sk->sk_bound_dev_if;
-	fl6.flowi6_mark = sk->sk_mark;
-	fl6.fl6_dport = inet->inet_dport;
-	fl6.fl6_sport = inet->inet_sport;
-
-	if (!fl6.flowi6_oif)
-		fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
-
-	if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
-		fl6.flowi6_oif = np->mcast_oif;
-
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	ip6_datagram_flow_key_init(&fl6, sk);
 
 	rcu_read_lock();
 	opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
-- 
2.5.1

^ permalink raw reply related

* [RFC PATCH net 4/4] ipv6: udp: Do a route lookup and update during release_cb
From: Martin KaFai Lau @ 2016-04-01 22:56 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Eric Dumazet, Wei Wang, Kernel Team
In-Reply-To: <1459551391-69969-1-git-send-email-kafai@fb.com>

This patch adds a release_cb for UDPv6.  It does a route lookup
and updates sk->sk_dst_cache if it is needed.  It picks up the
left-over job from ip6_sk_update_pmtu() if the sk is owned
by user during the pmtu update.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reported-by: Wei Wang <weiwan@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
---
 include/net/ipv6.h  |  1 +
 net/ipv6/datagram.c | 12 ++++++++++++
 net/ipv6/udp.c      |  1 +
 3 files changed, 14 insertions(+)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index fd02e90..1be050a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -960,6 +960,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
 int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
 				 int addr_len);
 int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
+void ip6_datagram_release_cb(struct sock *sk);
 
 int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
 		    int *addr_len);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 0b60f1e..a4f06bd 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -119,6 +119,18 @@ out:
 	return err;
 }
 
+void ip6_datagram_release_cb(struct sock *sk)
+{
+	struct dst_entry *dst;
+
+	dst = __sk_dst_get(sk);
+	if (!dst || dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
+		return;
+
+	ip6_datagram_dst_update(sk, false);
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
+
 static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8125931..6bc5c66 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1539,6 +1539,7 @@ struct proto udpv6_prot = {
 	.sendmsg	   = udpv6_sendmsg,
 	.recvmsg	   = udpv6_recvmsg,
 	.backlog_rcv	   = __udpv6_queue_rcv_skb,
+	.release_cb	   = ip6_datagram_release_cb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.rehash		   = udp_v6_rehash,
-- 
2.5.1

^ permalink raw reply related

* [RFC PATCH net 0/4] ip6: datagram: Update dst cache of a connected datagram sk during pmtu update
From: Martin KaFai Lau @ 2016-04-01 22:56 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Eric Dumazet, Wei Wang, Kernel Team

There is a case in connected UDP socket such that
getsockopt(IPV6_MTU) will return a stale MTU value. The reproducible
sequence could be the following:
1. Create a connected UDP socket
2. Send some datagrams out
3. Receive a ICMPV6_PKT_TOOBIG
4. No new outgoing datagrams to trigger the sk_dst_check()
   logic to update the sk->sk_dst_cache.
5. getsockopt(IPV6_MTU) returns the mtu from the invalid
   sk->sk_dst_cache instead of the newly created RTF_CACHE clone.

Patch 1 and 2 are the prep work.
Patch 3 and 4 are the fixes.

^ permalink raw reply

* Re: [v7, 0/5] Fix eSDHC host version register bug
From: Scott Wood @ 2016-04-01 22:25 UTC (permalink / raw)
  To: Yangbo Lu, devicetree, linux-arm-kernel, linux-kernel,
	linuxppc-dev, linux-clk, linux-i2c, iommu, netdev, linux-mmc
  Cc: ulf.hansson, Zhao Qiang, Russell King, Bhupesh Sharma,
	Joerg Roedel, Santosh Shilimkar, Jochen Friedrich, scott.wood,
	Rob Herring, Claudiu Manoil, Kumar Gala, leoyang.li, xiaobo.xie
In-Reply-To: <1459480051-3701-1-git-send-email-yangbo.lu@nxp.com>

On Fri, 2016-04-01 at 11:07 +0800, Yangbo Lu wrote:
> This patchset is used to fix a host version register bug in the T4240-R1.0
> -R2.0
> eSDHC controller. To get the SoC version and revision, it's needed to add
> the
> GUTS driver to access the global utilities registers.
> 
> So, the first three patches are to add the GUTS driver.
> The following two patches are to enable GUTS driver support to get SVR in
> eSDHC
> driver and fix host version for T4240.
> 
> Yangbo Lu (5):
>   ARM64: dts: ls2080a: add device configuration node
>   soc: fsl: add GUTS driver for QorIQ platforms
>   dt: move guts devicetree doc out of powerpc directory
>   powerpc/fsl: move mpc85xx.h to include/linux/fsl
>   mmc: sdhci-of-esdhc: fix host version for T4240-R1.0-R2.0

Acked-by: Scott Wood <oss@buserror.net>

-Scott


^ permalink raw reply

* [net-next PATCH 1/2 v4] ibmvnic: map L2/L3/L4 header descriptors to firmware
From: Thomas Falcon @ 2016-04-01 22:20 UTC (permalink / raw)
  To: netdev; +Cc: linuxppc-dev, davem, John Allen

Allow the VNIC driver to provide descriptors containing
L2/L3/L4 headers to firmware.  This feature is needed
for greater hardware compatibility and enablement of checksum
and TCP offloading features.

A new function is included for the hypervisor call,
H_SEND_SUBCRQ_INDIRECT, allowing a DMA-mapped array of SCRQ
descriptor elements to be sent to the VNIC server.

These additions will help fully enable checksum offloading as
well as other features as they are included later.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Cc: John Allen <jallen@linux.vnet.ibm.com>
---
v2: Fixed typo error caught by kbuild test bot
v3: Fixed erroneous patch sender
v4: sorry for the delay in resending, Thanks to David Miller for comments,
    removed all extra memory allocations,
    merged some helper functions,
    calculate all header lengths to meet firmware requirements,
    fixed endian bugs in the send_subcrq_indirect
---
 drivers/net/ethernet/ibm/ibmvnic.c | 195 ++++++++++++++++++++++++++++++++++++-
 drivers/net/ethernet/ibm/ibmvnic.h |   3 +
 2 files changed, 194 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 6e9e16ee..4e97e76 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -61,6 +61,7 @@
 #include <linux/proc_fs.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/irq.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
@@ -94,6 +95,7 @@ static int ibmvnic_reenable_crq_queue(struct ibmvnic_adapter *);
 static int ibmvnic_send_crq(struct ibmvnic_adapter *, union ibmvnic_crq *);
 static int send_subcrq(struct ibmvnic_adapter *adapter, u64 remote_handle,
 		       union sub_crq *sub_crq);
+static int send_subcrq_indirect(struct ibmvnic_adapter *, u64, u64, u64);
 static irqreturn_t ibmvnic_interrupt_rx(int irq, void *instance);
 static int enable_scrq_irq(struct ibmvnic_adapter *,
 			   struct ibmvnic_sub_crq_queue *);
@@ -561,10 +563,141 @@ static int ibmvnic_close(struct net_device *netdev)
 	return 0;
 }
 
+/**
+ * build_hdr_data - creates L2/L3/L4 header data buffer
+ * @hdr_field - bitfield determining needed headers
+ * @skb - socket buffer
+ * @hdr_len - array of header lengths
+ * @tot_len - total length of data
+ *
+ * Reads hdr_field to determine which headers are needed by firmware.
+ * Builds a buffer containing these headers.  Saves individual header
+ * lengths and total buffer length to be used to build descriptors.
+ */
+static int build_hdr_data(u8 hdr_field, struct sk_buff *skb,
+			  int *hdr_len, u8 *hdr_data)
+{
+	int len = 0;
+	u8 *hdr;
+
+	hdr_len[0] = sizeof(struct ethhdr);
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		hdr_len[1] = ip_hdr(skb)->ihl * 4;
+		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
+			hdr_len[2] = tcp_hdrlen(skb);
+		else if (ip_hdr(skb)->protocol == IPPROTO_UDP)
+			hdr_len[2] = sizeof(struct udphdr);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		hdr_len[1] = sizeof(struct ipv6hdr);
+		if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
+			hdr_len[2] = tcp_hdrlen(skb);
+		else if (ipv6_hdr(skb)->nexthdr == IPPROTO_UDP)
+			hdr_len[2] = sizeof(struct udphdr);
+	}
+
+	memset(hdr_data, 0, 120);
+	if ((hdr_field >> 6) & 1) {
+		hdr = skb_mac_header(skb);
+		memcpy(hdr_data, hdr, hdr_len[0]);
+		len += hdr_len[0];
+	}
+
+	if ((hdr_field >> 5) & 1) {
+		hdr = skb_network_header(skb);
+		memcpy(hdr_data + len, hdr, hdr_len[1]);
+		len += hdr_len[1];
+	}
+
+	if ((hdr_field >> 4) & 1) {
+		hdr = skb_transport_header(skb);
+		memcpy(hdr_data + len, hdr, hdr_len[2]);
+		len += hdr_len[2];
+	}
+	return len;
+}
+
+/**
+ * create_hdr_descs - create header and header extension descriptors
+ * @hdr_field - bitfield determining needed headers
+ * @data - buffer containing header data
+ * @len - length of data buffer
+ * @hdr_len - array of individual header lengths
+ * @scrq_arr - descriptor array
+ *
+ * Creates header and, if needed, header extension descriptors and
+ * places them in a descriptor array, scrq_arr
+ */
+
+static void create_hdr_descs(u8 hdr_field, u8 *hdr_data, int len, int *hdr_len,
+			     union sub_crq *scrq_arr)
+{
+	union sub_crq hdr_desc;
+	int tmp_len = len;
+	u8 *data, *cur;
+	int tmp;
+
+	while (tmp_len > 0) {
+		cur = hdr_data + len - tmp_len;
+
+		memset(&hdr_desc, 0, sizeof(hdr_desc));
+		if (cur != hdr_data) {
+			data = hdr_desc.hdr_ext.data;
+			tmp = tmp_len > 29 ? 29 : tmp_len;
+			hdr_desc.hdr_ext.first = IBMVNIC_CRQ_CMD;
+			hdr_desc.hdr_ext.type = IBMVNIC_HDR_EXT_DESC;
+			hdr_desc.hdr_ext.len = tmp;
+		} else {
+			data = hdr_desc.hdr.data;
+			tmp = tmp_len > 24 ? 24 : tmp_len;
+			hdr_desc.hdr.first = IBMVNIC_CRQ_CMD;
+			hdr_desc.hdr.type = IBMVNIC_HDR_DESC;
+			hdr_desc.hdr.len = tmp;
+			hdr_desc.hdr.l2_len = (u8)hdr_len[0];
+			hdr_desc.hdr.l3_len = cpu_to_be16((u16)hdr_len[1]);
+			hdr_desc.hdr.l4_len = (u8)hdr_len[2];
+			hdr_desc.hdr.flag = hdr_field << 1;
+		}
+		memcpy(data, cur, tmp);
+		tmp_len -= tmp;
+		*scrq_arr = hdr_desc;
+		scrq_arr++;
+	}
+}
+
+/**
+ * build_hdr_descs_arr - build a header descriptor array
+ * @skb - socket buffer
+ * @num_entries - number of descriptors to be sent
+ * @subcrq - first TX descriptor
+ * @hdr_field - bit field determining which headers will be sent
+ *
+ * This function will build a TX descriptor array with applicable
+ * L2/L3/L4 packet header descriptors to be sent by send_subcrq_indirect.
+ */
+
+static void build_hdr_descs_arr(struct ibmvnic_tx_buff *txbuff,
+				int *num_entries, u8 hdr_field)
+{
+	int hdr_len[3] = {0, 0, 0};
+	int tot_len, len;
+	u8 *hdr_data = txbuff->hdr_data;
+
+	tot_len = build_hdr_data(hdr_field, txbuff->skb, hdr_len,
+				 txbuff->hdr_data);
+	len = tot_len;
+	len -= 24;
+	if (len > 0)
+		num_entries += len % 29 ? len / 29 + 1 : len / 29;
+	create_hdr_descs(hdr_field, hdr_data, tot_len, hdr_len,
+			 txbuff->indir_arr + 1);
+}
+
 static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int queue_num = skb_get_queue_mapping(skb);
+	u8 *hdrs = (u8 *)&adapter->tx_rx_desc_req;
 	struct device *dev = &adapter->vdev->dev;
 	struct ibmvnic_tx_buff *tx_buff = NULL;
 	struct ibmvnic_tx_pool *tx_pool;
@@ -579,6 +712,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	unsigned long lpar_rc;
 	union sub_crq tx_crq;
 	unsigned int offset;
+	int num_entries = 1;
 	unsigned char *dst;
 	u64 *handle_array;
 	int index = 0;
@@ -644,11 +778,34 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 			tx_crq.v1.flags1 |= IBMVNIC_TX_PROT_UDP;
 	}
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL)
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		tx_crq.v1.flags1 |= IBMVNIC_TX_CHKSUM_OFFLOAD;
-
-	lpar_rc = send_subcrq(adapter, handle_array[0], &tx_crq);
-
+		hdrs += 2;
+	}
+	/* determine if l2/3/4 headers are sent to firmware */
+	if ((*hdrs >> 7) & 1 &&
+	    (skb->protocol == htons(ETH_P_IP) ||
+	     skb->protocol == htons(ETH_P_IPV6))) {
+		build_hdr_descs_arr(tx_buff, &num_entries, *hdrs);
+		tx_crq.v1.n_crq_elem = num_entries;
+		tx_buff->indir_arr[0] = tx_crq;
+		tx_buff->indir_dma = dma_map_single(dev, tx_buff->indir_arr,
+						    sizeof(tx_buff->indir_arr),
+						    DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, tx_buff->indir_dma)) {
+			if (!firmware_has_feature(FW_FEATURE_CMO))
+				dev_err(dev, "tx: unable to map descriptor array\n");
+			tx_map_failed++;
+			tx_dropped++;
+			ret = NETDEV_TX_BUSY;
+			goto out;
+		}
+		lpar_rc = send_subcrq_indirect(adapter, handle_array[0],
+					       (u64)tx_buff->indir_dma,
+					       (u64)num_entries);
+	} else {
+		lpar_rc = send_subcrq(adapter, handle_array[0], &tx_crq);
+	}
 	if (lpar_rc != H_SUCCESS) {
 		dev_err(dev, "tx failed with code %ld\n", lpar_rc);
 
@@ -1159,6 +1316,7 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter,
 	union sub_crq *next;
 	int index;
 	int i, j;
+	u8 first;
 
 restart_loop:
 	while (pending_scrq(adapter, scrq)) {
@@ -1181,6 +1339,13 @@ restart_loop:
 				txbuff->data_dma[j] = 0;
 				txbuff->used_bounce = false;
 			}
+			/* if sub_crq was sent indirectly */
+			first = txbuff->indir_arr[0].generic.first;
+			if (first == IBMVNIC_CRQ_CMD) {
+				dma_unmap_single(dev, txbuff->indir_dma,
+						 sizeof(txbuff->indir_arr),
+						 DMA_TO_DEVICE);
+			}
 
 			if (txbuff->last_frag)
 				dev_kfree_skb_any(txbuff->skb);
@@ -1494,6 +1659,28 @@ static int send_subcrq(struct ibmvnic_adapter *adapter, u64 remote_handle,
 	return rc;
 }
 
+static int send_subcrq_indirect(struct ibmvnic_adapter *adapter,
+				u64 remote_handle, u64 ioba, u64 num_entries)
+{
+	unsigned int ua = adapter->vdev->unit_address;
+	struct device *dev = &adapter->vdev->dev;
+	int rc;
+
+	/* Make sure the hypervisor sees the complete request */
+	mb();
+	rc = plpar_hcall_norets(H_SEND_SUB_CRQ_INDIRECT, ua,
+				cpu_to_be64(remote_handle),
+				ioba, num_entries);
+
+	if (rc) {
+		if (rc == H_CLOSED)
+			dev_warn(dev, "CRQ Queue closed\n");
+		dev_err(dev, "Send (indirect) error (rc=%d)\n", rc);
+	}
+
+	return rc;
+}
+
 static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter,
 			    union ibmvnic_crq *crq)
 {
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 1a9993c..5af8a79 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -879,6 +879,9 @@ struct ibmvnic_tx_buff {
 	int pool_index;
 	bool last_frag;
 	bool used_bounce;
+	union sub_crq indir_arr[6];
+	u8 hdr_data[140];
+	dma_addr_t indir_dma;
 };
 
 struct ibmvnic_tx_pool {
-- 
2.4.11

^ permalink raw reply related

* [net-next PATCH 2/2 v4] ibmvnic: enable RX checksum offload
From: Thomas Falcon @ 2016-04-01 22:20 UTC (permalink / raw)
  To: netdev; +Cc: linuxppc-dev, davem, John Allen
In-Reply-To: <1459549235-22039-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Enable RX Checksum offload feature in the ibmvnic driver.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Cc: John Allen <jallen@linux.vnet.ibm.com>
---
v4: this patch included since it is enabled by the previous patch
---
 drivers/net/ethernet/ibm/ibmvnic.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4e97e76..21bccf6 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2105,6 +2105,10 @@ static void handle_query_ip_offload_rsp(struct ibmvnic_adapter *adapter)
 	if (buf->tcp_ipv6_chksum || buf->udp_ipv6_chksum)
 		adapter->netdev->features |= NETIF_F_IPV6_CSUM;
 
+	if ((adapter->netdev->features &
+	    (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)))
+		adapter->netdev->features |= NETIF_F_RXCSUM;
+
 	memset(&crq, 0, sizeof(crq));
 	crq.control_ip_offload.first = IBMVNIC_CRQ_CMD;
 	crq.control_ip_offload.cmd = CONTROL_IP_OFFLOAD;
-- 
2.4.11

^ permalink raw reply related

* Re: [Odd commit author id merge via netdev]
From: santosh shilimkar @ 2016-04-01 22:15 UTC (permalink / raw)
  To: Johannes Berg, netdev, David S. Miller
In-Reply-To: <1459540904.3342.2.camel@sipsolutions.net>

On 4/1/2016 1:01 PM, Johannes Berg wrote:
> On Fri, 2016-04-01 at 10:51 -0700, santosh shilimkar wrote:
>> Hi Dave,
>>
>> I noticed something odd while checking the recent
>> commits of mine in kernel.org tree made it via netdev.
>>
>> Don't know if its patchwork tool doing this.
>> Usual author line in my git objects	:
>> 	Author: Santosh Shilimkar <emaid-id>
>>
>> But the commits going via your tree seems to be like below..
>> 	Author: email-id <email-id>
>>
>> Few more examples of the commits end of the email. Can this
>> be fixed for future commits ? The git objects you pulled from
>> my tree directly have right author format where as ones which
>> are picked from patchworks seems to be odd.
>>
>
> Patchwork does store this info somehow and re-use it, quite possibly
> from the very first patch you ever sent. I think this bug was *just*
> fixed in patchwork, but it'll probably be a while until that fix lands.
>
> However, you can go and create a patchwork account with the real name,
> associate it with all the email addresses you use and then I think
> it'll pick it up. Not entirely sure though, you'll have to test it.
>
I will try that. Thanks for the tip.

Reagrds,
Santosh

^ permalink raw reply

* Re: Question on rhashtable in worst-case scenario.
From: Johannes Berg @ 2016-04-01 21:34 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Ben Greear, David Miller, linux-kernel, linux-wireless, netdev,
	tgraf
In-Reply-To: <20160401004627.GA9367@gondor.apana.org.au>

On Fri, 2016-04-01 at 08:46 +0800, Herbert Xu wrote:
> On Thu, Mar 31, 2016 at 05:29:59PM +0200, Johannes Berg wrote:
> > 
> > 
> > Does removing this completely disable the "-EEXIST" error? I can't
> > say
> > I fully understand the elasticity stuff in
> > __rhashtable_insert_fast().
> What EEXIST error are you talking about? The only one that can be
> returned on insertion is if you're explicitly checking for dups
> which clearly can't be the case for you.

I was thinking about that one - it's not obvious to me from the code
how this "explicitly checking for dups" would be done or let's say how
rhashtable differentiates. But since it seems to work for Ben until
hitting a certain number of identical keys, surely that's just me not
understanding the code rather than anything else :)

johannes

^ permalink raw reply

* Re: [PATCH net] vlan: pull on __vlan_insert_tag error path and fix csum correction
From: Daniel Borkmann @ 2016-04-01 21:28 UTC (permalink / raw)
  To: David Miller; +Cc: jiri, alexei.starovoitov, jesse, tom, netdev
In-Reply-To: <20160401.150000.1165010490623034251.davem@davemloft.net>

On 04/01/2016 09:00 PM, David Miller wrote:
> From: Daniel Borkmann <daniel@iogearbox.net>
> Date: Fri,  1 Apr 2016 11:41:03 +0200
>
>> Moreover, I noticed that when in the non-error path the __skb_pull()
>> is done and the original offset to mac header was non-zero, we fixup
>> from a wrong skb->data offset in the checksum complete processing.
>>
>> So the skb_postpush_rcsum() really needs to be done before __skb_pull()
>> where skb->data still points to the mac header start.
>
> Ugh, what a mess, are you sure any of this is right even after your
> change?  What happens (outside of the csum part) is this:
>
> 	__skb_push(offset);
> 	__vlan_insert_tag(); {
> 		skb_push(VLAN_HLEN);
> 	...
> 		memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN);
> 	}
> 	__skb_pull(offset);
>
> If I understand this correctly, the last pull will therefore put
> skb->data pointing at vlan_ethhdr->h_vlan_TCI of the new VLAN header
> pushed by __vlan_insert_tag().
>
> That is assuming skb->data began right after the original ethernet
> header.

Yes, this is correct. Now, continuing this train of thought: you have
skb->data pointing _currently_ at vlan_ethhdr->h_vlan_TCI.

And then you call:

   skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);

So, we point from the ->vlan_TCI + (2 * ETH_ALEN) as start offset, and
with VLAN_HLEN (= 4 bytes that we added) as length for the csum
correction as input. So, we point way beyond what we actually wanted
to fixup wrt csum, no?

But what we actually want to sum is [h_vlan_proto + h_vlan_TCI], which
is where above skb_postpush_rcsum() call points to _before_ the last
__skb_pull() happens. In other words, still at that time, we have the
same expectations as in __vlan_insert_tag().

> To me, that postpull csum currently is absolutely in the correct spot,
> because it's acting upon the pull done by __vlan_insert_tag(), not the
> one done here by skb_vlan_push().
>
> Right?
>
> Can you tell me how you tested this?  Just curious...

I noticed both while reviewing the code, the error path fixup is not
critical for ovs or act_vlan as the skb is dropped afterwards, but not
necessarily in eBPF case, so there it matters as eBPF doesn't know at
this point, what the program is going to do with it (similar fixup is
done in __skb_vlan_pop() error path, btw). For the csum, I did a hexdump
to compare what we write and what is being passed in for the csum correction.

Anyway ...

Aside from all this and based on your comment, I'm investigating whether
for the vlan push and also pop case the __skb_pull(skb, offset) in success
case is actually enough and whether it needs to take VLAN_HLEN into account
as well. But, I need to do more test for that one first. At least the skb_vlan_push()
comment says "__vlan_insert_tag expect skb->data pointing to mac header.
So change skb->data before calling it and change back to original position
later", Jiri?

^ permalink raw reply

* RE: [net PATCH 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Subash Abhinov Kasiviswanathan @ 2016-04-01 21:13 UTC (permalink / raw)
  To: 'Alexander Duyck', 'David Miller'
  Cc: 'Eric Dumazet', 'Alex Duyck',
	'Herbert Xu', 'Tom Herbert',
	'Jesse Gross', 'Eric Dumazet', 'Netdev'
In-Reply-To: <CAKgT0UcLvAF9gTzcaR=GZyTkM2UQ6ymDmnisjgZiG8hTE+PdLA@mail.gmail.com>

|  For transmit we can leave the IP ID code as is.  For receive we should not be
|  snooping into the IP ID for any frames that have the DF bit set as devices
|  that have adopted RFC 6864 on their transmit path will end up causing issues.

Currently, GRO does not coalesce TCP packets originating from nodes which do IPv6 to IPv4 translation.
These packets have DF set to 1 and IP ID set 0 according to https://tools.ietf.org/html/rfc6145#page-17 

With this patch, we should be able to see coalescing happen for those cases as well.

^ permalink raw reply

* [PATCH v4 net-next 15/15] nfp: allow ring size reconfiguration at runtime
From: Jakub Kicinski @ 2016-04-01 21:06 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1459544811-24879-1-git-send-email-jakub.kicinski@netronome.com>

Since much of the required changes have already been made for
changing MTU at runtime let's use it for ring size changes as
well.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h       |   1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 126 +++++++++++++++++++++
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  30 ++---
 3 files changed, 136 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 1e08c9cf3ee0..90ad6264e62c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -724,6 +724,7 @@ void nfp_net_rss_write_key(struct nfp_net *nn);
 void nfp_net_coalesce_write_cfg(struct nfp_net *nn);
 int nfp_net_irqs_alloc(struct nfp_net *nn);
 void nfp_net_irqs_disable(struct nfp_net *nn);
+int nfp_net_set_ring_size(struct nfp_net *nn, u32 rxd_cnt, u32 txd_cnt);
 
 #ifdef CONFIG_NFP_NET_DEBUG
 void nfp_net_debugfs_create(void);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 631168a1971e..57f330a4736e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1445,6 +1445,59 @@ err_alloc:
 	return -ENOMEM;
 }
 
+static struct nfp_net_tx_ring *
+nfp_net_shadow_tx_rings_prepare(struct nfp_net *nn, u32 buf_cnt)
+{
+	struct nfp_net_tx_ring *rings;
+	unsigned int r;
+
+	rings = kcalloc(nn->num_tx_rings, sizeof(*rings), GFP_KERNEL);
+	if (!rings)
+		return NULL;
+
+	for (r = 0; r < nn->num_tx_rings; r++) {
+		nfp_net_tx_ring_init(&rings[r], nn->tx_rings[r].r_vec, r);
+
+		if (nfp_net_tx_ring_alloc(&rings[r], buf_cnt))
+			goto err_free_prev;
+	}
+
+	return rings;
+
+err_free_prev:
+	while (r--)
+		nfp_net_tx_ring_free(&rings[r]);
+	kfree(rings);
+	return NULL;
+}
+
+static struct nfp_net_tx_ring *
+nfp_net_shadow_tx_rings_swap(struct nfp_net *nn, struct nfp_net_tx_ring *rings)
+{
+	struct nfp_net_tx_ring *old = nn->tx_rings;
+	unsigned int r;
+
+	for (r = 0; r < nn->num_tx_rings; r++)
+		old[r].r_vec->tx_ring = &rings[r];
+
+	nn->tx_rings = rings;
+	return old;
+}
+
+static void
+nfp_net_shadow_tx_rings_free(struct nfp_net *nn, struct nfp_net_tx_ring *rings)
+{
+	unsigned int r;
+
+	if (!rings)
+		return;
+
+	for (r = 0; r < nn->num_tx_rings; r++)
+		nfp_net_tx_ring_free(&rings[r]);
+
+	kfree(rings);
+}
+
 /**
  * nfp_net_rx_ring_free() - Free resources allocated to a RX ring
  * @rx_ring:  RX ring to free
@@ -1561,6 +1614,9 @@ nfp_net_shadow_rx_rings_free(struct nfp_net *nn, struct nfp_net_rx_ring *rings)
 {
 	unsigned int r;
 
+	if (!rings)
+		return;
+
 	for (r = 0; r < nn->num_r_vecs; r++) {
 		nfp_net_rx_ring_bufs_free(nn, &rings[r]);
 		nfp_net_rx_ring_free(&rings[r]);
@@ -2106,6 +2162,76 @@ static int nfp_net_change_mtu(struct net_device *netdev, int new_mtu)
 	return err;
 }
 
+int nfp_net_set_ring_size(struct nfp_net *nn, u32 rxd_cnt, u32 txd_cnt)
+{
+	struct nfp_net_tx_ring *tx_rings = NULL;
+	struct nfp_net_rx_ring *rx_rings = NULL;
+	u32 old_rxd_cnt, old_txd_cnt;
+	int err;
+
+	if (!netif_running(nn->netdev)) {
+		nn->rxd_cnt = rxd_cnt;
+		nn->txd_cnt = txd_cnt;
+		return 0;
+	}
+
+	old_rxd_cnt = nn->rxd_cnt;
+	old_txd_cnt = nn->txd_cnt;
+
+	/* Prepare new rings */
+	if (nn->rxd_cnt != rxd_cnt) {
+		rx_rings = nfp_net_shadow_rx_rings_prepare(nn, nn->fl_bufsz,
+							   rxd_cnt);
+		if (!rx_rings)
+			return -ENOMEM;
+	}
+	if (nn->txd_cnt != txd_cnt) {
+		tx_rings = nfp_net_shadow_tx_rings_prepare(nn, txd_cnt);
+		if (!tx_rings) {
+			nfp_net_shadow_rx_rings_free(nn, rx_rings);
+			return -ENOMEM;
+		}
+	}
+
+	/* Stop device, swap in new rings, try to start the firmware */
+	nfp_net_close_stack(nn);
+	nfp_net_clear_config_and_disable(nn);
+
+	if (rx_rings)
+		rx_rings = nfp_net_shadow_rx_rings_swap(nn, rx_rings);
+	if (tx_rings)
+		tx_rings = nfp_net_shadow_tx_rings_swap(nn, tx_rings);
+
+	nn->rxd_cnt = rxd_cnt;
+	nn->txd_cnt = txd_cnt;
+
+	err = nfp_net_set_config_and_enable(nn);
+	if (err) {
+		const int err_new = err;
+
+		/* Try with old configuration and old rings */
+		if (rx_rings)
+			rx_rings = nfp_net_shadow_rx_rings_swap(nn, rx_rings);
+		if (tx_rings)
+			tx_rings = nfp_net_shadow_tx_rings_swap(nn, tx_rings);
+
+		nn->rxd_cnt = old_rxd_cnt;
+		nn->txd_cnt = old_txd_cnt;
+
+		err = __nfp_net_set_config_and_enable(nn);
+		if (err)
+			nn_err(nn, "Can't restore ring config - FW communication failed (%d,%d)\n",
+			       err_new, err);
+	}
+
+	nfp_net_shadow_rx_rings_free(nn, rx_rings);
+	nfp_net_shadow_tx_rings_free(nn, tx_rings);
+
+	nfp_net_open_stack(nn);
+
+	return err;
+}
+
 static struct rtnl_link_stats64 *nfp_net_stat64(struct net_device *netdev,
 						struct rtnl_link_stats64 *stats)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 9a4084a68db5..ccfef1f17627 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -153,37 +153,25 @@ static int nfp_net_set_ringparam(struct net_device *netdev,
 	struct nfp_net *nn = netdev_priv(netdev);
 	u32 rxd_cnt, txd_cnt;
 
-	if (netif_running(netdev)) {
-		/* Some NIC drivers allow reconfiguration on the fly,
-		 * some down the interface, change and then up it
-		 * again.  For now we don't allow changes when the
-		 * device is up.
-		 */
-		nn_warn(nn, "Can't change rings while device is up\n");
-		return -EBUSY;
-	}
-
 	/* We don't have separate queues/rings for small/large frames. */
 	if (ring->rx_mini_pending || ring->rx_jumbo_pending)
 		return -EINVAL;
 
 	/* Round up to supported values */
 	rxd_cnt = roundup_pow_of_two(ring->rx_pending);
-	rxd_cnt = max_t(u32, rxd_cnt, NFP_NET_MIN_RX_DESCS);
-	rxd_cnt = min_t(u32, rxd_cnt, NFP_NET_MAX_RX_DESCS);
-
 	txd_cnt = roundup_pow_of_two(ring->tx_pending);
-	txd_cnt = max_t(u32, txd_cnt, NFP_NET_MIN_TX_DESCS);
-	txd_cnt = min_t(u32, txd_cnt, NFP_NET_MAX_TX_DESCS);
 
-	if (nn->rxd_cnt != rxd_cnt || nn->txd_cnt != txd_cnt)
-		nn_dbg(nn, "Change ring size: RxQ %u->%u, TxQ %u->%u\n",
-		       nn->rxd_cnt, rxd_cnt, nn->txd_cnt, txd_cnt);
+	if (rxd_cnt < NFP_NET_MIN_RX_DESCS || rxd_cnt > NFP_NET_MAX_RX_DESCS ||
+	    txd_cnt < NFP_NET_MIN_TX_DESCS || txd_cnt > NFP_NET_MAX_TX_DESCS)
+		return -EINVAL;
 
-	nn->rxd_cnt = rxd_cnt;
-	nn->txd_cnt = txd_cnt;
+	if (nn->rxd_cnt == rxd_cnt && nn->txd_cnt == txd_cnt)
+		return 0;
 
-	return 0;
+	nn_dbg(nn, "Change ring size: RxQ %u->%u, TxQ %u->%u\n",
+	       nn->rxd_cnt, rxd_cnt, nn->txd_cnt, txd_cnt);
+
+	return nfp_net_set_ring_size(nn, rxd_cnt, txd_cnt);
 }
 
 static void nfp_net_get_strings(struct net_device *netdev,
-- 
1.9.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox