Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next 1/2] bpf: add helper for getting xfrm states
From: Eyal Birger @ 2018-04-17  4:48 UTC (permalink / raw)
  To: netdev; +Cc: shmulik, ast, daniel, fw, steffen.klassert, Eyal Birger
In-Reply-To: <1523940529-3791-1-git-send-email-eyal.birger@gmail.com>

This commit introduces a helper which allows fetching xfrm state
parameters by eBPF programs attached to TC.

Prototype:
bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)

skb: pointer to skb
index: the index in the skb xfrm_state secpath array
xfrm_state: pointer to 'struct bpf_xfrm_state'
size: size of 'struct bpf_xfrm_state'
flags: reserved for future extensions

The helper returns 0 on success. Non zero if no xfrm state at the index
is found - or non exists at all.

struct bpf_xfrm_state currently includes the SPI, peer IPv4/IPv6
address and the reqid; it can be further extended by adding elements to
its end - indicating the populated fields by the 'size' argument -
keeping backwards compatibility.

Typical usage:

struct bpf_xfrm_state x = {};
bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0);
...

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
---
 include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++-
 net/core/filter.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec897..132e172 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -755,6 +755,15 @@ union bpf_attr {
  *     @addr: pointer to struct sockaddr to bind socket to
  *     @addr_len: length of sockaddr structure
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
+ *     retrieve XFRM state
+ *     @skb: pointer to skb
+ *     @index: index of the xfrm state in the secpath
+ *     @key: pointer to 'struct bpf_xfrm_state'
+ *     @size: size of 'struct bpf_xfrm_state'
+ *     @flags: room for future extensions
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -821,7 +830,8 @@ union bpf_attr {
 	FN(msg_apply_bytes),		\
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
-	FN(bind),
+	FN(bind),			\
+	FN(skb_get_xfrm_state),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -927,6 +937,19 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* user accessible mirror of in-kernel xfrm_state.
+ * new fields can only be added to the end of this structure
+ */
+struct bpf_xfrm_state {
+	__u32 reqid;
+	__u32 spi;
+	__u16 family;
+	union {
+		__u32 remote_ipv4;
+		__u32 remote_ipv6[4];
+	};
+};
+
 /* Generic BPF return codes which all BPF program types may support.
  * The values are binary compatible with their TC_ACT_* counter-part to
  * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
diff --git a/net/core/filter.c b/net/core/filter.c
index d31aff9..c06600a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,6 +57,7 @@
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
 #include <net/tcp.h>
+#include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 
 /**
@@ -3703,6 +3704,49 @@ static const struct bpf_func_proto bpf_bind_proto = {
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+	   struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+#ifdef CONFIG_XFRM
+	const struct sec_path *sp = skb_sec_path(skb);
+	const struct xfrm_state *x;
+
+	if (!sp || index >= sp->len)
+		goto err_clear;
+
+	x = sp->xvec[index];
+
+	if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+		goto err_clear;
+
+	to->reqid = x->props.reqid;
+	to->spi = be32_to_cpu(x->id.spi);
+	to->family = x->props.family;
+	if (to->family == AF_INET6) {
+		memcpy(to->remote_ipv6, x->props.saddr.a6,
+		       sizeof(to->remote_ipv6));
+	} else {
+		to->remote_ipv4 = be32_to_cpu(x->props.saddr.a4);
+	}
+
+	return 0;
+err_clear:
+#endif
+	memset(to, 0, size);
+	return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+	.func		= bpf_skb_get_xfrm_state,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3844,6 +3888,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_skb_get_xfrm_state:
+		return &bpf_skb_get_xfrm_state_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 2/2] openvswitch: Support conntrack zone limit (fwd)
From: Julia Lawall @ 2018-04-17  4:49 UTC (permalink / raw)
  To: Yi-Hung Wei; +Cc: netdev, kbuild-all

Line 1814 frees something that is dereferenced on the next line.

julia

---------- Forwarded message ----------
Date: Tue, 17 Apr 2018 10:32:17 +0800
From: kbuild test robot <lkp@intel.com>
To: kbuild@01.org
Cc: Julia Lawall <julia.lawall@lip6.fr>
Subject: Re: [PATCH net-next 2/2] openvswitch: Support conntrack zone limit

CC: kbuild-all@01.org
In-Reply-To: <1523902550-10767-3-git-send-email-yihung.wei@gmail.com>
References: <1523902550-10767-3-git-send-email-yihung.wei@gmail.com>
TO: Yi-Hung Wei <yihung.wei@gmail.com>
CC: netdev@vger.kernel.org
CC: Yi-Hung Wei <yihung.wei@gmail.com>

Hi Yi-Hung,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Yi-Hung-Wei/openvswitch-Support-conntrack-zone-limit/20180417-085035
:::::: branch date: 2 hours ago
:::::: commit date: 2 hours ago

>> net/openvswitch/conntrack.c:1815:17-39: ERROR: reference preceded by free on line 1814

# https://github.com/0day-ci/linux/commit/01487f05f032565b952e344abace672a12045d9e
git remote add linux-review https://github.com/0day-ci/linux
git remote update linux-review
git checkout 01487f05f032565b952e344abace672a12045d9e
vim +1815 net/openvswitch/conntrack.c

c2ac6673 Joe Stringer 2015-08-26  1786
01487f05 Yi-Hung Wei  2018-04-16  1787  #if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
01487f05 Yi-Hung Wei  2018-04-16  1788  static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
01487f05 Yi-Hung Wei  2018-04-16  1789  {
01487f05 Yi-Hung Wei  2018-04-16  1790  	int i;
01487f05 Yi-Hung Wei  2018-04-16  1791
01487f05 Yi-Hung Wei  2018-04-16  1792  	ovs_net->ct_limit_info = kmalloc(sizeof *ovs_net->ct_limit_info,
01487f05 Yi-Hung Wei  2018-04-16  1793  					 GFP_KERNEL);
01487f05 Yi-Hung Wei  2018-04-16  1794  	if (!ovs_net->ct_limit_info)
01487f05 Yi-Hung Wei  2018-04-16  1795  		return -ENOMEM;
01487f05 Yi-Hung Wei  2018-04-16  1796
01487f05 Yi-Hung Wei  2018-04-16  1797  	ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
01487f05 Yi-Hung Wei  2018-04-16  1798  	ovs_net->ct_limit_info->limits =
01487f05 Yi-Hung Wei  2018-04-16  1799  		kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
01487f05 Yi-Hung Wei  2018-04-16  1800  			      GFP_KERNEL);
01487f05 Yi-Hung Wei  2018-04-16  1801  	if (!ovs_net->ct_limit_info->limits) {
01487f05 Yi-Hung Wei  2018-04-16  1802  		kfree(ovs_net->ct_limit_info);
01487f05 Yi-Hung Wei  2018-04-16  1803  		return -ENOMEM;
01487f05 Yi-Hung Wei  2018-04-16  1804  	}
01487f05 Yi-Hung Wei  2018-04-16  1805
01487f05 Yi-Hung Wei  2018-04-16  1806  	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
01487f05 Yi-Hung Wei  2018-04-16  1807  		INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
01487f05 Yi-Hung Wei  2018-04-16  1808
01487f05 Yi-Hung Wei  2018-04-16  1809  	ovs_net->ct_limit_info->data =
01487f05 Yi-Hung Wei  2018-04-16  1810  		nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
01487f05 Yi-Hung Wei  2018-04-16  1811
01487f05 Yi-Hung Wei  2018-04-16  1812  	if (IS_ERR(ovs_net->ct_limit_info->data)) {
01487f05 Yi-Hung Wei  2018-04-16  1813  		kfree(ovs_net->ct_limit_info->limits);
01487f05 Yi-Hung Wei  2018-04-16 @1814  		kfree(ovs_net->ct_limit_info);
01487f05 Yi-Hung Wei  2018-04-16 @1815  		return PTR_ERR(ovs_net->ct_limit_info->data);
01487f05 Yi-Hung Wei  2018-04-16  1816  	}
01487f05 Yi-Hung Wei  2018-04-16  1817  	return 0;
01487f05 Yi-Hung Wei  2018-04-16  1818  }
01487f05 Yi-Hung Wei  2018-04-16  1819

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* [PATCH bpf-next 0/2] bpf: add helper for getting xfrm states
From: Eyal Birger @ 2018-04-17  4:48 UTC (permalink / raw)
  To: netdev; +Cc: shmulik, ast, daniel, fw, steffen.klassert, Eyal Birger

This patchset adds support for fetching XFRM state information from
an eBPF program called from TC.

The first patch introduces a helper for fetching an XFRM state from the
skb's secpath. The XFRM state is modeled using a new virtual struct which
contains the SPI, peer address, and reqid values of the state; This struct
can be extended in the future to provide additional state information.

The second patch adds a test example in test_tunnel_bpf.sh. The sample
validates the correct extraction of state information by the eBPF program.

---


Eyal Birger (2):
  bpf: add helper for getting xfrm states
  samples/bpf: extend test_tunnel_bpf.sh with xfrm state test

 include/uapi/linux/bpf.h                  | 25 ++++++++++-
 net/core/filter.c                         | 46 ++++++++++++++++++++
 samples/bpf/tcbpf2_kern.c                 | 15 +++++++
 samples/bpf/test_tunnel_bpf.sh            | 71 +++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            | 25 ++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++
 6 files changed, 183 insertions(+), 2 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH RESEND net-next] net/ncsi: Refactor MAC, VLAN filters
From: Samuel Mendoza-Jonas @ 2018-04-17  4:23 UTC (permalink / raw)
  To: netdev; +Cc: Samuel Mendoza-Jonas, David S . Miller, linux-kernel, openbmc

The NCSI driver defines a generic ncsi_channel_filter struct that can be
used to store arbitrarily formatted filters, and several generic methods
of accessing data stored in such a filter.
However in both the driver and as defined in the NCSI specification
there are only two actual filters: VLAN ID filters and MAC address
filters. The splitting of the MAC filter into unicast, multicast, and
mixed is also technically not necessary as these are stored in the same
location in hardware.

To save complexity, particularly in the set up and accessing of these
generic filters, remove them in favour of two specific structs. These
can be acted on directly and do not need several generic helper
functions to use.

This also fixes a memory error found by KASAN on ARM32 (which is not
upstream yet), where response handlers accessing a filter's data field
could write past allocated memory.

[  114.926512] ==================================================================
[  114.933861] BUG: KASAN: slab-out-of-bounds in ncsi_configure_channel+0x4b8/0xc58
[  114.941304] Read of size 2 at addr 94888558 by task kworker/0:2/546
[  114.947593]
[  114.949146] CPU: 0 PID: 546 Comm: kworker/0:2 Not tainted 4.16.0-rc6-00119-ge156398bfcad #13
...
[  115.170233] The buggy address belongs to the object at 94888540
[  115.170233]  which belongs to the cache kmalloc-32 of size 32
[  115.181917] The buggy address is located 24 bytes inside of
[  115.181917]  32-byte region [94888540, 94888560)
[  115.192115] The buggy address belongs to the page:
[  115.196943] page:9eeac100 count:1 mapcount:0 mapping:94888000 index:0x94888fc1
[  115.204200] flags: 0x100(slab)
[  115.207330] raw: 00000100 94888000 94888fc1 0000003f 00000001 9eea2014 9eecaa74 96c003e0
[  115.215444] page dumped because: kasan: bad access detected
[  115.221036]
[  115.222544] Memory state around the buggy address:
[  115.227384]  94888400: fb fb fb fb fc fc fc fc 04 fc fc fc fc fc fc fc
[  115.233959]  94888480: 00 00 00 fc fc fc fc fc 00 04 fc fc fc fc fc fc
[  115.240529] >94888500: 00 00 04 fc fc fc fc fc 00 00 04 fc fc fc fc fc
[  115.247077]                                             ^
[  115.252523]  94888580: 00 04 fc fc fc fc fc fc 06 fc fc fc fc fc fc fc
[  115.259093]  94888600: 00 00 06 fc fc fc fc fc 00 00 04 fc fc fc fc fc
[  115.265639] ==================================================================

Reported-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
---
 net/ncsi/internal.h     |  34 +++---
 net/ncsi/ncsi-manage.c  | 226 +++++++++-------------------------------
 net/ncsi/ncsi-netlink.c |  20 ++--
 net/ncsi/ncsi-rsp.c     | 178 +++++++++++++------------------
 4 files changed, 147 insertions(+), 311 deletions(-)

diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 8da84312cd3b..8055e3965cef 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -68,15 +68,6 @@ enum {
 	NCSI_MODE_MAX
 };
 
-enum {
-	NCSI_FILTER_BASE	= 0,
-	NCSI_FILTER_VLAN	= 0,
-	NCSI_FILTER_UC,
-	NCSI_FILTER_MC,
-	NCSI_FILTER_MIXED,
-	NCSI_FILTER_MAX
-};
-
 struct ncsi_channel_version {
 	u32 version;		/* Supported BCD encoded NCSI version */
 	u32 alpha2;		/* Supported BCD encoded NCSI version */
@@ -98,11 +89,18 @@ struct ncsi_channel_mode {
 	u32 data[8];	/* Data entries                */
 };
 
-struct ncsi_channel_filter {
-	u32 index;	/* Index of channel filters          */
-	u32 total;	/* Total entries in the filter table */
-	u64 bitmap;	/* Bitmap of valid entries           */
-	u32 data[];	/* Data for the valid entries        */
+struct ncsi_channel_mac_filter {
+	u8	n_uc;
+	u8	n_mc;
+	u8	n_mixed;
+	u64	bitmap;
+	unsigned char	*addrs;
+};
+
+struct ncsi_channel_vlan_filter {
+	u8	n_vids;
+	u64	bitmap;
+	u16	*vids;
 };
 
 struct ncsi_channel_stats {
@@ -186,7 +184,9 @@ struct ncsi_channel {
 	struct ncsi_channel_version version;
 	struct ncsi_channel_cap	    caps[NCSI_CAP_MAX];
 	struct ncsi_channel_mode    modes[NCSI_MODE_MAX];
-	struct ncsi_channel_filter  *filters[NCSI_FILTER_MAX];
+	/* Filtering Settings */
+	struct ncsi_channel_mac_filter	mac_filter;
+	struct ncsi_channel_vlan_filter	vlan_filter;
 	struct ncsi_channel_stats   stats;
 	struct {
 		struct timer_list   timer;
@@ -320,10 +320,6 @@ extern spinlock_t ncsi_dev_lock;
 	list_for_each_entry_rcu(nc, &np->channels, node)
 
 /* Resources */
-u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index);
-int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data);
-int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data);
-int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index);
 void ncsi_start_channel_monitor(struct ncsi_channel *nc);
 void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
 struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index c3695ba0cf94..5561e221b71f 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -27,125 +27,6 @@
 LIST_HEAD(ncsi_dev_list);
 DEFINE_SPINLOCK(ncsi_dev_lock);
 
-static inline int ncsi_filter_size(int table)
-{
-	int sizes[] = { 2, 6, 6, 6 };
-
-	BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX);
-	if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX)
-		return -EINVAL;
-
-	return sizes[table];
-}
-
-u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index)
-{
-	struct ncsi_channel_filter *ncf;
-	int size;
-
-	ncf = nc->filters[table];
-	if (!ncf)
-		return NULL;
-
-	size = ncsi_filter_size(table);
-	if (size < 0)
-		return NULL;
-
-	return ncf->data + size * index;
-}
-
-/* Find the first active filter in a filter table that matches the given
- * data parameter. If data is NULL, this returns the first active filter.
- */
-int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data)
-{
-	struct ncsi_channel_filter *ncf;
-	void *bitmap;
-	int index, size;
-	unsigned long flags;
-
-	ncf = nc->filters[table];
-	if (!ncf)
-		return -ENXIO;
-
-	size = ncsi_filter_size(table);
-	if (size < 0)
-		return size;
-
-	spin_lock_irqsave(&nc->lock, flags);
-	bitmap = (void *)&ncf->bitmap;
-	index = -1;
-	while ((index = find_next_bit(bitmap, ncf->total, index + 1))
-	       < ncf->total) {
-		if (!data || !memcmp(ncf->data + size * index, data, size)) {
-			spin_unlock_irqrestore(&nc->lock, flags);
-			return index;
-		}
-	}
-	spin_unlock_irqrestore(&nc->lock, flags);
-
-	return -ENOENT;
-}
-
-int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data)
-{
-	struct ncsi_channel_filter *ncf;
-	int index, size;
-	void *bitmap;
-	unsigned long flags;
-
-	size = ncsi_filter_size(table);
-	if (size < 0)
-		return size;
-
-	index = ncsi_find_filter(nc, table, data);
-	if (index >= 0)
-		return index;
-
-	ncf = nc->filters[table];
-	if (!ncf)
-		return -ENODEV;
-
-	spin_lock_irqsave(&nc->lock, flags);
-	bitmap = (void *)&ncf->bitmap;
-	do {
-		index = find_next_zero_bit(bitmap, ncf->total, 0);
-		if (index >= ncf->total) {
-			spin_unlock_irqrestore(&nc->lock, flags);
-			return -ENOSPC;
-		}
-	} while (test_and_set_bit(index, bitmap));
-
-	memcpy(ncf->data + size * index, data, size);
-	spin_unlock_irqrestore(&nc->lock, flags);
-
-	return index;
-}
-
-int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index)
-{
-	struct ncsi_channel_filter *ncf;
-	int size;
-	void *bitmap;
-	unsigned long flags;
-
-	size = ncsi_filter_size(table);
-	if (size < 0)
-		return size;
-
-	ncf = nc->filters[table];
-	if (!ncf || index >= ncf->total)
-		return -ENODEV;
-
-	spin_lock_irqsave(&nc->lock, flags);
-	bitmap = (void *)&ncf->bitmap;
-	if (test_and_clear_bit(index, bitmap))
-		memset(ncf->data + size * index, 0, size);
-	spin_unlock_irqrestore(&nc->lock, flags);
-
-	return 0;
-}
-
 static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
@@ -339,20 +220,13 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
 static void ncsi_remove_channel(struct ncsi_channel *nc)
 {
 	struct ncsi_package *np = nc->package;
-	struct ncsi_channel_filter *ncf;
 	unsigned long flags;
-	int i;
 
-	/* Release filters */
 	spin_lock_irqsave(&nc->lock, flags);
-	for (i = 0; i < NCSI_FILTER_MAX; i++) {
-		ncf = nc->filters[i];
-		if (!ncf)
-			continue;
 
-		nc->filters[i] = NULL;
-		kfree(ncf);
-	}
+	/* Release filters */
+	kfree(nc->mac_filter.addrs);
+	kfree(nc->vlan_filter.vids);
 
 	nc->state = NCSI_CHANNEL_INACTIVE;
 	spin_unlock_irqrestore(&nc->lock, flags);
@@ -670,32 +544,26 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
 static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 			 struct ncsi_cmd_arg *nca)
 {
+	struct ncsi_channel_vlan_filter *ncf;
+	unsigned long flags;
+	void *bitmap;
 	int index;
-	u32 *data;
 	u16 vid;
 
-	index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, NULL);
-	if (index < 0) {
-		/* Filter table empty */
-		return -1;
-	}
+	ncf = &nc->vlan_filter;
+	bitmap = &ncf->bitmap;
 
-	data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, index);
-	if (!data) {
-		netdev_err(ndp->ndev.dev,
-			   "NCSI: failed to retrieve filter %d\n", index);
-		/* Set the VLAN id to 0 - this will still disable the entry in
-		 * the filter table, but we won't know what it was.
-		 */
-		vid = 0;
-	} else {
-		vid = *(u16 *)data;
+	spin_lock_irqsave(&nc->lock, flags);
+	index = find_next_bit(bitmap, ncf->n_vids, 0);
+	if (index >= ncf->n_vids) {
+		spin_unlock_irqrestore(&nc->lock, flags);
+		return -1;
 	}
+	vid = ncf->vids[index];
 
-	netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-		      "NCSI: removed vlan tag %u at index %d\n",
-		      vid, index + 1);
-	ncsi_remove_filter(nc, NCSI_FILTER_VLAN, index);
+	clear_bit(index, bitmap);
+	ncf->vids[index] = 0;
+	spin_unlock_irqrestore(&nc->lock, flags);
 
 	nca->type = NCSI_PKT_CMD_SVF;
 	nca->words[1] = vid;
@@ -711,45 +579,55 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 		       struct ncsi_cmd_arg *nca)
 {
+	struct ncsi_channel_vlan_filter *ncf;
 	struct vlan_vid *vlan = NULL;
-	int index = 0;
+	unsigned long flags;
+	int i, index;
+	void *bitmap;
+	u16 vid;
 
+	if (list_empty(&ndp->vlan_vids))
+		return -1;
+
+	ncf = &nc->vlan_filter;
+	bitmap = &ncf->bitmap;
+
+	spin_lock_irqsave(&nc->lock, flags);
+
+	rcu_read_lock();
 	list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) {
-		index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan->vid);
-		if (index < 0) {
-			/* New tag to add */
-			netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-				      "NCSI: new vlan id to set: %u\n",
-				      vlan->vid);
+		vid = vlan->vid;
+		for (i = 0; i < ncf->n_vids; i++)
+			if (ncf->vids[i] == vid) {
+				vid = 0;
+				break;
+			}
+		if (vid)
 			break;
-		}
-		netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-			      "vid %u already at filter pos %d\n",
-			      vlan->vid, index);
 	}
+	rcu_read_unlock();
 
-	if (!vlan || index >= 0) {
-		netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-			      "no vlan ids left to set\n");
+	if (!vid) {
+		/* No VLAN ID is not set */
+		spin_unlock_irqrestore(&nc->lock, flags);
 		return -1;
 	}
 
-	index = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan->vid);
-	if (index < 0) {
+	index = find_next_zero_bit(bitmap, ncf->n_vids, 0);
+	if (index < 0 || index >= ncf->n_vids) {
 		netdev_err(ndp->ndev.dev,
-			   "Failed to add new VLAN tag, error %d\n", index);
-		if (index == -ENOSPC)
-			netdev_err(ndp->ndev.dev,
-				   "Channel %u already has all VLAN filters set\n",
-				   nc->id);
+			   "Channel %u already has all VLAN filters set\n",
+			   nc->id);
+		spin_unlock_irqrestore(&nc->lock, flags);
 		return -1;
 	}
 
-	netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-		      "NCSI: set vid %u in packet, index %u\n",
-		      vlan->vid, index + 1);
+	ncf->vids[index] = vid;
+	set_bit(index, bitmap);
+	spin_unlock_irqrestore(&nc->lock, flags);
+
 	nca->type = NCSI_PKT_CMD_SVF;
-	nca->words[1] = vlan->vid;
+	nca->words[1] = vid;
 	/* HW filter index starts at 1 */
 	nca->bytes[6] = index + 1;
 	nca->bytes[7] = 0x01;
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 8d7e849d4825..b09ef77bf4cd 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -58,10 +58,9 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
 				   struct ncsi_dev_priv *ndp,
 				   struct ncsi_channel *nc)
 {
-	struct nlattr *vid_nest;
-	struct ncsi_channel_filter *ncf;
+	struct ncsi_channel_vlan_filter *ncf;
 	struct ncsi_channel_mode *m;
-	u32 *data;
+	struct nlattr *vid_nest;
 	int i;
 
 	nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id);
@@ -79,18 +78,13 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
 	vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST);
 	if (!vid_nest)
 		return -ENOMEM;
-	ncf = nc->filters[NCSI_FILTER_VLAN];
+	ncf = &nc->vlan_filter;
 	i = -1;
-	if (ncf) {
-		while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total,
-					  i + 1)) < ncf->total) {
-			data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i);
-			/* Uninitialised channels will have 'zero' vlan ids */
-			if (!data || !*data)
-				continue;
+	while ((i = find_next_bit((void *)&ncf->bitmap, ncf->n_vids,
+				  i + 1)) < ncf->n_vids) {
+		if (ncf->vids[i])
 			nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID,
-				    *(u16 *)data);
-		}
+				    ncf->vids[i]);
 	}
 	nla_nest_end(skb, vid_nest);
 
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index efd933ff5570..ce9497966ebe 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -334,9 +334,9 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
 	struct ncsi_rsp_pkt *rsp;
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	struct ncsi_channel *nc;
-	struct ncsi_channel_filter *ncf;
-	unsigned short vlan;
-	int ret;
+	struct ncsi_channel_vlan_filter *ncf;
+	unsigned long flags;
+	void *bitmap;
 
 	/* Find the package and channel */
 	rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
@@ -346,22 +346,23 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
 		return -ENODEV;
 
 	cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd);
-	ncf = nc->filters[NCSI_FILTER_VLAN];
-	if (!ncf)
-		return -ENOENT;
-	if (cmd->index >= ncf->total)
+	ncf = &nc->vlan_filter;
+	if (cmd->index > ncf->n_vids)
 		return -ERANGE;
 
-	/* Add or remove the VLAN filter */
+	/* Add or remove the VLAN filter. Remember HW indexes from 1 */
+	spin_lock_irqsave(&nc->lock, flags);
+	bitmap = &ncf->bitmap;
 	if (!(cmd->enable & 0x1)) {
-		/* HW indexes from 1 */
-		ret = ncsi_remove_filter(nc, NCSI_FILTER_VLAN, cmd->index - 1);
+		if (test_and_clear_bit(cmd->index - 1, bitmap))
+			ncf->vids[cmd->index - 1] = 0;
 	} else {
-		vlan = ntohs(cmd->vlan);
-		ret = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan);
+		set_bit(cmd->index - 1, bitmap);
+		ncf->vids[cmd->index - 1] = ntohs(cmd->vlan);
 	}
+	spin_unlock_irqrestore(&nc->lock, flags);
 
-	return ret;
+	return 0;
 }
 
 static int ncsi_rsp_handler_ev(struct ncsi_request *nr)
@@ -422,8 +423,12 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
 	struct ncsi_rsp_pkt *rsp;
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	struct ncsi_channel *nc;
-	struct ncsi_channel_filter *ncf;
+	struct ncsi_channel_mac_filter *ncf;
+	unsigned long flags;
 	void *bitmap;
+	bool enabled;
+	int index;
+
 
 	/* Find the package and channel */
 	rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
@@ -436,31 +441,23 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
 	 * isn't supported yet.
 	 */
 	cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd);
-	switch (cmd->at_e >> 5) {
-	case 0x0:	/* UC address */
-		ncf = nc->filters[NCSI_FILTER_UC];
-		break;
-	case 0x1:	/* MC address */
-		ncf = nc->filters[NCSI_FILTER_MC];
-		break;
-	default:
-		return -EINVAL;
-	}
+	enabled = cmd->at_e & 0x1;
+	ncf = &nc->mac_filter;
+	bitmap = &ncf->bitmap;
 
-	/* Sanity check on the filter */
-	if (!ncf)
-		return -ENOENT;
-	else if (cmd->index >= ncf->total)
+	if (cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed)
 		return -ERANGE;
 
-	bitmap = &ncf->bitmap;
-	if (cmd->at_e & 0x1) {
-		set_bit(cmd->index, bitmap);
-		memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6);
+	index = (cmd->index - 1) * ETH_ALEN;
+	spin_lock_irqsave(&nc->lock, flags);
+	if (enabled) {
+		set_bit(cmd->index - 1, bitmap);
+		memcpy(&ncf->addrs[index], cmd->mac, ETH_ALEN);
 	} else {
-		clear_bit(cmd->index, bitmap);
-		memset(ncf->data + 6 * cmd->index, 0, 6);
+		clear_bit(cmd->index - 1, bitmap);
+		memset(&ncf->addrs[index], 0, ETH_ALEN);
 	}
+	spin_unlock_irqrestore(&nc->lock, flags);
 
 	return 0;
 }
@@ -631,9 +628,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
 	struct ncsi_rsp_gc_pkt *rsp;
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	struct ncsi_channel *nc;
-	struct ncsi_channel_filter *ncf;
-	size_t size, entry_size;
-	int cnt, i;
+	size_t size;
 
 	/* Find the channel */
 	rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp);
@@ -655,64 +650,40 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
 	nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode &
 				      NCSI_CAP_VLAN_MASK;
 
-	/* Build filters */
-	for (i = 0; i < NCSI_FILTER_MAX; i++) {
-		switch (i) {
-		case NCSI_FILTER_VLAN:
-			cnt = rsp->vlan_cnt;
-			entry_size = 2;
-			break;
-		case NCSI_FILTER_MIXED:
-			cnt = rsp->mixed_cnt;
-			entry_size = 6;
-			break;
-		case NCSI_FILTER_MC:
-			cnt = rsp->mc_cnt;
-			entry_size = 6;
-			break;
-		case NCSI_FILTER_UC:
-			cnt = rsp->uc_cnt;
-			entry_size = 6;
-			break;
-		default:
-			continue;
-		}
-
-		if (!cnt || nc->filters[i])
-			continue;
-
-		size = sizeof(*ncf) + cnt * entry_size;
-		ncf = kzalloc(size, GFP_ATOMIC);
-		if (!ncf) {
-			pr_warn("%s: Cannot alloc filter table (%d)\n",
-				__func__, i);
-			return -ENOMEM;
-		}
-
-		ncf->index = i;
-		ncf->total = cnt;
-		if (i == NCSI_FILTER_VLAN) {
-			/* Set VLAN filters active so they are cleared in
-			 * first configuration state
-			 */
-			ncf->bitmap = U64_MAX;
-		} else {
-			ncf->bitmap = 0x0ul;
-		}
-		nc->filters[i] = ncf;
-	}
+	size = (rsp->uc_cnt + rsp->mc_cnt + rsp->mixed_cnt) * ETH_ALEN;
+	nc->mac_filter.addrs = kzalloc(size, GFP_KERNEL);
+	if (!nc->mac_filter.addrs)
+		return -ENOMEM;
+	nc->mac_filter.n_uc = rsp->uc_cnt;
+	nc->mac_filter.n_mc = rsp->mc_cnt;
+	nc->mac_filter.n_mixed = rsp->mixed_cnt;
+
+	nc->vlan_filter.vids = kcalloc(rsp->vlan_cnt,
+				       sizeof(*nc->vlan_filter.vids),
+				       GFP_KERNEL);
+	if (!nc->vlan_filter.vids)
+		return -ENOMEM;
+	/* Set VLAN filters active so they are cleared in the first
+	 * configuration state
+	 */
+	nc->vlan_filter.bitmap = U64_MAX;
+	nc->vlan_filter.n_vids = rsp->vlan_cnt;
 
 	return 0;
 }
 
 static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
 {
-	struct ncsi_rsp_gp_pkt *rsp;
+	struct ncsi_channel_vlan_filter *ncvf;
+	struct ncsi_channel_mac_filter *ncmf;
 	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct ncsi_rsp_gp_pkt *rsp;
 	struct ncsi_channel *nc;
-	unsigned short enable, vlan;
+	unsigned short enable;
 	unsigned char *pdata;
-	int table, i;
+	unsigned long flags;
+	void *bitmap;
+	int i;
 
 	/* Find the channel */
 	rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp);
@@ -746,36 +717,33 @@ static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
 	/* MAC addresses filter table */
 	pdata = (unsigned char *)rsp + 48;
 	enable = rsp->mac_enable;
+	ncmf = &nc->mac_filter;
+	spin_lock_irqsave(&nc->lock, flags);
+	bitmap = &ncmf->bitmap;
 	for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) {
-		if (i >= (nc->filters[NCSI_FILTER_UC]->total +
-			  nc->filters[NCSI_FILTER_MC]->total))
-			table = NCSI_FILTER_MIXED;
-		else if (i >= nc->filters[NCSI_FILTER_UC]->total)
-			table = NCSI_FILTER_MC;
-		else
-			table = NCSI_FILTER_UC;
-
 		if (!(enable & (0x1 << i)))
-			continue;
-
-		if (ncsi_find_filter(nc, table, pdata) >= 0)
-			continue;
+			clear_bit(i, bitmap);
+		else
+			set_bit(i, bitmap);
 
-		ncsi_add_filter(nc, table, pdata);
+		memcpy(&ncmf->addrs[i * ETH_ALEN], pdata, ETH_ALEN);
 	}
+	spin_unlock_irqrestore(&nc->lock, flags);
 
 	/* VLAN filter table */
 	enable = ntohs(rsp->vlan_enable);
+	ncvf = &nc->vlan_filter;
+	bitmap = &ncvf->bitmap;
+	spin_lock_irqsave(&nc->lock, flags);
 	for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) {
 		if (!(enable & (0x1 << i)))
-			continue;
-
-		vlan = ntohs(*(__be16 *)pdata);
-		if (ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan) >= 0)
-			continue;
+			clear_bit(i, bitmap);
+		else
+			set_bit(i, bitmap);
 
-		ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan);
+		ncvf->vids[i] = ntohs(*(__be16 *)pdata);
 	}
+	spin_unlock_irqrestore(&nc->lock, flags);
 
 	return 0;
 }
-- 
2.17.0

^ permalink raw reply related

* general protection fault in encode_rpcb_string
From: syzbot @ 2018-04-17  4:02 UTC (permalink / raw)
  To: anna.schumaker, bfields, davem, jlayton, linux-kernel, linux-nfs,
	netdev, syzkaller-bugs, trond.myklebust

Hello,

syzbot hit the following crash on bpf-next commit
5d1365940a68dd57b031b6e3c07d7d451cd69daf (Thu Apr 12 18:09:05 2018 +0000)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
syzbot dashboard link:  
https://syzkaller.appspot.com/bug?extid=4b98281f2401ab849f4b

So far this crash happened 2 times on bpf-next.
C reproducer: https://syzkaller.appspot.com/x/repro.c?id=6433835633868800
syzkaller reproducer:  
https://syzkaller.appspot.com/x/repro.syz?id=6407311794896896
Raw console output:  
https://syzkaller.appspot.com/x/log.txt?id=5861511176126464
Kernel config:  
https://syzkaller.appspot.com/x/.config?id=-5947642240294114534
compiler: gcc (GCC) 8.0.1 20180413 (experimental)

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+4b98281f2401ab849f4b@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for  
details.
If you forward the report, please keep this part and the footer.

RBP: 00000000006dbc50 R08: 000000002000a000 R09: 0000000000003437
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fe464ffed80
R13: 0030656c69662f2e R14: ffffffffffffffff R15: 0000000000000006
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
    (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 1861 Comm: kworker/u4:4 Not tainted 4.16.0+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Workqueue: rpciod rpc_async_schedule
RIP: 0010:strlen+0x1f/0xa0 lib/string.c:479
RSP: 0018:ffff8801cf75f318 EFLAGS: 00010296
RAX: dffffc0000000000 RBX: ffff8801cf68f200 RCX: ffffffff86a8c407
RDX: 0000000000000000 RSI: ffffffff86a84d7b RDI: 0000000000000000
RBP: ffff8801cf75f330 R08: ffff8801cf7de080 R09: ffffed0039ea3d43
R10: ffffed0039ea3d43 R11: ffff8801cf51ea1f R12: 0000000000000000
R13: 0000000002000000 R14: 0000000000000000 R15: ffff8801cf75f3e0
FS:  0000000000000000(0000) GS:ffff8801db000000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f64808a4000 CR3: 00000001b566a000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  strlen include/linux/string.h:267 [inline]
  encode_rpcb_string+0x23/0x70 net/sunrpc/rpcb_clnt.c:914
  rpcb_enc_getaddr+0x146/0x1f0 net/sunrpc/rpcb_clnt.c:940
  rpcauth_wrap_req_encode net/sunrpc/auth.c:777 [inline]
  rpcauth_wrap_req+0x1a8/0x230 net/sunrpc/auth.c:791
  rpc_xdr_encode net/sunrpc/clnt.c:1754 [inline]
  call_transmit+0x8a9/0xfe0 net/sunrpc/clnt.c:1949
  __rpc_execute+0x28a/0xfe0 net/sunrpc/sched.c:784
  rpc_async_schedule+0x16/0x20 net/sunrpc/sched.c:857
  process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
  worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
  kthread+0x345/0x410 kernel/kthread.c:238
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:411
Code: 37 ff ff ff 0f 1f 84 00 00 00 00 00 48 b8 00 00 00 00 00 fc ff df 55  
48 89 fa 48 c1 ea 03 48 89 e5 41 54 49 89 fc 53 48 83 ec 08 <0f> b6 04 02  
48 89 fa 83 e2 07 38 d0 7f 04 84 c0 75 4d 41 80 3c
RIP: strlen+0x1f/0xa0 lib/string.c:479 RSP: ffff8801cf75f318
---[ end trace bd76ed0378a56845 ]---


---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is  
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
If you want to test a patch for this bug, please reply with:
#syz test: git://repo/address.git branch
and provide the patch inline or as an attachment.
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug  
report.
Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* Re: [PATCH net-next 3/5] ipv4: support sport, dport and ip protocol in RTM_GETROUTE
From: Roopa Prabhu @ 2018-04-17  3:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, David Ahern
In-Reply-To: <20180416.185855.2220085572245129235.davem@davemloft.net>

On Mon, Apr 16, 2018 at 3:58 PM, David Miller <davem@davemloft.net> wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> Date: Mon, 16 Apr 2018 13:41:36 -0700
>
>> diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
>> index 9b15005..7947252 100644
>> --- a/include/uapi/linux/rtnetlink.h
>> +++ b/include/uapi/linux/rtnetlink.h
>> @@ -327,6 +327,9 @@ enum rtattr_type_t {
>>       RTA_PAD,
>>       RTA_UID,
>>       RTA_TTL_PROPAGATE,
>> +     RTA_SPORT,
>> +     RTA_DPORT,
>> +     RTA_IP_PROTO,
>>       __RTA_MAX
>>  };
>>
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index ccb25d8..ae55711 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -2663,6 +2663,18 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
>>                       from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
>>               goto nla_put_failure;
>>
>> +     if (fl4->fl4_sport &&
>> +         nla_put_u16(skb, RTA_SPORT, ntohs(fl4->fl4_sport)))
>> +             goto nla_put_failure;
>
> The addreeses are given over netlink in network byte order, so let's
> be consistent and do the same for the ports et al. as well.
>

will do, thanks.

^ permalink raw reply

* Re: tcp hang when socket fills up ?
From: Dominique Martinet @ 2018-04-17  3:52 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Eric Dumazet, Marcelo Ricardo Leitner, Michal Kubecek, netdev
In-Reply-To: <20180416110158.2cwr3pi3anpkzrw3@breakpoint.cc>

[-- Attachment #1: Type: text/plain, Size: 8673 bytes --]

Thank you for the hints. Out of order reply.

Florian Westphal wrote on Mon, Apr 16, 2018:
> echo 1 > /proc/sys/net/netfilter/nf_conntrack_tcp_be_liberal
> 
> which stops conntrack from marking packets with out-of-window
> acks as invalid.

That worked, the connection stays established with this set.


> echo 6 > /proc/sys/net/netfilter/nf_conntrack_log_invalid

Took me a while to figure how to log that properly (why do distros not
ship ulogd?!) but that is helpful.

I have attached the full logs to this mail as text attachments in case I
missed something in my excerpts


Specifically, there are many outbound invalid packets, but as Marcelo
pointed out there is no conntrack netfilter for outbound packets...
The first invalid incoming packet is that one (json output seemed to be
the most convenient as it has tcp seq/ackseq and the oob.prefix message,
feel free to ask for some other format)

{"timestamp": "2018-04-17T10:29:14.956485", "dvc": "Netfilter",
"raw.pktlen": 52, "raw.pktcount": 1, "oob.prefix": "nf_ct_tcp: ACK is
over the upper bound (ACKed data not seen yet) ", "oob.time.sec":
1523928554, "oob.time.usec": 956485, "oob.mark": 0, "oob.hook": 0,
"oob.family": 2, "oob.protocol": 2048, "raw.label": 0, "ip.protocol": 6,
"ip.tos": 0, "ip.ttl": 52, "ip.totlen": 52, "ip.ihl": 5, "ip.csum":
50909, "ip.id": 30328, "ip.fragoff": 16384, "src_port": 30280,
"dest_port": 29543, "tcp.seq": 4048786673, "tcp.ackseq": 3024392506,
"tcp.window": 781, "tcp.offset": 0, "tcp.reserved": 0, "tcp.urg": 0,
"tcp.ack": 1, "tcp.psh": 0, "tcp.rst": 0, "tcp.syn": 0, "tcp.fin": 0,
"tcp.res1": 0, "tcp.res2": 0, "tcp.csum": 17752, "oob.in": "",
"oob.out": "", "src_ip": "client", "dest_ip": "server"}

which is the first ack that isn't seen alright (sent packet, blocked
ack, replays):
10:29:14.926115 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681965809 ecr 1634528339], length 1374
...
10:29:14.956485 IP client.30280 > server.29543: Flags [.], ack 3024392506, win 781, options [nop,nop,TS val 1634528369 ecr 681965809], length 0
...
10:29:15.255489 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681966138 ecr 1634528369], length 1374
10:29:15.281581 IP client.30280 > server.29543: Flags [.], ack 3024436474, win 1444, options [nop,nop,TS val 1634528697 ecr 681965904,nop,nop,sack 1 {3024391132:3024392506}], length 0
10:29:15.555466 IP client.30280 > server.29543: Flags [P.], seq 4048786673:4048786709, ack 3024436474, win 1444, options [nop,nop,TS val 1634528869 ecr 681965904], length 36
10:29:15.719510 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681966602 ecr 1634528369], length 1374

So now I guess the question is why netfilter think this is over the
upper bound.
..And surely enough the answer is within the invalid outbound packets
that aren't considered in the conntrack state machine, the first of whih
is this one:
{"timestamp": "2018-04-17T10:29:14.926143", "dvc": "Netfilter",
"raw.pktlen": 2800, "raw.pktcount": 1, "oob.prefix": "nf_ct_tcp: SEQ is
over the upper bound (over the window of the receiver)", "oob.time.sec":
1523928554, "oob.time.usec": 926143, "oob.mark": 0, "oob.hook": 0,
"oob.family": 2, "oob.protocol": 2048, "oob.uid": 1000, "oob.gid": 100,
"raw.label": 0, "ip.protocol": 6, "ip.tos": 8, "ip.ttl": 64,
"ip.totlen": 2800, "ip.ihl": 5, "ip.csum": 47600, "ip.id": 27809,
"ip.fragoff": 16384, "src_port": 29543, "dest_port": 30280, "tcp.seq":
3024391132, "tcp.ackseq": 4048786673, "tcp.window": 307, "tcp.offset":
0, "tcp.reserved": 0, "tcp.urg": 0, "tcp.ack": 1, "tcp.psh": 0,
"tcp.rst": 0, "tcp.syn": 0, "tcp.fin": 0, "tcp.res1": 0, "tcp.res2": 0,
"tcp.csum": 5201, "oob.in": "", "oob.out": "", "src_ip": "server",
"dest_ip": "client"}

which sequence matches the start seq of the first packet we don't ack,
so that's why conntrack doesn't recongnize that as a valid ack.


Here's some context before that packet:
10:29:14.921305 IP server.29543 > client.30280: Flags [.], seq 3024378314:3024379688, ack 4048786673, win 307, options [nop,nop,TS val 681965804 ecr 1634528336], length 1374
10:29:14.921311 IP server.29543 > client.30280: Flags [.], seq 3024379688:3024381062, ack 4048786673, win 307, options [nop,nop,TS val 681965804 ecr 1634528336], length 1374
10:29:14.922166 IP client.30280 > server.29543: Flags [.], ack 3024365948, win 329, options [nop,nop,TS val 1634528336 ecr 681965778], length 0
10:29:14.922178 IP server.29543 > client.30280: Flags [.], seq 3024381062:3024382436, ack 4048786673, win 307, options [nop,nop,TS val 681965805 ecr 1634528336], length 1374
10:29:14.922182 IP server.29543 > client.30280: Flags [.], seq 3024382436:3024383810, ack 4048786673, win 307, options [nop,nop,TS val 681965805 ecr 1634528336], length 1374
10:29:14.923848 IP client.30280 > server.29543: Flags [.], ack 3024367322, win 352, options [nop,nop,TS val 1634528338 ecr 681965778], length 0
10:29:14.923860 IP server.29543 > client.30280: Flags [.], seq 3024383810:3024385184, ack 4048786673, win 307, options [nop,nop,TS val 681965807 ecr 1634528338], length 1374
10:29:14.923863 IP server.29543 > client.30280: Flags [.], seq 3024385184:3024386558, ack 4048786673, win 307, options [nop,nop,TS val 681965807 ecr 1634528338], length 1374
10:29:14.924175 IP client.30280 > server.29543: Flags [.], ack 3024368696, win 375, options [nop,nop,TS val 1634528338 ecr 681965778], length 0
10:29:14.924187 IP server.29543 > client.30280: Flags [.], seq 3024386558:3024387932, ack 4048786673, win 307, options [nop,nop,TS val 681965807 ecr 1634528338], length 1374
10:29:14.924190 IP server.29543 > client.30280: Flags [.], seq 3024387932:3024389306, ack 4048786673, win 307, options [nop,nop,TS val 681965807 ecr 1634528338], length 1374
10:29:14.925410 IP client.30280 > server.29543: Flags [.], ack 3024370070, win 397, options [nop,nop,TS val 1634528339 ecr 681965779], length 0
10:29:14.925422 IP server.29543 > client.30280: Flags [.], seq 3024389306:3024390680, ack 4048786673, win 307, options [nop,nop,TS val 681965808 ecr 1634528339], length 1374
10:29:14.925425 IP server.29543 > client.30280: Flags [P.], seq 3024390680:3024391132, ack 4048786673, win 307, options [nop,nop,TS val 681965808 ecr 1634528339], length 452
10:29:14.926030 IP client.30280 > server.29543: Flags [.], ack 3024371444, win 420, options [nop,nop,TS val 1634528339 ecr 681965779], length 0
10:29:14.926115 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681965809 ecr 1634528339], length 1374

But the way I see it, (3024392506-3024371444) = 21064 is smaller than
420*64 = 26880.
Even looking at the previous (ack 3024370070, win 397) we get 22436 and
25408 which still work, same with ack 3024368696, win 375 (23810 <
24000)
So that means conntrack is looking at an ack at least as old as ack
3024367322, win 352?...
Is there any way to confirm that? In the code it looks like we match
! before(seq, sender->td_maxend + 1)

I turned pr_debug on in tcp_in_window() for another try and it's a bit
mangled because the information on multiple lines and the function is
called in parallel but it looks like I do have some seq > maxend +1

Although it's weird, the maxend was set WAY earlier apparently?
Apr 17 11:13:14 res=1 sender end=1913287798 maxend=1913316998 maxwin=29312 receiver end=505004284 maxend=505033596 maxwin=29200
then window decreased drastically e.g. previous ack just before refusal:
Apr 17 11:13:53 seq=1913292311 ack=505007789+(0) sack=505007789+(0) win=284 end=1913292311
Apr 17 11:13:53 sender end=1913292311 maxend=1913331607 maxwin=284 scale=0 receiver end=505020155 maxend=505033596 maxwin=39296 scale=7

first res=0 (refused) packet:
Apr 17 11:13:53 odin kernel: seq=505042139 ack=1913292311+(0) ack=1913292311+(0) win=307 end=505044887
Apr 17 11:13:53 res=0 sender end=505033895 maxend=505033596 maxwin=39296 receiver end=1913292311 maxend=1913331607 maxwin=674


> (Earlier email implies this is related to timestamps, but unfortunately
>  to best of my knowledge conntrack doesn't look at tcp timestamps).

tcp_timestamp introduces other changes in behaviour so it might still be
relateed.



Recap of attachments:
 - new tcpdump output of a failed attempt
 - matching invalid packet dumped in json format
 - logs of tcp_in_window function unfortunately of a different attempt

Happy to provide new/different logs if it would be useful. Once again
thanks for the extra eyes and useful commands.

-- 
Dominique Martinet | Asmadeus

[-- Attachment #2: tcpdump --]
[-- Type: text/plain, Size: 27334 bytes --]

10:29:13.429532 IP server.29543 > client.30280: Flags [S], seq 3024360198, win 29200, options [mss 1460,sackOK,TS val 681964312 ecr 0,nop,wscale 7], length 0
10:29:13.712771 IP client.30280 > server.29543: Flags [S], seq 4048782683, win 29200, options [mss 1386,sackOK,TS val 1634527123 ecr 0,nop,wscale 7], length 0
10:29:13.712791 IP server.29543 > client.30280: Flags [S.], seq 3024360198, ack 4048782684, win 29200, options [mss 1460,sackOK,TS val 681964596 ecr 1634527123,nop,wscale 7], length 0
10:29:13.738441 IP client.30280 > server.29543: Flags [.], ack 3024360199, win 229, options [nop,nop,TS val 1634527153 ecr 681964596], length 0
10:29:13.740587 IP client.30280 > server.29543: Flags [P.], seq 4048782684:4048782705, ack 3024360199, win 229, options [nop,nop,TS val 1634527154 ecr 681964596], length 21
10:29:13.740610 IP server.29543 > client.30280: Flags [.], ack 4048782705, win 229, options [nop,nop,TS val 681964623 ecr 1634527154], length 0
10:29:13.744143 IP server.29543 > client.30280: Flags [P.], seq 3024360199:3024360220, ack 4048782705, win 229, options [nop,nop,TS val 681964627 ecr 1634527154], length 21
10:29:13.769430 IP client.30280 > server.29543: Flags [.], ack 3024360220, win 229, options [nop,nop,TS val 1634527185 ecr 681964627], length 0
10:29:13.769458 IP server.29543 > client.30280: Flags [P.], seq 3024360220:3024360932, ack 4048782705, win 229, options [nop,nop,TS val 681964652 ecr 1634527185], length 712
10:29:13.772401 IP client.30280 > server.29543: Flags [P.], seq 4048782705:4048783953, ack 3024360220, win 229, options [nop,nop,TS val 1634527187 ecr 681964627], length 1248
10:29:13.813512 IP server.29543 > client.30280: Flags [.], ack 4048783953, win 248, options [nop,nop,TS val 681964696 ecr 1634527187], length 0
10:29:13.835970 IP client.30280 > server.29543: Flags [.], ack 3024360932, win 240, options [nop,nop,TS val 1634527251 ecr 681964652], length 0
10:29:13.841601 IP client.30280 > server.29543: Flags [P.], seq 4048783953:4048784001, ack 3024360932, win 240, options [nop,nop,TS val 1634527255 ecr 681964696], length 48
10:29:13.841626 IP server.29543 > client.30280: Flags [.], ack 4048784001, win 248, options [nop,nop,TS val 681964724 ecr 1634527255], length 0
10:29:13.847085 IP server.29543 > client.30280: Flags [P.], seq 3024360932:3024361320, ack 4048784001, win 248, options [nop,nop,TS val 681964730 ecr 1634527255], length 388
10:29:13.872177 IP client.30280 > server.29543: Flags [.], ack 3024361320, win 251, options [nop,nop,TS val 1634527287 ecr 681964730], length 0
10:29:13.906359 IP client.30280 > server.29543: Flags [P.], seq 4048784001:4048784017, ack 3024361320, win 251, options [nop,nop,TS val 1634527321 ecr 681964730], length 16
10:29:13.946509 IP server.29543 > client.30280: Flags [.], ack 4048784017, win 248, options [nop,nop,TS val 681964829 ecr 1634527321], length 0
10:29:13.979888 IP client.30280 > server.29543: Flags [P.], seq 4048784017:4048784069, ack 3024361320, win 251, options [nop,nop,TS val 1634527387 ecr 681964829], length 52
10:29:13.979896 IP server.29543 > client.30280: Flags [.], ack 4048784069, win 248, options [nop,nop,TS val 681964863 ecr 1634527387], length 0
10:29:13.980189 IP server.29543 > client.30280: Flags [P.], seq 3024361320:3024361372, ack 4048784069, win 248, options [nop,nop,TS val 681964863 ecr 1634527387], length 52
10:29:14.007950 IP client.30280 > server.29543: Flags [P.], seq 4048784069:4048784137, ack 3024361372, win 251, options [nop,nop,TS val 1634527422 ecr 681964863], length 68
10:29:14.008365 IP server.29543 > client.30280: Flags [P.], seq 3024361372:3024361456, ack 4048784137, win 248, options [nop,nop,TS val 681964891 ecr 1634527422], length 84
10:29:14.035877 IP client.30280 > server.29543: Flags [P.], seq 4048784137:4048784765, ack 3024361456, win 251, options [nop,nop,TS val 1634527449 ecr 681964891], length 628
10:29:14.036743 IP server.29543 > client.30280: Flags [P.], seq 3024361456:3024362036, ack 4048784765, win 268, options [nop,nop,TS val 681964920 ecr 1634527449], length 580
10:29:14.075014 IP client.30280 > server.29543: Flags [P.], seq 4048784765:4048785937, ack 3024362036, win 262, options [nop,nop,TS val 1634527489 ecr 681964920], length 1172
10:29:14.076317 IP server.29543 > client.30280: Flags [P.], seq 3024362036:3024362072, ack 4048785937, win 287, options [nop,nop,TS val 681964959 ecr 1634527489], length 36
10:29:14.102624 IP client.30280 > server.29543: Flags [P.], seq 4048785937:4048786057, ack 3024362072, win 262, options [nop,nop,TS val 1634527517 ecr 681964959], length 120
10:29:14.102857 IP server.29543 > client.30280: Flags [P.], seq 3024362072:3024362732, ack 4048786057, win 287, options [nop,nop,TS val 681964986 ecr 1634527517], length 660
10:29:14.171874 IP client.30280 > server.29543: Flags [.], ack 3024362732, win 273, options [nop,nop,TS val 1634527584 ecr 681964986], length 0
10:29:14.171890 IP server.29543 > client.30280: Flags [P.], seq 3024362732:3024362784, ack 4048786057, win 287, options [nop,nop,TS val 681965055 ecr 1634527584], length 52
10:29:14.199156 IP client.30280 > server.29543: Flags [.], ack 3024362784, win 273, options [nop,nop,TS val 1634527614 ecr 681965055], length 0
10:29:14.200359 IP client.30280 > server.29543: Flags [P.], seq 4048786057:4048786585, ack 3024362784, win 273, options [nop,nop,TS val 1634527615 ecr 681965055], length 528
10:29:14.201431 IP server.29543 > client.30280: Flags [P.], seq 3024362784:3024362892, ack 4048786585, win 307, options [nop,nop,TS val 681965084 ecr 1634527615], length 108
10:29:14.227696 IP client.30280 > server.29543: Flags [P.], seq 4048786585:4048786637, ack 3024362892, win 273, options [nop,nop,TS val 1634527643 ecr 681965084], length 52
10:29:14.227737 IP server.29543 > client.30280: Flags [P.], seq 3024362892:3024362992, ack 4048786637, win 307, options [nop,nop,TS val 681965111 ecr 1634527643], length 100
10:29:14.296663 IP client.30280 > server.29543: Flags [.], ack 3024362992, win 273, options [nop,nop,TS val 1634527709 ecr 681965111], length 0
10:29:14.296678 IP server.29543 > client.30280: Flags [P.], seq 3024362992:3024363164, ack 4048786637, win 307, options [nop,nop,TS val 681965179 ecr 1634527709], length 172
10:29:14.322190 IP client.30280 > server.29543: Flags [.], ack 3024363164, win 284, options [nop,nop,TS val 1634527738 ecr 681965179], length 0
10:29:14.891087 IP client.30280 > server.29543: Flags [P.], seq 4048786637:4048786673, ack 3024363164, win 284, options [nop,nop,TS val 1634528306 ecr 681965179], length 36
10:29:14.891605 IP server.29543 > client.30280: Flags [P.], seq 3024363164:3024363200, ack 4048786673, win 307, options [nop,nop,TS val 681965774 ecr 1634528306], length 36
10:29:14.895150 IP server.29543 > client.30280: Flags [.], seq 3024363200:3024364574, ack 4048786673, win 307, options [nop,nop,TS val 681965778 ecr 1634528306], length 1374
10:29:14.895201 IP server.29543 > client.30280: Flags [.], seq 3024364574:3024365948, ack 4048786673, win 307, options [nop,nop,TS val 681965778 ecr 1634528306], length 1374
10:29:14.895329 IP server.29543 > client.30280: Flags [.], seq 3024365948:3024367322, ack 4048786673, win 307, options [nop,nop,TS val 681965778 ecr 1634528306], length 1374
10:29:14.895334 IP server.29543 > client.30280: Flags [.], seq 3024367322:3024368696, ack 4048786673, win 307, options [nop,nop,TS val 681965778 ecr 1634528306], length 1374
10:29:14.896045 IP server.29543 > client.30280: Flags [.], seq 3024368696:3024370070, ack 4048786673, win 307, options [nop,nop,TS val 681965779 ecr 1634528306], length 1374
10:29:14.896056 IP server.29543 > client.30280: Flags [.], seq 3024370070:3024371444, ack 4048786673, win 307, options [nop,nop,TS val 681965779 ecr 1634528306], length 1374
10:29:14.896058 IP server.29543 > client.30280: Flags [.], seq 3024371444:3024372818, ack 4048786673, win 307, options [nop,nop,TS val 681965779 ecr 1634528306], length 1374
10:29:14.896380 IP server.29543 > client.30280: Flags [.], seq 3024372818:3024374192, ack 4048786673, win 307, options [nop,nop,TS val 681965779 ecr 1634528306], length 1374
10:29:14.896384 IP server.29543 > client.30280: Flags [.], seq 3024374192:3024375566, ack 4048786673, win 307, options [nop,nop,TS val 681965779 ecr 1634528306], length 1374
10:29:14.917086 IP client.30280 > server.29543: Flags [.], ack 3024363200, win 284, options [nop,nop,TS val 1634528332 ecr 681965774], length 0
10:29:14.917104 IP server.29543 > client.30280: Flags [.], seq 3024375566:3024376940, ack 4048786673, win 307, options [nop,nop,TS val 681965800 ecr 1634528332], length 1374
10:29:14.917110 IP server.29543 > client.30280: Flags [.], seq 3024376940:3024378314, ack 4048786673, win 307, options [nop,nop,TS val 681965800 ecr 1634528332], length 1374
10:29:14.921290 IP client.30280 > server.29543: Flags [.], ack 3024364574, win 307, options [nop,nop,TS val 1634528336 ecr 681965778], length 0
10:29:14.921305 IP server.29543 > client.30280: Flags [.], seq 3024378314:3024379688, ack 4048786673, win 307, options [nop,nop,TS val 681965804 ecr 1634528336], length 1374
10:29:14.921311 IP server.29543 > client.30280: Flags [.], seq 3024379688:3024381062, ack 4048786673, win 307, options [nop,nop,TS val 681965804 ecr 1634528336], length 1374
10:29:14.922166 IP client.30280 > server.29543: Flags [.], ack 3024365948, win 329, options [nop,nop,TS val 1634528336 ecr 681965778], length 0
10:29:14.922178 IP server.29543 > client.30280: Flags [.], seq 3024381062:3024382436, ack 4048786673, win 307, options [nop,nop,TS val 681965805 ecr 1634528336], length 1374
10:29:14.922182 IP server.29543 > client.30280: Flags [.], seq 3024382436:3024383810, ack 4048786673, win 307, options [nop,nop,TS val 681965805 ecr 1634528336], length 1374
10:29:14.923848 IP client.30280 > server.29543: Flags [.], ack 3024367322, win 352, options [nop,nop,TS val 1634528338 ecr 681965778], length 0
10:29:14.923860 IP server.29543 > client.30280: Flags [.], seq 3024383810:3024385184, ack 4048786673, win 307, options [nop,nop,TS val 681965807 ecr 1634528338], length 1374
10:29:14.923863 IP server.29543 > client.30280: Flags [.], seq 3024385184:3024386558, ack 4048786673, win 307, options [nop,nop,TS val 681965807 ecr 1634528338], length 1374
10:29:14.924175 IP client.30280 > server.29543: Flags [.], ack 3024368696, win 375, options [nop,nop,TS val 1634528338 ecr 681965778], length 0
10:29:14.924187 IP server.29543 > client.30280: Flags [.], seq 3024386558:3024387932, ack 4048786673, win 307, options [nop,nop,TS val 681965807 ecr 1634528338], length 1374
10:29:14.924190 IP server.29543 > client.30280: Flags [.], seq 3024387932:3024389306, ack 4048786673, win 307, options [nop,nop,TS val 681965807 ecr 1634528338], length 1374
10:29:14.925410 IP client.30280 > server.29543: Flags [.], ack 3024370070, win 397, options [nop,nop,TS val 1634528339 ecr 681965779], length 0
10:29:14.925422 IP server.29543 > client.30280: Flags [.], seq 3024389306:3024390680, ack 4048786673, win 307, options [nop,nop,TS val 681965808 ecr 1634528339], length 1374
10:29:14.925425 IP server.29543 > client.30280: Flags [P.], seq 3024390680:3024391132, ack 4048786673, win 307, options [nop,nop,TS val 681965808 ecr 1634528339], length 452
10:29:14.926030 IP client.30280 > server.29543: Flags [.], ack 3024371444, win 420, options [nop,nop,TS val 1634528339 ecr 681965779], length 0
10:29:14.926115 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681965809 ecr 1634528339], length 1374
10:29:14.926122 IP server.29543 > client.30280: Flags [.], seq 3024392506:3024393880, ack 4048786673, win 307, options [nop,nop,TS val 681965809 ecr 1634528339], length 1374
10:29:14.926353 IP client.30280 > server.29543: Flags [.], ack 3024372818, win 443, options [nop,nop,TS val 1634528339 ecr 681965779], length 0
10:29:14.926924 IP client.30280 > server.29543: Flags [.], ack 3024374192, win 465, options [nop,nop,TS val 1634528339 ecr 681965779], length 0
10:29:14.927677 IP client.30280 > server.29543: Flags [.], ack 3024375566, win 488, options [nop,nop,TS val 1634528340 ecr 681965779], length 0
10:29:14.928109 IP server.29543 > client.30280: Flags [.], seq 3024393880:3024395254, ack 4048786673, win 307, options [nop,nop,TS val 681965811 ecr 1634528340], length 1374
10:29:14.928113 IP server.29543 > client.30280: Flags [.], seq 3024395254:3024396628, ack 4048786673, win 307, options [nop,nop,TS val 681965811 ecr 1634528340], length 1374
10:29:14.928701 IP server.29543 > client.30280: Flags [.], seq 3024396628:3024398002, ack 4048786673, win 307, options [nop,nop,TS val 681965811 ecr 1634528340], length 1374
10:29:14.928705 IP server.29543 > client.30280: Flags [.], seq 3024398002:3024399376, ack 4048786673, win 307, options [nop,nop,TS val 681965811 ecr 1634528340], length 1374
10:29:14.929817 IP server.29543 > client.30280: Flags [.], seq 3024399376:3024400750, ack 4048786673, win 307, options [nop,nop,TS val 681965813 ecr 1634528340], length 1374
10:29:14.929821 IP server.29543 > client.30280: Flags [.], seq 3024400750:3024402124, ack 4048786673, win 307, options [nop,nop,TS val 681965813 ecr 1634528340], length 1374
10:29:14.942186 IP client.30280 > server.29543: Flags [.], ack 3024376940, win 510, options [nop,nop,TS val 1634528358 ecr 681965800], length 0
10:29:14.942203 IP server.29543 > client.30280: Flags [.], seq 3024402124:3024403498, ack 4048786673, win 307, options [nop,nop,TS val 681965825 ecr 1634528358], length 1374
10:29:14.942208 IP server.29543 > client.30280: Flags [.], seq 3024403498:3024404872, ack 4048786673, win 307, options [nop,nop,TS val 681965825 ecr 1634528358], length 1374
10:29:14.942782 IP client.30280 > server.29543: Flags [.], ack 3024378314, win 533, options [nop,nop,TS val 1634528358 ecr 681965800], length 0
10:29:14.942797 IP server.29543 > client.30280: Flags [.], seq 3024404872:3024406246, ack 4048786673, win 307, options [nop,nop,TS val 681965826 ecr 1634528358], length 1374
10:29:14.942800 IP server.29543 > client.30280: Flags [.], seq 3024406246:3024407620, ack 4048786673, win 307, options [nop,nop,TS val 681965826 ecr 1634528358], length 1374
10:29:14.946310 IP client.30280 > server.29543: Flags [.], ack 3024379688, win 556, options [nop,nop,TS val 1634528362 ecr 681965804], length 0
10:29:14.946326 IP server.29543 > client.30280: Flags [.], seq 3024407620:3024408994, ack 4048786673, win 307, options [nop,nop,TS val 681965829 ecr 1634528362], length 1374
10:29:14.946332 IP server.29543 > client.30280: Flags [.], seq 3024408994:3024410368, ack 4048786673, win 307, options [nop,nop,TS val 681965829 ecr 1634528362], length 1374
10:29:14.946922 IP client.30280 > server.29543: Flags [.], ack 3024381062, win 578, options [nop,nop,TS val 1634528363 ecr 681965804], length 0
10:29:14.946937 IP server.29543 > client.30280: Flags [.], seq 3024410368:3024411742, ack 4048786673, win 307, options [nop,nop,TS val 681965830 ecr 1634528363], length 1374
10:29:14.946940 IP server.29543 > client.30280: Flags [.], seq 3024411742:3024413116, ack 4048786673, win 307, options [nop,nop,TS val 681965830 ecr 1634528363], length 1374
10:29:14.947854 IP client.30280 > server.29543: Flags [.], ack 3024382436, win 601, options [nop,nop,TS val 1634528363 ecr 681965805], length 0
10:29:14.947870 IP server.29543 > client.30280: Flags [.], seq 3024413116:3024414490, ack 4048786673, win 307, options [nop,nop,TS val 681965831 ecr 1634528363], length 1374
10:29:14.947873 IP server.29543 > client.30280: Flags [.], seq 3024414490:3024415864, ack 4048786673, win 307, options [nop,nop,TS val 681965831 ecr 1634528363], length 1374
10:29:14.948437 IP client.30280 > server.29543: Flags [.], ack 3024383810, win 624, options [nop,nop,TS val 1634528364 ecr 681965805], length 0
10:29:14.949374 IP server.29543 > client.30280: Flags [.], seq 3024415864:3024417238, ack 4048786673, win 307, options [nop,nop,TS val 681965832 ecr 1634528364], length 1374
10:29:14.949378 IP server.29543 > client.30280: Flags [.], seq 3024417238:3024418612, ack 4048786673, win 307, options [nop,nop,TS val 681965832 ecr 1634528364], length 1374
10:29:14.950039 IP client.30280 > server.29543: Flags [.], ack 3024385184, win 646, options [nop,nop,TS val 1634528365 ecr 681965807], length 0
10:29:14.951071 IP server.29543 > client.30280: Flags [.], seq 3024418612:3024419986, ack 4048786673, win 307, options [nop,nop,TS val 681965834 ecr 1634528365], length 1374
10:29:14.951075 IP server.29543 > client.30280: Flags [.], seq 3024419986:3024421360, ack 4048786673, win 307, options [nop,nop,TS val 681965834 ecr 1634528365], length 1374
10:29:14.951334 IP client.30280 > server.29543: Flags [.], ack 3024386558, win 669, options [nop,nop,TS val 1634528366 ecr 681965807], length 0
10:29:14.951824 IP client.30280 > server.29543: Flags [.], ack 3024387932, win 691, options [nop,nop,TS val 1634528366 ecr 681965807], length 0
10:29:14.952159 IP client.30280 > server.29543: Flags [.], ack 3024389306, win 714, options [nop,nop,TS val 1634528367 ecr 681965807], length 0
10:29:14.953216 IP server.29543 > client.30280: Flags [.], seq 3024421360:3024422734, ack 4048786673, win 307, options [nop,nop,TS val 681965836 ecr 1634528367], length 1374
10:29:14.953220 IP server.29543 > client.30280: Flags [.], seq 3024422734:3024424108, ack 4048786673, win 307, options [nop,nop,TS val 681965836 ecr 1634528367], length 1374
10:29:14.953473 IP client.30280 > server.29543: Flags [.], ack 3024390680, win 737, options [nop,nop,TS val 1634528368 ecr 681965808], length 0
10:29:14.954411 IP server.29543 > client.30280: Flags [.], seq 3024424108:3024425482, ack 4048786673, win 307, options [nop,nop,TS val 681965837 ecr 1634528368], length 1374
10:29:14.954415 IP server.29543 > client.30280: Flags [.], seq 3024425482:3024426856, ack 4048786673, win 307, options [nop,nop,TS val 681965837 ecr 1634528368], length 1374
10:29:14.955336 IP client.30280 > server.29543: Flags [.], ack 3024391132, win 758, options [nop,nop,TS val 1634528369 ecr 681965808], length 0
10:29:14.955646 IP server.29543 > client.30280: Flags [.], seq 3024426856:3024428230, ack 4048786673, win 307, options [nop,nop,TS val 681965838 ecr 1634528369], length 1374
10:29:14.955650 IP server.29543 > client.30280: Flags [.], seq 3024428230:3024429604, ack 4048786673, win 307, options [nop,nop,TS val 681965838 ecr 1634528369], length 1374
10:29:14.956485 IP client.30280 > server.29543: Flags [.], ack 3024392506, win 781, options [nop,nop,TS val 1634528369 ecr 681965809], length 0
10:29:14.956836 IP server.29543 > client.30280: Flags [.], seq 3024429604:3024430978, ack 4048786673, win 307, options [nop,nop,TS val 681965840 ecr 1634528369], length 1374
10:29:14.956841 IP server.29543 > client.30280: Flags [.], seq 3024430978:3024432352, ack 4048786673, win 307, options [nop,nop,TS val 681965840 ecr 1634528369], length 1374
10:29:14.957123 IP client.30280 > server.29543: Flags [.], ack 3024393880, win 803, options [nop,nop,TS val 1634528370 ecr 681965809], length 0
10:29:14.958055 IP client.30280 > server.29543: Flags [.], ack 3024395254, win 826, options [nop,nop,TS val 1634528371 ecr 681965811], length 0
10:29:14.958541 IP client.30280 > server.29543: Flags [.], ack 3024396628, win 849, options [nop,nop,TS val 1634528371 ecr 681965811], length 0
10:29:14.958931 IP client.30280 > server.29543: Flags [.], ack 3024398002, win 871, options [nop,nop,TS val 1634528372 ecr 681965811], length 0
10:29:14.959265 IP client.30280 > server.29543: Flags [.], ack 3024399376, win 894, options [nop,nop,TS val 1634528373 ecr 681965811], length 0
10:29:14.960799 IP client.30280 > server.29543: Flags [.], ack 3024400750, win 917, options [nop,nop,TS val 1634528375 ecr 681965813], length 0
10:29:14.961117 IP client.30280 > server.29543: Flags [.], ack 3024402124, win 939, options [nop,nop,TS val 1634528376 ecr 681965813], length 0
10:29:14.961443 IP server.29543 > client.30280: Flags [.], seq 3024432352:3024433726, ack 4048786673, win 307, options [nop,nop,TS val 681965844 ecr 1634528369], length 1374
10:29:14.961449 IP server.29543 > client.30280: Flags [.], seq 3024433726:3024435100, ack 4048786673, win 307, options [nop,nop,TS val 681965844 ecr 1634528369], length 1374
10:29:14.969116 IP client.30280 > server.29543: Flags [.], ack 3024403498, win 962, options [nop,nop,TS val 1634528384 ecr 681965825], length 0
10:29:14.969924 IP client.30280 > server.29543: Flags [.], ack 3024404872, win 984, options [nop,nop,TS val 1634528384 ecr 681965825], length 0
10:29:14.970246 IP client.30280 > server.29543: Flags [.], ack 3024406246, win 1007, options [nop,nop,TS val 1634528384 ecr 681965826], length 0
10:29:14.970652 IP client.30280 > server.29543: Flags [.], ack 3024407620, win 1030, options [nop,nop,TS val 1634528385 ecr 681965826], length 0
10:29:14.971939 IP client.30280 > server.29543: Flags [.], ack 3024408994, win 1052, options [nop,nop,TS val 1634528388 ecr 681965829], length 0
10:29:14.973628 IP client.30280 > server.29543: Flags [.], ack 3024410368, win 1075, options [nop,nop,TS val 1634528389 ecr 681965829], length 0
10:29:14.974381 IP client.30280 > server.29543: Flags [.], ack 3024411742, win 1098, options [nop,nop,TS val 1634528390 ecr 681965830], length 0
10:29:14.975803 IP client.30280 > server.29543: Flags [.], ack 3024413116, win 1120, options [nop,nop,TS val 1634528391 ecr 681965830], length 0
10:29:14.976415 IP client.30280 > server.29543: Flags [.], ack 3024414490, win 1143, options [nop,nop,TS val 1634528392 ecr 681965831], length 0
10:29:14.977895 IP client.30280 > server.29543: Flags [.], ack 3024415864, win 1165, options [nop,nop,TS val 1634528393 ecr 681965831], length 0
10:29:14.978221 IP client.30280 > server.29543: Flags [.], ack 3024417238, win 1188, options [nop,nop,TS val 1634528393 ecr 681965832], length 0
10:29:14.978864 IP client.30280 > server.29543: Flags [.], ack 3024418612, win 1211, options [nop,nop,TS val 1634528394 ecr 681965832], length 0
10:29:14.979653 IP client.30280 > server.29543: Flags [.], ack 3024419986, win 1233, options [nop,nop,TS val 1634528395 ecr 681965834], length 0
10:29:14.979976 IP client.30280 > server.29543: Flags [.], ack 3024421360, win 1256, options [nop,nop,TS val 1634528396 ecr 681965834], length 0
10:29:14.980864 IP client.30280 > server.29543: Flags [.], ack 3024422734, win 1279, options [nop,nop,TS val 1634528396 ecr 681965836], length 0
10:29:14.981489 IP client.30280 > server.29543: Flags [.], ack 3024424108, win 1301, options [nop,nop,TS val 1634528397 ecr 681965836], length 0
10:29:14.981877 IP client.30280 > server.29543: Flags [.], ack 3024425482, win 1324, options [nop,nop,TS val 1634528397 ecr 681965837], length 0
10:29:14.985230 IP client.30280 > server.29543: Flags [.], ack 3024426856, win 1346, options [nop,nop,TS val 1634528401 ecr 681965837], length 0
10:29:14.986222 IP client.30280 > server.29543: Flags [P.], seq 4048786673:4048786709, ack 3024426856, win 1346, options [nop,nop,TS val 1634528401 ecr 681965837], length 36
10:29:14.987058 IP client.30280 > server.29543: Flags [.], ack 3024429604, win 1392, options [nop,nop,TS val 1634528402 ecr 681965838], length 0
10:29:14.988349 IP client.30280 > server.29543: Flags [.], ack 3024432352, win 1437, options [nop,nop,TS val 1634528403 ecr 681965840], length 0
10:29:14.989103 IP client.30280 > server.29543: Flags [.], ack 3024435100, win 1444, options [nop,nop,TS val 1634528404 ecr 681965844], length 0
10:29:15.021406 IP server.29543 > client.30280: Flags [.], seq 3024435100:3024436474, ack 4048786673, win 307, options [nop,nop,TS val 681965904 ecr 1634528369], length 1374
10:29:15.087837 IP client.30280 > server.29543: Flags [.], ack 3024436474, win 1444, options [nop,nop,TS val 1634528503 ecr 681965904], length 0
10:29:15.242785 IP client.30280 > server.29543: Flags [P.], seq 4048786673:4048786709, ack 3024436474, win 1444, options [nop,nop,TS val 1634528637 ecr 681965904], length 36
10:29:15.255489 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681966138 ecr 1634528369], length 1374
10:29:15.281581 IP client.30280 > server.29543: Flags [.], ack 3024436474, win 1444, options [nop,nop,TS val 1634528697 ecr 681965904,nop,nop,sack 1 {3024391132:3024392506}], length 0
10:29:15.555466 IP client.30280 > server.29543: Flags [P.], seq 4048786673:4048786709, ack 3024436474, win 1444, options [nop,nop,TS val 1634528869 ecr 681965904], length 36
10:29:15.719510 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681966602 ecr 1634528369], length 1374
10:29:15.746624 IP client.30280 > server.29543: Flags [.], ack 3024436474, win 1444, options [nop,nop,TS val 1634529162 ecr 681965904,nop,nop,sack 1 {3024391132:3024392506}], length 0
10:29:15.918363 IP client.30280 > server.29543: Flags [P.], seq 4048786673:4048786709, ack 3024436474, win 1444, options [nop,nop,TS val 1634529333 ecr 681965904], length 36
10:29:16.695512 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681967578 ecr 1634528369], length 1374
10:29:16.737481 IP client.30280 > server.29543: Flags [.], ack 3024436474, win 1444, options [nop,nop,TS val 1634530143 ecr 681965904,nop,nop,sack 1 {3024391132:3024392506}], length 0
10:29:16.886594 IP client.30280 > server.29543: Flags [P.], seq 4048786673:4048786709, ack 3024436474, win 1444, options [nop,nop,TS val 1634530301 ecr 681965904], length 36
10:29:18.551512 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681969434 ecr 1634528369], length 1374
10:29:18.578152 IP client.30280 > server.29543: Flags [.], ack 3024436474, win 1444, options [nop,nop,TS val 1634531993 ecr 681965904,nop,nop,sack 1 {3024391132:3024392506}], length 0
10:29:18.742471 IP client.30280 > server.29543: Flags [P.], seq 4048786673:4048786709, ack 3024436474, win 1444, options [nop,nop,TS val 1634532157 ecr 681965904], length 36
10:29:22.263513 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681973146 ecr 1634528369], length 1374
10:29:22.298045 IP client.30280 > server.29543: Flags [.], ack 3024436474, win 1444, options [nop,nop,TS val 1634535706 ecr 681965904,nop,nop,sack 1 {3024391132:3024392506}], length 0
10:29:22.456055 IP client.30280 > server.29543: Flags [P.], seq 4048786673:4048786709, ack 3024436474, win 1444, options [nop,nop,TS val 1634535869 ecr 681965904], length 36
10:29:22.698033 IP client.30280 > server.29543: Flags [FP.], seq 4048786709:4048786849, ack 3024436474, win 1444, options [nop,nop,TS val 1634536113 ecr 681965904], length 140
10:29:29.943515 IP server.29543 > client.30280: Flags [.], seq 3024391132:3024392506, ack 4048786673, win 307, options [nop,nop,TS val 681980826 ecr 1634528369], length 1374
10:29:29.976496 IP client.30280 > server.29543: Flags [.], ack 3024436474, win 1444, options [nop,nop,TS val 1634543386 ecr 681965904,nop,nop,sack 1 {3024391132:3024392506}], length 0
10:29:30.326520 IP client.30280 > server.29543: Flags [FP.], seq 4048786673:4048786849, ack 3024436474, win 1444, options [nop,nop,TS val 1634543741 ecr 681965904], length 176

[-- Attachment #3: ulogd.json --]
[-- Type: application/json, Size: 48712 bytes --]

[-- Attachment #4: dmesg.xz --]
[-- Type: application/octet-stream, Size: 500772 bytes --]

^ permalink raw reply

* [PATCH][net-next] net: ip tos cgroup
From: Li RongQing @ 2018-04-17  3:36 UTC (permalink / raw)
  To: netdev

ip tos segment can be changed by setsockopt(IP_TOS), or by iptables;
this patch creates a new method to change socket tos segment of
processes based on cgroup

The usage:

    1. mount ip_tos cgroup, and setting tos value
    mount -t cgroup -o ip_tos ip_tos /cgroups/tos
    echo tos_value >/cgroups/tos/ip_tos.tos
    2. then move processes to cgroup, or create processes in cgroup

Signed-off-by: jimyan <jimyan@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
 include/linux/cgroup_subsys.h |   4 ++
 include/net/tos_cgroup.h      |  35 ++++++++++++
 net/ipv4/Kconfig              |  10 ++++
 net/ipv4/Makefile             |   1 +
 net/ipv4/af_inet.c            |   2 +
 net/ipv4/tos_cgroup.c         | 128 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/af_inet6.c           |   2 +
 7 files changed, 182 insertions(+)
 create mode 100644 include/net/tos_cgroup.h
 create mode 100644 net/ipv4/tos_cgroup.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..1b86eda1c23e 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -61,6 +61,10 @@ SUBSYS(pids)
 SUBSYS(rdma)
 #endif
 
+#if IS_ENABLED(CONFIG_IP_TOS_CGROUP)
+SUBSYS(ip_tos)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/net/tos_cgroup.h b/include/net/tos_cgroup.h
new file mode 100644
index 000000000000..0868e921faf3
--- /dev/null
+++ b/include/net/tos_cgroup.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _IP_TOS_CGROUP_H
+#define _IP_TOS_CGROUP_H
+
+#include <linux/cgroup.h>
+#include <linux/hardirq.h>
+
+struct tos_cgroup_state {
+	struct cgroup_subsys_state css;
+	u32 tos;
+};
+
+#if IS_ENABLED(CONFIG_IP_TOS_CGROUP)
+static inline u32 task_ip_tos(struct task_struct *p)
+{
+	u32 tos;
+
+	if (in_interrupt())
+		return 0;
+
+	rcu_read_lock();
+	tos = container_of(task_css(p, ip_tos_cgrp_id),
+			struct tos_cgroup_state, css)->tos;
+	rcu_read_unlock();
+
+	return tos;
+}
+#else /* !CONFIG_IP_TOS_CGROUP */
+static inline u32 task_ip_tos(struct task_struct *p)
+{
+	return 0;
+}
+#endif /* CONFIG_IP_TOS_CGROUP */
+#endif  /* _IP_TOS_CGROUP_H */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 80dad301361d..57070bbb0394 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -753,3 +753,13 @@ config TCP_MD5SIG
 	  on the Internet.
 
 	  If unsure, say N.
+
+config IP_TOS_CGROUP
+	bool "ip tos cgroup"
+	depends on CGROUPS
+	default n
+	---help---
+	  Say Y here if you want to set ip packet tos based on the
+	  control cgroup of their process.
+
+	  This can set ip packet tos
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a07b7dd06def..12c708142d1f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_IP_TOS_CGROUP) += tos_cgroup.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eaed0367e669..e2dd902b06dd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -120,6 +120,7 @@
 #include <linux/mroute.h>
 #endif
 #include <net/l3mdev.h>
+#include <net/tos_cgroup.h>
 
 #include <trace/events/sock.h>
 
@@ -356,6 +357,7 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 	inet->mc_index	= 0;
 	inet->mc_list	= NULL;
 	inet->rcv_tos	= 0;
+	inet->tos       = task_ip_tos(current);
 
 	sk_refcnt_debug_inc(sk);
 
diff --git a/net/ipv4/tos_cgroup.c b/net/ipv4/tos_cgroup.c
new file mode 100644
index 000000000000..dbb828f5b464
--- /dev/null
+++ b/net/ipv4/tos_cgroup.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/tos_cgroup.h>
+#include <linux/fdtable.h>
+#include <net/route.h>
+#include <net/inet_ecn.h>
+#include <linux/sched/task.h>
+
+static inline
+struct tos_cgroup_state *css_tos_cgroup(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct tos_cgroup_state, css) : NULL;
+}
+
+static inline struct tos_cgroup_state *task_tos_cgroup(struct task_struct *task)
+{
+	return css_tos_cgroup(task_css(task, ip_tos_cgrp_id));
+}
+
+static struct cgroup_subsys_state
+*cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct tos_cgroup_state *cs;
+
+	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+	if (!cs)
+		return ERR_PTR(-ENOMEM);
+
+	return &cs->css;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_tos_cgroup(css));
+}
+
+static int update_tos(const void *v, struct file *file, unsigned int n)
+{
+	int err;
+	struct socket *sock = sock_from_file(file, &err);
+	unsigned char val = (unsigned char)*(u64 *)v;
+
+	if (sock && (sock->sk->sk_family == PF_INET ||
+				sock->sk->sk_family == PF_INET6)) {
+		struct inet_sock *inet = inet_sk(sock->sk);
+
+		lock_sock(sock->sk);
+		if (sock->sk->sk_type == SOCK_STREAM) {
+			val &= ~INET_ECN_MASK;
+			val |= inet->tos & INET_ECN_MASK;
+		}
+		if (inet->tos != val) {
+			inet->tos = val;
+			sock->sk->sk_priority = rt_tos2priority(val);
+			sk_dst_reset(sock->sk);
+		}
+		release_sock(sock->sk);
+	}
+	return 0;
+}
+
+static void cgrp_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *p;
+	struct cgroup_subsys_state *css;
+	u64 v;
+
+	cgroup_taskset_for_each(p, css, tset) {
+		task_lock(p);
+		v = task_tos_cgroup(p)->tos;
+		iterate_fd(p->files, 0, update_tos, (void *)&v);
+		task_unlock(p);
+	}
+}
+
+static u64 read_tos(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return css_tos_cgroup(css)->tos;
+}
+
+static int
+write_tos(struct cgroup_subsys_state *css, struct cftype *cft, u64 value)
+{
+	struct css_task_iter it;
+	struct task_struct *task = NULL;
+
+	if (value < 0 || value > 255) {
+		pr_info("Invalid TOS value\n");
+		return 0;
+	}
+
+	css_tos_cgroup(css)->tos = (u32)value;
+
+	css_task_iter_start(css, 0, &it);
+	while ((task = css_task_iter_next(&it))) {
+		task_lock(task);
+		iterate_fd(task->files, 0, update_tos, (void *)&value);
+		task_unlock(task);
+	}
+	css_task_iter_end(&it);
+
+	return 0;
+}
+
+static struct cftype ss_files[] = {
+	{
+		.name = "tos",
+		.read_u64 = read_tos,
+		.write_u64 = write_tos,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys ip_tos_cgrp_subsys = {
+	.css_alloc	= cgrp_css_alloc,
+	.css_free	= cgrp_css_free,
+	.attach		= cgrp_attach,
+	.legacy_cftypes	= ss_files,
+};
+
+MODULE_LICENSE("GPL v2");
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8da0b513f188..d33e240613e0 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -44,6 +44,7 @@
 #include <linux/icmpv6.h>
 #include <linux/netfilter_ipv6.h>
 
+#include <net/tos_cgroup.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/udp.h>
@@ -223,6 +224,7 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 	inet->mc_index	= 0;
 	inet->mc_list	= NULL;
 	inet->rcv_tos	= 0;
+	inet->tos       = task_ip_tos(current);
 
 	if (net->ipv4.sysctl_ip_no_pmtu_disc)
 		inet->pmtudisc = IP_PMTUDISC_DONT;
-- 
2.11.0

^ permalink raw reply related

* Re: [RFC v2] virtio: support packed ring
From: Tiwei Bie @ 2018-04-17  2:51 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <8dee7d62-ac0b-54ba-6bec-4bc4a6fb34e9@redhat.com>

On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> On 2018年04月13日 15:15, Tiwei Bie wrote:
> > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > On 2018年04月01日 22:12, Tiwei Bie wrote:
[...]
> > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > +			      void **ctx)
> > > > +{
> > > > +	struct vring_packed_desc *desc;
> > > > +	unsigned int i, j;
> > > > +
> > > > +	/* Clear data ptr. */
> > > > +	vq->desc_state[head].data = NULL;
> > > > +
> > > > +	i = head;
> > > > +
> > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > +		desc = &vq->vring_packed.desc[i];
> > > > +		vring_unmap_one_packed(vq, desc);
> > > > +		desc->flags = 0x0;
> > > Looks like this is unnecessary.
> > It's safer to zero it. If we don't zero it, after we
> > call virtqueue_detach_unused_buf_packed() which calls
> > this function, the desc is still available to the
> > device.
> 
> Well detach_unused_buf_packed() should be called after device is stopped,
> otherwise even if you try to clear, there will still be a window that device
> may use it.

This is not about whether the device has been stopped or
not. We don't have other places to re-initialize the ring
descriptors and wrap_counter. So they need to be set to
the correct values when doing detach_unused_buf.

Best regards,
Tiwei Bie
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [RFC v2] virtio: support packed ring
From: Michael S. Tsirkin @ 2018-04-17  2:37 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, virtualization, wexu
In-Reply-To: <665c828e-6699-7688-cfea-b23b2b0f5fe3@redhat.com>

On Tue, Apr 17, 2018 at 10:24:32AM +0800, Jason Wang wrote:
> 
> 
> On 2018年04月17日 10:17, Michael S. Tsirkin wrote:
> > On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> > > 
> > > On 2018年04月13日 15:15, Tiwei Bie wrote:
> > > > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > > > Hello everyone,
> > > > > > 
> > > > > > This RFC implements packed ring support for virtio driver.
> > > > > > 
> > > > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > > > 
> > > > > > TODO:
> > > > > > - Refinements and bug fixes;
> > > > > > - Split into small patches;
> > > > > > - Test indirect descriptor support;
> > > > > > - Test/fix event suppression support;
> > > > > > - Test devices other than net;
> > > > > > 
> > > > > > RFC v1 -> RFC v2:
> > > > > > - Add indirect descriptor support - compile test only;
> > > > > > - Add event suppression supprt - compile test only;
> > > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > > - Avoid using '%' operator (Jason);
> > > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > > - Some other refinements and bug fixes;
> > > > > > 
> > > > > > Thanks!
> > > > > > 
> > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > ---
> > > > > >     drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > > > >     include/linux/virtio_ring.h        |    8 +-
> > > > > >     include/uapi/linux/virtio_config.h |   12 +-
> > > > > >     include/uapi/linux/virtio_ring.h   |   61 ++
> > > > > >     4 files changed, 980 insertions(+), 195 deletions(-)
> > > > [...]
> 
> [...]
> 
> > > > > It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
> > > > > instead of vq->event here. Spec does not forces to use evenf_off and
> > > > > event_wrap if event index is enabled.
> > > > > 
> > > > > > +		// FIXME: fix this!
> > > > > > +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> > > > > > +			     vring_need_event(off_wrap & ~(1<<15), new, old);
> > > > > Why need a & here?
> > > > Because wrap_counter (the most significant bit in off_wrap)
> > > > isn't part of the index.
> > > > 
> > > > > > +	} else {
> > > > > Need a smp_rmb() to make sure desc_event_flags was checked before flags.
> > > > I don't get your point, if my understanding is correct,
> > > > desc_event_flags is vq->vring_packed.device->flags. So
> > > > what's the "flags"?
> > > Sorry, I mean we need check device.flags before off_warp. So it needs an
> > > smp_rmb() in the middle.
> > It's best to just read all flags atomically as u32.
> 
> Yes it is.
> 
> > 
> > > It looks to me there's no guarantee that
> > > VRING_EVENT_F_DESC is set if event index is supported.
> > > 
> > > > > > +		needs_kick = (vq->vring_packed.device->flags !=
> > > > > > +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> > > > > > +	}
> > > > > > +	END_USE(vq);
> > > > > > +	return needs_kick;
> > > > > > +}
> > > > [...]
> > > > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > +			      void **ctx)
> > > > > > +{
> > > > > > +	struct vring_packed_desc *desc;
> > > > > > +	unsigned int i, j;
> > > > > > +
> > > > > > +	/* Clear data ptr. */
> > > > > > +	vq->desc_state[head].data = NULL;
> > > > > > +
> > > > > > +	i = head;
> > > > > > +
> > > > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > > > +		desc = &vq->vring_packed.desc[i];
> > > > > > +		vring_unmap_one_packed(vq, desc);
> > > > > > +		desc->flags = 0x0;
> > > > > Looks like this is unnecessary.
> > > > It's safer to zero it. If we don't zero it, after we
> > > > call virtqueue_detach_unused_buf_packed() which calls
> > > > this function, the desc is still available to the
> > > > device.
> > > Well detach_unused_buf_packed() should be called after device is stopped,
> > > otherwise even if you try to clear, there will still be a window that device
> > > may use it.
> > > 
> > > > > > +		i++;
> > > > > > +		if (i >= vq->vring_packed.num)
> > > > > > +			i = 0;
> > > > > > +	}
> > > > [...]
> > > > > > +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > > > +{
> > > > > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > +	u16 last_used_idx, wrap_counter, off_wrap;
> > > > > > +
> > > > > > +	START_USE(vq);
> > > > > > +
> > > > > > +	last_used_idx = vq->last_used_idx;
> > > > > > +	wrap_counter = vq->wrap_counter;
> > > > > > +
> > > > > > +	if (last_used_idx > vq->next_avail_idx)
> > > > > > +		wrap_counter ^= 1;
> > > > > > +
> > > > > > +	off_wrap = last_used_idx | (wrap_counter << 15);
> > > > > > +
> > > > > > +	/* We optimistically turn back on interrupts, then check if there was
> > > > > > +	 * more to do. */
> > > > > > +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> > > > > > +	 * either clear the flags bit or point the event index at the next
> > > > > > +	 * entry. Always do both to keep code simple. */
> > > > > > +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > > > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> > > > > > +						     VRING_EVENT_F_ENABLE;
> > > > > > +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > > > +							vq->event_flags_shadow);
> > > > > > +	}
> > > > > A smp_wmb() is missed here?
> > > > > 
> > > > > > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
> > > > > And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
> > > > > sufficient here.
> > > > I didn't think much when implementing the event suppression
> > > > for packed ring previously. After I saw your comments, I found
> > > > something new. Indeed, unlike the split ring, for the packed
> > > > ring, spec doesn't say we must use VRING_EVENT_F_DESC when
> > > > EVENT_IDX is negotiated. So do you think below thought is
> > > > right or makes sense?
> > > > 
> > > > - For virtqueue_enable_cb_prepare(), we just need to enable
> > > >     the ring by setting flags to VRING_EVENT_F_ENABLE in any
> > > >     case.
> > > > 
> > > > - We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
> > > >     negotiated) only when we want to delay the interrupts
> > > >     virtqueue_enable_cb_delayed().
> > > This looks good to me.
> > I suspect this will lead to extra interrupts if host is fast.
> > So I think for now we should always use VRING_EVENT_F_DESC
> > if EVENT_IDX is negotiated.
> 
> Right, so if this is true, maybe we'd better force this in the spec?
> 
> Thanks

I did consider this.

Why it is the way it is:

- if we allow DISABLE it seems nicer to allow ENABLE as well
for symmetry.

- ENABLE is handy for hardware offload devices
where kicks do not trigger an exit so not worth bother
with, but interrupts still slow the system down so
event index might be worth it.

> > 
> > VRING_EVENT_F_DISABLE makes more sense to me.
> > 
> 
> [...]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH] samples/bpf: correct comment in sock_example.c
From: Wang Sheng-Hui @ 2018-04-17  2:25 UTC (permalink / raw)
  To: ast, daniel, netdev

The program run against loopback interace "lo", not "eth0".
Correct the comment.

Signed-off-by: Wang Sheng-Hui <shhuiw@foxmail.com>
---
 samples/bpf/sock_example.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index 6fc6e193ef1b..33a637507c00 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -9,10 +9,10 @@
  *   if (value)
  *        (*(u64*)value) += 1;
  *
- * - attaches this program to eth0 raw socket
+ * - attaches this program to loopback interface "lo" raw socket
  *
  * - every second user space reads map[tcp], map[udp], map[icmp] to see
- *   how many packets of given protocol were seen on eth0
+ *   how many packets of given protocol were seen on "lo"
  */
 #include <stdio.h>
 #include <unistd.h>
-- 
2.11.0

^ permalink raw reply related

* Re: [RFC v2] virtio: support packed ring
From: Jason Wang @ 2018-04-17  2:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Tiwei Bie, wexu, virtualization, linux-kernel, netdev, jfreimann
In-Reply-To: <20180417051343-mutt-send-email-mst@kernel.org>



On 2018年04月17日 10:17, Michael S. Tsirkin wrote:
> On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
>>
>> On 2018年04月13日 15:15, Tiwei Bie wrote:
>>> On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
>>>> On 2018年04月01日 22:12, Tiwei Bie wrote:
>>>>> Hello everyone,
>>>>>
>>>>> This RFC implements packed ring support for virtio driver.
>>>>>
>>>>> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
>>>>> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
>>>>> Minor changes are needed for the vhost code, e.g. to kick the guest.
>>>>>
>>>>> TODO:
>>>>> - Refinements and bug fixes;
>>>>> - Split into small patches;
>>>>> - Test indirect descriptor support;
>>>>> - Test/fix event suppression support;
>>>>> - Test devices other than net;
>>>>>
>>>>> RFC v1 -> RFC v2:
>>>>> - Add indirect descriptor support - compile test only;
>>>>> - Add event suppression supprt - compile test only;
>>>>> - Move vring_packed_init() out of uapi (Jason, MST);
>>>>> - Merge two loops into one in virtqueue_add_packed() (Jason);
>>>>> - Split vring_unmap_one() for packed ring and split ring (Jason);
>>>>> - Avoid using '%' operator (Jason);
>>>>> - Rename free_head -> next_avail_idx (Jason);
>>>>> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
>>>>> - Some other refinements and bug fixes;
>>>>>
>>>>> Thanks!
>>>>>
>>>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>>>> ---
>>>>>     drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
>>>>>     include/linux/virtio_ring.h        |    8 +-
>>>>>     include/uapi/linux/virtio_config.h |   12 +-
>>>>>     include/uapi/linux/virtio_ring.h   |   61 ++
>>>>>     4 files changed, 980 insertions(+), 195 deletions(-)
>>> [...]

[...]

>>>> It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
>>>> instead of vq->event here. Spec does not forces to use evenf_off and
>>>> event_wrap if event index is enabled.
>>>>
>>>>> +		// FIXME: fix this!
>>>>> +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
>>>>> +			     vring_need_event(off_wrap & ~(1<<15), new, old);
>>>> Why need a & here?
>>> Because wrap_counter (the most significant bit in off_wrap)
>>> isn't part of the index.
>>>
>>>>> +	} else {
>>>> Need a smp_rmb() to make sure desc_event_flags was checked before flags.
>>> I don't get your point, if my understanding is correct,
>>> desc_event_flags is vq->vring_packed.device->flags. So
>>> what's the "flags"?
>> Sorry, I mean we need check device.flags before off_warp. So it needs an
>> smp_rmb() in the middle.
> It's best to just read all flags atomically as u32.

Yes it is.

>
>> It looks to me there's no guarantee that
>> VRING_EVENT_F_DESC is set if event index is supported.
>>
>>>>> +		needs_kick = (vq->vring_packed.device->flags !=
>>>>> +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
>>>>> +	}
>>>>> +	END_USE(vq);
>>>>> +	return needs_kick;
>>>>> +}
>>> [...]
>>>>> +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>>>> +			      void **ctx)
>>>>> +{
>>>>> +	struct vring_packed_desc *desc;
>>>>> +	unsigned int i, j;
>>>>> +
>>>>> +	/* Clear data ptr. */
>>>>> +	vq->desc_state[head].data = NULL;
>>>>> +
>>>>> +	i = head;
>>>>> +
>>>>> +	for (j = 0; j < vq->desc_state[head].num; j++) {
>>>>> +		desc = &vq->vring_packed.desc[i];
>>>>> +		vring_unmap_one_packed(vq, desc);
>>>>> +		desc->flags = 0x0;
>>>> Looks like this is unnecessary.
>>> It's safer to zero it. If we don't zero it, after we
>>> call virtqueue_detach_unused_buf_packed() which calls
>>> this function, the desc is still available to the
>>> device.
>> Well detach_unused_buf_packed() should be called after device is stopped,
>> otherwise even if you try to clear, there will still be a window that device
>> may use it.
>>
>>>>> +		i++;
>>>>> +		if (i >= vq->vring_packed.num)
>>>>> +			i = 0;
>>>>> +	}
>>> [...]
>>>>> +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>>>>> +{
>>>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>>>> +	u16 last_used_idx, wrap_counter, off_wrap;
>>>>> +
>>>>> +	START_USE(vq);
>>>>> +
>>>>> +	last_used_idx = vq->last_used_idx;
>>>>> +	wrap_counter = vq->wrap_counter;
>>>>> +
>>>>> +	if (last_used_idx > vq->next_avail_idx)
>>>>> +		wrap_counter ^= 1;
>>>>> +
>>>>> +	off_wrap = last_used_idx | (wrap_counter << 15);
>>>>> +
>>>>> +	/* We optimistically turn back on interrupts, then check if there was
>>>>> +	 * more to do. */
>>>>> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
>>>>> +	 * either clear the flags bit or point the event index at the next
>>>>> +	 * entry. Always do both to keep code simple. */
>>>>> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
>>>>> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
>>>>> +						     VRING_EVENT_F_ENABLE;
>>>>> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>>>>> +							vq->event_flags_shadow);
>>>>> +	}
>>>> A smp_wmb() is missed here?
>>>>
>>>>> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
>>>> And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
>>>> sufficient here.
>>> I didn't think much when implementing the event suppression
>>> for packed ring previously. After I saw your comments, I found
>>> something new. Indeed, unlike the split ring, for the packed
>>> ring, spec doesn't say we must use VRING_EVENT_F_DESC when
>>> EVENT_IDX is negotiated. So do you think below thought is
>>> right or makes sense?
>>>
>>> - For virtqueue_enable_cb_prepare(), we just need to enable
>>>     the ring by setting flags to VRING_EVENT_F_ENABLE in any
>>>     case.
>>>
>>> - We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
>>>     negotiated) only when we want to delay the interrupts
>>>     virtqueue_enable_cb_delayed().
>> This looks good to me.
> I suspect this will lead to extra interrupts if host is fast.
> So I think for now we should always use VRING_EVENT_F_DESC
> if EVENT_IDX is negotiated.

Right, so if this is true, maybe we'd better force this in the spec?

Thanks

>
> VRING_EVENT_F_DISABLE makes more sense to me.
>

[...]

^ permalink raw reply

* Re: [RFC v2] virtio: support packed ring
From: Michael S. Tsirkin @ 2018-04-17  2:17 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, virtualization, wexu
In-Reply-To: <8dee7d62-ac0b-54ba-6bec-4bc4a6fb34e9@redhat.com>

On Tue, Apr 17, 2018 at 10:11:58AM +0800, Jason Wang wrote:
> 
> 
> On 2018年04月13日 15:15, Tiwei Bie wrote:
> > On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
> > > On 2018年04月01日 22:12, Tiwei Bie wrote:
> > > > Hello everyone,
> > > > 
> > > > This RFC implements packed ring support for virtio driver.
> > > > 
> > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > > > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > > > 
> > > > TODO:
> > > > - Refinements and bug fixes;
> > > > - Split into small patches;
> > > > - Test indirect descriptor support;
> > > > - Test/fix event suppression support;
> > > > - Test devices other than net;
> > > > 
> > > > RFC v1 -> RFC v2:
> > > > - Add indirect descriptor support - compile test only;
> > > > - Add event suppression supprt - compile test only;
> > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > - Avoid using '%' operator (Jason);
> > > > - Rename free_head -> next_avail_idx (Jason);
> > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > - Some other refinements and bug fixes;
> > > > 
> > > > Thanks!
> > > > 
> > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > ---
> > > >    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
> > > >    include/linux/virtio_ring.h        |    8 +-
> > > >    include/uapi/linux/virtio_config.h |   12 +-
> > > >    include/uapi/linux/virtio_ring.h   |   61 ++
> > > >    4 files changed, 980 insertions(+), 195 deletions(-)
> > [...]
> > > > +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> > > > +						       unsigned int total_sg,
> > > > +						       gfp_t gfp)
> > > > +{
> > > > +	struct vring_packed_desc *desc;
> > > > +
> > > > +	/*
> > > > +	 * We require lowmem mappings for the descriptors because
> > > > +	 * otherwise virt_to_phys will give us bogus addresses in the
> > > > +	 * virtqueue.
> > > > +	 */
> > > > +	gfp &= ~__GFP_HIGHMEM;
> > > > +
> > > > +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> > > Can we simply check vq->packed here to avoid duplicating helpers?
> > Then it would be something like this:
> > 
> > static void *alloc_indirect(struct virtqueue *_vq, unsigned int total_sg,
> > 			    gfp_t gfp)
> > {
> > 	struct vring_virtqueue *vq = to_vvq(_vq);
> > 	void *data;
> > 
> > 	/*
> > 	 * We require lowmem mappings for the descriptors because
> > 	 * otherwise virt_to_phys will give us bogus addresses in the
> > 	 * virtqueue.
> > 	 */
> > 	gfp &= ~__GFP_HIGHMEM;
> > 
> > 	if (vq->packed) {
> > 		data = kmalloc(total_sg * sizeof(struct vring_packed_desc),
> > 				gfp);
> > 		if (!data)
> > 			return NULL;
> > 	} else {
> > 		struct vring_desc *desc;
> > 		unsigned int i;
> > 
> > 		desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
> > 		if (!desc)
> > 			return NULL;
> > 
> > 		for (i = 0; i < total_sg; i++)
> > 			desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> > 
> > 		data = desc;
> > 	}
> > 
> > 	return data;
> > }
> > 
> > I would prefer to have two simpler helpers (and to the callers,
> > it's already very clear about which one they should call), i.e.
> > the current implementation:
> > 
> > static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> > 						       unsigned int total_sg,
> > 						       gfp_t gfp)
> > {
> > 	struct vring_packed_desc *desc;
> > 
> > 	/*
> > 	 * We require lowmem mappings for the descriptors because
> > 	 * otherwise virt_to_phys will give us bogus addresses in the
> > 	 * virtqueue.
> > 	 */
> > 	gfp &= ~__GFP_HIGHMEM;
> > 
> > 	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> > 
> > 	return desc;
> > }
> > 
> > static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> > 					       unsigned int total_sg,
> > 					       gfp_t gfp)
> > {
> > 	struct vring_desc *desc;
> > 	unsigned int i;
> > 
> > 	/*
> > 	 * We require lowmem mappings for the descriptors because
> > 	 * otherwise virt_to_phys will give us bogus addresses in the
> > 	 * virtqueue.
> > 	 */
> > 	gfp &= ~__GFP_HIGHMEM;
> > 
> > 	desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
> > 	if (!desc)
> > 		return NULL;
> > 
> > 	for (i = 0; i < total_sg; i++)
> > 		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> > 	return desc;
> > }
> 
> Yeah, I miss that split version needs a desc list.
> 
> > 
> > > > +
> > > > +	return desc;
> > > > +}
> > [...]
> > > > +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > > > +				       struct scatterlist *sgs[],
> > > > +				       unsigned int total_sg,
> > > > +				       unsigned int out_sgs,
> > > > +				       unsigned int in_sgs,
> > > > +				       void *data,
> > > > +				       void *ctx,
> > > > +				       gfp_t gfp)
> > > > +{
> > > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > +	struct vring_packed_desc *desc;
> > > > +	struct scatterlist *sg;
> > > > +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> > > > +	__virtio16 uninitialized_var(head_flags), flags;
> > > > +	int head, wrap_counter;
> > > > +	bool indirect;
> > > > +
> > > > +	START_USE(vq);
> > > > +
> > > > +	BUG_ON(data == NULL);
> > > > +	BUG_ON(ctx && vq->indirect);
> > > > +
> > > > +	if (unlikely(vq->broken)) {
> > > > +		END_USE(vq);
> > > > +		return -EIO;
> > > > +	}
> > > > +
> > > > +#ifdef DEBUG
> > > > +	{
> > > > +		ktime_t now = ktime_get();
> > > > +
> > > > +		/* No kick or get, with .1 second between?  Warn. */
> > > > +		if (vq->last_add_time_valid)
> > > > +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> > > > +					    > 100);
> > > > +		vq->last_add_time = now;
> > > > +		vq->last_add_time_valid = true;
> > > > +	}
> > > > +#endif
> > > > +
> > > > +	BUG_ON(total_sg == 0);
> > > > +
> > > > +	head = vq->next_avail_idx;
> > > > +	wrap_counter = vq->wrap_counter;
> > > > +
> > > > +	/* If the host supports indirect descriptor tables, and we have multiple
> > > > +	 * buffers, then go indirect. FIXME: tune this threshold */
> > > > +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> > > Let's introduce a helper like virtqueue_need_indirect() to avoid duplicating
> > > codes and FIXME.
> > Okay.
> > 
> > > > +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> > > > +	else {
> > > > +		desc = NULL;
> > > > +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> > > > +	}
> > > > +
> > > > +	if (desc) {
> > > > +		/* Use a single buffer which doesn't continue */
> > > > +		indirect = true;
> > > > +		/* Set up rest to use this indirect table. */
> > > > +		i = 0;
> > > > +		descs_used = 1;
> > > > +	} else {
> > > > +		indirect = false;
> > > > +		desc = vq->vring_packed.desc;
> > > > +		i = head;
> > > > +		descs_used = total_sg;
> > > > +	}
> > > > +
> > > > +	if (vq->vq.num_free < descs_used) {
> > > > +		pr_debug("Can't add buf len %i - avail = %i\n",
> > > > +			 descs_used, vq->vq.num_free);
> > > > +		/* FIXME: for historical reasons, we force a notify here if
> > > > +		 * there are outgoing parts to the buffer.  Presumably the
> > > > +		 * host should service the ring ASAP. */
> > > > +		if (out_sgs)
> > > > +			vq->notify(&vq->vq);
> > > > +		if (indirect)
> > > > +			kfree(desc);
> > > > +		END_USE(vq);
> > > > +		return -ENOSPC;
> > > > +	}
> > > > +
> > > > +	for (n = 0; n < out_sgs + in_sgs; n++) {
> > > > +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> > > > +						DMA_TO_DEVICE : DMA_FROM_DEVICE);
> > > > +			if (vring_mapping_error(vq, addr))
> > > > +				goto unmap_release;
> > > > +
> > > > +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > > > +					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> > > > +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > > > +					VRING_DESC_F_USED(!vq->wrap_counter));
> > > > +			if (!indirect && i == head)
> > > > +				head_flags = flags;
> > > > +			else
> > > > +				desc[i].flags = flags;
> > > > +
> > > > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > > > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > > Similar to V1, we only need this for the last descriptor.
> > Okay, will just set it for the last desc.
> > 
> > > > +			prev = i;
> > > It looks to me there's no need to track prev inside the loop here.
> > > 
> > > > +			i++;
> > > > +			if (!indirect && i >= vq->vring_packed.num) {
> > > > +				i = 0;
> > > > +				vq->wrap_counter ^= 1;
> > > > +			}
> > > > +		}
> > > > +	}
> > > > +	/* Last one doesn't continue. */
> > > > +	if (total_sg == 1)
> > > > +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > > +	else
> > > > +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > The only case when prev != i - 1 is i == 0, we can add a if here.
> > It's just a mirror of the existing implementation in split ring.
> > It seems that split ring implementation needs this just because
> > it's much harder for it to find the prev, which is not true for
> > packed ring. So I'll take your suggestion. Thanks!
> > 
> > [...]
> > > > +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> > > > +{
> > > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > +	u16 new, old, off_wrap;
> > > > +	bool needs_kick;
> > > > +
> > > > +	START_USE(vq);
> > > > +	/* We need to expose the new flags value before checking notification
> > > > +	 * suppressions. */
> > > > +	virtio_mb(vq->weak_barriers);
> > > > +
> > > > +	old = vq->next_avail_idx - vq->num_added;
> > > > +	new = vq->next_avail_idx;
> > > > +	vq->num_added = 0;
> > > > +
> > > > +#ifdef DEBUG
> > > > +	if (vq->last_add_time_valid) {
> > > > +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> > > > +					      vq->last_add_time)) > 100);
> > > > +	}
> > > > +	vq->last_add_time_valid = false;
> > > > +#endif
> > > > +
> > > > +	off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
> > > > +
> > > > +	if (vq->event) {
> > > It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
> > > instead of vq->event here. Spec does not forces to use evenf_off and
> > > event_wrap if event index is enabled.
> > > 
> > > > +		// FIXME: fix this!
> > > > +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
> > > > +			     vring_need_event(off_wrap & ~(1<<15), new, old);
> > > Why need a & here?
> > Because wrap_counter (the most significant bit in off_wrap)
> > isn't part of the index.
> > 
> > > > +	} else {
> > > Need a smp_rmb() to make sure desc_event_flags was checked before flags.
> > I don't get your point, if my understanding is correct,
> > desc_event_flags is vq->vring_packed.device->flags. So
> > what's the "flags"?
> 
> Sorry, I mean we need check device.flags before off_warp. So it needs an
> smp_rmb() in the middle.

It's best to just read all flags atomically as u32.

> It looks to me there's no guarantee that
> VRING_EVENT_F_DESC is set if event index is supported.
> 
> > 
> > > > +		needs_kick = (vq->vring_packed.device->flags !=
> > > > +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
> > > > +	}
> > > > +	END_USE(vq);
> > > > +	return needs_kick;
> > > > +}
> > [...]
> > > > +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > +			      void **ctx)
> > > > +{
> > > > +	struct vring_packed_desc *desc;
> > > > +	unsigned int i, j;
> > > > +
> > > > +	/* Clear data ptr. */
> > > > +	vq->desc_state[head].data = NULL;
> > > > +
> > > > +	i = head;
> > > > +
> > > > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > > > +		desc = &vq->vring_packed.desc[i];
> > > > +		vring_unmap_one_packed(vq, desc);
> > > > +		desc->flags = 0x0;
> > > Looks like this is unnecessary.
> > It's safer to zero it. If we don't zero it, after we
> > call virtqueue_detach_unused_buf_packed() which calls
> > this function, the desc is still available to the
> > device.
> 
> Well detach_unused_buf_packed() should be called after device is stopped,
> otherwise even if you try to clear, there will still be a window that device
> may use it.
> 
> > 
> > > > +		i++;
> > > > +		if (i >= vq->vring_packed.num)
> > > > +			i = 0;
> > > > +	}
> > [...]
> > > > +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > > > +{
> > > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > > +	u16 last_used_idx, wrap_counter, off_wrap;
> > > > +
> > > > +	START_USE(vq);
> > > > +
> > > > +	last_used_idx = vq->last_used_idx;
> > > > +	wrap_counter = vq->wrap_counter;
> > > > +
> > > > +	if (last_used_idx > vq->next_avail_idx)
> > > > +		wrap_counter ^= 1;
> > > > +
> > > > +	off_wrap = last_used_idx | (wrap_counter << 15);
> > > > +
> > > > +	/* We optimistically turn back on interrupts, then check if there was
> > > > +	 * more to do. */
> > > > +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> > > > +	 * either clear the flags bit or point the event index at the next
> > > > +	 * entry. Always do both to keep code simple. */
> > > > +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > > > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
> > > > +						     VRING_EVENT_F_ENABLE;
> > > > +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > > > +							vq->event_flags_shadow);
> > > > +	}
> > > A smp_wmb() is missed here?
> > > 
> > > > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
> > > And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
> > > sufficient here.
> > I didn't think much when implementing the event suppression
> > for packed ring previously. After I saw your comments, I found
> > something new. Indeed, unlike the split ring, for the packed
> > ring, spec doesn't say we must use VRING_EVENT_F_DESC when
> > EVENT_IDX is negotiated. So do you think below thought is
> > right or makes sense?
> > 
> > - For virtqueue_enable_cb_prepare(), we just need to enable
> >    the ring by setting flags to VRING_EVENT_F_ENABLE in any
> >    case.
> > 
> > - We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
> >    negotiated) only when we want to delay the interrupts
> >    virtqueue_enable_cb_delayed().
> 
> This looks good to me.

I suspect this will lead to extra interrupts if host is fast.
So I think for now we should always use VRING_EVENT_F_DESC
if EVENT_IDX is negotiated.

VRING_EVENT_F_DISABLE makes more sense to me.


> > 
> > > > +	END_USE(vq);
> > > > +	return last_used_idx;
> > > > +}
> > > > +
> > [...]
> > > > @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
> > > >    	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
> > > >    		switch (i) {
> > > > -		case VIRTIO_RING_F_INDIRECT_DESC:
> > > > +#if 0
> > > > +		case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
> > > >    			break;
> > > > -		case VIRTIO_RING_F_EVENT_IDX:
> > > > +		case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
> > > >    			break;
> > > > +#endif
> > > It would be better if you can split EVENT_IDX and INDIRECT_DESC into
> > > separate patches too.
> > Sure. Will do it in the next version.
> > 
> > Thanks for the review!
> 
> Thanks.
> 
> > > Thanks
> > > 
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net,stable] tun: fix vlan packet truncation
From: Jason Wang @ 2018-04-17  2:13 UTC (permalink / raw)
  To: Bjørn Mork, netdev
In-Reply-To: <20180416220038.21743-1-bjorn@mork.no>



On 2018年04月17日 06:00, Bjørn Mork wrote:
> Bogus trimming in tun_net_xmit() causes truncated vlan packets.
>
> skb->len is correct whether or not skb_vlan_tag_present() is true. There
> is no more reason to adjust the skb length on xmit in this driver than
> any other driver. tun_put_user() adds 4 bytes to the total for tagged
> packets because it transmits the tag inline to userspace.  This is
> similar to a nic transmitting the tag inline on the wire.
>
> Reproducing the bug by sending any tagged packet through back-to-back
> connected tap interfaces:
>
>   socat TUN,tun-type=tap,iff-up,tun-name=in TUN,tun-type=tap,iff-up,tun-name=out &
>   ip link add link in name in.20 type vlan id 20
>   ip addr add 10.9.9.9/24 dev in.20
>   ip link set in.20 up
>   tshark -nxxi in -f arp -c1 2>/dev/null &
>   tshark -nxxi out -f arp -c1 2>/dev/null &
>   ping -c 1 10.9.9.5 >/dev/null 2>&1
>
> The output from the 'in' and 'out' interfaces are different when the
> bug is present:
>
>   Capturing on 'in'
>   0000  ff ff ff ff ff ff 76 cf 76 37 d5 0a 81 00 00 14   ......v.v7......
>   0010  08 06 00 01 08 00 06 04 00 01 76 cf 76 37 d5 0a   ..........v.v7..
>   0020  0a 09 09 09 00 00 00 00 00 00 0a 09 09 05         ..............
>
>   Capturing on 'out'
>   0000  ff ff ff ff ff ff 76 cf 76 37 d5 0a 81 00 00 14   ......v.v7......
>   0010  08 06 00 01 08 00 06 04 00 01 76 cf 76 37 d5 0a   ..........v.v7..
>   0020  0a 09 09 09 00 00 00 00 00 00                     ..........
>
> Fixes: aff3d70a07ff ("tun: allow to attach ebpf socket filter")
> Cc: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Bjørn Mork <bjorn@mork.no>
> ---
>   drivers/net/tun.c | 7 -------
>   1 file changed, 7 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 28583aa0c17d..01cf8e3d8edc 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1103,13 +1103,6 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>   
>   	len = run_ebpf_filter(tun, skb, len);
>   
> -	/* Trim extra bytes since we may insert vlan proto & TCI
> -	 * in tun_put_user().
> -	 */
> -	len -= skb_vlan_tag_present(skb) ? sizeof(struct veth) : 0;
> -	if (len <= 0 || pskb_trim(skb, len))
> -		goto drop;
> -
>   	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
>   		goto drop;
>   

Good catch, but I still think we should do the truncation in 
run_ebpf_filter to match the behavior socket ebpf filter.

Thanks

^ permalink raw reply

* Re: [RFC v2] virtio: support packed ring
From: Jason Wang @ 2018-04-17  2:11 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, wexu, virtualization, linux-kernel, netdev, jfreimann
In-Reply-To: <20180413071529.f4esh654dakodf4f@debian>



On 2018年04月13日 15:15, Tiwei Bie wrote:
> On Fri, Apr 13, 2018 at 12:30:24PM +0800, Jason Wang wrote:
>> On 2018年04月01日 22:12, Tiwei Bie wrote:
>>> Hello everyone,
>>>
>>> This RFC implements packed ring support for virtio driver.
>>>
>>> The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
>>> by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
>>> Minor changes are needed for the vhost code, e.g. to kick the guest.
>>>
>>> TODO:
>>> - Refinements and bug fixes;
>>> - Split into small patches;
>>> - Test indirect descriptor support;
>>> - Test/fix event suppression support;
>>> - Test devices other than net;
>>>
>>> RFC v1 -> RFC v2:
>>> - Add indirect descriptor support - compile test only;
>>> - Add event suppression supprt - compile test only;
>>> - Move vring_packed_init() out of uapi (Jason, MST);
>>> - Merge two loops into one in virtqueue_add_packed() (Jason);
>>> - Split vring_unmap_one() for packed ring and split ring (Jason);
>>> - Avoid using '%' operator (Jason);
>>> - Rename free_head -> next_avail_idx (Jason);
>>> - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
>>> - Some other refinements and bug fixes;
>>>
>>> Thanks!
>>>
>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>> ---
>>>    drivers/virtio/virtio_ring.c       | 1094 +++++++++++++++++++++++++++++-------
>>>    include/linux/virtio_ring.h        |    8 +-
>>>    include/uapi/linux/virtio_config.h |   12 +-
>>>    include/uapi/linux/virtio_ring.h   |   61 ++
>>>    4 files changed, 980 insertions(+), 195 deletions(-)
> [...]
>>> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
>>> +						       unsigned int total_sg,
>>> +						       gfp_t gfp)
>>> +{
>>> +	struct vring_packed_desc *desc;
>>> +
>>> +	/*
>>> +	 * We require lowmem mappings for the descriptors because
>>> +	 * otherwise virt_to_phys will give us bogus addresses in the
>>> +	 * virtqueue.
>>> +	 */
>>> +	gfp &= ~__GFP_HIGHMEM;
>>> +
>>> +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
>> Can we simply check vq->packed here to avoid duplicating helpers?
> Then it would be something like this:
>
> static void *alloc_indirect(struct virtqueue *_vq, unsigned int total_sg,
> 			    gfp_t gfp)
> {
> 	struct vring_virtqueue *vq = to_vvq(_vq);
> 	void *data;
>
> 	/*
> 	 * We require lowmem mappings for the descriptors because
> 	 * otherwise virt_to_phys will give us bogus addresses in the
> 	 * virtqueue.
> 	 */
> 	gfp &= ~__GFP_HIGHMEM;
>
> 	if (vq->packed) {
> 		data = kmalloc(total_sg * sizeof(struct vring_packed_desc),
> 				gfp);
> 		if (!data)
> 			return NULL;
> 	} else {
> 		struct vring_desc *desc;
> 		unsigned int i;
>
> 		desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
> 		if (!desc)
> 			return NULL;
>
> 		for (i = 0; i < total_sg; i++)
> 			desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
>
> 		data = desc;
> 	}
>
> 	return data;
> }
>
> I would prefer to have two simpler helpers (and to the callers,
> it's already very clear about which one they should call), i.e.
> the current implementation:
>
> static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> 						       unsigned int total_sg,
> 						       gfp_t gfp)
> {
> 	struct vring_packed_desc *desc;
>
> 	/*
> 	 * We require lowmem mappings for the descriptors because
> 	 * otherwise virt_to_phys will give us bogus addresses in the
> 	 * virtqueue.
> 	 */
> 	gfp &= ~__GFP_HIGHMEM;
>
> 	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
>
> 	return desc;
> }
>
> static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> 					       unsigned int total_sg,
> 					       gfp_t gfp)
> {
> 	struct vring_desc *desc;
> 	unsigned int i;
>
> 	/*
> 	 * We require lowmem mappings for the descriptors because
> 	 * otherwise virt_to_phys will give us bogus addresses in the
> 	 * virtqueue.
> 	 */
> 	gfp &= ~__GFP_HIGHMEM;
>
> 	desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
> 	if (!desc)
> 		return NULL;
>
> 	for (i = 0; i < total_sg; i++)
> 		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> 	return desc;
> }

Yeah, I miss that split version needs a desc list.

>
>>> +
>>> +	return desc;
>>> +}
> [...]
>>> +static inline int virtqueue_add_packed(struct virtqueue *_vq,
>>> +				       struct scatterlist *sgs[],
>>> +				       unsigned int total_sg,
>>> +				       unsigned int out_sgs,
>>> +				       unsigned int in_sgs,
>>> +				       void *data,
>>> +				       void *ctx,
>>> +				       gfp_t gfp)
>>> +{
>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>> +	struct vring_packed_desc *desc;
>>> +	struct scatterlist *sg;
>>> +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
>>> +	__virtio16 uninitialized_var(head_flags), flags;
>>> +	int head, wrap_counter;
>>> +	bool indirect;
>>> +
>>> +	START_USE(vq);
>>> +
>>> +	BUG_ON(data == NULL);
>>> +	BUG_ON(ctx && vq->indirect);
>>> +
>>> +	if (unlikely(vq->broken)) {
>>> +		END_USE(vq);
>>> +		return -EIO;
>>> +	}
>>> +
>>> +#ifdef DEBUG
>>> +	{
>>> +		ktime_t now = ktime_get();
>>> +
>>> +		/* No kick or get, with .1 second between?  Warn. */
>>> +		if (vq->last_add_time_valid)
>>> +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
>>> +					    > 100);
>>> +		vq->last_add_time = now;
>>> +		vq->last_add_time_valid = true;
>>> +	}
>>> +#endif
>>> +
>>> +	BUG_ON(total_sg == 0);
>>> +
>>> +	head = vq->next_avail_idx;
>>> +	wrap_counter = vq->wrap_counter;
>>> +
>>> +	/* If the host supports indirect descriptor tables, and we have multiple
>>> +	 * buffers, then go indirect. FIXME: tune this threshold */
>>> +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
>> Let's introduce a helper like virtqueue_need_indirect() to avoid duplicating
>> codes and FIXME.
> Okay.
>
>>> +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
>>> +	else {
>>> +		desc = NULL;
>>> +		WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
>>> +	}
>>> +
>>> +	if (desc) {
>>> +		/* Use a single buffer which doesn't continue */
>>> +		indirect = true;
>>> +		/* Set up rest to use this indirect table. */
>>> +		i = 0;
>>> +		descs_used = 1;
>>> +	} else {
>>> +		indirect = false;
>>> +		desc = vq->vring_packed.desc;
>>> +		i = head;
>>> +		descs_used = total_sg;
>>> +	}
>>> +
>>> +	if (vq->vq.num_free < descs_used) {
>>> +		pr_debug("Can't add buf len %i - avail = %i\n",
>>> +			 descs_used, vq->vq.num_free);
>>> +		/* FIXME: for historical reasons, we force a notify here if
>>> +		 * there are outgoing parts to the buffer.  Presumably the
>>> +		 * host should service the ring ASAP. */
>>> +		if (out_sgs)
>>> +			vq->notify(&vq->vq);
>>> +		if (indirect)
>>> +			kfree(desc);
>>> +		END_USE(vq);
>>> +		return -ENOSPC;
>>> +	}
>>> +
>>> +	for (n = 0; n < out_sgs + in_sgs; n++) {
>>> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
>>> +			dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
>>> +						DMA_TO_DEVICE : DMA_FROM_DEVICE);
>>> +			if (vring_mapping_error(vq, addr))
>>> +				goto unmap_release;
>>> +
>>> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
>>> +					(n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
>>> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
>>> +					VRING_DESC_F_USED(!vq->wrap_counter));
>>> +			if (!indirect && i == head)
>>> +				head_flags = flags;
>>> +			else
>>> +				desc[i].flags = flags;
>>> +
>>> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>> Similar to V1, we only need this for the last descriptor.
> Okay, will just set it for the last desc.
>
>>> +			prev = i;
>> It looks to me there's no need to track prev inside the loop here.
>>
>>> +			i++;
>>> +			if (!indirect && i >= vq->vring_packed.num) {
>>> +				i = 0;
>>> +				vq->wrap_counter ^= 1;
>>> +			}
>>> +		}
>>> +	}
>>> +	/* Last one doesn't continue. */
>>> +	if (total_sg == 1)
>>> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>>> +	else
>>> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>> The only case when prev != i - 1 is i == 0, we can add a if here.
> It's just a mirror of the existing implementation in split ring.
> It seems that split ring implementation needs this just because
> it's much harder for it to find the prev, which is not true for
> packed ring. So I'll take your suggestion. Thanks!
>
> [...]
>>> +static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>>> +{
>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>> +	u16 new, old, off_wrap;
>>> +	bool needs_kick;
>>> +
>>> +	START_USE(vq);
>>> +	/* We need to expose the new flags value before checking notification
>>> +	 * suppressions. */
>>> +	virtio_mb(vq->weak_barriers);
>>> +
>>> +	old = vq->next_avail_idx - vq->num_added;
>>> +	new = vq->next_avail_idx;
>>> +	vq->num_added = 0;
>>> +
>>> +#ifdef DEBUG
>>> +	if (vq->last_add_time_valid) {
>>> +		WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
>>> +					      vq->last_add_time)) > 100);
>>> +	}
>>> +	vq->last_add_time_valid = false;
>>> +#endif
>>> +
>>> +	off_wrap = virtio16_to_cpu(_vq->vdev, vq->vring_packed.device->off_wrap);
>>> +
>>> +	if (vq->event) {
>> It looks to me we should examine RING_EVENT_FLAGS_DESC in desc_event_flags
>> instead of vq->event here. Spec does not forces to use evenf_off and
>> event_wrap if event index is enabled.
>>
>>> +		// FIXME: fix this!
>>> +		needs_kick = ((off_wrap >> 15) == vq->wrap_counter) &&
>>> +			     vring_need_event(off_wrap & ~(1<<15), new, old);
>> Why need a & here?
> Because wrap_counter (the most significant bit in off_wrap)
> isn't part of the index.
>
>>> +	} else {
>> Need a smp_rmb() to make sure desc_event_flags was checked before flags.
> I don't get your point, if my understanding is correct,
> desc_event_flags is vq->vring_packed.device->flags. So
> what's the "flags"?

Sorry, I mean we need check device.flags before off_warp. So it needs an 
smp_rmb() in the middle. It looks to me there's no guarantee that 
VRING_EVENT_F_DESC is set if event index is supported.

>
>>> +		needs_kick = (vq->vring_packed.device->flags !=
>>> +			      cpu_to_virtio16(_vq->vdev, VRING_EVENT_F_DISABLE));
>>> +	}
>>> +	END_USE(vq);
>>> +	return needs_kick;
>>> +}
> [...]
>>> +static int detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>> +			      void **ctx)
>>> +{
>>> +	struct vring_packed_desc *desc;
>>> +	unsigned int i, j;
>>> +
>>> +	/* Clear data ptr. */
>>> +	vq->desc_state[head].data = NULL;
>>> +
>>> +	i = head;
>>> +
>>> +	for (j = 0; j < vq->desc_state[head].num; j++) {
>>> +		desc = &vq->vring_packed.desc[i];
>>> +		vring_unmap_one_packed(vq, desc);
>>> +		desc->flags = 0x0;
>> Looks like this is unnecessary.
> It's safer to zero it. If we don't zero it, after we
> call virtqueue_detach_unused_buf_packed() which calls
> this function, the desc is still available to the
> device.

Well detach_unused_buf_packed() should be called after device is 
stopped, otherwise even if you try to clear, there will still be a 
window that device may use it.

>
>>> +		i++;
>>> +		if (i >= vq->vring_packed.num)
>>> +			i = 0;
>>> +	}
> [...]
>>> +static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>>> +{
>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>> +	u16 last_used_idx, wrap_counter, off_wrap;
>>> +
>>> +	START_USE(vq);
>>> +
>>> +	last_used_idx = vq->last_used_idx;
>>> +	wrap_counter = vq->wrap_counter;
>>> +
>>> +	if (last_used_idx > vq->next_avail_idx)
>>> +		wrap_counter ^= 1;
>>> +
>>> +	off_wrap = last_used_idx | (wrap_counter << 15);
>>> +
>>> +	/* We optimistically turn back on interrupts, then check if there was
>>> +	 * more to do. */
>>> +	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
>>> +	 * either clear the flags bit or point the event index at the next
>>> +	 * entry. Always do both to keep code simple. */
>>> +	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
>>> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC:
>>> +						     VRING_EVENT_F_ENABLE;
>>> +		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>>> +							vq->event_flags_shadow);
>>> +	}
>> A smp_wmb() is missed here?
>>
>>> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev, off_wrap);
>> And according to the spec, it looks to me write a VRING_EVENT_F_ENABLE is
>> sufficient here.
> I didn't think much when implementing the event suppression
> for packed ring previously. After I saw your comments, I found
> something new. Indeed, unlike the split ring, for the packed
> ring, spec doesn't say we must use VRING_EVENT_F_DESC when
> EVENT_IDX is negotiated. So do you think below thought is
> right or makes sense?
>
> - For virtqueue_enable_cb_prepare(), we just need to enable
>    the ring by setting flags to VRING_EVENT_F_ENABLE in any
>    case.
>
> - We will try to use VRING_EVENT_F_DESC (if EVENT_IDX is
>    negotiated) only when we want to delay the interrupts
>    virtqueue_enable_cb_delayed().

This looks good to me.

>
>>> +	END_USE(vq);
>>> +	return last_used_idx;
>>> +}
>>> +
> [...]
>>> @@ -1157,14 +1852,18 @@ void vring_transport_features(struct virtio_device *vdev)
>>>    	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
>>>    		switch (i) {
>>> -		case VIRTIO_RING_F_INDIRECT_DESC:
>>> +#if 0
>>> +		case VIRTIO_RING_F_INDIRECT_DESC: // FIXME not tested yet.
>>>    			break;
>>> -		case VIRTIO_RING_F_EVENT_IDX:
>>> +		case VIRTIO_RING_F_EVENT_IDX: // FIXME probably not work.
>>>    			break;
>>> +#endif
>> It would be better if you can split EVENT_IDX and INDIRECT_DESC into
>> separate patches too.
> Sure. Will do it in the next version.
>
> Thanks for the review!

Thanks.

>> Thanks
>>

^ permalink raw reply

* Re: SRIOV switchdev mode BoF minutes
From: Samudrala, Sridhar @ 2018-04-17  2:08 UTC (permalink / raw)
  To: Andy Gospodarek, Or Gerlitz
  Cc: David Miller, Anjali Singhai Jain, Michael Chan, Simon Horman,
	Jakub Kicinski, John Fastabend, Saeed Mahameed, Jiri Pirko,
	Rony Efraim, Linux Netdev List
In-Reply-To: <20180416123936.GH33938@C02RW35GFVH8.dhcp.broadcom.net>


On 4/16/2018 5:39 AM, Andy Gospodarek wrote:
> On Sun, Apr 15, 2018 at 09:01:16AM +0300, Or Gerlitz wrote:
>> On Sat, Apr 14, 2018 at 2:03 AM, Samudrala, Sridhar
>> <sridhar.samudrala@intel.com> wrote:
>>
>>> I meant between PFs on 2 compute nodes.
>> If the PF serves as uplink rep, it functions as  a switch port -- applications
>> don't run on switch ports. One way to get apps to run on the host in switchdev
>> mode is probe one of the VFs there.
>>
>>
>>
So once a pci device is configured in 'switchdev' mode,  only port representor netdevs are
seen on the host, no more PF netdev.

Are you going to expose another way to change sriov_num_vfs when the device is in
'switchdev' mode OR do we need to switch to 'legacy' mode to increase/decrease the number of
VFs?

Even in switchdev mode, i guess it will be possible for host apps to use the IP configured
on the uplink rep to talk externally.

In case of multiple uplinks, are you exposing one uplink-rep netdev per uplink?

^ permalink raw reply

* [PATCH v2 8/8] net: New ax88796 platform driver for Amiga X-Surf 100 Zorro board (m68k)
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev
  Cc: andrew, linux-m68k, Michael.Karcher, Michael Schmitz,
	Michael Karcher
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

Add platform device driver to populate the ax88796 platform data from
information provided by the XSurf100 zorro device driver.
This driver will have to be loaded before loading the ax88796 module,
or compiled as built-in.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
---
 drivers/net/ethernet/8390/Kconfig    |   14 +-
 drivers/net/ethernet/8390/Makefile   |    1 +
 drivers/net/ethernet/8390/xsurf100.c |  411 ++++++++++++++++++++++++++++++++++
 3 files changed, 425 insertions(+), 1 deletions(-)
 create mode 100644 drivers/net/ethernet/8390/xsurf100.c

diff --git a/drivers/net/ethernet/8390/Kconfig b/drivers/net/ethernet/8390/Kconfig
index fdc6734..0cadd45 100644
--- a/drivers/net/ethernet/8390/Kconfig
+++ b/drivers/net/ethernet/8390/Kconfig
@@ -30,7 +30,7 @@ config PCMCIA_AXNET
 
 config AX88796
 	tristate "ASIX AX88796 NE2000 clone support"
-	depends on (ARM || MIPS || SUPERH)
+	depends on (ARM || MIPS || SUPERH || AMIGA)
 	select CRC32
 	select PHYLIB
 	select MDIO_BITBANG
@@ -45,6 +45,18 @@ config AX88796_93CX6
 	---help---
 	  Select this if your platform comes with an external 93CX6 eeprom.
 
+config XSURF100
+	tristate "Amiga XSurf 100 AX88796/NE2000 clone support"
+	depends on ZORRO
+	depends on AX88796
+	---help---
+	  This driver is for the Individual Computers X-Surf 100 Ethernet
+	  card (based on the Asix AX88796 chip). If you have such a card,
+	  say Y. Otherwise, say N.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called xsurf100.
+
 config HYDRA
 	tristate "Hydra support"
 	depends on ZORRO
diff --git a/drivers/net/ethernet/8390/Makefile b/drivers/net/ethernet/8390/Makefile
index f975c2f..3715f8d 100644
--- a/drivers/net/ethernet/8390/Makefile
+++ b/drivers/net/ethernet/8390/Makefile
@@ -16,4 +16,5 @@ obj-$(CONFIG_PCMCIA_PCNET) += pcnet_cs.o 8390.o
 obj-$(CONFIG_STNIC) += stnic.o 8390.o
 obj-$(CONFIG_ULTRA) += smc-ultra.o 8390.o
 obj-$(CONFIG_WD80x3) += wd.o 8390.o
+obj-$(CONFIG_XSURF100) += xsurf100.o
 obj-$(CONFIG_ZORRO8390) += zorro8390.o 8390.o
diff --git a/drivers/net/ethernet/8390/xsurf100.c b/drivers/net/ethernet/8390/xsurf100.c
new file mode 100644
index 0000000..3caece0
--- /dev/null
+++ b/drivers/net/ethernet/8390/xsurf100.c
@@ -0,0 +1,411 @@
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/zorro.h>
+#include <net/ax88796.h>
+#include <asm/amigaints.h>
+
+#define ZORRO_PROD_INDIVIDUAL_COMPUTERS_X_SURF100 \
+		ZORRO_ID(INDIVIDUAL_COMPUTERS, 0x64, 0)
+
+#define XS100_IRQSTATUS_BASE 0x40
+#define XS100_8390_BASE 0x800
+
+/* Longword-access area. Translated to 2 16-bit access cycles by the
+ * X-Surf 100 FPGA
+ */
+#define XS100_8390_DATA32_BASE 0x8000
+#define XS100_8390_DATA32_SIZE 0x2000
+/* Sub-Areas for fast data register access; addresses relative to area begin */
+#define XS100_8390_DATA_READ32_BASE 0x0880
+#define XS100_8390_DATA_WRITE32_BASE 0x0C80
+#define XS100_8390_DATA_AREA_SIZE 0x80
+
+#define __NS8390_init ax_NS8390_init
+
+/* force unsigned long back to 'void __iomem *' */
+#define ax_convert_addr(_a) ((void __force __iomem *)(_a))
+
+#define ei_inb(_a) z_readb(ax_convert_addr(_a))
+#define ei_outb(_v, _a) z_writeb(_v, ax_convert_addr(_a))
+
+#define ei_inw(_a) z_readw(ax_convert_addr(_a))
+#define ei_outw(_v, _a) z_writew(_v, ax_convert_addr(_a))
+
+#define ei_inb_p(_a) ei_inb(_a)
+#define ei_outb_p(_v, _a) ei_outb(_v, _a)
+
+/* define EI_SHIFT() to take into account our register offsets */
+#define EI_SHIFT(x) (ei_local->reg_offset[(x)])
+
+/* Ensure we have our RCR base value */
+#define AX88796_PLATFORM
+
+static unsigned char version[] =
+		"ax88796.c: Copyright 2005,2007 Simtec Electronics\n";
+
+#include "lib8390.c"
+
+/* from ne.c */
+#define NE_CMD		EI_SHIFT(0x00)
+#define NE_RESET	EI_SHIFT(0x1f)
+#define NE_DATAPORT	EI_SHIFT(0x10)
+
+/* Hard reset the card. This used to pause for the same period that a
+ * 8390 reset command required, but that shouldn't be necessary.
+ */
+static void ax_reset_8390(struct net_device *dev)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	unsigned long reset_start_time = jiffies;
+	void __iomem *addr = (void __iomem *)dev->base_addr;
+
+	netif_dbg(ei_local, hw, dev, "resetting the 8390 t=%ld...\n", jiffies);
+
+	ei_outb(ei_inb(addr + NE_RESET), addr + NE_RESET);
+
+	ei_local->txing = 0;
+	ei_local->dmaing = 0;
+
+	/* This check _should_not_ be necessary, omit eventually. */
+	while ((ei_inb(addr + EN0_ISR) & ENISR_RESET) == 0) {
+		if (time_after(jiffies, reset_start_time + 2 * HZ / 100)) {
+			netdev_warn(dev, "%s: did not complete.\n", __func__);
+			break;
+		}
+	}
+
+	ei_outb(ENISR_RESET, addr + EN0_ISR);	/* Ack intr. */
+}
+
+struct xsurf100_ax_plat_data {
+	struct ax_plat_data ax;
+	void __iomem *base_regs;
+	void __iomem *data_area;
+};
+
+static int is_xsurf100_network_irq(struct platform_device *pdev)
+{
+	struct xsurf100_ax_plat_data *xs100 = dev_get_platdata(&pdev->dev);
+
+	return (readw(xs100->base_regs + XS100_IRQSTATUS_BASE) & 0xaaaa) != 0;
+}
+
+/* These functions guarantee that the iomem is accessed with 32 bit
+ * cycles only. z_memcpy_fromio / z_memcpy_toio don't
+ */
+static void z_memcpy_fromio32(void *dst, const void __iomem *src, size_t bytes)
+{
+	while (bytes > 32) {
+		asm __volatile__
+		   ("movem.l (%0)+,%%d0-%%d7\n"
+		    "movem.l %%d0-%%d7,(%1)\n"
+		    "adda.l #32,%1" : "=a"(src), "=a"(dst)
+		    : "0"(src), "1"(dst) : "d0", "d1", "d2", "d3", "d4",
+					   "d5", "d6", "d7", "memory");
+		bytes -= 32;
+	}
+	while (bytes) {
+		*(uint32_t *)dst = z_readl(src);
+		src += 4;
+		dst += 4;
+		bytes -= 4;
+	}
+}
+
+static void z_memcpy_toio32(void __iomem *dst, const void *src, size_t bytes)
+{
+	while (bytes) {
+		z_writel(*(const uint32_t *)src, dst);
+		src += 4;
+		dst += 4;
+		bytes -= 4;
+	}
+}
+
+static void xs100_write(struct net_device *dev, const void *src,
+			unsigned int count)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+	struct xsurf100_ax_plat_data *xs100 = dev_get_platdata(&pdev->dev);
+
+	/* copy whole blocks */
+	while (count > XS100_8390_DATA_AREA_SIZE) {
+		z_memcpy_toio32(xs100->data_area +
+				XS100_8390_DATA_WRITE32_BASE, src,
+				XS100_8390_DATA_AREA_SIZE);
+		src += XS100_8390_DATA_AREA_SIZE;
+		count -= XS100_8390_DATA_AREA_SIZE;
+	}
+	/* copy whole dwords */
+	z_memcpy_toio32(xs100->data_area + XS100_8390_DATA_WRITE32_BASE,
+			src, count & ~3);
+	src += count & ~3;
+	if (count & 2) {
+		ei_outw(*(uint16_t *)src, ei_local->mem + NE_DATAPORT);
+		src += 2;
+	}
+	if (count & 1)
+		ei_outb(*(uint8_t *)src, ei_local->mem + NE_DATAPORT);
+}
+
+static void xs100_read(struct net_device *dev, void *dst, unsigned int count)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+	struct xsurf100_ax_plat_data *xs100 = dev_get_platdata(&pdev->dev);
+
+	/* copy whole blocks */
+	while (count > XS100_8390_DATA_AREA_SIZE) {
+		z_memcpy_fromio32(dst, xs100->data_area +
+				  XS100_8390_DATA_READ32_BASE,
+				  XS100_8390_DATA_AREA_SIZE);
+		dst += XS100_8390_DATA_AREA_SIZE;
+		count -= XS100_8390_DATA_AREA_SIZE;
+	}
+	/* copy whole dwords */
+	z_memcpy_fromio32(dst, xs100->data_area + XS100_8390_DATA_READ32_BASE,
+			  count & ~3);
+	dst += count & ~3;
+	if (count & 2) {
+		*(uint16_t *)dst = ei_inw(ei_local->mem + NE_DATAPORT);
+		dst += 2;
+	}
+	if (count & 1)
+		*(uint8_t *)dst = ei_inb(ei_local->mem + NE_DATAPORT);
+}
+
+/* Block input and output, similar to the Crynwr packet driver. If
+ * you are porting to a new ethercard, look at the packet driver
+ * source for hints. The NEx000 doesn't share the on-board packet
+ * memory -- you have to put the packet out through the "remote DMA"
+ * dataport using ei_outb.
+ */
+static void xs100_block_input(struct net_device *dev, int count,
+			      struct sk_buff *skb, int ring_offset)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	void __iomem *nic_base = ei_local->mem;
+	char *buf = skb->data;
+
+	if (ei_local->dmaing) {
+		netdev_err(dev,
+			   "DMAing conflict in %s "
+			   "[DMAstat:%d][irqlock:%d].\n",
+			   __func__,
+			   ei_local->dmaing, ei_local->irqlock);
+		return;
+	}
+
+	ei_local->dmaing |= 0x01;
+
+	ei_outb(E8390_NODMA + E8390_PAGE0 + E8390_START, nic_base + NE_CMD);
+	ei_outb(count & 0xff, nic_base + EN0_RCNTLO);
+	ei_outb(count >> 8, nic_base + EN0_RCNTHI);
+	ei_outb(ring_offset & 0xff, nic_base + EN0_RSARLO);
+	ei_outb(ring_offset >> 8, nic_base + EN0_RSARHI);
+	ei_outb(E8390_RREAD + E8390_START, nic_base + NE_CMD);
+
+	xs100_read(dev, buf, count);
+
+	ei_local->dmaing &= ~1;
+}
+
+static void xs100_block_output(struct net_device *dev, int count,
+			       const unsigned char *buf, const int start_page)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	void __iomem *nic_base = ei_local->mem;
+	unsigned long dma_start;
+
+	/* Round the count up for word writes. Do we need to do this?
+	 * What effect will an odd byte count have on the 8390?  I
+	 * should check someday.
+	 */
+	if (ei_local->word16 && (count & 0x01))
+		count++;
+
+	/* This *shouldn't* happen. If it does, it's the last thing
+	 * you'll see
+	 */
+	if (ei_local->dmaing) {
+		netdev_err(dev, "DMAing conflict in %s "
+			"[DMAstat:%d][irqlock:%d]\n",
+			__func__,
+		       ei_local->dmaing, ei_local->irqlock);
+		return;
+	}
+
+	ei_local->dmaing |= 0x01;
+	/* We should already be in page 0, but to be safe... */
+	ei_outb(E8390_PAGE0 + E8390_START + E8390_NODMA, nic_base + NE_CMD);
+
+	ei_outb(ENISR_RDC, nic_base + EN0_ISR);
+
+	/* Now the normal output. */
+	ei_outb(count & 0xff, nic_base + EN0_RCNTLO);
+	ei_outb(count >> 8, nic_base + EN0_RCNTHI);
+	ei_outb(0x00, nic_base + EN0_RSARLO);
+	ei_outb(start_page, nic_base + EN0_RSARHI);
+
+	ei_outb(E8390_RWRITE + E8390_START, nic_base + NE_CMD);
+
+	xs100_write(dev, buf, count);
+
+	dma_start = jiffies;
+
+	while ((ei_inb(nic_base + EN0_ISR) & ENISR_RDC) == 0) {
+		if (jiffies - dma_start > 2 * HZ / 100) {	/* 20ms */
+			netdev_warn(dev, "timeout waiting for Tx RDC.\n");
+			ax_reset_8390(dev);
+			ax_NS8390_init(dev, 1);
+			break;
+		}
+	}
+
+	ei_outb(ENISR_RDC, nic_base + EN0_ISR);	/* Ack intr. */
+	ei_local->dmaing &= ~0x01;
+}
+
+static int xsurf100_probe(struct zorro_dev *zdev,
+			  const struct zorro_device_id *ent)
+{
+	struct platform_device *pdev;
+	struct xsurf100_ax_plat_data ax88796_data;
+	struct resource res[2] = {
+		DEFINE_RES_NAMED(IRQ_AMIGA_PORTS, 1, NULL,
+				 IORESOURCE_IRQ | IORESOURCE_IRQ_SHAREABLE),
+		DEFINE_RES_MEM(zdev->resource.start + XS100_8390_BASE,
+			       4 * 0x20)
+	};
+	int reg;
+	/* This table is referenced in the device structure, so it must
+	 * outlive the scope of xsurf100_probe.
+	 */
+	static u32 reg_offsets[32];
+	int ret = 0;
+
+	/* X-Surf 100 control and 32 bit ring buffer data access areas.
+	 * These resources are not used by the ax88796 driver, so must
+	 * be requested here and passed via platform data.
+	 */
+
+	if (!request_mem_region(zdev->resource.start, 0x100, zdev->name)) {
+		dev_err(&zdev->dev, "cannot reserve X-Surf 100 control registers\n");
+		return -ENXIO;
+	}
+
+	if (!request_mem_region(zdev->resource.start +
+				XS100_8390_DATA32_BASE,
+				XS100_8390_DATA32_SIZE,
+				"X-Surf 100 32-bit data access")) {
+		dev_err(&zdev->dev, "cannot reserve 32-bit area\n");
+		ret = -ENXIO;
+		goto exit_req;
+	}
+
+	for (reg = 0; reg < 0x20; reg++)
+		reg_offsets[reg] = 4 * reg;
+
+	memset(&ax88796_data, 0, sizeof(ax88796_data));
+	ax88796_data.ax.flags = AXFLG_HAS_EEPROM;
+	ax88796_data.ax.wordlength = 2;
+	ax88796_data.ax.dcr_val = 0x48;
+	ax88796_data.ax.rcr_val = 0x40;
+	ax88796_data.ax.reg_offsets = reg_offsets;
+	ax88796_data.ax.check_irq = is_xsurf100_network_irq;
+	ax88796_data.base_regs = ioremap(zdev->resource.start, 0x100);
+
+	/* error handling for ioremap regs */
+	if (!ax88796_data.base_regs) {
+		dev_err(&zdev->dev, "Cannot ioremap area %p (registers)\n",
+			(void *)zdev->resource.start);
+
+		ret = -ENXIO;
+		goto exit_req2;
+	}
+
+	ax88796_data.data_area = ioremap(zdev->resource.start +
+			XS100_8390_DATA32_BASE, XS100_8390_DATA32_SIZE);
+
+	/* error handling for ioremap data */
+	if (!ax88796_data.data_area) {
+		dev_err(&zdev->dev, "Cannot ioremap area %p (32-bit access)\n",
+			(void *)zdev->resource.start + XS100_8390_DATA32_BASE);
+
+		ret = -ENXIO;
+		goto exit_mem;
+	}
+
+	ax88796_data.ax.block_output = xs100_block_output;
+	ax88796_data.ax.block_input = xs100_block_input;
+
+	pdev = platform_device_register_resndata(&zdev->dev, "ax88796",
+						 zdev->slotaddr, res, 2,
+						 &ax88796_data,
+						 sizeof(ax88796_data));
+
+	if (IS_ERR(pdev)) {
+		dev_err(&zdev->dev, "cannot register platform device\n");
+		ret = -ENXIO;
+		goto exit_mem2;
+	}
+
+	zorro_set_drvdata(zdev, pdev);
+
+	if (!ret)
+		return 0;
+
+ exit_mem2:
+	iounmap(ax88796_data.data_area);
+
+ exit_mem:
+	iounmap(ax88796_data.base_regs);
+
+ exit_req2:
+	release_mem_region(zdev->resource.start + XS100_8390_DATA32_BASE,
+			   XS100_8390_DATA32_SIZE);
+
+ exit_req:
+	release_mem_region(zdev->resource.start, 0x100);
+
+	return ret;
+}
+
+static void xsurf100_remove(struct zorro_dev *zdev)
+{
+	struct platform_device *pdev;
+	struct xsurf100_ax_plat_data *xs100;
+
+	pdev = zorro_get_drvdata(zdev);
+	xs100 = dev_get_platdata(&pdev->dev);
+
+	platform_device_unregister(pdev);
+
+	iounmap(xs100->base_regs);
+	release_mem_region(zdev->resource.start, 0x100);
+	iounmap(xs100->data_area);
+	release_mem_region(zdev->resource.start + XS100_8390_DATA32_BASE,
+			   XS100_8390_DATA32_SIZE);
+}
+
+static const struct zorro_device_id xsurf100_zorro_tbl[] = {
+	{ ZORRO_PROD_INDIVIDUAL_COMPUTERS_X_SURF100, },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(zorro, xsurf100_zorro_tbl);
+
+static struct zorro_driver xsurf100_driver = {
+	.name           = "xsurf100",
+	.id_table       = xsurf100_zorro_tbl,
+	.probe          = xsurf100_probe,
+	.remove         = xsurf100_remove,
+};
+
+module_driver(xsurf100_driver, zorro_register_driver, zorro_unregister_driver);
+
+MODULE_DESCRIPTION("X-Surf 100 driver");
+MODULE_AUTHOR("Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>");
+MODULE_LICENSE("GPL v2");
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH v2 6/8] net: ax88796: set IRQF_SHARED flag when IRQ resource is marked as shareable
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev
  Cc: andrew, linux-m68k, Michael.Karcher, John Paul Adrian Glaubitz,
	Michael Karcher, Michael Schmitz
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

From: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>

On the Amiga X-Surf100, the network card interrupt is shared with many
other interrupt sources, so requires the IRQF_SHARED flag to register.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
---
 drivers/net/ethernet/8390/ax88796.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index c799441..a72dfbc 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -875,6 +875,9 @@ static int ax_probe(struct platform_device *pdev)
 	dev->irq = irq->start;
 	ax->irqflags = irq->flags & IRQF_TRIGGER_MASK;
 
+	if (irq->flags &  IORESOURCE_IRQ_SHAREABLE)
+		ax->irqflags |= IRQF_SHARED;
+
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!mem) {
 		dev_err(&pdev->dev, "no MEM specified\n");
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH v2 7/8] net: ax88796: release platform device drvdata on probe error and module remove
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev; +Cc: andrew, linux-m68k, Michael.Karcher, Michael Schmitz
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

The net device struct pointer is stored as platform device drvdata on
module probe - clear the drvdata entry on probe fail there, as well as
when unloading the module.

Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
---
 drivers/net/ethernet/8390/ax88796.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index a72dfbc..eb72282 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -829,6 +829,7 @@ static int ax_remove(struct platform_device *pdev)
 		release_mem_region(mem->start, resource_size(mem));
 	}
 
+	platform_set_drvdata(pdev, NULL);
 	free_netdev(dev);
 
 	return 0;
@@ -962,6 +963,7 @@ static int ax_probe(struct platform_device *pdev)
 	release_mem_region(mem->start, mem_size);
 
  exit_mem:
+	platform_set_drvdata(pdev, NULL);
 	free_netdev(dev);
 
 	return ret;
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH v2 5/8] net: ax88796: add interrupt status callback to platform data
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev
  Cc: andrew, linux-m68k, Michael.Karcher, Michael Schmitz,
	Michael Karcher
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

To be able to tell the ax88796 driver whether it is sensible to enter
the 8390 interrupt handler, an "is this interrupt caused by the 88796"
callback has been added to the ax_plat_data structure (with NULL being
compatible to the previous behaviour).

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
---
 drivers/net/ethernet/8390/ax88796.c |   23 +++++++++++++++++++++--
 include/net/ax88796.h               |    5 +++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 29cde38..c799441 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -165,6 +165,21 @@ static void ax_reset_8390(struct net_device *dev)
 	ei_outb(ENISR_RESET, addr + EN0_ISR);	/* Ack intr. */
 }
 
+/* Wrapper for __ei_interrupt for platforms that have a platform-specific
+ * way to find out whether the interrupt request might be caused by
+ * the ax88796 chip.
+ */
+static irqreturn_t ax_ei_interrupt_filtered(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct ax_device *ax = to_ax_dev(dev);
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+
+	if (!ax->plat->check_irq(pdev))
+		return IRQ_NONE;
+
+	return ax_ei_interrupt(irq, dev_id);
+}
 
 static void ax_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr,
 			    int ring_page)
@@ -484,8 +499,12 @@ static int ax_open(struct net_device *dev)
 	if (ret)
 		goto failed_mii;
 
-	ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
-			  dev->name, dev);
+	if (ax->plat->check_irq)
+		ret = request_irq(dev->irq, ax_ei_interrupt_filtered,
+				  ax->irqflags, dev->name, dev);
+	else
+		ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
+				  dev->name, dev);
 	if (ret)
 		goto failed_request_irq;
 
diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index 26cc459..26412cd 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -12,6 +12,7 @@
 #define __NET_AX88796_PLAT_H
 
 struct net_device;
+struct platform_device;
 
 #define AXFLG_HAS_EEPROM		(1<<0)
 #define AXFLG_MAC_FROMDEV		(1<<1)	/* device already has MAC */
@@ -33,6 +34,10 @@ struct ax_plat_data {
 			const unsigned char *buf, int star_page);
 	void (*block_input)(struct net_device *dev, int count,
 			struct sk_buff *skb, int ring_offset);
+	/* returns nonzero if a pending interrupt request might by caused by
+	 * the ax88786. Handles all interrupts if set to NULL
+	 */
+	int (*check_irq)(struct platform_device *pdev);
 };
 
 #endif /* __NET_AX88796_PLAT_H */
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH v2 1/8] net: ax88796: Fix MAC address reading
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev
  Cc: andrew, linux-m68k, Michael.Karcher, Michael Karcher,
	Michael Karcher, Michael Schmitz
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

From: Michael Karcher <debian@mkarcher.dialup.fu-berlin.de>

To read the MAC address from the (virtual) SAprom, the remote DMA
unit needs to be set up like for every other process access to card-local
memory.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
---
 drivers/net/ethernet/8390/ax88796.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 2455547..2a256aa 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -671,10 +671,16 @@ static int ax_init_dev(struct net_device *dev)
 	if (ax->plat->flags & AXFLG_HAS_EEPROM) {
 		unsigned char SA_prom[32];
 
+		ei_outb(6, ioaddr + EN0_RCNTLO);
+		ei_outb(0, ioaddr + EN0_RCNTHI);
+		ei_outb(0, ioaddr + EN0_RSARLO);
+		ei_outb(0, ioaddr + EN0_RSARHI);
+		ei_outb(E8390_RREAD + E8390_START, ioaddr + NE_CMD);
 		for (i = 0; i < sizeof(SA_prom); i += 2) {
 			SA_prom[i] = ei_inb(ioaddr + NE_DATAPORT);
 			SA_prom[i + 1] = ei_inb(ioaddr + NE_DATAPORT);
 		}
+		ei_outb(ENISR_RDC, ioaddr + EN0_ISR);	/* Ack intr. */
 
 		if (ax->plat->wordlength == 2)
 			for (i = 0; i < 16; i++)
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH v2 4/8] net: ax88796: Add block_input/output hooks to ax_plat_data
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev
  Cc: andrew, linux-m68k, Michael.Karcher, Michael Schmitz,
	Michael Karcher
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

Add platform specific hooks for block transfer reads/writes of packet
buffer data, superseding the default provided ax_block_input/output.
Currently used for m68k Amiga XSurf100.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
---
 drivers/net/ethernet/8390/ax88796.c |   10 ++++++++--
 include/net/ax88796.h               |    9 ++++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index ecf104c..29cde38 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -760,8 +760,14 @@ static int ax_init_dev(struct net_device *dev)
 #endif
 
 	ei_local->reset_8390 = &ax_reset_8390;
-	ei_local->block_input = &ax_block_input;
-	ei_local->block_output = &ax_block_output;
+	if (ax->plat->block_input)
+		ei_local->block_input = ax->plat->block_input;
+	else
+		ei_local->block_input = &ax_block_input;
+	if (ax->plat->block_output)
+		ei_local->block_output = ax->plat->block_output;
+	else
+		ei_local->block_output = &ax_block_output;
 	ei_local->get_8390_hdr = &ax_get_8390_hdr;
 	ei_local->priv = 0;
 	ei_local->msg_enable = ax_msg_enable;
diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index b9a3bec..26cc459 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -8,10 +8,11 @@
  * published by the Free Software Foundation.
  *
 */
-
 #ifndef __NET_AX88796_PLAT_H
 #define __NET_AX88796_PLAT_H
 
+struct net_device;
+
 #define AXFLG_HAS_EEPROM		(1<<0)
 #define AXFLG_MAC_FROMDEV		(1<<1)	/* device already has MAC */
 #define AXFLG_HAS_93CX6			(1<<2)	/* use eeprom_93cx6 driver */
@@ -26,6 +27,12 @@ struct ax_plat_data {
 	u32		*reg_offsets;	/* register offsets */
 	u8		*mac_addr;	/* MAC addr (only used when
 					   AXFLG_MAC_FROMPLATFORM is used */
+
+	/* uses default ax88796 buffer if set to NULL */
+	void (*block_output)(struct net_device *dev, int count,
+			const unsigned char *buf, int star_page);
+	void (*block_input)(struct net_device *dev, int count,
+			struct sk_buff *skb, int ring_offset);
 };
 
 #endif /* __NET_AX88796_PLAT_H */
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH v2 2/8] net: ax88796: Attach MII bus only when open
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev
  Cc: andrew, linux-m68k, Michael.Karcher, Michael Karcher,
	Michael Karcher, Michael Schmitz
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

From: Michael Karcher <debian@mkarcher.dialup.fu-berlin.de>

Call ax_mii_init in ax_open(), and unregister/remove mdiobus resources
in ax_close().

This is needed to be able to unload the module, as the module is busy
while the MII bus is attached.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
---
 drivers/net/ethernet/8390/ax88796.c |  183 ++++++++++++++++++-----------------
 1 files changed, 95 insertions(+), 88 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 2a256aa..83e59ae 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -389,6 +389,90 @@ static void ax_phy_switch(struct net_device *dev, int on)
 	ei_outb(reg_gpoc, ei_local->mem + EI_SHIFT(0x17));
 }
 
+static void ax_bb_mdc(struct mdiobb_ctrl *ctrl, int level)
+{
+	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
+
+	if (level)
+		ax->reg_memr |= AX_MEMR_MDC;
+	else
+		ax->reg_memr &= ~AX_MEMR_MDC;
+
+	ei_outb(ax->reg_memr, ax->addr_memr);
+}
+
+static void ax_bb_dir(struct mdiobb_ctrl *ctrl, int output)
+{
+	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
+
+	if (output)
+		ax->reg_memr &= ~AX_MEMR_MDIR;
+	else
+		ax->reg_memr |= AX_MEMR_MDIR;
+
+	ei_outb(ax->reg_memr, ax->addr_memr);
+}
+
+static void ax_bb_set_data(struct mdiobb_ctrl *ctrl, int value)
+{
+	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
+
+	if (value)
+		ax->reg_memr |= AX_MEMR_MDO;
+	else
+		ax->reg_memr &= ~AX_MEMR_MDO;
+
+	ei_outb(ax->reg_memr, ax->addr_memr);
+}
+
+static int ax_bb_get_data(struct mdiobb_ctrl *ctrl)
+{
+	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
+	int reg_memr = ei_inb(ax->addr_memr);
+
+	return reg_memr & AX_MEMR_MDI ? 1 : 0;
+}
+
+static const struct mdiobb_ops bb_ops = {
+	.owner = THIS_MODULE,
+	.set_mdc = ax_bb_mdc,
+	.set_mdio_dir = ax_bb_dir,
+	.set_mdio_data = ax_bb_set_data,
+	.get_mdio_data = ax_bb_get_data,
+};
+
+static int ax_mii_init(struct net_device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+	struct ei_device *ei_local = netdev_priv(dev);
+	struct ax_device *ax = to_ax_dev(dev);
+	int err;
+
+	ax->bb_ctrl.ops = &bb_ops;
+	ax->addr_memr = ei_local->mem + AX_MEMR;
+	ax->mii_bus = alloc_mdio_bitbang(&ax->bb_ctrl);
+	if (!ax->mii_bus) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ax->mii_bus->name = "ax88796_mii_bus";
+	ax->mii_bus->parent = dev->dev.parent;
+	snprintf(ax->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x",
+		pdev->name, pdev->id);
+
+	err = mdiobus_register(ax->mii_bus);
+	if (err)
+		goto out_free_mdio_bitbang;
+
+	return 0;
+
+ out_free_mdio_bitbang:
+	free_mdio_bitbang(ax->mii_bus);
+ out:
+	return err;
+}
+
 static int ax_open(struct net_device *dev)
 {
 	struct ax_device *ax = to_ax_dev(dev);
@@ -396,6 +480,10 @@ static int ax_open(struct net_device *dev)
 
 	netdev_dbg(dev, "open\n");
 
+	ret = ax_mii_init(dev);
+	if (ret)
+		goto failed_mii;
+
 	ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
 			  dev->name, dev);
 	if (ret)
@@ -423,6 +511,10 @@ static int ax_open(struct net_device *dev)
 	ax_phy_switch(dev, 0);
 	free_irq(dev->irq, dev);
  failed_request_irq:
+	/* unregister mdiobus */
+	mdiobus_unregister(ax->mii_bus);
+	free_mdio_bitbang(ax->mii_bus);
+ failed_mii:
 	return ret;
 }
 
@@ -442,6 +534,9 @@ static int ax_close(struct net_device *dev)
 	phy_disconnect(dev->phydev);
 
 	free_irq(dev->irq, dev);
+
+	mdiobus_unregister(ax->mii_bus);
+	free_mdio_bitbang(ax->mii_bus);
 	return 0;
 }
 
@@ -541,92 +636,8 @@ static void ax_eeprom_register_write(struct eeprom_93cx6 *eeprom)
 #endif
 };
 
-static void ax_bb_mdc(struct mdiobb_ctrl *ctrl, int level)
-{
-	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
-
-	if (level)
-		ax->reg_memr |= AX_MEMR_MDC;
-	else
-		ax->reg_memr &= ~AX_MEMR_MDC;
-
-	ei_outb(ax->reg_memr, ax->addr_memr);
-}
-
-static void ax_bb_dir(struct mdiobb_ctrl *ctrl, int output)
-{
-	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
-
-	if (output)
-		ax->reg_memr &= ~AX_MEMR_MDIR;
-	else
-		ax->reg_memr |= AX_MEMR_MDIR;
-
-	ei_outb(ax->reg_memr, ax->addr_memr);
-}
-
-static void ax_bb_set_data(struct mdiobb_ctrl *ctrl, int value)
-{
-	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
-
-	if (value)
-		ax->reg_memr |= AX_MEMR_MDO;
-	else
-		ax->reg_memr &= ~AX_MEMR_MDO;
-
-	ei_outb(ax->reg_memr, ax->addr_memr);
-}
-
-static int ax_bb_get_data(struct mdiobb_ctrl *ctrl)
-{
-	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
-	int reg_memr = ei_inb(ax->addr_memr);
-
-	return reg_memr & AX_MEMR_MDI ? 1 : 0;
-}
-
-static const struct mdiobb_ops bb_ops = {
-	.owner = THIS_MODULE,
-	.set_mdc = ax_bb_mdc,
-	.set_mdio_dir = ax_bb_dir,
-	.set_mdio_data = ax_bb_set_data,
-	.get_mdio_data = ax_bb_get_data,
-};
-
 /* setup code */
 
-static int ax_mii_init(struct net_device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev->dev.parent);
-	struct ei_device *ei_local = netdev_priv(dev);
-	struct ax_device *ax = to_ax_dev(dev);
-	int err;
-
-	ax->bb_ctrl.ops = &bb_ops;
-	ax->addr_memr = ei_local->mem + AX_MEMR;
-	ax->mii_bus = alloc_mdio_bitbang(&ax->bb_ctrl);
-	if (!ax->mii_bus) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	ax->mii_bus->name = "ax88796_mii_bus";
-	ax->mii_bus->parent = dev->dev.parent;
-	snprintf(ax->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x",
-		pdev->name, pdev->id);
-
-	err = mdiobus_register(ax->mii_bus);
-	if (err)
-		goto out_free_mdio_bitbang;
-
-	return 0;
-
- out_free_mdio_bitbang:
-	free_mdio_bitbang(ax->mii_bus);
- out:
-	return err;
-}
-
 static void ax_initial_setup(struct net_device *dev, struct ei_device *ei_local)
 {
 	void __iomem *ioaddr = ei_local->mem;
@@ -758,10 +769,6 @@ static int ax_init_dev(struct net_device *dev)
 	dev->netdev_ops = &ax_netdev_ops;
 	dev->ethtool_ops = &ax_ethtool_ops;
 
-	ret = ax_mii_init(dev);
-	if (ret)
-		goto err_out;
-
 	ax_NS8390_init(dev, 0);
 
 	ret = register_netdev(dev);
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH v2 0/8] New network driver for Amiga X-Surf 100 (m68k)
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev; +Cc: andrew, linux-m68k, Michael.Karcher
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

This patch series adds support for the Individual Computers X-Surf 100
network card for m68k Amiga, a network adapter based on the AX88796 chip set.

The driver was originally written for kernel version 3.19 by Michael Karcher
(see CC:), and adapted to 4.16 for submission to netdev by me. Questions 
regarding motivation for some of the changes are probably best directed at
Michael Karcher.

The driver has been tested by Adrian <glaubitz@physik.fu-berlin.de> who will
send his Tested-by tag separately.

A few changes to the ax88796 driver were required:
- to read the MAC address, some setup of the ax99796 chip must be done,
- attach to the MII bus only on device open to allow module unloading,
- allow to supersede ax_block_input/ax_block_output by card-specific
  optimized code,
- use an optional interrupt status callback to allow easier sharing of the
  card interrupt,
- set IRQF_SHARED if platform IRQ resource is marked shareable,

Some additional cleanup:
- do not attempt to free IRQ in ax_remove (complements 82533ad9a1c),
- clear platform drvdata on probe fail and module remove.

Changes since v1:

Raised in review by Andrew Lunn:
- move MII code around to avoid need for forward declaration
- combine patches 2 and 7 to add cleanup in error path

The patch series, in order:

1/8 net: ax88796: Fix MAC address reading
2/8 net: ax88796: Attach MII bus only when open
3/8 net: ax88796: Do not free IRQ in ax_remove() (already freed in ax_close()).
4/8 net: ax88796: Add block_input/output hooks to ax_plat_data
5/8 net: ax88796: add interrupt status callback to platform data
6/8 net: ax88796: set IRQF_SHARED flag when IRQ resource is marked as shareable
7/8 net: ax88796: release platform device drvdata on probe error and module remove
8/8 net: New ax88796 platform driver for Amiga X-Surf 100 Zorro board (m68k)

 drivers/net/ethernet/8390/Kconfig    |   14 +-
 drivers/net/ethernet/8390/Makefile   |    1 +
 drivers/net/ethernet/8390/ax88796.c  |  228 +++++++++++--------
 drivers/net/ethernet/8390/xsurf100.c |  411 ++++++++++++++++++++++++++++++++++
 include/net/ax88796.h                |   14 +-
 5 files changed, 573 insertions(+), 95 deletions(-)

Cheers,

  Michael

^ permalink raw reply

* [PATCH v2 3/8] net: ax88796: Do not free IRQ in ax_remove() (already freed in ax_close()).
From: Michael Schmitz @ 2018-04-17  2:08 UTC (permalink / raw)
  To: netdev
  Cc: andrew, linux-m68k, Michael.Karcher, John Paul Adrian Glaubitz,
	Michael Karcher
In-Reply-To: <1523916285-6057-1-git-send-email-schmitzmic@gmail.com>

From: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>

This complements the fix in 82533ad9a1c that removed the free_irq
call in the error path of probe, to also not call free_irq when
remove is called to revert the effects of probe.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
---
 drivers/net/ethernet/8390/ax88796.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 83e59ae..ecf104c 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -793,7 +793,6 @@ static int ax_remove(struct platform_device *pdev)
 	struct resource *mem;
 
 	unregister_netdev(dev);
-	free_irq(dev->irq, dev);
 
 	iounmap(ei_local->mem);
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-- 
1.7.0.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox