Netdev List

Netdev List
 help / color / mirror / Atom feed

* [patch iproute2-next v2 2/2] devlink: add support for network namespace change
From: Jiri Pirko @ 2019-07-30  8:59 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, sthemmin, dsahern, mlxsw
In-Reply-To: <20190730085734.31504-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 devlink/devlink.c            | 54 +++++++++++++++++++++++++++++++++++-
 include/uapi/linux/devlink.h |  4 +++
 man/man8/devlink-dev.8       | 12 ++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 9242cc05ad0c..a39bd8771d8b 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -235,6 +235,7 @@ static void ifname_map_free(struct ifname_map *ifname_map)
 #define DL_OPT_HEALTH_REPORTER_NAME	BIT(27)
 #define DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD	BIT(27)
 #define DL_OPT_HEALTH_REPORTER_AUTO_RECOVER	BIT(28)
+#define DL_OPT_NETNS	BIT(29)
 
 struct dl_opts {
 	uint32_t present; /* flags of present items */
@@ -271,6 +272,8 @@ struct dl_opts {
 	const char *reporter_name;
 	uint64_t reporter_graceful_period;
 	bool reporter_auto_recover;
+	bool netns_is_pid;
+	uint32_t netns;
 };
 
 struct dl {
@@ -1331,6 +1334,22 @@ static int dl_argv_parse(struct dl *dl, uint32_t o_required,
 			if (err)
 				return err;
 			o_found |= DL_OPT_HEALTH_REPORTER_AUTO_RECOVER;
+		} else if (dl_argv_match(dl, "netns") &&
+			(o_all & DL_OPT_NETNS)) {
+			const char *netns_str;
+
+			dl_arg_inc(dl);
+			err = dl_argv_str(dl, &netns_str);
+			if (err)
+				return err;
+			opts->netns = netns_get_fd(netns_str);
+			if (opts->netns < 0) {
+				err = dl_argv_uint32_t(dl, &opts->netns);
+				if (err)
+					return err;
+				opts->netns_is_pid = true;
+			}
+			o_found |= DL_OPT_NETNS;
 		} else {
 			pr_err("Unknown option \"%s\"\n", dl_argv(dl));
 			return -EINVAL;
@@ -1444,7 +1463,11 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl)
 	if (opts->present & DL_OPT_HEALTH_REPORTER_AUTO_RECOVER)
 		mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
 				opts->reporter_auto_recover);
-
+	if (opts->present & DL_OPT_NETNS)
+		mnl_attr_put_u32(nlh,
+				 opts->netns_is_pid ? DEVLINK_ATTR_NETNS_PID :
+						      DEVLINK_ATTR_NETNS_FD,
+				 opts->netns);
 }
 
 static int dl_argv_parse_put(struct nlmsghdr *nlh, struct dl *dl,
@@ -1499,6 +1522,7 @@ static bool dl_dump_filter(struct dl *dl, struct nlattr **tb)
 static void cmd_dev_help(void)
 {
 	pr_err("Usage: devlink dev show [ DEV ]\n");
+	pr_err("       devlink dev set DEV netns { PID | NAME | ID }\n");
 	pr_err("       devlink dev eswitch set DEV [ mode { legacy | switchdev } ]\n");
 	pr_err("                               [ inline-mode { none | link | network | transport } ]\n");
 	pr_err("                               [ encap { disable | enable } ]\n");
@@ -2551,6 +2575,31 @@ static int cmd_dev_show(struct dl *dl)
 	return err;
 }
 
+static void cmd_dev_set_help(void)
+{
+	pr_err("Usage: devlink dev set DEV netns { PID | NAME | ID }\n");
+}
+
+static int cmd_dev_set(struct dl *dl)
+{
+	struct nlmsghdr *nlh;
+	int err;
+
+	if (dl_argv_match(dl, "help") || dl_no_arg(dl)) {
+		cmd_dev_set_help();
+		return 0;
+	}
+
+	nlh = mnlg_msg_prepare(dl->nlg, DEVLINK_CMD_SET,
+			       NLM_F_REQUEST | NLM_F_ACK);
+
+	err = dl_argv_parse_put(nlh, dl, DL_OPT_HANDLE, DL_OPT_NETNS);
+	if (err)
+		return err;
+
+	return _mnlg_socket_sndrcv(dl->nlg, nlh, NULL, NULL);
+}
+
 static void cmd_dev_reload_help(void)
 {
 	pr_err("Usage: devlink dev reload [ DEV ]\n");
@@ -2747,6 +2796,9 @@ static int cmd_dev(struct dl *dl)
 		   dl_argv_match(dl, "list") || dl_no_arg(dl)) {
 		dl_arg_inc(dl);
 		return cmd_dev_show(dl);
+	} else if (dl_argv_match(dl, "set")) {
+		dl_arg_inc(dl);
+		return cmd_dev_set(dl);
 	} else if (dl_argv_match(dl, "eswitch")) {
 		dl_arg_inc(dl);
 		return cmd_dev_eswitch(dl);
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index fc195cbd66f4..bc1869993e20 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -348,6 +348,10 @@ enum devlink_attr {
 	DEVLINK_ATTR_PORT_PCI_PF_NUMBER,	/* u16 */
 	DEVLINK_ATTR_PORT_PCI_VF_NUMBER,	/* u16 */
 
+	DEVLINK_ATTR_NETNS_FD,			/* u32 */
+	DEVLINK_ATTR_NETNS_PID,			/* u32 */
+	DEVLINK_ATTR_NETNS_ID,			/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/man/man8/devlink-dev.8 b/man/man8/devlink-dev.8
index 1804463b2321..0e1a5523fa7b 100644
--- a/man/man8/devlink-dev.8
+++ b/man/man8/devlink-dev.8
@@ -25,6 +25,13 @@ devlink-dev \- devlink device configuration
 .ti -8
 .B devlink dev help
 
+.ti -8
+.BR "devlink dev set"
+.IR DEV
+.RI "[ "
+.BI "netns { " PID " | " NAME " | " ID " }
+.RI "]"
+
 .ti -8
 .BR "devlink dev eswitch set"
 .IR DEV
@@ -92,6 +99,11 @@ Format is:
 .in +2
 BUS_NAME/BUS_ADDRESS
 
+.SS devlink dev set  - sets devlink device attributes
+
+.TP
+.BI "netns { " PID " | " NAME " | " ID " }
+
 .SS devlink dev eswitch show - display devlink device eswitch attributes
 .SS devlink dev eswitch set  - sets devlink device eswitch attributes
 
-- 
2.21.0


^ permalink raw reply related

* RE: [PATCH] net/socket: fix GCC8+ Wpacked-not-aligned warnings
From: David Laight @ 2019-07-30  9:01 UTC (permalink / raw)
  To: 'Qian Cai', davem@davemloft.net
  Cc: vyasevich@gmail.com, nhorman@tuxdriver.com,
	marcelo.leitner@gmail.com, linux-sctp@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <1564431838-23051-1-git-send-email-cai@lca.pw>

From: Qian Cai
> Sent: 29 July 2019 21:24
..
> To fix this, "struct sockaddr_storage" needs to be aligned to 4-byte as
> it is only used in those packed sctp structure which is part of UAPI,
> and "struct __kernel_sockaddr_storage" is used in some other
> places of UAPI that need not to change alignments in order to not
> breaking userspace.
> 
> One option is use typedef between "sockaddr_storage" and
> "__kernel_sockaddr_storage" but it needs to change 35 or 370 lines of
> codes. The other option is to duplicate this simple 2-field structure to
> have a separate "struct sockaddr_storage" so it can use a different
> alignment than "__kernel_sockaddr_storage". Also the structure seems
> stable enough, so it will be low-maintenance to update two structures in
> the future in case of changes.

Does it all work if the 8 byte alignment is implicit, not explicit?
For instance if unnamed union and struct are used eg:

struct sockaddr_storage {
	union {
		void * __ptr;  /* Force alignment */
		struct {
			__kernel_sa_family_t	ss_family;		/* address family */
			/* Following field(s) are implementation specific */
			char	__data[_K_SS_MAXSIZE - sizeof(unsigned short)];
					/* space to achieve desired size, */
					/* _SS_MAXSIZE value minus size of ss_family */
		};
	};
};

I suspect unnamed unions and structs have to be allowed by the compiler.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply

* Re: [PATCH 0/5] staging: fsl-dpaa2/ethsw: add the .ndo_fdb_dump callback
From: Ioana Ciornei @ 2019-07-30  9:13 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: gregkh@linuxfoundation.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, davem@davemloft.net, f.fainelli@gmail.com,
	jiri@mellanox.com
In-Reply-To: <20190729163506.GJ4110@lunn.ch>

On 7/29/19 7:35 PM, Andrew Lunn wrote:
> On Mon, Jul 29, 2019 at 07:11:47PM +0300, Ioana Ciornei wrote:
>> This patch set adds some features and small fixes in the
>> FDB table manipulation area.
>>
>> First of all, we implement the .ndo_fdb_dump netdev callback so that all
>> offloaded FDB entries, either static or learnt, are available to the user.
>> This is necessary because the DPAA2 switch does not emit interrupts when a
>> new FDB is learnt or deleted, thus we are not able to keep the software
>> bridge state and the HW in sync by calling the switchdev notifiers.
>>
>> The patch set also adds the .ndo_fdb_[add|del] callbacks in order to
>> facilitate adding FDB entries not associated with any master device.
>>
>> One interesting thing that I observed is that when adding an FDB entry
>> associated with a bridge (ie using the 'master' keywork appended to the
>> bridge command) and then dumping the FDB entries, there will be duplicates
>> of the same entry: one listed by the bridge device and one by the
>> driver's .ndo_fdb_dump).
>> It raises the question whether this is the expected behavior or not.
> 
> DSA devices are the same, they don't provide an interrupt when a new
> entry is added by the hardware. So we can have two entries, or just
> the SW bridge entry, or just the HW entry, depending on ageing.
>

This also happens when dealing with static entries (not just dynamic 
ones that can be affected by ageing). All in all, the basic actions of 
adding/deleting entries and then dumping them works. It was just a 
question about switchdev's architecture.


>> Another concern is regarding the correct/desired machanism for drivers to
>> signal errors back to switchdev on adding or deleting an FDB entry.
>> In the switchdev documentation, there is a TODO in the place of this topic.
> 
> It used to be a two state prepare/commit transaction, but that was
> changed a while back.
> 
> Maybe the DSA core code can give you ideas?
>

I looked in the DSA core before sending these patches out and it's doing 
the exact same thing as ethsw - even though it notifies switchdev if the 
entry could be offloaded (ie no error) all entries will still be present 
in the 'bridge fdb' output. In the SWITCHDEV_FDB_DEL_TO_DEVICE case, it 
seems that it just closes the netdev without any further action.

On the other hand, the mlxsw_spectrum also calls the notifiers when an 
offloaded entry is deleted (on SWITCHDEV_FDB_DEL_TO_DEVICE). This seems 
like a reasonable thing to do, maybe in another patch set.

Ioana C


^ permalink raw reply

* Re: [PATCH v4 1/5] vsock/virtio: limit the memory used per-socket
From: Stefano Garzarella @ 2019-07-30  9:35 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
	virtualization, Jason Wang, kvm
In-Reply-To: <20190729143622-mutt-send-email-mst@kernel.org>

On Mon, Jul 29, 2019 at 03:10:15PM -0400, Michael S. Tsirkin wrote:
> On Mon, Jul 29, 2019 at 06:50:56PM +0200, Stefano Garzarella wrote:
> > On Mon, Jul 29, 2019 at 06:19:03PM +0200, Stefano Garzarella wrote:
> > > On Mon, Jul 29, 2019 at 11:49:02AM -0400, Michael S. Tsirkin wrote:
> > > > On Mon, Jul 29, 2019 at 05:36:56PM +0200, Stefano Garzarella wrote:
> > > > > On Mon, Jul 29, 2019 at 10:04:29AM -0400, Michael S. Tsirkin wrote:
> > > > > > On Wed, Jul 17, 2019 at 01:30:26PM +0200, Stefano Garzarella wrote:
> > > > > > > Since virtio-vsock was introduced, the buffers filled by the host
> > > > > > > and pushed to the guest using the vring, are directly queued in
> > > > > > > a per-socket list. These buffers are preallocated by the guest
> > > > > > > with a fixed size (4 KB).
> > > > > > > 
> > > > > > > The maximum amount of memory used by each socket should be
> > > > > > > controlled by the credit mechanism.
> > > > > > > The default credit available per-socket is 256 KB, but if we use
> > > > > > > only 1 byte per packet, the guest can queue up to 262144 of 4 KB
> > > > > > > buffers, using up to 1 GB of memory per-socket. In addition, the
> > > > > > > guest will continue to fill the vring with new 4 KB free buffers
> > > > > > > to avoid starvation of other sockets.
> > > > > > > 
> > > > > > > This patch mitigates this issue copying the payload of small
> > > > > > > packets (< 128 bytes) into the buffer of last packet queued, in
> > > > > > > order to avoid wasting memory.
> > > > > > > 
> > > > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > > > 
> > > > > > This is good enough for net-next, but for net I think we
> > > > > > should figure out how to address the issue completely.
> > > > > > Can we make the accounting precise? What happens to
> > > > > > performance if we do?
> > > > > > 
> > > > > 
> > > > > In order to do more precise accounting maybe we can use the buffer size,
> > > > > instead of payload size when we update the credit available.
> > > > > In this way, the credit available for each socket will reflect the memory
> > > > > actually used.
> > > > > 
> > > > > I should check better, because I'm not sure what happen if the peer sees
> > > > > 1KB of space available, then it sends 1KB of payload (using a 4KB
> > > > > buffer).
> > > > > 
> > > > > The other option is to copy each packet in a new buffer like I did in
> > > > > the v2 [2], but this forces us to make a copy for each packet that does
> > > > > not fill the entire buffer, perhaps too expensive.
> > > > > 
> > > > > [2] https://patchwork.kernel.org/patch/10938741/
> > > > > 
> > > > > 
> > > > > Thanks,
> > > > > Stefano
> > > > 
> > > > Interesting. You are right, and at some level the protocol forces copies.
> > > > 
> > > > We could try to detect that the actual memory is getting close to
> > > > admin limits and force copies on queued packets after the fact.
> > > > Is that practical?
> > > 
> > > Yes, I think it is doable!
> > > We can decrease the credit available with the buffer size queued, and
> > > when the buffer size of packet to queue is bigger than the credit
> > > available, we can copy it.
> > > 
> > > > 
> > > > And yes we can extend the credit accounting to include buffer size.
> > > > That's a protocol change but maybe it makes sense.
> > > 
> > > Since we send to the other peer the credit available, maybe this
> > > change can be backwards compatible (I'll check better this).
> > 
> > What I said was wrong.
> > 
> > We send a counter (increased when the user consumes the packets) and the
> > "buf_alloc" (the max memory allowed) to the other peer.
> > It makes a difference between a local counter (increased when the
> > packets are sent) and the remote counter to calculate the credit available:
> > 
> >     u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
> >     {
> >     	u32 ret;
> > 
> >     	spin_lock_bh(&vvs->tx_lock);
> >     	ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
> >     	if (ret > credit)
> >     		ret = credit;
> >     	vvs->tx_cnt += ret;
> >     	spin_unlock_bh(&vvs->tx_lock);
> > 
> >     	return ret;
> >     }
> > 
> > Maybe I can play with "buf_alloc" to take care of bytes queued but not
> > used.
> > 
> > Thanks,
> > Stefano
> 
> Right. And the idea behind it all was that if we send a credit
> to remote then we have space for it.

Yes.

> I think the basic idea was that if we have actual allocated
> memory and can copy data there, then we send the credit to
> remote.
> 
> Of course that means an extra copy every packet.
> So as an optimization, it seems that we just assume
> that we will be able to allocate a new buffer.

Yes, we refill the virtqueue when half of the buffers were used.

> 
> First this is not the best we can do. We can actually do
> allocate memory in the socket before sending credit.

In this case, IIUC we should allocate an entire buffer (4KB),
so we can reuse it if the packet is big.

> If packet is small then we copy it there.
> If packet is big then we queue the packet,
> take the buffer out of socket and add it to the virtqueue.
> 
> Second question is what to do about medium sized packets.
> Packet is 1K but buffer is 4K, what do we do?
> And here I wonder - why don't we add the 3K buffer
> to the vq?

This would allow us to have an accurate credit account.

The problem here is the compatibility. Before this series virtio-vsock
and vhost-vsock modules had the RX buffer size hard-coded
(VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
of 4K, there might be issues.

Maybe it is the time to add add 'features' to virtio-vsock device.

Thanks,
Stefano

^ permalink raw reply

* Re: [PATCH] rtw88: pci: Use general byte arrays as the elements of RX ring
From: Stanislaw Gruszka @ 2019-07-30  9:35 UTC (permalink / raw)
  To: Jian-Hong Pan
  Cc: Yan-Hsuan Chuang, Kalle Valo, David S . Miller, David Laight,
	linux-wireless, netdev, linux-kernel, linux, stable
In-Reply-To: <20190725080925.6575-1-jian-hong@endlessm.com>

On Thu, Jul 25, 2019 at 04:09:26PM +0800, Jian-Hong Pan wrote:
> Each skb as the element in RX ring was expected with sized buffer 8216
> (RTK_PCI_RX_BUF_SIZE) bytes. However, the skb buffer's true size is
> 16640 bytes for alignment after allocated, x86_64 for example. And, the

rtw88 advertise IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454, so maximum AMSDU
packet can be approximately 12kB. This might be accidental, but having
16kB skb's allow to handle such big AMSDUs. If you shrink buf size,
you can probably override memory after buffer end.

> difference will be enlarged 512 times (RTK_MAX_RX_DESC_NUM).
> To prevent that much wasted memory, this patch follows David's
> suggestion [1] and uses general buffer arrays, instead of skbs as the
> elements in RX ring.
> 
> [1] https://www.spinics.net/lists/linux-wireless/msg187870.html
> 
> Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
> Cc: <stable@vger.kernel.org>

This does not fix any serious problem, it actually most likely 
introduce memory corruption problem described above. Should not
be targeted to stable anyway.

> -			dev_kfree_skb_any(skb);
> +			devm_kfree(rtwdev->dev, buf);

For what this is needed? devm_ allocations are used exactly to avoid
manual freeing.

> +		len = pkt_stat.pkt_len + pkt_offset;
> +		skb = dev_alloc_skb(len);
> +		if (WARN_ONCE(!skb, "rx routine starvation\n"))
>  			goto next_rp;
>  
>  		/* put the DMA data including rx_desc from phy to new skb */
> -		skb_put_data(new, skb->data, new_len);
> +		skb_put_data(skb, rx_desc, len);

Coping big packets it quite inefficient. What drivers usually do is 
copy only for small packets and for big ones allocate new rx buf
(drop packet alloc if fail) and pas old buf to network stack via
skb_add_rx_frag(). See iwlmvm as example.

Stanislaw

^ permalink raw reply

* KMSAN: uninit-value in read_eprom_word
From: syzbot @ 2019-07-30  9:38 UTC (permalink / raw)
  To: davem, glider, linux-kernel, linux-usb, netdev, petkan,
	syzkaller-bugs

Hello,

syzbot found the following crash on:

HEAD commit:    3351e2b9 usb-fuzzer: main usb gadget fuzzer driver
git tree:       kmsan
console output: https://syzkaller.appspot.com/x/log.txt?x=13105d85a00000
kernel config:  https://syzkaller.appspot.com/x/.config?x=40511ad0c5945201
dashboard link: https://syzkaller.appspot.com/bug?extid=3499a83b2d062ae409d4
compiler:       clang version 9.0.0 (/home/glider/llvm/clang  
80fee25776c2fb61e74c1ecb1a523375c2500b69)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=1257755ea00000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1327e5a5a00000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+3499a83b2d062ae409d4@syzkaller.appspotmail.com

usb 1-1: Using ep0 maxpacket: 8
usb 1-1: config 0 has an invalid interface number: 150 but max is 0
usb 1-1: config 0 has no interface number 0
usb 1-1: New USB device found, idVendor=050d, idProduct=0122,  
bcdDevice=c1.69
usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0
usb 1-1: config 0 descriptor??
==================================================================
BUG: KMSAN: uninit-value in read_eprom_word+0x947/0xdd0  
drivers/net/usb/pegasus.c:298
CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.2.0-rc4+ #5
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Workqueue: usb_hub_wq hub_event
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x191/0x1f0 lib/dump_stack.c:113
  kmsan_report+0x162/0x2d0 mm/kmsan/kmsan.c:611
  __msan_warning+0x75/0xe0 mm/kmsan/kmsan_instr.c:304
  read_eprom_word+0x947/0xdd0 drivers/net/usb/pegasus.c:298
  get_interrupt_interval drivers/net/usb/pegasus.c:758 [inline]
  pegasus_probe+0xf2b/0x4be0 drivers/net/usb/pegasus.c:1192
  usb_probe_interface+0xd19/0x1310 drivers/usb/core/driver.c:361
  really_probe+0x1344/0x1d90 drivers/base/dd.c:513
  driver_probe_device+0x1ba/0x510 drivers/base/dd.c:670
  __device_attach_driver+0x5b8/0x790 drivers/base/dd.c:777
  bus_for_each_drv+0x28e/0x3b0 drivers/base/bus.c:454
  __device_attach+0x489/0x750 drivers/base/dd.c:843
  device_initial_probe+0x4a/0x60 drivers/base/dd.c:890
  bus_probe_device+0x131/0x390 drivers/base/bus.c:514
  device_add+0x25b5/0x2df0 drivers/base/core.c:2111
  usb_set_configuration+0x309f/0x3710 drivers/usb/core/message.c:2027
  generic_probe+0xe7/0x280 drivers/usb/core/generic.c:210
  usb_probe_device+0x146/0x200 drivers/usb/core/driver.c:266
  really_probe+0x1344/0x1d90 drivers/base/dd.c:513
  driver_probe_device+0x1ba/0x510 drivers/base/dd.c:670
  __device_attach_driver+0x5b8/0x790 drivers/base/dd.c:777
  bus_for_each_drv+0x28e/0x3b0 drivers/base/bus.c:454
  __device_attach+0x489/0x750 drivers/base/dd.c:843
  device_initial_probe+0x4a/0x60 drivers/base/dd.c:890
  bus_probe_device+0x131/0x390 drivers/base/bus.c:514
  device_add+0x25b5/0x2df0 drivers/base/core.c:2111
  usb_new_device+0x23e5/0x2fb0 drivers/usb/core/hub.c:2534
  hub_port_connect drivers/usb/core/hub.c:5089 [inline]
  hub_port_connect_change drivers/usb/core/hub.c:5204 [inline]
  port_event drivers/usb/core/hub.c:5350 [inline]
  hub_event+0x5853/0x7320 drivers/usb/core/hub.c:5432
  process_one_work+0x1572/0x1f00 kernel/workqueue.c:2269
  worker_thread+0x111b/0x2460 kernel/workqueue.c:2415
  kthread+0x4b5/0x4f0 kernel/kthread.c:256
  ret_from_fork+0x35/0x40 arch/x86/entry/entry_64.S:355

Local variable description: ----data.addr.i13@read_eprom_word
Variable was created at:
  set_register drivers/net/usb/pegasus.c:174 [inline]
  read_eprom_word+0x498/0xdd0 drivers/net/usb/pegasus.c:294
  get_interrupt_interval drivers/net/usb/pegasus.c:758 [inline]
  pegasus_probe+0xf2b/0x4be0 drivers/net/usb/pegasus.c:1192
==================================================================

---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH] net: ceph: Fix a possible null-pointer dereference in ceph_crypto_key_destroy()
From: Ilya Dryomov @ 2019-07-30  9:41 UTC (permalink / raw)
  To: Jia-Ju Bai
  Cc: Jeff Layton, Sage Weil, David S. Miller, Ceph Development, netdev,
	LKML
In-Reply-To: <20190724094306.1866-1-baijiaju1990@gmail.com>

On Wed, Jul 24, 2019 at 11:43 AM Jia-Ju Bai <baijiaju1990@gmail.com> wrote:
>
> In set_secret(), key->tfm is assigned to NULL on line 55, and then
> ceph_crypto_key_destroy(key) is executed.
>
> ceph_crypto_key_destroy(key)
>     crypto_free_sync_skcipher(key->tfm)
>         crypto_skcipher_tfm(tfm)
>             return &tfm->base;
>
> Thus, a possible null-pointer dereference may occur.
>
> To fix this bug, key->tfm is checked before calling
> crypto_free_sync_skcipher().
>
> This bug is found by a static analysis tool STCheck written by us.
>
> Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
> ---
>  net/ceph/crypto.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
> index 5d6724cee38f..ac28463bcfd8 100644
> --- a/net/ceph/crypto.c
> +++ b/net/ceph/crypto.c
> @@ -136,7 +136,8 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
>         if (key) {
>                 kfree(key->key);
>                 key->key = NULL;
> -               crypto_free_sync_skcipher(key->tfm);
> +               if (key->tfm)
> +                       crypto_free_sync_skcipher(key->tfm);
>                 key->tfm = NULL;
>         }
>  }

Hi Jia-Ju,

Yeah, looks like the only reason this continued to work after
69d6302b65a8 ("libceph: Remove VLA usage of skcipher") is because
crypto_sync_skcipher is a trivial wrapper around crypto_skcipher
added just for type checking AFAICT.

struct crypto_sync_skcipher {
    struct crypto_skcipher base;
};

Before that ceph_crypto_key_destroy() used crypto_free_skcipher(),
which is safe to call on a NULL tfm.

Applied with a slight modification -- I moved key->tfm = NULL under
the new if and amended the changelog.

https://github.com/ceph/ceph-client/commit/b3d79916ff99074d289d66f1643b423ae0008c50

Thanks,

                Ilya

^ permalink raw reply

* RE: [PATCH net-next 3/3] net: stmmac: Introducing support for Page Pool
From: Jose Abreu @ 2019-07-30  9:39 UTC (permalink / raw)
  To: Jon Hunter, Jose Abreu, Robin Murphy,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	linux-stm32@st-md-mailman.stormreply.com,
	linux-arm-kernel@lists.infradead.org, Catalin Marinas,
	Will Deacon
  Cc: Joao Pinto, Alexandre Torgue, Maxime Ripard, Chen-Yu Tsai,
	Maxime Coquelin, linux-tegra, Giuseppe Cavallaro,
	David S . Miller
In-Reply-To: <8a60361f-b914-93ef-0d80-92ae4ad8b808@nvidia.com>

[-- Attachment #1: Type: text/plain, Size: 2703 bytes --]

From: Jon Hunter <jonathanh@nvidia.com>
Date: Jul/29/2019, 22:33:04 (UTC+00:00)

> 
> On 29/07/2019 15:08, Jose Abreu wrote:
> 
> ...
> 
> >>> Hi Catalin and Will,
> >>>
> >>> Sorry to add you in such a long thread but we are seeing a DMA issue
> >>> with stmmac driver in an ARM64 platform with IOMMU enabled.
> >>>
> >>> The issue seems to be solved when buffers allocation for DMA based
> >>> transfers are *not* mapped with the DMA_ATTR_SKIP_CPU_SYNC flag *OR*
> >>> when IOMMU is disabled.
> >>>
> >>> Notice that after transfer is done we do use
> >>> dma_sync_single_for_{cpu,device} and then we reuse *the same* page for
> >>> another transfer.
> >>>
> >>> Can you please comment on whether DMA_ATTR_SKIP_CPU_SYNC can not be used
> >>> in ARM64 platforms with IOMMU ?
> >>
> >> In terms of what they do, there should be no difference on arm64 between:
> >>
> >> dma_map_page(..., dir);
> >> ...
> >> dma_unmap_page(..., dir);
> >>
> >> and:
> >>
> >> dma_map_page_attrs(..., dir, DMA_ATTR_SKIP_CPU_SYNC);
> >> dma_sync_single_for_device(..., dir);
> >> ...
> >> dma_sync_single_for_cpu(..., dir);
> >> dma_unmap_page_attrs(..., dir, DMA_ATTR_SKIP_CPU_SYNC);
> >>
> >> provided that the first sync covers the whole buffer and any subsequent 
> >> ones cover at least the parts of the buffer which may have changed. Plus 
> >> for coherent hardware it's entirely moot either way.
> > 
> > Thanks for confirming. That's indeed what stmmac is doing when buffer is 
> > received by syncing the packet size to CPU.
> > 
> >>
> >> Given Jon's previous findings, I would lean towards the idea that 
> >> performing the extra (redundant) cache maintenance plus barrier in 
> >> dma_unmap is mostly just perturbing timing in the same way as the debug 
> >> print which also made things seem OK.
> > 
> > Mikko said that Tegra186 is not coherent so we have to explicit flush 
> > pipeline but I don't understand why sync_single() is not doing it ...
> > 
> > Jon, can you please remove *all* debug prints, hacks, etc ... and test 
> > this one in attach with plain -net tree ?
> 
> So far I have just been testing on the mainline kernel branch. The issue
> still persists after applying this on mainline. I can test on the -net
> tree, but I am not sure that will make a difference.
> 
> Cheers
> Jon
> 
> -- 
> nvpublic

I looked at netsec implementation and I noticed that we are syncing the 
old buffer for device instead of the new one. netsec syncs the buffer 
for device immediately after the allocation which may be what we have to 
do. Maybe the attached patch can make things work for you ?

---
Thanks,
Jose Miguel Abreu

[-- Attachment #2: 0001-net-stmmac-Sync-RX-Buffer-upon-allocation.patch --]
[-- Type: application/octet-stream, Size: 2465 bytes --]

From 3601e3ae4357d48b3294f42781d0f19095d1b00e Mon Sep 17 00:00:00 2001
Message-Id: <3601e3ae4357d48b3294f42781d0f19095d1b00e.1564479382.git.joabreu@synopsys.com>
From: Jose Abreu <joabreu@synopsys.com>
Date: Tue, 30 Jul 2019 11:36:13 +0200
Subject: [PATCH net] net: stmmac: Sync RX Buffer upon allocation

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
---
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jose Abreu <joabreu@synopsys.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-stm32@st-md-mailman.stormreply.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 98b1a5c6d537..9a4a56ad35cd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3271,9 +3271,11 @@ static inline int stmmac_rx_threshold_count(struct stmmac_rx_queue *rx_q)
 static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 {
 	struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
-	int dirty = stmmac_rx_dirty(priv, queue);
+	int len, dirty = stmmac_rx_dirty(priv, queue);
 	unsigned int entry = rx_q->dirty_rx;
 
+	len = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE) * PAGE_SIZE;
+
 	while (dirty-- > 0) {
 		struct stmmac_rx_buffer *buf = &rx_q->buf_pool[entry];
 		struct dma_desc *p;
@@ -3291,6 +3293,13 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 		}
 
 		buf->addr = page_pool_get_dma_addr(buf->page);
+
+		/* Sync whole allocation to device. This will invalidate old
+		 * data.
+		 */
+		dma_sync_single_for_device(priv->device, buf->addr, len,
+					   DMA_FROM_DEVICE);
+
 		stmmac_set_desc_addr(priv, p, buf->addr);
 		stmmac_refill_desc3(priv, rx_q, p);
 
@@ -3425,8 +3434,6 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 			skb_copy_to_linear_data(skb, page_address(buf->page),
 						frame_len);
 			skb_put(skb, frame_len);
-			dma_sync_single_for_device(priv->device, buf->addr,
-						   frame_len, DMA_FROM_DEVICE);
 
 			if (netif_msg_pktdata(priv)) {
 				netdev_dbg(priv->dev, "frame received (%dbytes)",
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH v4 0/5] vsock/virtio: optimizations to increase the throughput
From: Stefano Garzarella @ 2019-07-30  9:40 UTC (permalink / raw)
  To: Michael S. Tsirkin, Stefan Hajnoczi, David S. Miller
  Cc: netdev, linux-kernel, virtualization, Jason Wang, kvm
In-Reply-To: <20190729095743-mutt-send-email-mst@kernel.org>

On Mon, Jul 29, 2019 at 09:59:23AM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 17, 2019 at 01:30:25PM +0200, Stefano Garzarella wrote:
> > This series tries to increase the throughput of virtio-vsock with slight
> > changes.
> > While I was testing the v2 of this series I discovered an huge use of memory,
> > so I added patch 1 to mitigate this issue. I put it in this series in order
> > to better track the performance trends.
> 
> Series:
> 
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
> 
> Can this go into net-next?
> 

I think so.
Michael, Stefan thanks to ack the series!

Should I resend it with net-next tag?

Thanks,
Stefano

^ permalink raw reply

* Re: net: ipv6: Fix a bug in ndisc_send_ns when netdev only has a global address
From: Su Yanjun @ 2019-07-30  9:39 UTC (permalink / raw)
  To: Mark Smith, netdev
In-Reply-To: <CAO42Z2yN=FfueKAjb0KjY8-CdiKuvkJDr2iJdJR4XdKM90HJRg@mail.gmail.com>


在 2019/7/30 16:15, Mark Smith 写道:
> Hi,
>
> I'm not subscribed to the Linux netdev mailing list, so I can't
> directly reply to the patch email.
>
> This patch is not the correct solution to this issue.
>
> Per RFC 4291 "IP Version 6 Addressing Architecture", all IPv6
> interfaces are required to have Link-Local addresses, so therefore
> there should always be a link-local address available to use as the
> source address for an ND NS.

In linux implementation, one interface may have no link local address if 
kernel config

*addr_gen_mode* is set to IN6_ADDR_GEN_MODE_NONE. My patch is to fix 
this problem.

And what you say is related to the lo interface.  I'm not sure whether 
the lo interface needs a ll adreess.

IMO the ll address is used to get l2 address by sending ND ns. The lo is 
very special.

Thanks

Su

>
> "2.1. Addressing Model"
>
> ...
>
> "All interfaces are required to have at least one Link-Local unicast
>     address (see Section 2.8 for additional required addresses)."
>
> I have submitted a more specific bug regarding no Link-Local
> address/prefix on the Linux kernel loopback interface through RedHat
> bugzilla as I use Fedora 30, however it doesn't seem to have been
> looked at yet.
>
> "Loopback network interface does not have a Link Local address,
> contrary to RFC 4291"
> https://bugzilla.redhat.com/show_bug.cgi?id=1706709
>
>
> Thanks very much,
> Mark.
>
>



^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-30  9:21 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: Ido Schimmel, Horatiu Vultur, roopa, davem, bridge, netdev,
	linux-kernel
In-Reply-To: <13f66ebe-4173-82d7-604b-08e9d33d9aff@cumulusnetworks.com>

The 07/30/2019 11:58, Nikolay Aleksandrov wrote:
> On 30/07/2019 11:30, Allan W. Nielsen wrote:
> > The 07/30/2019 10:06, Ido Schimmel wrote:
> >> On Tue, Jul 30, 2019 at 08:27:22AM +0200, Allan W. Nielsen wrote:
> >>> The 07/29/2019 20:51, Ido Schimmel wrote:
> >>>> Can you please clarify what you're trying to achieve? I just read the
> >>>> thread again and my impression is that you're trying to locally receive
> >>>> packets with a certain link layer multicast address.
> >>> Yes. The thread is also a bit confusing because we half way through realized
> >>> that we misunderstood how the multicast packets should be handled (sorry about
> >>> that). To begin with we had a driver where multicast packets was only copied to
> >>> the CPU if someone needed it. Andrew and Nikolay made us aware that this is not
> >>> how other drivers are doing it, so we changed the driver to include the CPU in
> >>> the default multicast flood-mask.
> >> OK, so what prevents you from removing all other ports from the
> >> flood-mask and letting the CPU handle the flooding? Then you can install
> >> software tc filters to limit the flooding.
> > I do not have the bandwidth to forward the multicast traffic in the CPU.
> > 
> > It will also cause enormous latency on the forwarding of L2 multicast packets.
> > 
> >>> This changes the objective a bit. To begin with we needed to get more packets to
> >>> the CPU (which could have been done using tc ingress rules and a trap action).
> >>>
> >>> Now after we changed the driver, we realized that we need something to limit the
> >>> flooding of certain L2 multicast packets. This is the new problem we are trying
> >>> to solve!
> >>>
> >>> Example: Say we have a bridge with 4 slave interfaces, then we want to install a
> >>> forwarding rule saying that packets to a given L2-multicast MAC address, should
> >>> only be flooded to 2 of the 4 ports.
> >>>
> >>> (instead of adding rules to get certain packets to the CPU, we are now adding
> >>> other rules to prevent other packets from going to the CPU and other ports where
> >>> they are not needed/wanted).
> >>>
> >>> This is exactly the same thing as IGMP snooping does dynamically, but only for
> >>> IP multicast.
> >>>
> >>> The "bridge mdb" allow users to manually/static add/del a port to a multicast
> >>> group, but still it operates on IP multicast address (not L2 multicast
> >>> addresses).
> >>>
> >>>> Nik suggested SIOCADDMULTI.
> >>> It is not clear to me how this should be used to limit the flooding, maybe we
> >>> can make some hacks, but as far as I understand the intend of this is maintain
> >>> the list of addresses an interface should receive. I'm not sure this should
> >>> influence how for forwarding decisions are being made.
> >>>
> >>>> and I suggested a tc filter to get the packet to the CPU.
> >>> The TC solution is a good solution to the original problem where wanted to copy
> >>> more frames to the CPU. But we were convinced that this is not the right
> >>> approach, and that the CPU by default should receive all multicast packets, and
> >>> we should instead try to find a way to limit the flooding of certain frames as
> >>> an optimization.
> >>
> >> This can still work. In Linux, ingress tc filters are executed before the
> >> bridge's Rx handler. The same happens in every sane HW. Ingress ACL is
> >> performed before L2 forwarding. Assuming you have eth0-eth3 bridged and
> >> you want to prevent packets with DMAC 01:21:6C:00:00:01 from egressing
> >> eth2:
> >>
> >> # tc filter add dev eth0 ingress pref 1 flower skip_sw \
> >> 	dst_mac 01:21:6C:00:00:01 action trap
> >> # tc filter add dev eth2 egress pref 1 flower skip_hw \
> >> 	dst_mac 01:21:6C:00:00:01 action drop
> >>
> >> The first filter is only present in HW ('skip_sw') and should result in
> >> your HW passing you the sole copy of the packet.
> > Agree.
> > 
> >> The second filter is only present in SW ('skip_hw', not using HW egress
> >> ACL that you don't have) and drops the packet after it was flooded by
> >> the SW bridge.
> > Agree.
> > 
> >> As I mentioned earlier, you can install the filter once in your HW and
> >> share it between different ports using a shared block. This means you
> >> only consume one TCAM entry.
> >>
> >> Note that this allows you to keep flooding all other multicast packets
> >> in HW.
> > Yes, but the frames we want to limit the flood-mask on are the exact frames
> > which occurs at a very high rate, and where latency is important.
> > 
> > I really do not consider it as an option to forward this in SW, when it is
> > something that can easily be offloaded in HW.
> > 
> >>>> If you now want to limit the ports to which this packet is flooded, then
> >>>> you can use tc filters in *software*:
> >>>>
> >>>> # tc qdisc add dev eth2 clsact
> >>>> # tc filter add dev eth2 egress pref 1 flower skip_hw \
> >>>> 	dst_mac 01:21:6C:00:00:01 action drop
> >>> Yes. This can work in the SW bridge.
> >>>
> >>>> If you want to forward the packet in hardware and locally receive it,
> >>>> you can chain several mirred action and then a trap action.
> >>> I'm not I fully understand how this should be done, but it does sound like it
> >>> becomes quite complicated. Also, as far as I understand it will mean that we
> >>> will be using TCAM/ACL resources to do something that could have been done with
> >>> a simple MAC entry.
> >>>
> >>>> Both options avoid HW egress ACLs which your design does not support.
> >>> True, but what is wrong with expanding the functionality of the normal
> >>> forwarding/MAC operations to allow multiple destinations?
> >>>
> >>> It is not an uncommon feature (I just browsed the manual of some common L2
> >>> switches and they all has this feature).
> >>>
> >>> It seems to fit nicely into the existing user-interface:
> >>>
> >>> bridge fdb add    01:21:6C:00:00:01 port eth0
> >>> bridge fdb append 01:21:6C:00:00:01 port eth1
> >>
> >> Wouldn't it be better to instead extend the MDB entries so that they are
> >> either keyed by IP or MAC? I believe FDB should remain as unicast-only.
> > 
> > You might be right, it was not clear to me which of the two would fit the
> > purpose best.
> > 
> > From a user-space iproute2 perspective I prefer using the "bridge fdb" command
> > as it already supports the needed syntax, and I do not think it will be too
> > pretty if we squeeze this into the "bridge mdb" command syntax.
> > 
> 
> MDB is a much better fit as Ido already suggested. FDB should remain unicast
> and mixing them is not a good idea, we already have a good ucast/mcast separation
> and we'd like to keep it that way.
Okay. We will explore that option.


> > But that does not mean that it need to go into the FDB database in the
> > implementation.
> > 
> > Last evening when I looked at it again, I was considering keeping the
> > net_bridge_fdb_entry structure as is, and add a new hashtable with the
> > following:
> > 
> > struct net_bridge_fdbmc_entry {
> > 	struct rhash_head		rhnode;
> > 	struct net_bridge_fdbmc_ports   *dst;
> > 
> > 	struct net_bridge_fdb_key	key;
> > 	struct hlist_node		fdb_node;
> > 	unsigned char			offloaded:1;
> > 
> > 	struct rcu_head			rcu;
> > };
> > 
> 
> What would the notification for this look like ?
Not sure. But we will change the direction and use the MDB structures instead.

> > If we go with this approach then we can look at the MAC address and see if it is
> > a unicast which will cause a lookup in the fdb, l3-multicast (33:33:* or
> > 01:00:5e:*) which will cause a lookup in the mdb, or finally a fdbmc which will
> > need to do a lookup in this new hashtable.
> 
> That sounds wrong, you will change the current default behaviour of flooding these
> packets. This will have to be well hidden behind a new option and enabled only on user
> request.
It will only affect users who install a static L2-multicast entry. If no entry
is found, it will default to flooding, which will be the same as before.

> > Alternative it would be like this:
> > 
> > struct net_bridge_fdb_entry {
> > 	struct rhash_head		rhnode;
> > 	union net_bridge_port_or_list	*dst;
> > 
> > 	struct net_bridge_fdb_key	key;
> > 	struct hlist_node		fdb_node;
> > 	unsigned char			is_local:1,
> > 					is_static:1,
> > 					is_sticky:1,
> > 					added_by_user:1,
> > 					added_by_external_learn:1,
> > 					offloaded:1;
> > 					multi_dst:1;
> > 
> > 	/* write-heavy members should not affect lookups */
> > 	unsigned long			updated ____cacheline_aligned_in_smp;
> > 	unsigned long			used;
> > 
> > 	struct rcu_head			rcu;
> > };
> > 
> > Both solutions should require fairly few changes, and should not cause any
> > measurable performance hit.
> > 
> 
> You'll have to convert these bits to use the proper atomic bitops if you go with
> the second solution. That has to be done even today, but the second case would
> make it a must.
Good to know.

Just for my understanding, is this because this is the "current" guide lines on
how things should be done, or is this because the multi_dst as a special need.

The multi_dst flag will never be changed in the life-cycle of the structure, and
the structure is protected by rcu. If this is causeing a raise, then I do not
see it.

> > Making it fit into the net_bridge_mdb_entry seems to be harder.
> > 
> 
> But it is the correct abstraction from bridge POV, so please stop trying to change
> the FDB code and try to keep to the multicast code.
We are planning on letting the net_bridge_port_or_list union use the
net_bridge_port_group structure, which will mean that we can re-use the
br_multicast_flood function (if we change the signatire to accept the ports
instead of the entry).

> >> As a bonus, existing drivers could benefit from it, as MDB entries are already
> >> notified by MAC.
> > Not sure I follow. When FDB entries are added, it also generates notification
> > events.
> > 
> 
> Could you please show fdb event with multiple ports ?
We will get to that. Maybe this is an argument for converting to mdb. We have
not looked into the details of this yet.

> >>> It seems that it can be added to the existing implementation with out adding
> >>> significant complexity.
> >>>
> >>> It will be easy to offload in HW.
> >>>
> >>> I do not believe that it will be a performance issue, if this is a concern then
> >>> we may have to do a bit of benchmarking, or we can make it a configuration
> >>> option.
> >>>
> >>> Long story short, we (Horatiu and I) learned a lot from the discussion here, and
> >>> I think we should try do a new patch with the learning we got. Then it is easier
> >>> to see what it actually means to the exiting code, complexity, exiting drivers,
> >>> performance, default behavioral, backwards compatibly, and other valid concerns.
> >>>
> >>> If the patch is no good, and cannot be fixed, then we will go back and look
> >>> further into alternative solutions.
> >> Overall, I tend to agree with Nik. I think your use case is too specific
> >> to justify the amount of changes you want to make in the bridge driver.
> >> We also provided other alternatives. That being said, you're more than
> >> welcome to send the patches and we can continue the discussion then.
> > Okay, good to know. I'm not sure I agree that the alternative solutions really
> > solves the issue this is trying to solve, nor do I agree that this is specific
> > to our needs.
> > 
> > But lets take a look at a new patch, and see what is the amount of changes we
> > are talking about. Without having the patch it is really hard to know for sure.
> Please keep in mind that this case is the exception, not the norm, thus it should
> not under any circumstance affect the standard deployments.
Understood - no surprises.

-- 
/Allan

^ permalink raw reply

* RE: [PATCH] powerpc/kmcent2: update the ethernet devices' phy properties
From: Madalin-cristian Bucur @ 2019-07-30  9:44 UTC (permalink / raw)
  To: Scott Wood, Valentin Longchamp, linuxppc-dev@lists.ozlabs.org,
	galak@kernel.crashing.org
  Cc: netdev@vger.kernel.org
In-Reply-To: <2243421e574c72c5e75d27cc0122338e2e0bde63.camel@buserror.net>

> -----Original Message-----
> From: Scott Wood <oss@buserror.net>
> Sent: Sunday, July 28, 2019 10:27 PM
> To: Valentin Longchamp <valentin@longchamp.me>; linuxppc-
> dev@lists.ozlabs.org; galak@kernel.crashing.org
> Cc: Madalin-cristian Bucur <madalin.bucur@nxp.com>
> Subject: Re: [PATCH] powerpc/kmcent2: update the ethernet devices' phy
> properties
> 
> On Sun, 2019-07-28 at 18:01 +0200, Valentin Longchamp wrote:
> > Hi Scott, Kumar,
> >
> > Looking at this patch I have realised that I had already submitted it
> > to the mailing list nearly 2 years ago:
> >
> > https://patchwork.ozlabs.org/patch/842944/
> >
> > Could you please make sure that this one gets merged in the next
> > window, so that I avoid forgetting such a patch a 2nd time ?
> >
> > Thanks a lot
> 
> I added it to my patchwork todo list; thanks for the reminder.
> 
> > Le dim. 14 juil. 2019 à 22:05, Valentin Longchamp
> > <valentin@longchamp.me> a écrit :
> > >
> > > Change all phy-connection-type properties to phy-mode that are better
> > > supported by the fman driver.
> > >
> > > Use the more readable fixed-link node for the 2 sgmii links.
> > >
> > > Change the RGMII link to rgmii-id as the clock delays are added by the
> > > phy.
> > >
> > > Signed-off-by: Valentin Longchamp <valentin@longchamp.me>
> 
> I don't see any other uses of phy-mode in arch/powerpc/boot/dts/fsl, and I see
> lots of phy-connection-type with fman.  Madalin, does this patch look OK?
> 
> -Scott

Hi,

we are using "phy-connection-type" not "phy-mode" for the NXP (former Freescale)
DPAA platforms. While the two seem to be interchangeable ("phy-mode" seems to be
more recent, looking at the device tree bindings), the driver code in Linux seems
to use one or the other, not both so one should stick with the variant the driver
is using. To make things more complex, there may be dependencies in bootloaders,
I see code in u-boot using only "phy-connection-type" or only "phy-mode".

I'd leave "phy-connection-type" as is.

Madalin

> > > ---
> > >  arch/powerpc/boot/dts/fsl/kmcent2.dts | 16 +++++++++++-----
> > >  1 file changed, 11 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > b/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > index 48b7f9797124..c3e0741cafb1 100644
> > > --- a/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > +++ b/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > @@ -210,13 +210,19 @@
> > >
> > >                 fman@400000 {
> > >                         ethernet@e0000 {
> > > -                               fixed-link = <0 1 1000 0 0>;
> > > -                               phy-connection-type = "sgmii";
> > > +                               phy-mode = "sgmii";
> > > +                               fixed-link {
> > > +                                       speed = <1000>;
> > > +                                       full-duplex;
> > > +                               };
> > >                         };
> > >
> > >                         ethernet@e2000 {
> > > -                               fixed-link = <1 1 1000 0 0>;
> > > -                               phy-connection-type = "sgmii";
> > > +                               phy-mode = "sgmii";
> > > +                               fixed-link {
> > > +                                       speed = <1000>;
> > > +                                       full-duplex;
> > > +                               };
> > >                         };
> > >
> > >                         ethernet@e4000 {
> > > @@ -229,7 +235,7 @@
> > >
> > >                         ethernet@e8000 {
> > >                                 phy-handle = <&front_phy>;
> > > -                               phy-connection-type = "rgmii";
> > > +                               phy-mode = "rgmii-id";
> > >                         };
> > >
> > >                         mdio0: mdio@fc000 {
> > > --
> > > 2.17.1
> > >
> >
> >


^ permalink raw reply

* [PATCH net-next v4 1/4] enetc: Clean up local mdio bus allocation
From: Claudiu Manoil @ 2019-07-30  9:45 UTC (permalink / raw)
  To: David S . Miller
  Cc: andrew, Rob Herring, Li Yang, alexandru.marginean, netdev,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <1564479919-18835-1-git-send-email-claudiu.manoil@nxp.com>

What's needed is basically a pointer to the mdio registers.
This is one way to store it inside bus->priv allocated space,
without upsetting sparse.
Reworked accessors to avoid __iomem casting.
Used devm_* variant to further clean up the init error /
remove paths.

Fixes following sparse warning:
 warning: incorrect type in assignment (different address spaces)
    expected void *priv
    got struct enetc_mdio_regs [noderef] <asn:2>*[assigned] regs

Fixes: ebfcb23d62ab ("enetc: Add ENETC PF level external MDIO support")

Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
---
v1 - added this patch
v2 - reworked accessors as per Andrew Lunn's request
v3 - cleaned up commit message
v4 - none

 .../net/ethernet/freescale/enetc/enetc_mdio.c | 94 +++++++++----------
 1 file changed, 46 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
index 77b9cd10ba2b..05094601ece8 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
@@ -8,16 +8,22 @@
 
 #include "enetc_pf.h"
 
-struct enetc_mdio_regs {
-	u32	mdio_cfg;	/* MDIO configuration and status */
-	u32	mdio_ctl;	/* MDIO control */
-	u32	mdio_data;	/* MDIO data */
-	u32	mdio_addr;	/* MDIO address */
+#define	ENETC_MDIO_REG_OFFSET	0x1c00
+#define	ENETC_MDIO_CFG	0x0	/* MDIO configuration and status */
+#define	ENETC_MDIO_CTL	0x4	/* MDIO control */
+#define	ENETC_MDIO_DATA	0x8	/* MDIO data */
+#define	ENETC_MDIO_ADDR	0xc	/* MDIO address */
+
+#define enetc_mdio_rd(hw, off) \
+	enetc_port_rd(hw, ENETC_##off + ENETC_MDIO_REG_OFFSET)
+#define enetc_mdio_wr(hw, off, val) \
+	enetc_port_wr(hw, ENETC_##off + ENETC_MDIO_REG_OFFSET, val)
+#define enetc_mdio_rd_reg(off)	enetc_mdio_rd(hw, off)
+
+struct enetc_mdio_priv {
+	struct enetc_hw *hw;
 };
 
-#define bus_to_enetc_regs(bus)	(struct enetc_mdio_regs __iomem *)((bus)->priv)
-
-#define ENETC_MDIO_REG_OFFSET	0x1c00
 #define ENETC_MDC_DIV		258
 
 #define MDIO_CFG_CLKDIV(x)	((((x) >> 1) & 0xff) << 8)
@@ -33,18 +39,19 @@ struct enetc_mdio_regs {
 #define MDIO_DATA(x)		((x) & 0xffff)
 
 #define TIMEOUT	1000
-static int enetc_mdio_wait_complete(struct enetc_mdio_regs __iomem *regs)
+static int enetc_mdio_wait_complete(struct enetc_hw *hw)
 {
 	u32 val;
 
-	return readx_poll_timeout(enetc_rd_reg, &regs->mdio_cfg, val,
+	return readx_poll_timeout(enetc_mdio_rd_reg, MDIO_CFG, val,
 				  !(val & MDIO_CFG_BSY), 10, 10 * TIMEOUT);
 }
 
 static int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum,
 			    u16 value)
 {
-	struct enetc_mdio_regs __iomem *regs = bus_to_enetc_regs(bus);
+	struct enetc_mdio_priv *mdio_priv = bus->priv;
+	struct enetc_hw *hw = mdio_priv->hw;
 	u32 mdio_ctl, mdio_cfg;
 	u16 dev_addr;
 	int ret;
@@ -59,29 +66,29 @@ static int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum,
 		mdio_cfg &= ~MDIO_CFG_ENC45;
 	}
 
-	enetc_wr_reg(&regs->mdio_cfg, mdio_cfg);
+	enetc_mdio_wr(hw, MDIO_CFG, mdio_cfg);
 
-	ret = enetc_mdio_wait_complete(regs);
+	ret = enetc_mdio_wait_complete(hw);
 	if (ret)
 		return ret;
 
 	/* set port and dev addr */
 	mdio_ctl = MDIO_CTL_PORT_ADDR(phy_id) | MDIO_CTL_DEV_ADDR(dev_addr);
-	enetc_wr_reg(&regs->mdio_ctl, mdio_ctl);
+	enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl);
 
 	/* set the register address */
 	if (regnum & MII_ADDR_C45) {
-		enetc_wr_reg(&regs->mdio_addr, regnum & 0xffff);
+		enetc_mdio_wr(hw, MDIO_ADDR, regnum & 0xffff);
 
-		ret = enetc_mdio_wait_complete(regs);
+		ret = enetc_mdio_wait_complete(hw);
 		if (ret)
 			return ret;
 	}
 
 	/* write the value */
-	enetc_wr_reg(&regs->mdio_data, MDIO_DATA(value));
+	enetc_mdio_wr(hw, MDIO_DATA, MDIO_DATA(value));
 
-	ret = enetc_mdio_wait_complete(regs);
+	ret = enetc_mdio_wait_complete(hw);
 	if (ret)
 		return ret;
 
@@ -90,7 +97,8 @@ static int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum,
 
 static int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 {
-	struct enetc_mdio_regs __iomem *regs = bus_to_enetc_regs(bus);
+	struct enetc_mdio_priv *mdio_priv = bus->priv;
+	struct enetc_hw *hw = mdio_priv->hw;
 	u32 mdio_ctl, mdio_cfg;
 	u16 dev_addr, value;
 	int ret;
@@ -104,41 +112,41 @@ static int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 		mdio_cfg &= ~MDIO_CFG_ENC45;
 	}
 
-	enetc_wr_reg(&regs->mdio_cfg, mdio_cfg);
+	enetc_mdio_wr(hw, MDIO_CFG, mdio_cfg);
 
-	ret = enetc_mdio_wait_complete(regs);
+	ret = enetc_mdio_wait_complete(hw);
 	if (ret)
 		return ret;
 
 	/* set port and device addr */
 	mdio_ctl = MDIO_CTL_PORT_ADDR(phy_id) | MDIO_CTL_DEV_ADDR(dev_addr);
-	enetc_wr_reg(&regs->mdio_ctl, mdio_ctl);
+	enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl);
 
 	/* set the register address */
 	if (regnum & MII_ADDR_C45) {
-		enetc_wr_reg(&regs->mdio_addr, regnum & 0xffff);
+		enetc_mdio_wr(hw, MDIO_ADDR, regnum & 0xffff);
 
-		ret = enetc_mdio_wait_complete(regs);
+		ret = enetc_mdio_wait_complete(hw);
 		if (ret)
 			return ret;
 	}
 
 	/* initiate the read */
-	enetc_wr_reg(&regs->mdio_ctl, mdio_ctl | MDIO_CTL_READ);
+	enetc_mdio_wr(hw, MDIO_CTL, mdio_ctl | MDIO_CTL_READ);
 
-	ret = enetc_mdio_wait_complete(regs);
+	ret = enetc_mdio_wait_complete(hw);
 	if (ret)
 		return ret;
 
 	/* return all Fs if nothing was there */
-	if (enetc_rd_reg(&regs->mdio_cfg) & MDIO_CFG_RD_ER) {
+	if (enetc_mdio_rd(hw, MDIO_CFG) & MDIO_CFG_RD_ER) {
 		dev_dbg(&bus->dev,
 			"Error while reading PHY%d reg at %d.%hhu\n",
 			phy_id, dev_addr, regnum);
 		return 0xffff;
 	}
 
-	value = enetc_rd_reg(&regs->mdio_data) & 0xffff;
+	value = enetc_mdio_rd(hw, MDIO_DATA) & 0xffff;
 
 	return value;
 }
@@ -146,12 +154,12 @@ static int enetc_mdio_read(struct mii_bus *bus, int phy_id, int regnum)
 int enetc_mdio_probe(struct enetc_pf *pf)
 {
 	struct device *dev = &pf->si->pdev->dev;
-	struct enetc_mdio_regs __iomem *regs;
+	struct enetc_mdio_priv *mdio_priv;
 	struct device_node *np;
 	struct mii_bus *bus;
-	int ret;
+	int err;
 
-	bus = mdiobus_alloc_size(sizeof(regs));
+	bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv));
 	if (!bus)
 		return -ENOMEM;
 
@@ -159,41 +167,31 @@ int enetc_mdio_probe(struct enetc_pf *pf)
 	bus->read = enetc_mdio_read;
 	bus->write = enetc_mdio_write;
 	bus->parent = dev;
+	mdio_priv = bus->priv;
+	mdio_priv->hw = &pf->si->hw;
 	snprintf(bus->id, MII_BUS_ID_SIZE, "%s", dev_name(dev));
 
-	/* store the enetc mdio base address for this bus */
-	regs = pf->si->hw.port + ENETC_MDIO_REG_OFFSET;
-	bus->priv = regs;
-
 	np = of_get_child_by_name(dev->of_node, "mdio");
 	if (!np) {
 		dev_err(dev, "MDIO node missing\n");
-		ret = -EINVAL;
-		goto err_registration;
+		return -EINVAL;
 	}
 
-	ret = of_mdiobus_register(bus, np);
-	if (ret) {
+	err = of_mdiobus_register(bus, np);
+	if (err) {
 		of_node_put(np);
 		dev_err(dev, "cannot register MDIO bus\n");
-		goto err_registration;
+		return err;
 	}
 
 	of_node_put(np);
 	pf->mdio = bus;
 
 	return 0;
-
-err_registration:
-	mdiobus_free(bus);
-
-	return ret;
 }
 
 void enetc_mdio_remove(struct enetc_pf *pf)
 {
-	if (pf->mdio) {
+	if (pf->mdio)
 		mdiobus_unregister(pf->mdio);
-		mdiobus_free(pf->mdio);
-	}
 }
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next v4 4/4] arm64: dts: fsl: ls1028a: Enable eth port1 on the ls1028a QDS board
From: Claudiu Manoil @ 2019-07-30  9:45 UTC (permalink / raw)
  To: David S . Miller
  Cc: andrew, Rob Herring, Li Yang, alexandru.marginean, netdev,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <1564479919-18835-1-git-send-email-claudiu.manoil@nxp.com>

LS1028a has one Ethernet management interface. On the QDS board, the
MDIO signals are multiplexed to either on-board AR8035 PHY device or
to 4 PCIe slots allowing for SGMII cards.
To enable the Ethernet ENETC Port 1, which can only be connected to a
RGMII PHY, the multiplexer needs to be configured to route the MDIO to
the AR8035 PHY.  The MDIO/MDC routing is controlled by bits 7:4 of FPGA
board config register 0x54, and value 0 selects the on-board RGMII PHY.
The FPGA board config registers are accessible on the i2c bus, at address
0x66.

The PF3 MDIO PCIe integrated endpoint device allows for centralized access
to the MDIO bus.  Add the corresponding devicetree node and set it to be
the MDIO bus parent.

Signed-off-by: Alex Marginean <alexandru.marginean@nxp.com>
Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
---
v1-v4 - none

 .../boot/dts/freescale/fsl-ls1028a-qds.dts    | 40 +++++++++++++++++++
 .../arm64/boot/dts/freescale/fsl-ls1028a.dtsi |  6 +++
 2 files changed, 46 insertions(+)

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts
index de6ef39f3118..663c4b728c07 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-qds.dts
@@ -85,6 +85,26 @@
 			system-clock-frequency = <25000000>;
 		};
 	};
+
+	mdio-mux {
+		compatible = "mdio-mux-multiplexer";
+		mux-controls = <&mux 0>;
+		mdio-parent-bus = <&enetc_mdio_pf3>;
+		#address-cells=<1>;
+		#size-cells = <0>;
+
+		/* on-board RGMII PHY */
+		mdio@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			reg = <0>;
+
+			qds_phy1: ethernet-phy@5 {
+				/* Atheros 8035 */
+				reg = <5>;
+			};
+		};
+	};
 };
 
 &duart0 {
@@ -164,6 +184,26 @@
 			};
 		};
 	};
+
+	fpga@66 {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "fsl,ls1028aqds-fpga", "fsl,fpga-qixis-i2c",
+			     "simple-mfd";
+		reg = <0x66>;
+
+		mux: mux-controller {
+			compatible = "reg-mux";
+			#mux-control-cells = <1>;
+			mux-reg-masks = <0x54 0xf0>; /* 0: reg 0x54, bits 7:4 */
+		};
+	};
+
+};
+
+&enetc_port1 {
+	phy-handle = <&qds_phy1>;
+	phy-connection-type = "rgmii-id";
 };
 
 &sai1 {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
index 7975519b4f56..de71153fda00 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
@@ -536,6 +536,12 @@
 				compatible = "fsl,enetc";
 				reg = <0x000100 0 0 0 0>;
 			};
+			enetc_mdio_pf3: mdio@0,3 {
+				compatible = "fsl,enetc-mdio";
+				reg = <0x000300 0 0 0 0>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
 			ethernet@0,4 {
 				compatible = "fsl,enetc-ptp";
 				reg = <0x000400 0 0 0 0>;
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next v4 3/4] dt-bindings: net: fsl: enetc: Add bindings for the central MDIO PCIe endpoint
From: Claudiu Manoil @ 2019-07-30  9:45 UTC (permalink / raw)
  To: David S . Miller
  Cc: andrew, Rob Herring, Li Yang, alexandru.marginean, netdev,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <1564479919-18835-1-git-send-email-claudiu.manoil@nxp.com>

The on-chip PCIe root complex that integrates the ENETC ethernet
controllers also integrates a PCIe endpoint for the MDIO controller
providing for centralized control of the ENETC mdio bus.
Add bindings for this "central" MDIO Integrated PCIe Endpoint.

Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
---
v1 - none
v2 - none
v3 - fixed spelling (commit message mostly)
v4 - none

 .../devicetree/bindings/net/fsl-enetc.txt     | 42 +++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/fsl-enetc.txt b/Documentation/devicetree/bindings/net/fsl-enetc.txt
index 25fc687419db..b7034ccbc1bd 100644
--- a/Documentation/devicetree/bindings/net/fsl-enetc.txt
+++ b/Documentation/devicetree/bindings/net/fsl-enetc.txt
@@ -11,7 +11,9 @@ Required properties:
 		  to parent node bindings.
 - compatible	: Should be "fsl,enetc".
 
-1) The ENETC external port is connected to a MDIO configurable phy:
+1. The ENETC external port is connected to a MDIO configurable phy
+
+1.1. Using the local ENETC Port MDIO interface
 
 In this case, the ENETC node should include a "mdio" sub-node
 that in turn should contain the "ethernet-phy" node describing the
@@ -47,8 +49,42 @@ Example:
 		};
 	};
 
-2) The ENETC port is an internal port or has a fixed-link external
-connection:
+1.2. Using the central MDIO PCIe endpoint device
+
+In this case, the mdio node should be defined as another PCIe
+endpoint node, at the same level with the ENETC port nodes.
+
+Required properties:
+
+- reg		: Specifies PCIe Device Number and Function
+		  Number of the ENETC endpoint device, according
+		  to parent node bindings.
+- compatible	: Should be "fsl,enetc-mdio".
+
+The remaining required mdio bus properties are standard, their bindings
+already defined in Documentation/devicetree/bindings/net/mdio.txt.
+
+Example:
+
+	ethernet@0,0 {
+		compatible = "fsl,enetc";
+		reg = <0x000000 0 0 0 0>;
+		phy-handle = <&sgmii_phy0>;
+		phy-connection-type = "sgmii";
+	};
+
+	mdio@0,3 {
+		compatible = "fsl,enetc-mdio";
+		reg = <0x000300 0 0 0 0>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		sgmii_phy0: ethernet-phy@2 {
+			reg = <0x2>;
+		};
+	};
+
+2. The ENETC port is an internal port or has a fixed-link external
+connection
 
 In this case, the ENETC port node defines a fixed link connection,
 as specified by Documentation/devicetree/bindings/net/fixed-link.txt.
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next v4 2/4] enetc: Add mdio bus driver for the PCIe MDIO endpoint
From: Claudiu Manoil @ 2019-07-30  9:45 UTC (permalink / raw)
  To: David S . Miller
  Cc: andrew, Rob Herring, Li Yang, alexandru.marginean, netdev,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <1564479919-18835-1-git-send-email-claudiu.manoil@nxp.com>

ENETC ports can manage the MDIO bus via local register
interface.  However there's also a centralized way
to manage the MDIO bus, via the MDIO PCIe endpoint
device integrated by the same root complex that also
integrates the ENETC ports (eth controllers).

Depending on board design and use case, centralized
access to MDIO may be better than using local ENETC
port registers.  For instance, on the LS1028A QDS board
where MDIO muxing is required.  Also, the LS1028A on-chip
switch doesn't have a local MDIO register interface.

The current patch registers the above PCIe endpoint as a
separate MDIO bus and provides a driver for it by re-using
the code used for local MDIO access.  It also allows the
ENETC port PHYs to be managed by this driver if the local
"mdio" node is missing from the ENETC port node.

Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
---
v1 - fixed mdio bus allocation
   - requested only BAR0 region, as it's the only one used by the driver
v2 - reworked accessors as per Andrew Lunn's request
v3 - none
v4 - err path check fix

 .../net/ethernet/freescale/enetc/enetc_mdio.c | 98 +++++++++++++++++++
 .../net/ethernet/freescale/enetc/enetc_pf.c   |  5 +-
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
index 05094601ece8..d2a0f910af9b 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
@@ -195,3 +195,101 @@ void enetc_mdio_remove(struct enetc_pf *pf)
 	if (pf->mdio)
 		mdiobus_unregister(pf->mdio);
 }
+
+#define ENETC_MDIO_DEV_ID	0xee01
+#define ENETC_MDIO_DEV_NAME	"FSL PCIe IE Central MDIO"
+#define ENETC_MDIO_BUS_NAME	ENETC_MDIO_DEV_NAME " Bus"
+#define ENETC_MDIO_DRV_NAME	ENETC_MDIO_DEV_NAME " driver"
+#define ENETC_MDIO_DRV_ID	"fsl_enetc_mdio"
+
+static int enetc_pci_mdio_probe(struct pci_dev *pdev,
+				const struct pci_device_id *ent)
+{
+	struct enetc_mdio_priv *mdio_priv;
+	struct device *dev = &pdev->dev;
+	struct enetc_hw *hw;
+	struct mii_bus *bus;
+	int err;
+
+	hw = devm_kzalloc(dev, sizeof(*hw), GFP_KERNEL);
+	if (!hw)
+		return -ENOMEM;
+
+	bus = devm_mdiobus_alloc_size(dev, sizeof(*mdio_priv));
+	if (!bus)
+		return -ENOMEM;
+
+	bus->name = ENETC_MDIO_BUS_NAME;
+	bus->read = enetc_mdio_read;
+	bus->write = enetc_mdio_write;
+	bus->parent = dev;
+	mdio_priv = bus->priv;
+	mdio_priv->hw = hw;
+	snprintf(bus->id, MII_BUS_ID_SIZE, "%s", dev_name(dev));
+
+	pcie_flr(pdev);
+	err = pci_enable_device_mem(pdev);
+	if (err) {
+		dev_err(dev, "device enable failed\n");
+		return err;
+	}
+
+	err = pci_request_region(pdev, 0, ENETC_MDIO_DRV_ID);
+	if (err) {
+		dev_err(dev, "pci_request_region failed\n");
+		goto err_pci_mem_reg;
+	}
+
+	hw->port = pci_iomap(pdev, 0, 0);
+	if (!hw->port) {
+		err = -ENXIO;
+		dev_err(dev, "iomap failed\n");
+		goto err_ioremap;
+	}
+
+	err = of_mdiobus_register(bus, dev->of_node);
+	if (err)
+		goto err_mdiobus_reg;
+
+	pci_set_drvdata(pdev, bus);
+
+	return 0;
+
+err_mdiobus_reg:
+	iounmap(mdio_priv->hw->port);
+err_ioremap:
+	pci_release_mem_regions(pdev);
+err_pci_mem_reg:
+	pci_disable_device(pdev);
+
+	return err;
+}
+
+static void enetc_pci_mdio_remove(struct pci_dev *pdev)
+{
+	struct mii_bus *bus = pci_get_drvdata(pdev);
+	struct enetc_mdio_priv *mdio_priv;
+
+	mdiobus_unregister(bus);
+	mdio_priv = bus->priv;
+	iounmap(mdio_priv->hw->port);
+	pci_release_mem_regions(pdev);
+	pci_disable_device(pdev);
+}
+
+static const struct pci_device_id enetc_pci_mdio_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, ENETC_MDIO_DEV_ID) },
+	{ 0, } /* End of table. */
+};
+MODULE_DEVICE_TABLE(pci, enetc_mdio_id_table);
+
+static struct pci_driver enetc_pci_mdio_driver = {
+	.name = ENETC_MDIO_DRV_ID,
+	.id_table = enetc_pci_mdio_id_table,
+	.probe = enetc_pci_mdio_probe,
+	.remove = enetc_pci_mdio_remove,
+};
+module_pci_driver(enetc_pci_mdio_driver);
+
+MODULE_DESCRIPTION(ENETC_MDIO_DRV_NAME);
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index 258b3cb38a6f..7d6513ff8507 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -750,6 +750,7 @@ static int enetc_of_get_phy(struct enetc_ndev_priv *priv)
 {
 	struct enetc_pf *pf = enetc_si_priv(priv->si);
 	struct device_node *np = priv->dev->of_node;
+	struct device_node *mdio_np;
 	int err;
 
 	if (!np) {
@@ -773,7 +774,9 @@ static int enetc_of_get_phy(struct enetc_ndev_priv *priv)
 		priv->phy_node = of_node_get(np);
 	}
 
-	if (!of_phy_is_fixed_link(np)) {
+	mdio_np = of_get_child_by_name(np, "mdio");
+	if (mdio_np) {
+		of_node_put(mdio_np);
 		err = enetc_mdio_probe(pf);
 		if (err) {
 			of_node_put(priv->phy_node);
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next v4 0/4] enetc: Add mdio bus driver for the PCIe MDIO endpoint
From: Claudiu Manoil @ 2019-07-30  9:45 UTC (permalink / raw)
  To: David S . Miller
  Cc: andrew, Rob Herring, Li Yang, alexandru.marginean, netdev,
	devicetree, linux-arm-kernel, linux-kernel

First patch fixes a sparse issue and cleans up accessors to avoid
casting to __iomem.
Second patch just registers the PCIe endpoint device containing
the MDIO registers as a standalone MDIO bus driver, to allow
an alternative way to control the MDIO bus.  The same code used
by the ENETC ports (eth controllers) to manage MDIO via local
registers applies and is reused.

Bindings are provided for the new MDIO node, similarly to ENETC
port nodes bindings.

Last patch enables the ENETC port 1 and its RGMII PHY on the
LS1028A QDS board, where the MDIO muxing configuration relies
on the MDIO support provided in the first patch.

Changes since v0:
v1 - fixed mdio bus allocation
v2 - cleaned up accessors to avoid casting
v3 - fixed spelling (mostly commit message)
v4 - fixed err path check blunder

Claudiu Manoil (4):
  enetc: Clean up local mdio bus allocation
  enetc: Add mdio bus driver for the PCIe MDIO endpoint
  dt-bindings: net: fsl: enetc: Add bindings for the central MDIO PCIe
    endpoint
  arm64: dts: fsl: ls1028a: Enable eth port1 on the ls1028a QDS board

 .../devicetree/bindings/net/fsl-enetc.txt     |  42 +++-
 .../boot/dts/freescale/fsl-ls1028a-qds.dts    |  40 ++++
 .../arm64/boot/dts/freescale/fsl-ls1028a.dtsi |   6 +
 .../net/ethernet/freescale/enetc/enetc_mdio.c | 190 +++++++++++++-----
 .../net/ethernet/freescale/enetc/enetc_pf.c   |   5 +-
 5 files changed, 232 insertions(+), 51 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH] net: phy: fixed_phy: print gpio error only if gpio node is present
From: Hubert Feurstein @ 2019-07-30  9:46 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: Hubert Feurstein, Andrew Lunn, Florian Fainelli, Heiner Kallweit,
	David S. Miller

It is perfectly ok to not have an gpio attached to the fixed-link node. So
the driver should not throw an error message when the gpio is missing.

Signed-off-by: Hubert Feurstein <h.feurstein@gmail.com>
---
 drivers/net/phy/fixed_phy.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 3ffe46df249e..7c5265fd2b94 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -216,8 +216,10 @@ static struct gpio_desc *fixed_phy_get_gpiod(struct device_node *np)
 	if (IS_ERR(gpiod)) {
 		if (PTR_ERR(gpiod) == -EPROBE_DEFER)
 			return gpiod;
-		pr_err("error getting GPIO for fixed link %pOF, proceed without\n",
-		       fixed_link_node);
+
+		if (PTR_ERR(gpiod) != -ENOENT)
+			pr_err("error getting GPIO for fixed link %pOF, proceed without\n",
+			       fixed_link_node);
 		gpiod = NULL;
 	}
 
-- 
2.22.0


^ permalink raw reply related

* RE: [PATCH] rtw88: pci: Use general byte arrays as the elements of RX ring
From: David Laight @ 2019-07-30  9:48 UTC (permalink / raw)
  To: 'Stanislaw Gruszka', Jian-Hong Pan
  Cc: Yan-Hsuan Chuang, Kalle Valo, David S . Miller,
	linux-wireless@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux@endlessm.com,
	stable@vger.kernel.org
In-Reply-To: <20190730093533.GC3174@redhat.com>

From: Stanislaw Gruszka
> Sent: 30 July 2019 10:36
...
> > +		len = pkt_stat.pkt_len + pkt_offset;
> > +		skb = dev_alloc_skb(len);
> > +		if (WARN_ONCE(!skb, "rx routine starvation\n"))
> >  			goto next_rp;
> >
> >  		/* put the DMA data including rx_desc from phy to new skb */
> > -		skb_put_data(new, skb->data, new_len);
> > +		skb_put_data(skb, rx_desc, len);
> 
> Coping big packets it quite inefficient. What drivers usually do is
> copy only for small packets and for big ones allocate new rx buf
> (drop packet alloc if fail) and pas old buf to network stack via
> skb_add_rx_frag(). See iwlmvm as example.

If you have to do iommu setup/teardown then the breakeven point
for (not) copying may be surprisingly large.
You do need to do the measurements on a range of hardware.
Coping is also likely to affect the L1 cache - unless you can
copy quickly without polluting the cache.

It is all 'swings and roundabouts'.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Nikolay Aleksandrov @ 2019-07-30  9:55 UTC (permalink / raw)
  To: Allan W. Nielsen
  Cc: Ido Schimmel, Horatiu Vultur, roopa, davem, bridge, netdev,
	linux-kernel
In-Reply-To: <20190730092118.key2ygh3ggpd3tkq@lx-anielsen.microsemi.net>

On 30/07/2019 12:21, Allan W. Nielsen wrote:
> The 07/30/2019 11:58, Nikolay Aleksandrov wrote:
>> On 30/07/2019 11:30, Allan W. Nielsen wrote:
>>> The 07/30/2019 10:06, Ido Schimmel wrote:
>>>> On Tue, Jul 30, 2019 at 08:27:22AM +0200, Allan W. Nielsen wrote:
>>>>> The 07/29/2019 20:51, Ido Schimmel wrote:
>>>>>> Can you please clarify what you're trying to achieve? I just read the
>>>>>> thread again and my impression is that you're trying to locally receive
>>>>>> packets with a certain link layer multicast address.
>>>>> Yes. The thread is also a bit confusing because we half way through realized
>>>>> that we misunderstood how the multicast packets should be handled (sorry about
>>>>> that). To begin with we had a driver where multicast packets was only copied to
>>>>> the CPU if someone needed it. Andrew and Nikolay made us aware that this is not
>>>>> how other drivers are doing it, so we changed the driver to include the CPU in
>>>>> the default multicast flood-mask.
>>>> OK, so what prevents you from removing all other ports from the
>>>> flood-mask and letting the CPU handle the flooding? Then you can install
>>>> software tc filters to limit the flooding.
>>> I do not have the bandwidth to forward the multicast traffic in the CPU.
>>>
>>> It will also cause enormous latency on the forwarding of L2 multicast packets.
>>>
>>>>> This changes the objective a bit. To begin with we needed to get more packets to
>>>>> the CPU (which could have been done using tc ingress rules and a trap action).
>>>>>
>>>>> Now after we changed the driver, we realized that we need something to limit the
>>>>> flooding of certain L2 multicast packets. This is the new problem we are trying
>>>>> to solve!
>>>>>
>>>>> Example: Say we have a bridge with 4 slave interfaces, then we want to install a
>>>>> forwarding rule saying that packets to a given L2-multicast MAC address, should
>>>>> only be flooded to 2 of the 4 ports.
>>>>>
>>>>> (instead of adding rules to get certain packets to the CPU, we are now adding
>>>>> other rules to prevent other packets from going to the CPU and other ports where
>>>>> they are not needed/wanted).
>>>>>
>>>>> This is exactly the same thing as IGMP snooping does dynamically, but only for
>>>>> IP multicast.
>>>>>
>>>>> The "bridge mdb" allow users to manually/static add/del a port to a multicast
>>>>> group, but still it operates on IP multicast address (not L2 multicast
>>>>> addresses).
>>>>>
>>>>>> Nik suggested SIOCADDMULTI.
>>>>> It is not clear to me how this should be used to limit the flooding, maybe we
>>>>> can make some hacks, but as far as I understand the intend of this is maintain
>>>>> the list of addresses an interface should receive. I'm not sure this should
>>>>> influence how for forwarding decisions are being made.
>>>>>
>>>>>> and I suggested a tc filter to get the packet to the CPU.
>>>>> The TC solution is a good solution to the original problem where wanted to copy
>>>>> more frames to the CPU. But we were convinced that this is not the right
>>>>> approach, and that the CPU by default should receive all multicast packets, and
>>>>> we should instead try to find a way to limit the flooding of certain frames as
>>>>> an optimization.
>>>>
>>>> This can still work. In Linux, ingress tc filters are executed before the
>>>> bridge's Rx handler. The same happens in every sane HW. Ingress ACL is
>>>> performed before L2 forwarding. Assuming you have eth0-eth3 bridged and
>>>> you want to prevent packets with DMAC 01:21:6C:00:00:01 from egressing
>>>> eth2:
>>>>
>>>> # tc filter add dev eth0 ingress pref 1 flower skip_sw \
>>>> 	dst_mac 01:21:6C:00:00:01 action trap
>>>> # tc filter add dev eth2 egress pref 1 flower skip_hw \
>>>> 	dst_mac 01:21:6C:00:00:01 action drop
>>>>
>>>> The first filter is only present in HW ('skip_sw') and should result in
>>>> your HW passing you the sole copy of the packet.
>>> Agree.
>>>
>>>> The second filter is only present in SW ('skip_hw', not using HW egress
>>>> ACL that you don't have) and drops the packet after it was flooded by
>>>> the SW bridge.
>>> Agree.
>>>
>>>> As I mentioned earlier, you can install the filter once in your HW and
>>>> share it between different ports using a shared block. This means you
>>>> only consume one TCAM entry.
>>>>
>>>> Note that this allows you to keep flooding all other multicast packets
>>>> in HW.
>>> Yes, but the frames we want to limit the flood-mask on are the exact frames
>>> which occurs at a very high rate, and where latency is important.
>>>
>>> I really do not consider it as an option to forward this in SW, when it is
>>> something that can easily be offloaded in HW.
>>>
>>>>>> If you now want to limit the ports to which this packet is flooded, then
>>>>>> you can use tc filters in *software*:
>>>>>>
>>>>>> # tc qdisc add dev eth2 clsact
>>>>>> # tc filter add dev eth2 egress pref 1 flower skip_hw \
>>>>>> 	dst_mac 01:21:6C:00:00:01 action drop
>>>>> Yes. This can work in the SW bridge.
>>>>>
>>>>>> If you want to forward the packet in hardware and locally receive it,
>>>>>> you can chain several mirred action and then a trap action.
>>>>> I'm not I fully understand how this should be done, but it does sound like it
>>>>> becomes quite complicated. Also, as far as I understand it will mean that we
>>>>> will be using TCAM/ACL resources to do something that could have been done with
>>>>> a simple MAC entry.
>>>>>
>>>>>> Both options avoid HW egress ACLs which your design does not support.
>>>>> True, but what is wrong with expanding the functionality of the normal
>>>>> forwarding/MAC operations to allow multiple destinations?
>>>>>
>>>>> It is not an uncommon feature (I just browsed the manual of some common L2
>>>>> switches and they all has this feature).
>>>>>
>>>>> It seems to fit nicely into the existing user-interface:
>>>>>
>>>>> bridge fdb add    01:21:6C:00:00:01 port eth0
>>>>> bridge fdb append 01:21:6C:00:00:01 port eth1
>>>>
>>>> Wouldn't it be better to instead extend the MDB entries so that they are
>>>> either keyed by IP or MAC? I believe FDB should remain as unicast-only.
>>>
>>> You might be right, it was not clear to me which of the two would fit the
>>> purpose best.
>>>
>>> From a user-space iproute2 perspective I prefer using the "bridge fdb" command
>>> as it already supports the needed syntax, and I do not think it will be too
>>> pretty if we squeeze this into the "bridge mdb" command syntax.
>>>
>>
>> MDB is a much better fit as Ido already suggested. FDB should remain unicast
>> and mixing them is not a good idea, we already have a good ucast/mcast separation
>> and we'd like to keep it that way.
> Okay. We will explore that option.
> 
> 
Great, thanks.

>>> But that does not mean that it need to go into the FDB database in the
>>> implementation.
>>>
>>> Last evening when I looked at it again, I was considering keeping the
>>> net_bridge_fdb_entry structure as is, and add a new hashtable with the
>>> following:
>>>
>>> struct net_bridge_fdbmc_entry {
>>> 	struct rhash_head		rhnode;
>>> 	struct net_bridge_fdbmc_ports   *dst;
>>>
>>> 	struct net_bridge_fdb_key	key;
>>> 	struct hlist_node		fdb_node;
>>> 	unsigned char			offloaded:1;
>>>
>>> 	struct rcu_head			rcu;
>>> };
>>>
>>
>> What would the notification for this look like ?
> Not sure. But we will change the direction and use the MDB structures instead.
> 

Ack

>>> If we go with this approach then we can look at the MAC address and see if it is
>>> a unicast which will cause a lookup in the fdb, l3-multicast (33:33:* or
>>> 01:00:5e:*) which will cause a lookup in the mdb, or finally a fdbmc which will
>>> need to do a lookup in this new hashtable.
>>
>> That sounds wrong, you will change the current default behaviour of flooding these
>> packets. This will have to be well hidden behind a new option and enabled only on user
>> request.
> It will only affect users who install a static L2-multicast entry. If no entry
> is found, it will default to flooding, which will be the same as before.
> 

Ack

>>> Alternative it would be like this:
>>>
>>> struct net_bridge_fdb_entry {
>>> 	struct rhash_head		rhnode;
>>> 	union net_bridge_port_or_list	*dst;
>>>
>>> 	struct net_bridge_fdb_key	key;
>>> 	struct hlist_node		fdb_node;
>>> 	unsigned char			is_local:1,
>>> 					is_static:1,
>>> 					is_sticky:1,
>>> 					added_by_user:1,
>>> 					added_by_external_learn:1,
>>> 					offloaded:1;
>>> 					multi_dst:1;
>>>
>>> 	/* write-heavy members should not affect lookups */
>>> 	unsigned long			updated ____cacheline_aligned_in_smp;
>>> 	unsigned long			used;
>>>
>>> 	struct rcu_head			rcu;
>>> };
>>>
>>> Both solutions should require fairly few changes, and should not cause any
>>> measurable performance hit.
>>>
>>
>> You'll have to convert these bits to use the proper atomic bitops if you go with
>> the second solution. That has to be done even today, but the second case would
>> make it a must.
> Good to know.
> 
> Just for my understanding, is this because this is the "current" guide lines on
> how things should be done, or is this because the multi_dst as a special need.
> 
> The multi_dst flag will never be changed in the life-cycle of the structure, and
> the structure is protected by rcu. If this is causeing a raise, then I do not
> see it.
> 

These flags are changed from different contexts without any locking and you can end
up with wrong values since these are not atomic ops. We need to move to atomic ops
to guarantee consistent results. It is not only multi_dst issue, it's a problem
for all of them, it's just not critical today since you'll end up with wrong
flag values in such cases, but with multi_dst it will be important because the union
pointer will have to be treated different based on the multi_dst value.

>>> Making it fit into the net_bridge_mdb_entry seems to be harder.
>>>
>>
>> But it is the correct abstraction from bridge POV, so please stop trying to change
>> the FDB code and try to keep to the multicast code.
> We are planning on letting the net_bridge_port_or_list union use the
> net_bridge_port_group structure, which will mean that we can re-use the
> br_multicast_flood function (if we change the signatire to accept the ports
> instead of the entry).
> 

That sounds great, definitely a step in the right direction.

>>>> As a bonus, existing drivers could benefit from it, as MDB entries are already
>>>> notified by MAC.
>>> Not sure I follow. When FDB entries are added, it also generates notification
>>> events.
>>>
>>
>> Could you please show fdb event with multiple ports ?
> We will get to that. Maybe this is an argument for converting to mdb. We have
> not looked into the details of this yet.
> 

I can see a few potential problems, the important thing here would be to keep
backwards compatibility.

>>>>> It seems that it can be added to the existing implementation with out adding
>>>>> significant complexity.
>>>>>
>>>>> It will be easy to offload in HW.
>>>>>
>>>>> I do not believe that it will be a performance issue, if this is a concern then
>>>>> we may have to do a bit of benchmarking, or we can make it a configuration
>>>>> option.
>>>>>
>>>>> Long story short, we (Horatiu and I) learned a lot from the discussion here, and
>>>>> I think we should try do a new patch with the learning we got. Then it is easier
>>>>> to see what it actually means to the exiting code, complexity, exiting drivers,
>>>>> performance, default behavioral, backwards compatibly, and other valid concerns.
>>>>>
>>>>> If the patch is no good, and cannot be fixed, then we will go back and look
>>>>> further into alternative solutions.
>>>> Overall, I tend to agree with Nik. I think your use case is too specific
>>>> to justify the amount of changes you want to make in the bridge driver.
>>>> We also provided other alternatives. That being said, you're more than
>>>> welcome to send the patches and we can continue the discussion then.
>>> Okay, good to know. I'm not sure I agree that the alternative solutions really
>>> solves the issue this is trying to solve, nor do I agree that this is specific
>>> to our needs.
>>>
>>> But lets take a look at a new patch, and see what is the amount of changes we
>>> are talking about. Without having the patch it is really hard to know for sure.
>> Please keep in mind that this case is the exception, not the norm, thus it should
>> not under any circumstance affect the standard deployments.
> Understood - no surprises.
> 

Great, thanks again. Will continue discussing when the new patch arrives.

Cheers,
 Nik

^ permalink raw reply

* Re: DSA Rate Limiting in 88E6390
From: Benjamin Beckmeyer @ 2019-07-30  9:58 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev
In-Reply-To: <20190729150158.GE4110@lunn.ch>


>> Hi all,
>> is there a possibility to set the rate limiting for the 6390 with linux tools?
>> I've seen that the driver will init all to zero, so rate limiting is disabled, 
>> but there is no solution for it in the ip tool?
>>
>> The only thing I found for rate limiting is the tc tool, but I guess this is 
>> only a software solution?
> Hi Benjamin
>
> In Linux, we accelerate the software solution by offloading it to the
> hardware. So TC is what you need here.
>  
>> Furthermore, does exist a table or a tutorial which functions DSA supports?
>> The background is that we are using the DSDT driver (in future maybe the UMSD
>> driver) but we would like to switch to the in kernel DSA entirely. And our
>> software is using some of the DSDT functions, so I have to find an 
>> alternative to these functions.
> The DSA framework supports offloading TC. There was some patches a
> while back adding ingress rate limiting to one of the DSA drivers, via
> TC. I forget which, and i don't think they have been merged yet. If
> you can find the patchset, it should give you a good idea how you can
> implement support in the mv88e6xxx driver.
>
> 	  Andrew

Hi Andrew,
I've searched the netdev mailing list for DSA and traffic, but can't find anything
about rate limiting till 2016. Do you have a hint, how I can find it?

Do you know if the patchset was for Marvell or maybe for another company?

Cheers,
Benjamin


^ permalink raw reply

* Re: [PATCH v4 0/5] vsock/virtio: optimizations to increase the throughput
From: Jason Wang @ 2019-07-30 10:03 UTC (permalink / raw)
  To: Stefano Garzarella, Michael S. Tsirkin, Stefan Hajnoczi,
	David S. Miller
  Cc: netdev, linux-kernel, virtualization, kvm
In-Reply-To: <20190730094013.ruqjllqrjmkdnh5y@steredhat>


On 2019/7/30 下午5:40, Stefano Garzarella wrote:
> On Mon, Jul 29, 2019 at 09:59:23AM -0400, Michael S. Tsirkin wrote:
>> On Wed, Jul 17, 2019 at 01:30:25PM +0200, Stefano Garzarella wrote:
>>> This series tries to increase the throughput of virtio-vsock with slight
>>> changes.
>>> While I was testing the v2 of this series I discovered an huge use of memory,
>>> so I added patch 1 to mitigate this issue. I put it in this series in order
>>> to better track the performance trends.
>> Series:
>>
>> Acked-by: Michael S. Tsirkin <mst@redhat.com>
>>
>> Can this go into net-next?
>>
> I think so.
> Michael, Stefan thanks to ack the series!
>
> Should I resend it with net-next tag?
>
> Thanks,
> Stefano


I think so.

Thanks


^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Ido Schimmel @ 2019-07-30 10:04 UTC (permalink / raw)
  To: Allan W. Nielsen
  Cc: Nikolay Aleksandrov, Horatiu Vultur, roopa, davem, bridge, netdev,
	linux-kernel
In-Reply-To: <20190730083027.biuzy7h5dbq7pik3@lx-anielsen.microsemi.net>

On Tue, Jul 30, 2019 at 10:30:28AM +0200, Allan W. Nielsen wrote:
> The 07/30/2019 10:06, Ido Schimmel wrote:
> > As a bonus, existing drivers could benefit from it, as MDB entries are already
> > notified by MAC.
> Not sure I follow. When FDB entries are added, it also generates notification
> events.

I meant the switchdev notification sent to drivers:

/* SWITCHDEV_OBJ_ID_PORT_MDB */
struct switchdev_obj_port_mdb {
	struct switchdev_obj obj;
	unsigned char addr[ETH_ALEN];
	u16 vid;
};

By extending MDB entries to also be keyed by MAC you basically get a lot
of things for free without duplicating the same code for multicast FDBs.

AFAICS, then only change in the fast path is in br_mdb_get() where you
need to use DMAC as key in case Ethertype is not IPv4/IPv6.

^ permalink raw reply

* [PATCH 0/4] net: dsa: mv88e6xxx: add support for MV88E6220
From: Hubert Feurstein @ 2019-07-30 10:04 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: Hubert Feurstein, Andrew Lunn, Vivien Didelot, Florian Fainelli,
	David S. Miller, Rasmus Villemoes

This patch series adds support for the MV88E6220 chip to the mv88e6xxx driver.
The MV88E6220 is almost the same as MV88E6250 except that the ports 2-4 are
not routed to pins.

Furthermore, PTP support is added to the MV88E6250 family.

Hubert Feurstein (4):
  net: dsa: mv88e6xxx: add support for MV88E6220
  dt-bindings: net: dsa: marvell: add 6220 model to the 6250 family
  net: dsa: mv88e6xxx: setup message port is not supported in the 6250
    family
  net: dsa: mv88e6xxx: add PTP support for MV88E6250 family

 .../devicetree/bindings/net/dsa/marvell.txt   |  2 +-
 drivers/net/dsa/mv88e6xxx/chip.c              | 63 +++++++++++++++-
 drivers/net/dsa/mv88e6xxx/chip.h              |  8 +-
 drivers/net/dsa/mv88e6xxx/port.h              |  1 +
 drivers/net/dsa/mv88e6xxx/ptp.c               | 73 +++++++++++++------
 5 files changed, 120 insertions(+), 27 deletions(-)

-- 
2.22.0


^ permalink raw reply

* [PATCH 4/4] net: dsa: mv88e6xxx: add PTP support for MV88E6250 family
From: Hubert Feurstein @ 2019-07-30 10:04 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: Hubert Feurstein, Andrew Lunn, Vivien Didelot, Florian Fainelli,
	David S. Miller, Rasmus Villemoes
In-Reply-To: <20190730100429.32479-1-h.feurstein@gmail.com>

This adds PTP support for the MV88E6250 family.

Signed-off-by: Hubert Feurstein <h.feurstein@gmail.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c |  4 ++
 drivers/net/dsa/mv88e6xxx/chip.h |  4 ++
 drivers/net/dsa/mv88e6xxx/ptp.c  | 73 ++++++++++++++++++++++----------
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index c2fb4ea66434..58e298cc90e0 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3514,6 +3514,8 @@ static const struct mv88e6xxx_ops mv88e6250_ops = {
 	.reset = mv88e6250_g1_reset,
 	.vtu_getnext = mv88e6250_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6250_g1_vtu_loadpurge,
+	.avb_ops = &mv88e6352_avb_ops,
+	.ptp_ops = &mv88e6352_ptp_ops,
 	.phylink_validate = mv88e6065_phylink_validate,
 };
 
@@ -4333,6 +4335,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.atu_move_port_mask = 0xf,
 		.dual_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_DSA,
+		.ptp_support = true,
 		.ops = &mv88e6250_ops,
 	},
 
@@ -4354,6 +4357,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = {
 		.atu_move_port_mask = 0xf,
 		.dual_chip = true,
 		.tag_protocol = DSA_TAG_PROTO_DSA,
+		.ptp_support = true,
 		.ops = &mv88e6250_ops,
 	},
 
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 720cace3db4e..64872251e479 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -273,6 +273,10 @@ struct mv88e6xxx_chip {
 	u16 trig_config;
 	u16 evcap_config;
 	u16 enable_count;
+	u32 ptp_cc_shift;
+	u32 ptp_cc_mult;
+	u32 ptp_cc_mult_num;
+	u32 ptp_cc_mult_dem;
 
 	/* Per-port timestamping resources. */
 	struct mv88e6xxx_port_hwtstamp port_hwtstamp[DSA_MAX_PORTS];
diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c
index 768d256f7c9f..51cdf4712517 100644
--- a/drivers/net/dsa/mv88e6xxx/ptp.c
+++ b/drivers/net/dsa/mv88e6xxx/ptp.c
@@ -15,11 +15,38 @@
 #include "hwtstamp.h"
 #include "ptp.h"
 
-/* Raw timestamps are in units of 8-ns clock periods. */
-#define CC_SHIFT	28
-#define CC_MULT		(8 << CC_SHIFT)
-#define CC_MULT_NUM	(1 << 9)
-#define CC_MULT_DEM	15625ULL
+/* The adjfine API clamps ppb between [-32,768,000, 32,768,000], and
+ * therefore scaled_ppm between [-2,147,483,648, 2,147,483,647].
+ * Set the maximum supported ppb to a round value smaller than the maximum.
+ *
+ * Percentually speaking, this is a +/- 0.032x adjustment of the
+ * free-running counter (0.968x to 1.032x).
+ */
+#define MV88E6XXX_MAX_ADJ_PPB	32000000
+
+/* Family MV88E6250:
+ * Raw timestamps are in units of 10-ns clock periods.
+ *
+ * clkadj = scaled_ppm * 10*2^28 / (10^6 * 2^16)
+ * simplifies to
+ * clkadj = scaled_ppm * 2^7 / 5^5
+ */
+#define MV88E6250_CC_SHIFT	28
+#define MV88E6250_CC_MULT	(10 << MV88E6250_CC_SHIFT)
+#define MV88E6250_CC_MULT_NUM	(1 << 7)
+#define MV88E6250_CC_MULT_DEM	3125ULL
+
+/* Other families:
+ * Raw timestamps are in units of 8-ns clock periods.
+ *
+ * clkadj = scaled_ppm * 8*2^28 / (10^6 * 2^16)
+ * simplifies to
+ * clkadj = scaled_ppm * 2^9 / 5^6
+ */
+#define MV88E6XXX_CC_SHIFT	28
+#define MV88E6XXX_CC_MULT	(8 << MV88E6XXX_CC_SHIFT)
+#define MV88E6XXX_CC_MULT_NUM	(1 << 9)
+#define MV88E6XXX_CC_MULT_DEM	15625ULL
 
 #define TAI_EVENT_WORK_INTERVAL msecs_to_jiffies(100)
 
@@ -179,24 +206,14 @@ static void mv88e6352_tai_event_work(struct work_struct *ugly)
 static int mv88e6xxx_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 {
 	struct mv88e6xxx_chip *chip = ptp_to_chip(ptp);
-	int neg_adj = 0;
-	u32 diff, mult;
-	u64 adj;
+	s64 adj;
 
-	if (scaled_ppm < 0) {
-		neg_adj = 1;
-		scaled_ppm = -scaled_ppm;
-	}
-	mult = CC_MULT;
-	adj = CC_MULT_NUM;
-	adj *= scaled_ppm;
-	diff = div_u64(adj, CC_MULT_DEM);
+	adj = (s64)scaled_ppm * chip->ptp_cc_mult_num;
+	adj = div_s64(adj, chip->ptp_cc_mult_dem);
 
 	mv88e6xxx_reg_lock(chip);
-
 	timecounter_read(&chip->tstamp_tc);
-	chip->tstamp_cc.mult = neg_adj ? mult - diff : mult + diff;
-
+	chip->tstamp_cc.mult = chip->ptp_cc_mult + adj;
 	mv88e6xxx_reg_unlock(chip);
 
 	return 0;
@@ -380,12 +397,24 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip)
 	const struct mv88e6xxx_ptp_ops *ptp_ops = chip->info->ops->ptp_ops;
 	int i;
 
+	if (chip->info->family == MV88E6XXX_FAMILY_6250) {
+		chip->ptp_cc_shift	= MV88E6250_CC_SHIFT;
+		chip->ptp_cc_mult	= MV88E6250_CC_MULT;
+		chip->ptp_cc_mult_num	= MV88E6250_CC_MULT_NUM;
+		chip->ptp_cc_mult_dem	= MV88E6250_CC_MULT_DEM;
+	} else {
+		chip->ptp_cc_shift	= MV88E6XXX_CC_SHIFT;
+		chip->ptp_cc_mult	= MV88E6XXX_CC_MULT;
+		chip->ptp_cc_mult_num	= MV88E6XXX_CC_MULT_NUM;
+		chip->ptp_cc_mult_dem	= MV88E6XXX_CC_MULT_DEM;
+	}
+
 	/* Set up the cycle counter */
 	memset(&chip->tstamp_cc, 0, sizeof(chip->tstamp_cc));
 	chip->tstamp_cc.read	= mv88e6xxx_ptp_clock_read;
 	chip->tstamp_cc.mask	= CYCLECOUNTER_MASK(32);
-	chip->tstamp_cc.mult	= CC_MULT;
-	chip->tstamp_cc.shift	= CC_SHIFT;
+	chip->tstamp_cc.mult	= chip->ptp_cc_mult;
+	chip->tstamp_cc.shift	= chip->ptp_cc_shift;
 
 	timecounter_init(&chip->tstamp_tc, &chip->tstamp_cc,
 			 ktime_to_ns(ktime_get_real()));
@@ -397,7 +426,6 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip)
 	chip->ptp_clock_info.owner = THIS_MODULE;
 	snprintf(chip->ptp_clock_info.name, sizeof(chip->ptp_clock_info.name),
 		 "%s", dev_name(chip->dev));
-	chip->ptp_clock_info.max_adj	= 1000000;
 
 	chip->ptp_clock_info.n_ext_ts	= ptp_ops->n_ext_ts;
 	chip->ptp_clock_info.n_per_out	= 0;
@@ -413,6 +441,7 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip)
 	}
 	chip->ptp_clock_info.pin_config = chip->pin_config;
 
+	chip->ptp_clock_info.max_adj    = MV88E6XXX_MAX_ADJ_PPB;
 	chip->ptp_clock_info.adjfine	= mv88e6xxx_ptp_adjfine;
 	chip->ptp_clock_info.adjtime	= mv88e6xxx_ptp_adjtime;
 	chip->ptp_clock_info.gettime64	= mv88e6xxx_ptp_gettime;
-- 
2.22.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox