Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next] r8152: divide the tx and rx bottom functions
From: Hayes Wang @ 2019-08-14  8:30 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, Hayes Wang

Move the tx bottom function from NAPI to a new tasklet. Then, for
multi-cores, the bottom functions of tx and rx may be run at same
time with different cores. This is used to improve performance.

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 40d18e866269..3ed9f8e082c9 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -619,7 +619,7 @@ enum rtl8152_flags {
 	RTL8152_LINK_CHG,
 	SELECTIVE_SUSPEND,
 	PHY_RESET,
-	SCHEDULE_NAPI,
+	SCHEDULE_TASKLET,
 	GREEN_ETHERNET,
 	DELL_TB_RX_AGG_BUG,
 };
@@ -733,6 +733,7 @@ struct r8152 {
 #ifdef CONFIG_PM_SLEEP
 	struct notifier_block pm_notifier;
 #endif
+	struct tasklet_struct tx_tl;
 
 	struct rtl_ops {
 		void (*init)(struct r8152 *);
@@ -1401,7 +1402,7 @@ static void write_bulk_callback(struct urb *urb)
 		return;
 
 	if (!skb_queue_empty(&tp->tx_queue))
-		napi_schedule(&tp->napi);
+		tasklet_schedule(&tp->tx_tl);
 }
 
 static void intr_callback(struct urb *urb)
@@ -2179,8 +2180,12 @@ static void tx_bottom(struct r8152 *tp)
 	} while (res == 0);
 }
 
-static void bottom_half(struct r8152 *tp)
+static void bottom_half(unsigned long data)
 {
+	struct r8152 *tp;
+
+	tp = (struct r8152 *)data;
+
 	if (test_bit(RTL8152_UNPLUG, &tp->flags))
 		return;
 
@@ -2192,7 +2197,7 @@ static void bottom_half(struct r8152 *tp)
 	if (!netif_carrier_ok(tp->netdev))
 		return;
 
-	clear_bit(SCHEDULE_NAPI, &tp->flags);
+	clear_bit(SCHEDULE_TASKLET, &tp->flags);
 
 	tx_bottom(tp);
 }
@@ -2203,16 +2208,12 @@ static int r8152_poll(struct napi_struct *napi, int budget)
 	int work_done;
 
 	work_done = rx_bottom(tp, budget);
-	bottom_half(tp);
 
 	if (work_done < budget) {
 		if (!napi_complete_done(napi, work_done))
 			goto out;
 		if (!list_empty(&tp->rx_done))
 			napi_schedule(napi);
-		else if (!skb_queue_empty(&tp->tx_queue) &&
-			 !list_empty(&tp->tx_free))
-			napi_schedule(napi);
 	}
 
 out:
@@ -2366,11 +2367,11 @@ static netdev_tx_t rtl8152_start_xmit(struct sk_buff *skb,
 
 	if (!list_empty(&tp->tx_free)) {
 		if (test_bit(SELECTIVE_SUSPEND, &tp->flags)) {
-			set_bit(SCHEDULE_NAPI, &tp->flags);
+			set_bit(SCHEDULE_TASKLET, &tp->flags);
 			schedule_delayed_work(&tp->schedule, 0);
 		} else {
 			usb_mark_last_busy(tp->udev);
-			napi_schedule(&tp->napi);
+			tasklet_schedule(&tp->tx_tl);
 		}
 	} else if (skb_queue_len(&tp->tx_queue) > tp->tx_qlen) {
 		netif_stop_queue(netdev);
@@ -4020,9 +4021,11 @@ static void set_carrier(struct r8152 *tp)
 	} else {
 		if (netif_carrier_ok(netdev)) {
 			netif_carrier_off(netdev);
+			tasklet_disable(&tp->tx_tl);
 			napi_disable(napi);
 			tp->rtl_ops.disable(tp);
 			napi_enable(napi);
+			tasklet_enable(&tp->tx_tl);
 			netif_info(tp, link, netdev, "carrier off\n");
 		}
 	}
@@ -4055,10 +4058,10 @@ static void rtl_work_func_t(struct work_struct *work)
 	if (test_and_clear_bit(RTL8152_SET_RX_MODE, &tp->flags))
 		_rtl8152_set_rx_mode(tp->netdev);
 
-	/* don't schedule napi before linking */
-	if (test_and_clear_bit(SCHEDULE_NAPI, &tp->flags) &&
+	/* don't schedule tasket before linking */
+	if (test_and_clear_bit(SCHEDULE_TASKLET, &tp->flags) &&
 	    netif_carrier_ok(tp->netdev))
-		napi_schedule(&tp->napi);
+		tasklet_schedule(&tp->tx_tl);
 
 	mutex_unlock(&tp->control);
 
@@ -4144,6 +4147,7 @@ static int rtl8152_open(struct net_device *netdev)
 		goto out_unlock;
 	}
 	napi_enable(&tp->napi);
+	tasklet_enable(&tp->tx_tl);
 
 	mutex_unlock(&tp->control);
 
@@ -4171,6 +4175,7 @@ static int rtl8152_close(struct net_device *netdev)
 #ifdef CONFIG_PM_SLEEP
 	unregister_pm_notifier(&tp->pm_notifier);
 #endif
+	tasklet_disable(&tp->tx_tl);
 	if (!test_bit(RTL8152_UNPLUG, &tp->flags))
 		napi_disable(&tp->napi);
 	clear_bit(WORK_ENABLE, &tp->flags);
@@ -4440,6 +4445,7 @@ static int rtl8152_pre_reset(struct usb_interface *intf)
 		return 0;
 
 	netif_stop_queue(netdev);
+	tasklet_disable(&tp->tx_tl);
 	napi_disable(&tp->napi);
 	clear_bit(WORK_ENABLE, &tp->flags);
 	usb_kill_urb(tp->intr_urb);
@@ -4483,6 +4489,7 @@ static int rtl8152_post_reset(struct usb_interface *intf)
 	}
 
 	napi_enable(&tp->napi);
+	tasklet_enable(&tp->tx_tl);
 	netif_wake_queue(netdev);
 	usb_submit_urb(tp->intr_urb, GFP_KERNEL);
 
@@ -4636,10 +4643,12 @@ static int rtl8152_system_suspend(struct r8152 *tp)
 
 		clear_bit(WORK_ENABLE, &tp->flags);
 		usb_kill_urb(tp->intr_urb);
+		tasklet_disable(&tp->tx_tl);
 		napi_disable(napi);
 		cancel_delayed_work_sync(&tp->schedule);
 		tp->rtl_ops.down(tp);
 		napi_enable(napi);
+		tasklet_enable(&tp->tx_tl);
 	}
 
 	return 0;
@@ -5499,6 +5508,8 @@ static int rtl8152_probe(struct usb_interface *intf,
 	mutex_init(&tp->control);
 	INIT_DELAYED_WORK(&tp->schedule, rtl_work_func_t);
 	INIT_DELAYED_WORK(&tp->hw_phy_work, rtl_hw_phy_work_func_t);
+	tasklet_init(&tp->tx_tl, bottom_half, (unsigned long)tp);
+	tasklet_disable(&tp->tx_tl);
 
 	netdev->netdev_ops = &rtl8152_netdev_ops;
 	netdev->watchdog_timeo = RTL8152_TX_TIMEOUT;
@@ -5585,6 +5596,7 @@ static int rtl8152_probe(struct usb_interface *intf,
 
 out1:
 	netif_napi_del(&tp->napi);
+	tasklet_kill(&tp->tx_tl);
 	usb_set_intfdata(intf, NULL);
 out:
 	free_netdev(netdev);
@@ -5601,6 +5613,7 @@ static void rtl8152_disconnect(struct usb_interface *intf)
 
 		netif_napi_del(&tp->napi);
 		unregister_netdev(tp->netdev);
+		tasklet_kill(&tp->tx_tl);
 		cancel_delayed_work_sync(&tp->hw_phy_work);
 		tp->rtl_ops.unload(tp);
 		free_netdev(tp->netdev);
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Antoine Tenart @ 2019-08-14  8:31 UTC (permalink / raw)
  To: Igor Russkikh
  Cc: Andrew Lunn, Antoine Tenart, davem@davemloft.net,
	sd@queasysnail.net, f.fainelli@gmail.com, hkallweit1@gmail.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	thomas.petazzoni@bootlin.com, alexandre.belloni@bootlin.com,
	allan.nielsen@microchip.com, camelia.groza@nxp.com,
	Simon Edelhaus, Pavel Belous
In-Reply-To: <2e3c2307-d414-a531-26cb-064e05fa01fc@aquantia.com>

Hi Igor,

On Tue, Aug 13, 2019 at 04:18:40PM +0000, Igor Russkikh wrote:
> On 13.08.2019 16:17, Andrew Lunn wrote:
> > On Tue, Aug 13, 2019 at 10:58:17AM +0200, Antoine Tenart wrote:
> >> I think this question is linked to the use of a MACsec virtual interface
> >> when using h/w offloading. The starting point for me was that I wanted
> >> to reuse the data structures and the API exposed to the userspace by the
> >> s/w implementation of MACsec. I then had two choices: keeping the exact
> >> same interface for the user (having a virtual MACsec interface), or
> >> registering the MACsec genl ops onto the real net devices (and making
> >> the s/w implementation a virtual net dev and a provider of the MACsec
> >> "offloading" ops).
> >>
> >> The advantages of the first option were that nearly all the logic of the
> >> s/w implementation could be kept and especially that it would be
> >> transparent for the user to use both implementations of MACsec.
> > 
> > We have always talked about offloading operations to the hardware,
> > accelerating what the linux stack can do by making use of hardware
> > accelerators. The basic user API should not change because of
> > acceleration. Those are the general guidelines.
> > 
> > It would however be interesting to get comments from those who did the
> > software implementation and what they think of this architecture. I've
> > no personal experience with MACSec, so it is hard for me to say if the
> > current architecture makes sense when using accelerators.
> 
> In terms of overall concepts, I'd add the following:
> 
> 1) With current implementation it's impossible to install SW macsec engine onto
> the device which supports HW offload. That could be a strong limitation in
> cases when user sees HW macsec offload is broken or work differently, and he/she
> wants to replace it with SW one.
> MACSec is a complex feature, and it may happen something is missing in HW.
> Trivial example is 256bit encryption, which is not always a musthave in HW
> implementations.

Agreed. I'm not sure it would be possible to have both used at the same
time but there should be a way to switch between the two
implementations. That is not supported for now, but I think that would
be a good thing, and can probably come later on.

> 2) I think, Antoine, its not totally true that otherwise the user macsec API
> will be broken/changed. netlink api is the same, the only thing we may want to
> add is an optional parameter to force selection of SW macsec engine.

I meant that we can either have a virtual net device representing the
MACsec feature and being the iface used to configure it, or we could
have it only when s/w MACsec is used. That to me is part of the "API",
or at least part of what's exposed to the user.

> I'm also eager to hear from sw macsec users/devs on whats better here.

I'd like more comments as well :)

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Antoine Tenart @ 2019-08-14  8:32 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Igor Russkikh, Antoine Tenart, davem@davemloft.net,
	sd@queasysnail.net, f.fainelli@gmail.com, hkallweit1@gmail.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	thomas.petazzoni@bootlin.com, alexandre.belloni@bootlin.com,
	allan.nielsen@microchip.com, camelia.groza@nxp.com,
	Simon Edelhaus, Pavel Belous
In-Reply-To: <20190813162823.GH15047@lunn.ch>

Hi Andrew,

On Tue, Aug 13, 2019 at 06:28:23PM +0200, Andrew Lunn wrote:
> 
> It would also be nice to add extra information to the netlink API to
> indicate if HW or SW is being used. In other places where we offload
> to accelerators we have such additional information.

Yes, that would be very nice to have.

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH 3/3] ocelot_ace: fix action of trap
From: Allan W. Nielsen @ 2019-08-14  8:57 UTC (permalink / raw)
  To: Y.b. Lu
  Cc: Andrew Lunn, netdev@vger.kernel.org, David S . Miller,
	Alexandre Belloni, Microchip Linux Driver Support
In-Reply-To: <VI1PR0401MB2237D9358AA17400E72A776EF8AD0@VI1PR0401MB2237.eurprd04.prod.outlook.com>

Hi Y.b. and Andrew,

The 08/14/2019 04:28, Y.b. Lu wrote:
> > > I'd like to trap all IEEE 1588 PTP Ethernet frames to CPU through etype
> > 0x88f7.
> > 
> > Is this the correct way to handle PTP for this switch? For other switches we
> > don't need such traps. The switch itself identifies PTP frames and forwards
> > them to the CPU so it can process them.
> > 
> > I'm just wondering if your general approach is wrong?
> 
> [Y.b. Lu] PTP messages over Ethernet will use two multicast addresses.
> 01-80-C2-00-00-0E for peer delay messages.
Yes, and as you write, this is a BPDU which must not be forwarded (and they are
not).

> 01-1B-19-00-00-00 for other messages.
Yes, this is a normal L2 multicast address, which by default are broadcastet.

> But only 01-80-C2-00-00-0E could be handled by hardware filter for BPDU frames
> (01-80-C2-00-00-0x).  For PTP messages handling, trapping them to CPU through
> VCAP IS2 is the suggested way by Ocelot/Felix.
As I see it there are at least 3 scenarios which needs to be considered and
ideally supported:

1) Operate as a PTP-unaware switch. This means that Peer-delays messages are
   trapped to the CPU and not handled. End-to-End PTP sessions can still run on
   the network as 01-1B-19-00-00-00 frames are forwarded normally.

   This is what we have by default today.

2) A "passive" PTP switch (in MSCC/MCHP we call this an end-to-end transparent
   clock) where 01-80-C2-00-00-0E frames are still trapped to the CPU (and not
   handled), 01-1B-19-00-00-00 frames are forwarded, but we use the TCAM to add
   the residence time to the correction field in Sync and Delay request
   messages.

   This is a simple mechanism which allow end-to-end PTP sessions to synchronize
   their time, and compensate for the variable delay caused by the switch.

   Compared to implement a complete boundary clock, this is much simpler, and
   cause a much lower work load on the CPU (even small switches may be serving
   many many PTP sessions).

3) Full PTP aware switch. In this mode we need all PTP frames trapped (on the
   ports where PTP are running) to the CPU, and we need a PTP daemon in
   user-space to process them in-order for things to work.

   I guess this is what you are trying to achieve.

Eventually, I hope we can get to a point where all (and maybe more) scenarios
are supported.

Lets consider them case by case:

1) This is what we have today.

2) To support this, we need a SW implementation of this, and then we can add
   hooks to offload this in HW.

   We would certainly be interested in supporting this in both SW and HW.

3) It can be done via 'tc' using the trap action, but I do not know if this is
   the desired way of doing it. Ocelot will be using a TCAM rule to do this,
   which align nicely with the 'tc' approach, but other chips may be have
   dedicated HW for doing this.

   Also, in the current implementation we will be using a rule per port, and
   ideally we could have done it with a single rule (this is what Y.B. prepared
   in this patch series).

I'm very much against configuring option 3 in the driver initialization, as it
will prevent us from having a conforming switch if a PTP daemon is not running
in user-space, and it give us very little room for supporting other ways in the
future without breaking backwards compatibility.

> I have a question since you are experts.
I'm not really an expert on this, but I have access to some good guidance from
collages knowing PTP very well :-D

> For other switches, whether they are always trapping PTP messages to CPU?
Good question, I could not find anything in the SW bridge forcing option 3.

My understanding is that the SW bridge is implementing option 1, but I could be
wrong.

> Is there any common method in linux to configure switch to select trapping or
> just forwarding PTP messages?
You should be able to use TC for this. But due to the port vs port-mask
limitation you will need to install a rule per port.

I do not know if this is what others are doing, but would like to learn about
that.

/Allan


^ permalink raw reply

* Re: [PATCH v4 13/14] net: phy: adin: add ethtool get_stats support
From: Ardelean, Alexandru @ 2019-08-14  9:08 UTC (permalink / raw)
  To: andrew@lunn.ch
  Cc: davem@davemloft.net, hkallweit1@gmail.com,
	devicetree@vger.kernel.org, mark.rutland@arm.com,
	linux-kernel@vger.kernel.org, f.fainelli@gmail.com,
	netdev@vger.kernel.org, robh+dt@kernel.org
In-Reply-To: <c3fdb21c40900dae0e52b02b98fe27924a76c256.camel@analog.com>

On Tue, 2019-08-13 at 08:48 +0300, Alexandru Ardelean wrote:
> On Mon, 2019-08-12 at 16:33 +0200, Andrew Lunn wrote:
> > [External]
> > 
> > > +static int adin_read_mmd_stat_regs(struct phy_device *phydev,
> > > +				   struct adin_hw_stat *stat,
> > > +				   u32 *val)
> > > +{
> > > +	int ret;
> > > +
> > > +	ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg1);
> > > +	if (ret < 0)
> > > +		return ret;
> > > +
> > > +	*val = (ret & 0xffff);
> > > +
> > > +	if (stat->reg2 == 0)
> > > +		return 0;
> > > +
> > > +	ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > > +	if (ret < 0)
> > > +		return ret;
> > > +
> > > +	*val <<= 16;
> > > +	*val |= (ret & 0xffff);
> > > +
> > > +	return 0;
> > > +}
> > 
> > It still looks like you have not dealt with overflow from the LSB into
> > the MSB between the two reads.
> 
> Apologies for forgetting to address this.
> I did not intentionally leave it out; this item got lost after V1 [which had the most remarks].
> Changelog V1 -> V2 was quite bulky, and I did not look at V1 remarks after I finished V2.
> 
> Thanks for snippet.

So, I have to apologize again here.
I guess I was an idiot/n00b about this.

The PHY stats do support snapshot, and I sync-ed with someone from the chip-team to confirm.

Also, from the datasheet[1] (page 29 - FRAME GENERATOR AND CHECKER - 5th paragraph):
--------------------------------------------------------------
The frame checker counts the number of CRC errors and these 
are reported in the receive error counter register (RxErrCnt 
register,  address 0x0014). To ensure synchronization between 
the frame checker error counter and frame checker frame counters,
all of the counters are latched once the receive error counter
register is read. Hence when using the frame checker, the
receive error counter should be read first and then all the other
frame counters and error counters should be read. A latched copy
of the receive frame counter register is available in the
FcFrmCntH and FcFrmCntL registers (register addresses 0x1E.0x940A
and 0x1E.0x940B).
-------------------------------------------------------------

Then in the description of these regs, it mentions (repeteadly):
-------------------------------------------------------------
This register is a latched copy of bits 31:16 of the 32-bit
receive frame counter register. When the receive error counter
(RxErrCnt register address 0x0014) is read, the receive
frame
counter register is latched. A copy of the receive frame counter
register is latched when RxErrCnt is read so that
the error count
and receive frame count are synchronized
-------------------------------------------------------------

I'll re-spin this with the rename of the strings, and maybe do a minor polish of the code.

Thanks & sorry for the noise/trouble
Alex

[1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADIN1300.pdf


> 
> > 	do {
> > 		hi1 = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > 		if (hi1 < 0)
> > 			return hi1;
> > 		
> > 		low = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg1);
> > 		if (low < 0)
> > 			return low;
> > 
> > 		hi2 = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > 		if (hi2 < 0)
> > 			return hi1;
> > 	} while (hi1 != hi2)
> > 
> > 	return low | (hi << 16);
> > 
> > 	Andrew

^ permalink raw reply

* [PATCH net] net/packet: fix race in tpacket_snd()
From: Eric Dumazet @ 2019-08-14  9:11 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, syzbot

packet_sendmsg() checks tx_ring.pg_vec to decide
if it must call tpacket_snd().

Problem is that the check is lockless, meaning another thread
can issue a concurrent setsockopt(PACKET_TX_RING ) to flip
tx_ring.pg_vec back to NULL.

Given that tpacket_snd() grabs pg_vec_lock mutex, we can
perform the check again to solve the race.

syzbot reported :

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 11429 Comm: syz-executor394 Not tainted 5.3.0-rc4+ #101
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:packet_lookup_frame+0x8d/0x270 net/packet/af_packet.c:474
Code: c1 ee 03 f7 73 0c 80 3c 0e 00 0f 85 cb 01 00 00 48 8b 0b 89 c0 4c 8d 24 c1 48 b8 00 00 00 00 00 fc ff df 4c 89 e1 48 c1 e9 03 <80> 3c 01 00 0f 85 94 01 00 00 48 8d 7b 10 4d 8b 3c 24 48 b8 00 00
RSP: 0018:ffff88809f82f7b8 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: ffff8880a45c7030 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 1ffff110148b8e06 RDI: ffff8880a45c703c
RBP: ffff88809f82f7e8 R08: ffff888087aea200 R09: fffffbfff134ae50
R10: fffffbfff134ae4f R11: ffffffff89a5727f R12: 0000000000000000
R13: 0000000000000001 R14: ffff8880a45c6ac0 R15: 0000000000000000
FS:  00007fa04716f700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fa04716edb8 CR3: 0000000091eb4000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 packet_current_frame net/packet/af_packet.c:487 [inline]
 tpacket_snd net/packet/af_packet.c:2667 [inline]
 packet_sendmsg+0x590/0x6250 net/packet/af_packet.c:2975
 sock_sendmsg_nosec net/socket.c:637 [inline]
 sock_sendmsg+0xd7/0x130 net/socket.c:657
 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311
 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413
 __do_sys_sendmmsg net/socket.c:2442 [inline]
 __se_sys_sendmmsg net/socket.c:2439 [inline]
 __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439
 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/packet/af_packet.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8d54f3047768d2272cbd28a7bcda33df800aa589..e2742b006d255f598fc98953dbb823f615d2bf9a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2618,6 +2618,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 
 	mutex_lock(&po->pg_vec_lock);
 
+	/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
+	 * we need to confirm it under protection of pg_vec_lock.
+	 */
+	if (unlikely(!po->tx_ring.pg_vec)) {
+		err = -EBUSY;
+		goto out;
+	}
 	if (likely(saddr == NULL)) {
 		dev	= packet_cached_dev_get(po);
 		proto	= po->num;
-- 
2.23.0.rc1.153.gdeed80330f-goog


^ permalink raw reply related

* Re: [v2, 4/4] ocelot: add VCAP IS2 rule to trap PTP Ethernet frames
From: Allan W . Nielsen @ 2019-08-14  9:16 UTC (permalink / raw)
  To: Y.b. Lu
  Cc: Andrew Lunn, netdev@vger.kernel.org, David S . Miller,
	Alexandre Belloni, Microchip Linux Driver Support
In-Reply-To: <VI1PR0401MB2237E0F32D6CC719682E8C1AF8AD0@VI1PR0401MB2237.eurprd04.prod.outlook.com>

The 08/14/2019 04:56, Y.b. Lu wrote:
> > -----Original Message-----
> > From: Allan W . Nielsen <allan.nielsen@microchip.com>
> > Sent: Tuesday, August 13, 2019 2:25 PM
> > To: Y.b. Lu <yangbo.lu@nxp.com>
> > Cc: netdev@vger.kernel.org; David S . Miller <davem@davemloft.net>;
> > Alexandre Belloni <alexandre.belloni@bootlin.com>; Microchip Linux Driver
> > Support <UNGLinuxDriver@microchip.com>
> > Subject: Re: [v2, 4/4] ocelot: add VCAP IS2 rule to trap PTP Ethernet frames
> > 
> > The 08/13/2019 10:52, Yangbo Lu wrote:
> > > All the PTP messages over Ethernet have etype 0x88f7 on them.
> > > Use etype as the key to trap PTP messages.
> > >
> > > Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
> > > ---
> > > Changes for v2:
> > > 	- Added this patch.
> > > ---
> > >  drivers/net/ethernet/mscc/ocelot.c | 28 ++++++++++++++++++++++++++++
> > >  1 file changed, 28 insertions(+)
> > >
> > > diff --git a/drivers/net/ethernet/mscc/ocelot.c
> > > b/drivers/net/ethernet/mscc/ocelot.c
> > > index 6932e61..40f4e0d 100644
> > > --- a/drivers/net/ethernet/mscc/ocelot.c
> > > +++ b/drivers/net/ethernet/mscc/ocelot.c
> > > @@ -1681,6 +1681,33 @@ int ocelot_probe_port(struct ocelot *ocelot, u8
> > > port,  }  EXPORT_SYMBOL(ocelot_probe_port);
> > >
> > > +static int ocelot_ace_add_ptp_rule(struct ocelot *ocelot) {
> > > +	struct ocelot_ace_rule *rule;
> > > +
> > > +	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
> > > +	if (!rule)
> > > +		return -ENOMEM;
> > > +
> > > +	/* Entry for PTP over Ethernet (etype 0x88f7)
> > > +	 * Action: trap to CPU port
> > > +	 */
> > > +	rule->ocelot = ocelot;
> > > +	rule->prio = 1;
> > > +	rule->type = OCELOT_ACE_TYPE_ETYPE;
> > > +	/* Available on all ingress port except CPU port */
> > > +	rule->ingress_port = ~BIT(ocelot->num_phys_ports);
> > > +	rule->dmac_mc = OCELOT_VCAP_BIT_1;
> > > +	rule->frame.etype.etype.value[0] = 0x88;
> > > +	rule->frame.etype.etype.value[1] = 0xf7;
> > > +	rule->frame.etype.etype.mask[0] = 0xff;
> > > +	rule->frame.etype.etype.mask[1] = 0xff;
> > > +	rule->action = OCELOT_ACL_ACTION_TRAP;
> > > +
> > > +	ocelot_ace_rule_offload_add(rule);
> > > +	return 0;
> > > +}
> > > +
> > >  int ocelot_init(struct ocelot *ocelot)  {
> > >  	u32 port;
> > > @@ -1708,6 +1735,7 @@ int ocelot_init(struct ocelot *ocelot)
> > >  	ocelot_mact_init(ocelot);
> > >  	ocelot_vlan_init(ocelot);
> > >  	ocelot_ace_init(ocelot);
> > > +	ocelot_ace_add_ptp_rule(ocelot);
> > >
> > >  	for (port = 0; port < ocelot->num_phys_ports; port++) {
> > >  		/* Clear all counters (5 groups) */
> > This seems really wrong to me, and much too hard-coded...
> > 
> > What if I want to forward the PTP frames to be forwarded like a normal
> > non-aware PTP switch?
> 
> [Y.b. Lu] As Andrew said, other switches could identify PTP messages and forward to CPU for processing.
> https://patchwork.ozlabs.org/patch/1145627/
Yes, it would be good to see some exampels to understand this better.

> I'm also wondering whether there is common method in linux to address your questions.
Me too.

> If no, I think trapping all PTP messages on all ports to CPU could be used for now.
> If users require PTP synchronization, they actually don’t want a non-aware PTP switch.
Can we continue this discussion in the other thread where I listed the 3
scenarios?

> I once see other ocelot code configure ptp trap rules in ioctl timestamping
> setting. But I don’t think it's proper either.  Enable timestamping doesn’t
> mean we want to trap PTP messages.
Where did you see this?

The effort in [1] is just about the time-stamping and does not really consider
the bridge part of it, and it should not be installing any TCAM rules (I believe
it did in earlier versions, but this has been changed).

[1] https://patchwork.ozlabs.org/patch/1145777/

> > What if do not want this on all ports?
> [Y.b. Lu] Actually I don’t think there should be difference of handling PTP messages on each port.
> You don’t need to run PTP protocol application on the specific port if you don’t want.
What if you want some vlans or some ports to be PTP unaware, and other to be PTP
aware.

> > If you do not have an application behind this implementing a boundary or
> > transparent clock, then you are breaking PTP on the network.
> [Y.b. Lu] You're right. But actually for PTP network, all PTP devices should run PTP protocol on it.
> Of course, it's better to have a way to configure it as non-aware PTP switch.
I think we agree.

In my point of view, it is the PTP daemon who should configure frames to be
trapped. Then the switch will be PTP unaware until the PTP daemon starts up and
is ready to make it aware.

If we put it in the init function, then it will be of PTP broken until the PTP
daemon starts.

/Allan


^ permalink raw reply

* [PATCH] can: rcar_can: Remove unused platform data support
From: Geert Uytterhoeven @ 2019-08-14  9:22 UTC (permalink / raw)
  To: Wolfgang Grandegger, Marc Kleine-Budde, Sergei Shtylyov
  Cc: David S . Miller, Wolfram Sang, linux-can, linux-renesas-soc,
	netdev, Geert Uytterhoeven

All R-Car platforms use DT for describing CAN controllers.
R-Car CAN platform data support was never used in any upstream kernel.

Move the Clock Select Register settings enum into the driver, and remove
platform data support and the corresponding header file.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/net/can/rcar/rcar_can.c       | 22 +++++++++-------------
 include/linux/can/platform/rcar_can.h | 18 ------------------
 2 files changed, 9 insertions(+), 31 deletions(-)
 delete mode 100644 include/linux/can/platform/rcar_can.h

diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index cf218949a8fb52d5..3c5e9c2c5342147f 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -15,11 +15,17 @@
 #include <linux/can/led.h>
 #include <linux/can/dev.h>
 #include <linux/clk.h>
-#include <linux/can/platform/rcar_can.h>
 #include <linux/of.h>
 
 #define RCAR_CAN_DRV_NAME	"rcar_can"
 
+/* Clock Select Register settings */
+enum CLKR {
+	CLKR_CLKP1 = 0,	/* Peripheral clock (clkp1) */
+	CLKR_CLKP2 = 1,	/* Peripheral clock (clkp2) */
+	CLKR_CLKEXT = 3	/* Externally input clock */
+};
+
 #define RCAR_SUPPORTED_CLOCKS	(BIT(CLKR_CLKP1) | BIT(CLKR_CLKP2) | \
 				 BIT(CLKR_CLKEXT))
 
@@ -736,7 +742,6 @@ static const char * const clock_names[] = {
 
 static int rcar_can_probe(struct platform_device *pdev)
 {
-	struct rcar_can_platform_data *pdata;
 	struct rcar_can_priv *priv;
 	struct net_device *ndev;
 	struct resource *mem;
@@ -745,17 +750,8 @@ static int rcar_can_probe(struct platform_device *pdev)
 	int err = -ENODEV;
 	int irq;
 
-	if (pdev->dev.of_node) {
-		of_property_read_u32(pdev->dev.of_node,
-				     "renesas,can-clock-select", &clock_select);
-	} else {
-		pdata = dev_get_platdata(&pdev->dev);
-		if (!pdata) {
-			dev_err(&pdev->dev, "No platform data provided!\n");
-			goto fail;
-		}
-		clock_select = pdata->clock_select;
-	}
+	of_property_read_u32(pdev->dev.of_node, "renesas,can-clock-select",
+			     &clock_select);
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
diff --git a/include/linux/can/platform/rcar_can.h b/include/linux/can/platform/rcar_can.h
deleted file mode 100644
index a43dcd0cf79ee3ec..0000000000000000
--- a/include/linux/can/platform/rcar_can.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _CAN_PLATFORM_RCAR_CAN_H_
-#define _CAN_PLATFORM_RCAR_CAN_H_
-
-#include <linux/types.h>
-
-/* Clock Select Register settings */
-enum CLKR {
-	CLKR_CLKP1 = 0,	/* Peripheral clock (clkp1) */
-	CLKR_CLKP2 = 1,	/* Peripheral clock (clkp2) */
-	CLKR_CLKEXT = 3	/* Externally input clock */
-};
-
-struct rcar_can_platform_data {
-	enum CLKR clock_select;	/* Clock source select */
-};
-
-#endif	/* !_CAN_PLATFORM_RCAR_CAN_H_ */
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH bpf-next 1/3] libbpf: add asm/unistd.h to xsk to get __NR_mmap2
From: Ivan Khoronzhuk @ 2019-08-14  9:24 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Magnus Karlsson, Björn Töpel, David S. Miller,
	Jesper Dangaard Brouer, john fastabend, Jakub Kicinski,
	Daniel Borkmann, Networking, bpf, xdp-newbies, open list
In-Reply-To: <CAEf4BzZ2y_DmTXkVqFh6Hdcquo6UvntvCygw5h5WwrWYXRRg_g@mail.gmail.com>

On Tue, Aug 13, 2019 at 04:38:13PM -0700, Andrii Nakryiko wrote:

Hi, Andrii

>On Tue, Aug 13, 2019 at 3:24 AM Ivan Khoronzhuk
><ivan.khoronzhuk@linaro.org> wrote:
>>
>> That's needed to get __NR_mmap2 when mmap2 syscall is used.
>>
>> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>> ---
>>  tools/lib/bpf/xsk.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
>> index 5007b5d4fd2c..f2fc40f9804c 100644
>> --- a/tools/lib/bpf/xsk.c
>> +++ b/tools/lib/bpf/xsk.c
>> @@ -12,6 +12,7 @@
>>  #include <stdlib.h>
>>  #include <string.h>
>>  #include <unistd.h>
>> +#include <asm/unistd.h>
>
>asm/unistd.h is not present in Github libbpf projection. Is there any

Look on includes from
tools/lib/bpf/libpf.c
tools/lib/bpf/bpf.c

That's how it's done... Copping headers to arch/arm will not
solve this, it includes both of them anyway, and anyway it needs
asm/unistd.h inclusion here, only because xsk.c needs __NR_*


>way to avoid including this header? Generally, libbpf can't easily use
>all of kernel headers, we need to re-implemented all the extra used
>stuff for Github version of libbpf, so we try to minimize usage of new
>headers that are not just plain uapi headers from include/uapi.

Yes I know, it's far away from real number of changes needed.
I faced enough about this already and kernel headers, especially
for arm32 it's a bit decency problem. But this patch it's part of
normal one. I have couple issues despite this normally fixed mmap2
that is the same even if uapi includes are coppied to tools/arch/arm.

In continuation of kernel headers inclusion and arm build:

For instance, what about this rough "kernel headers" hack:
https://github.com/ikhorn/af_xdp_stuff/commit/aa645ccca4d844f404ec3c2b27402d4d7848d1b5

or this one related for arm32 only:
https://github.com/ikhorn/af_xdp_stuff/commit/2c6c6d538605aac39600dcb3c9b66de11c70b963

I have more...

>
>>  #include <arpa/inet.h>
>>  #include <asm/barrier.h>
>>  #include <linux/compiler.h>
>> --
>> 2.17.1
>>

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* [PATCH 0/7] Netfilter fixes for net
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

Hi,

This patchset contains Netfilter fixes for net:

1) Extend selftest to cover flowtable with ipsec, from Florian Westphal.

2) Fix interaction of ipsec with flowtable, also from Florian.

3) User-after-free with bound set to rule that fails to load.

4) Adjust state and timeout for flows that expire.

5) Timeout update race with flows in teardown state.

6) Ensure conntrack id hash calculation use invariants as input,
   from Dirk Morris.

7) Do not push flows into flowtable for TCP fin/rst packets.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks.

----------------------------------------------------------------

The following changes since commit 5e5412c365a32e452daa762eac36121cb8a370bb:

  net/socket: fix GCC8+ Wpacked-not-aligned warnings (2019-08-03 11:02:46 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to dfe42be15fde16232340b8b2a57c359f51cc10d9:

  netfilter: nft_flow_offload: skip tcp rst and fin packets (2019-08-14 11:09:07 +0200)

----------------------------------------------------------------
Dirk Morris (1):
      netfilter: conntrack: Use consistent ct id hash calculation

Florian Westphal (2):
      selftests: netfilter: extend flowtable test script for ipsec
      netfilter: nf_flow_table: fix offload for flows that are subject to xfrm

Pablo Neira Ayuso (4):
      netfilter: nf_tables: use-after-free in failing rule with bound set
      netfilter: nf_flow_table: conntrack picks up expired flows
      netfilter: nf_flow_table: teardown flow timeout race
      netfilter: nft_flow_offload: skip tcp rst and fin packets

 include/net/netfilter/nf_tables.h                  |  9 +++-
 net/netfilter/nf_conntrack_core.c                  | 16 ++++----
 net/netfilter/nf_flow_table_core.c                 | 43 +++++++++++++------
 net/netfilter/nf_flow_table_ip.c                   | 43 +++++++++++++++++++
 net/netfilter/nf_tables_api.c                      | 15 ++++---
 net/netfilter/nft_flow_offload.c                   |  9 ++--
 tools/testing/selftests/netfilter/nft_flowtable.sh | 48 ++++++++++++++++++++++
 7 files changed, 153 insertions(+), 30 deletions(-)


^ permalink raw reply

* [PATCH 1/7] selftests: netfilter: extend flowtable test script for ipsec
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

'flow offload' expression should not offload flows that will be subject
to ipsec, but it does.

This results in a connectivity blackhole for the affected flows -- first
packets will go through (offload happens after established state is
reached), but all remaining ones bypass ipsec encryption and are thus
discarded by the peer.

This can be worked around by adding "rt ipsec exists accept"
before the 'flow offload' rule matches.

This test case will fail, support for such flows is added in
next patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_flowtable.sh | 48 ++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh
index fe52488a6f72..16571ac1dab4 100755
--- a/tools/testing/selftests/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/netfilter/nft_flowtable.sh
@@ -321,4 +321,52 @@ else
 	ip netns exec nsr1 nft list ruleset
 fi
 
+KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
+KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
+SPI1=$RANDOM
+SPI2=$RANDOM
+
+if [ $SPI1 -eq $SPI2 ]; then
+	SPI2=$((SPI2+1))
+fi
+
+do_esp() {
+    local ns=$1
+    local me=$2
+    local remote=$3
+    local lnet=$4
+    local rnet=$5
+    local spi_out=$6
+    local spi_in=$7
+
+    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
+    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
+
+    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
+    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
+    # to fwd decrypted packets after esp processing:
+    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow
+
+}
+
+do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
+
+do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
+
+ip netns exec nsr1 nft delete table ip nat
+
+# restore default routes
+ip -net ns2 route del 192.168.10.1 via 10.0.2.1
+ip -net ns2 route add default via 10.0.2.1
+ip -net ns2 route add default via dead:2::1
+
+test_tcp_forwarding ns1 ns2
+if [ $? -eq 0 ] ;then
+	echo "PASS: ipsec tunnel mode for ns1/ns2"
+else
+	echo "FAIL: ipsec tunnel mode for ns1/ns2"
+	ip netns exec nsr1 nft list ruleset 1>&2
+	ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2
+fi
+
 exit $ret
-- 
2.11.0



^ permalink raw reply related

* [PATCH 2/7] netfilter: nf_flow_table: fix offload for flows that are subject to xfrm
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

This makes the previously added 'encap test' pass.
Because its possible that the xfrm dst entry becomes stale while such
a flow is offloaded, we need to call dst_check() -- the notifier that
handles this for non-tunneled traffic isn't sufficient, because SA or
or policies might have changed.

If dst becomes stale the flow offload entry will be tagged for teardown
and packets will be passed to 'classic' forwarding path.

Removing the entry right away is problematic, as this would
introduce a race condition with the gc worker.

In case flow is long-lived, it could eventually be offloaded again
once the gc worker removes the entry from the flow table.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 43 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index cdfc33517e85..d68c801dd614 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -214,6 +214,25 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
+static int nf_flow_offload_dst_check(struct dst_entry *dst)
+{
+	if (unlikely(dst_xfrm(dst)))
+		return dst_check(dst, 0) ? 0 : -1;
+
+	return 0;
+}
+
+static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
+				      const struct nf_hook_state *state,
+				      struct dst_entry *dst)
+{
+	skb_orphan(skb);
+	skb_dst_set_noref(skb, dst);
+	skb->tstamp = 0;
+	dst_output(state->net, state->sk, skb);
+	return NF_STOLEN;
+}
+
 unsigned int
 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 			const struct nf_hook_state *state)
@@ -254,6 +273,11 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
 		return NF_ACCEPT;
 
+	if (nf_flow_offload_dst_check(&rt->dst)) {
+		flow_offload_teardown(flow);
+		return NF_ACCEPT;
+	}
+
 	if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
 		return NF_DROP;
 
@@ -261,6 +285,13 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	iph = ip_hdr(skb);
 	ip_decrease_ttl(iph);
 
+	if (unlikely(dst_xfrm(&rt->dst))) {
+		memset(skb->cb, 0, sizeof(struct inet_skb_parm));
+		IPCB(skb)->iif = skb->dev->ifindex;
+		IPCB(skb)->flags = IPSKB_FORWARDED;
+		return nf_flow_xmit_xfrm(skb, state, &rt->dst);
+	}
+
 	skb->dev = outdev;
 	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
 	skb_dst_set_noref(skb, &rt->dst);
@@ -467,6 +498,11 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 				sizeof(*ip6h)))
 		return NF_ACCEPT;
 
+	if (nf_flow_offload_dst_check(&rt->dst)) {
+		flow_offload_teardown(flow);
+		return NF_ACCEPT;
+	}
+
 	if (skb_try_make_writable(skb, sizeof(*ip6h)))
 		return NF_DROP;
 
@@ -477,6 +513,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	ip6h = ipv6_hdr(skb);
 	ip6h->hop_limit--;
 
+	if (unlikely(dst_xfrm(&rt->dst))) {
+		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+		IP6CB(skb)->iif = skb->dev->ifindex;
+		IP6CB(skb)->flags = IP6SKB_FORWARDED;
+		return nf_flow_xmit_xfrm(skb, state, &rt->dst);
+	}
+
 	skb->dev = outdev;
 	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
 	skb_dst_set_noref(skb, &rt->dst);
-- 
2.11.0



^ permalink raw reply related

* [PATCH 7/7] netfilter: nft_flow_offload: skip tcp rst and fin packets
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

TCP rst and fin packets do not qualify to place a flow into the
flowtable. Most likely there will be no more packets after connection
closure. Without this patch, this flow entry expires and connection
tracking picks up the entry in ESTABLISHED state using the fixup
timeout, which makes this look inconsistent to the user for a connection
that is actually already closed.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_flow_offload.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index aa5f571d4361..060a4ed46d5e 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -72,11 +72,11 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 {
 	struct nft_flow_offload *priv = nft_expr_priv(expr);
 	struct nf_flowtable *flowtable = &priv->flowtable->data;
+	struct tcphdr _tcph, *tcph = NULL;
 	enum ip_conntrack_info ctinfo;
 	struct nf_flow_route route;
 	struct flow_offload *flow;
 	enum ip_conntrack_dir dir;
-	bool is_tcp = false;
 	struct nf_conn *ct;
 	int ret;
 
@@ -89,7 +89,10 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 
 	switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
 	case IPPROTO_TCP:
-		is_tcp = true;
+		tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff,
+					  sizeof(_tcph), &_tcph);
+		if (unlikely(!tcph || tcph->fin || tcph->rst))
+			goto out;
 		break;
 	case IPPROTO_UDP:
 		break;
@@ -115,7 +118,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 	if (!flow)
 		goto err_flow_alloc;
 
-	if (is_tcp) {
+	if (tcph) {
 		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
 		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
 	}
-- 
2.11.0



^ permalink raw reply related

* [PATCH 6/7] netfilter: conntrack: Use consistent ct id hash calculation
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

From: Dirk Morris <dmorris@metaloft.com>

Change ct id hash calculation to only use invariants.

Currently the ct id hash calculation is based on some fields that can
change in the lifetime on a conntrack entry in some corner cases. The
current hash uses the whole tuple which contains an hlist pointer which
will change when the conntrack is placed on the dying list resulting in
a ct id change.

This patch also removes the reply-side tuple and extension pointer from
the hash calculation so that the ct id will will not change from
initialization until confirmation.

Fixes: 3c79107631db1f7 ("netfilter: ctnetlink: don't use conntrack/expect object addresses as id")
Signed-off-by: Dirk Morris <dmorris@metaloft.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index a542761e90d1..81a8ef42b88d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -453,13 +453,12 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
  * table location, we assume id gets exposed to userspace.
  *
  * Following nf_conn items do not change throughout lifetime
- * of the nf_conn after it has been committed to main hash table:
+ * of the nf_conn:
  *
  * 1. nf_conn address
- * 2. nf_conn->ext address
- * 3. nf_conn->master address (normally NULL)
- * 4. tuple
- * 5. the associated net namespace
+ * 2. nf_conn->master address (normally NULL)
+ * 3. the associated net namespace
+ * 4. the original direction tuple
  */
 u32 nf_ct_get_id(const struct nf_conn *ct)
 {
@@ -469,9 +468,10 @@ u32 nf_ct_get_id(const struct nf_conn *ct)
 	net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
 
 	a = (unsigned long)ct;
-	b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct));
-	c = (unsigned long)ct->ext;
-	d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash),
+	b = (unsigned long)ct->master;
+	c = (unsigned long)nf_ct_net(ct);
+	d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				   sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
 				   &ct_id_seed);
 #ifdef CONFIG_64BIT
 	return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
-- 
2.11.0



^ permalink raw reply related

* [PATCH 3/7] netfilter: nf_tables: use-after-free in failing rule with bound set
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

If a rule that has already a bound anonymous set fails to be added, the
preparation phase releases the rule and the bound set. However, the
transaction object from the abort path still has a reference to the set
object that is stale, leading to a use-after-free when checking for the
set->bound field. Add a new field to the transaction that specifies if
the set is bound, so the abort path can skip releasing it since the rule
command owns it and it takes care of releasing it. After this update,
the set->bound field is removed.

[   24.649883] Unable to handle kernel paging request at virtual address 0000000000040434
[   24.657858] Mem abort info:
[   24.660686]   ESR = 0x96000004
[   24.663769]   Exception class = DABT (current EL), IL = 32 bits
[   24.669725]   SET = 0, FnV = 0
[   24.672804]   EA = 0, S1PTW = 0
[   24.675975] Data abort info:
[   24.678880]   ISV = 0, ISS = 0x00000004
[   24.682743]   CM = 0, WnR = 0
[   24.685723] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000428952000
[   24.692207] [0000000000040434] pgd=0000000000000000
[   24.697119] Internal error: Oops: 96000004 [#1] SMP
[...]
[   24.889414] Call trace:
[   24.891870]  __nf_tables_abort+0x3f0/0x7a0
[   24.895984]  nf_tables_abort+0x20/0x40
[   24.899750]  nfnetlink_rcv_batch+0x17c/0x588
[   24.904037]  nfnetlink_rcv+0x13c/0x190
[   24.907803]  netlink_unicast+0x18c/0x208
[   24.911742]  netlink_sendmsg+0x1b0/0x350
[   24.915682]  sock_sendmsg+0x4c/0x68
[   24.919185]  ___sys_sendmsg+0x288/0x2c8
[   24.923037]  __sys_sendmsg+0x7c/0xd0
[   24.926628]  __arm64_sys_sendmsg+0x2c/0x38
[   24.930744]  el0_svc_common.constprop.0+0x94/0x158
[   24.935556]  el0_svc_handler+0x34/0x90
[   24.939322]  el0_svc+0x8/0xc
[   24.942216] Code: 37280300 f9404023 91014262 aa1703e0 (f9401863)
[   24.948336] ---[ end trace cebbb9dcbed3b56f ]---

Fixes: f6ac85858976 ("netfilter: nf_tables: unbind set in rule from commit path")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  9 +++++++--
 net/netfilter/nf_tables_api.c     | 15 ++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 9b624566b82d..475d6f28ca67 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -421,8 +421,7 @@ struct nft_set {
 	unsigned char			*udata;
 	/* runtime data below here */
 	const struct nft_set_ops	*ops ____cacheline_aligned;
-	u16				flags:13,
-					bound:1,
+	u16				flags:14,
 					genmask:2;
 	u8				klen;
 	u8				dlen;
@@ -1348,12 +1347,15 @@ struct nft_trans_rule {
 struct nft_trans_set {
 	struct nft_set			*set;
 	u32				set_id;
+	bool				bound;
 };
 
 #define nft_trans_set(trans)	\
 	(((struct nft_trans_set *)trans->data)->set)
 #define nft_trans_set_id(trans)	\
 	(((struct nft_trans_set *)trans->data)->set_id)
+#define nft_trans_set_bound(trans)	\
+	(((struct nft_trans_set *)trans->data)->bound)
 
 struct nft_trans_chain {
 	bool				update;
@@ -1384,12 +1386,15 @@ struct nft_trans_table {
 struct nft_trans_elem {
 	struct nft_set			*set;
 	struct nft_set_elem		elem;
+	bool				bound;
 };
 
 #define nft_trans_elem_set(trans)	\
 	(((struct nft_trans_elem *)trans->data)->set)
 #define nft_trans_elem(trans)	\
 	(((struct nft_trans_elem *)trans->data)->elem)
+#define nft_trans_elem_set_bound(trans)	\
+	(((struct nft_trans_elem *)trans->data)->bound)
 
 struct nft_trans_obj {
 	struct nft_object		*obj;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 605a7cfe7ca7..88abbddf8967 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -138,9 +138,14 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
 		return;
 
 	list_for_each_entry_reverse(trans, &net->nft.commit_list, list) {
-		if (trans->msg_type == NFT_MSG_NEWSET &&
-		    nft_trans_set(trans) == set) {
-			set->bound = true;
+		switch (trans->msg_type) {
+		case NFT_MSG_NEWSET:
+			if (nft_trans_set(trans) == set)
+				nft_trans_set_bound(trans) = true;
+			break;
+		case NFT_MSG_NEWSETELEM:
+			if (nft_trans_elem_set(trans) == set)
+				nft_trans_elem_set_bound(trans) = true;
 			break;
 		}
 	}
@@ -6906,7 +6911,7 @@ static int __nf_tables_abort(struct net *net)
 			break;
 		case NFT_MSG_NEWSET:
 			trans->ctx.table->use--;
-			if (nft_trans_set(trans)->bound) {
+			if (nft_trans_set_bound(trans)) {
 				nft_trans_destroy(trans);
 				break;
 			}
@@ -6918,7 +6923,7 @@ static int __nf_tables_abort(struct net *net)
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_NEWSETELEM:
-			if (nft_trans_elem_set(trans)->bound) {
+			if (nft_trans_elem_set_bound(trans)) {
 				nft_trans_destroy(trans);
 				break;
 			}
-- 
2.11.0



^ permalink raw reply related

* [PATCH 5/7] netfilter: nf_flow_table: teardown flow timeout race
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

Flows that are in teardown state (due to RST / FIN TCP packet) still
have their offload flag set on. Hence, the conntrack garbage collector
may race to undo the timeout adjustment that the fixup routine performs,
leaving the conntrack entry in place with the internal offload timeout
(one day).

Update teardown flow state to ESTABLISHED and set tracking to liberal,
then once the offload bit is cleared, adjust timeout if it is more than
the default fixup timeout (conntrack might already have set a lower
timeout from the packet path).

Fixes: da5984e51063 ("netfilter: nf_flow_table: add support for sending flows back to the slow path")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_core.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 68a24471ffee..80a8f9ae4c93 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -111,15 +111,16 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT	(120 * HZ)
 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT	(30 * HZ)
 
-static void flow_offload_fixup_ct(struct nf_conn *ct)
+static inline __s32 nf_flow_timeout_delta(unsigned int timeout)
+{
+	return (__s32)(timeout - (u32)jiffies);
+}
+
+static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 {
 	const struct nf_conntrack_l4proto *l4proto;
+	int l4num = nf_ct_protonum(ct);
 	unsigned int timeout;
-	int l4num;
-
-	l4num = nf_ct_protonum(ct);
-	if (l4num == IPPROTO_TCP)
-		flow_offload_fixup_tcp(&ct->proto.tcp);
 
 	l4proto = nf_ct_l4proto_find(l4num);
 	if (!l4proto)
@@ -132,7 +133,20 @@ static void flow_offload_fixup_ct(struct nf_conn *ct)
 	else
 		return;
 
-	ct->timeout = nfct_time_stamp + timeout;
+	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
+		ct->timeout = nfct_time_stamp + timeout;
+}
+
+static void flow_offload_fixup_ct_state(struct nf_conn *ct)
+{
+	if (nf_ct_protonum(ct) == IPPROTO_TCP)
+		flow_offload_fixup_tcp(&ct->proto.tcp);
+}
+
+static void flow_offload_fixup_ct(struct nf_conn *ct)
+{
+	flow_offload_fixup_ct_state(ct);
+	flow_offload_fixup_ct_timeout(ct);
 }
 
 void flow_offload_free(struct flow_offload *flow)
@@ -210,7 +224,7 @@ EXPORT_SYMBOL_GPL(flow_offload_add);
 
 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
 {
-	return (__s32)(flow->timeout - (u32)jiffies) <= 0;
+	return nf_flow_timeout_delta(flow->timeout) <= 0;
 }
 
 static void flow_offload_del(struct nf_flowtable *flow_table,
@@ -230,6 +244,8 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 
 	if (nf_flow_has_expired(flow))
 		flow_offload_fixup_ct(e->ct);
+	else if (flow->flags & FLOW_OFFLOAD_TEARDOWN)
+		flow_offload_fixup_ct_timeout(e->ct);
 
 	flow_offload_free(flow);
 }
@@ -241,7 +257,7 @@ void flow_offload_teardown(struct flow_offload *flow)
 	flow->flags |= FLOW_OFFLOAD_TEARDOWN;
 
 	e = container_of(flow, struct flow_offload_entry, flow);
-	flow_offload_fixup_ct(e->ct);
+	flow_offload_fixup_ct_state(e->ct);
 }
 EXPORT_SYMBOL_GPL(flow_offload_teardown);
 
-- 
2.11.0



^ permalink raw reply related

* [PATCH 4/7] netfilter: nf_flow_table: conntrack picks up expired flows
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

Update conntrack entry to pick up expired flows, otherwise the conntrack
entry gets stuck with the internal offload timeout (one day). The TCP
state also needs to be adjusted to ESTABLISHED state and tracking is set
to liberal mode in order to give conntrack a chance to pick up the
expired flow.

Fixes: ac2a66665e23 ("netfilter: add generic flow table infrastructure")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_core.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index e3d797252a98..68a24471ffee 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -111,7 +111,7 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT	(120 * HZ)
 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT	(30 * HZ)
 
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct nf_conn *ct)
 {
 	const struct nf_conntrack_l4proto *l4proto;
 	unsigned int timeout;
@@ -208,6 +208,11 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_add);
 
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+	return (__s32)(flow->timeout - (u32)jiffies) <= 0;
+}
+
 static void flow_offload_del(struct nf_flowtable *flow_table,
 			     struct flow_offload *flow)
 {
@@ -223,6 +228,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 	e = container_of(flow, struct flow_offload_entry, flow);
 	clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
 
+	if (nf_flow_has_expired(flow))
+		flow_offload_fixup_ct(e->ct);
+
 	flow_offload_free(flow);
 }
 
@@ -233,7 +241,7 @@ void flow_offload_teardown(struct flow_offload *flow)
 	flow->flags |= FLOW_OFFLOAD_TEARDOWN;
 
 	e = container_of(flow, struct flow_offload_entry, flow);
-	flow_offload_fixup_ct_state(e->ct);
+	flow_offload_fixup_ct(e->ct);
 }
 EXPORT_SYMBOL_GPL(flow_offload_teardown);
 
@@ -298,11 +306,6 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
 	return err;
 }
 
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
-{
-	return (__s32)(flow->timeout - (u32)jiffies) <= 0;
-}
-
 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
 {
 	struct nf_flowtable *flow_table = data;
-- 
2.11.0



^ permalink raw reply related

* tc - mirred ingress not supported at the moment
From: Martin Olsson @ 2019-08-14  9:25 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev
In-Reply-To: <CAAT+qEYG5=5ny+t0VcqiYjDUQLrcj9sBR=2w-fdsE7Jjf4xOkQ@mail.gmail.com>

Hi Cong!

Ah sorry.
Already implemented. Great!

Hmmm. Then why don't the manual at
https://www.linux.org/docs/man8/tc-mirred.html to reflect the changes?
That was the place I checked to see if ingress was still not implemented.
In the commit you point at, the sentence "Currently only egress is
implemented" has been removed.


Question:
Is there any form of performance penalty if I send the mirrored
traffic to the ingress queue of the destination interface rather than
to the egress queue?
I mean, in the kernel there is the possibility to perform far more
actions on the ingress queue than on the egress, but if I leave both
queues at their defaults, will mirrored packets to ingress use more
CPU cycles than to the egress destination, or are they more or less
identical?


Question 2:
Given the commit
https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/commit/?id=5eca0a3701223619a513c7209f7d9335ca1b4cfa,
how can I see in what kernel version it was added?

/Martin


Den tis 13 aug. 2019 kl 18:47 skrev Cong Wang <xiyou.wangcong@gmail.com>:
>
> On Tue, Aug 13, 2019 at 4:05 AM Martin Olsson
> <martin.olsson+netdev@sentorsecurity.com> wrote:
> > Q1: Why was 'ingress' not implemented at the same time as 'egress'?
>
> Because you are using an old iproute2.
>
> ingress support is added by:
> https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/commit/?id=5eca0a3701223619a513c7209f7d9335ca1b4cfa
>
>
> > 2)
> > Ok, so I have to use 'egress':
> > # tc filter add dev eno2 parent ffff: prio 999  protocol all matchall
> > action mirred egress redirect dev mon0
>
>
> So you redirect packets from eno2's ingress to mon0's egress.
>
>
> >
> > Since the mirred action forces me to use 'egress' as the direction on
> > the dest interface, all kinds of network statistics tools show
> > incorrect counters. :-(
> > eno2 is a pure sniffer interface (it is connected to the SPAN dest
> > port of a switch).
> > All packets (matchall) on eno2 are mirrored to mon0.
> >
> > # ip -s link show dev eno2
> >     ...
> >     ...
> >     RX: bytes  packets  errors  dropped overrun mcast
> >     13660757   16329    0       0       0       0
> >     TX: bytes  packets  errors  dropped carrier collsns
> >     0          0        0       0       0       0
> > # ip -s link show dev mon0
> >     ...
> >     ...
> >     RX: bytes  packets  errors  dropped overrun mcast
> >     0          0        0       0       0       0
> >     TX: bytes  packets  errors  dropped carrier collsns
> >     13660757   16329    0       0       0       0
> >
> > eno2 and mon0 should be identical, but they are inverted.
>
> Yes, this behavior is correct. The keyword "egress" in your cmdline
> already says so.
>
> >
> > Q2: So... Can the 'ingress' option please be implemented? (I'm no
> > programmer, so unfortunetly I can't do it myself).
>
> It is completed, you need to update your iproute2 and kernel.
>
> Thanks.

^ permalink raw reply

* Re: [PATCH] net: ethernet: mediatek: Add MT7628/88 SoC support
From: René van Dorst @ 2019-08-14  9:26 UTC (permalink / raw)
  To: Stefan Roese
  Cc: netdev, linux-mediatek, Sean Wang, Felix Fietkau, John Crispin,
	Daniel Golle
In-Reply-To: <a92d7207-80b2-e88d-d869-64c9758ef1da@denx.de>

Hi Stefan,

Quoting Stefan Roese <sr@denx.de>:

> Hi Rene,
>
> On 17.07.19 14:53, René van Dorst wrote:
>
> <snip>
>
>>> +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
>>> @@ -39,7 +39,8 @@
>>>  				 NETIF_F_SG | NETIF_F_TSO | \
>>>  				 NETIF_F_TSO6 | \
>>>  				 NETIF_F_IPV6_CSUM)
>>> -#define NEXT_RX_DESP_IDX(X, Y)	(((X) + 1) & ((Y) - 1))
>>> +#define MTK_HW_FEATURES_MT7628	(NETIF_F_SG | NETIF_F_RXCSUM)
>>> +#define NEXT_DESP_IDX(X, Y)	(((X) + 1) & ((Y) - 1))
>>>
>>>  #define MTK_MAX_RX_RING_NUM	4
>>>  #define MTK_HW_LRO_DMA_SIZE	8
>>> @@ -118,6 +119,7 @@
>>>  /* PDMA Global Configuration Register */
>>>  #define MTK_PDMA_GLO_CFG	0xa04
>>>  #define MTK_MULTI_EN		BIT(10)
>>> +#define MTK_PDMA_SIZE_8DWORDS	(1 << 4)
>>>
>>>  /* PDMA Reset Index Register */
>>>  #define MTK_PDMA_RST_IDX	0xa08
>>> @@ -276,11 +278,18 @@
>>>  #define TX_DMA_OWNER_CPU	BIT(31)
>>>  #define TX_DMA_LS0		BIT(30)
>>>  #define TX_DMA_PLEN0(_x)	(((_x) & MTK_TX_DMA_BUF_LEN) << 16)
>>> +#define TX_DMA_PLEN1(_x)	((_x) & MTK_TX_DMA_BUF_LEN)
>>>  #define TX_DMA_SWC		BIT(14)
>>>  #define TX_DMA_SDL(_x)		(((_x) & 0x3fff) << 16)
>>>
>>> +/* PDMA on MT7628 */
>>> +#define TX_DMA_DONE		BIT(31)
>>> +#define TX_DMA_LS1		BIT(14)
>>> +#define TX_DMA_DESP2_DEF	(TX_DMA_LS0 | TX_DMA_DONE)
>>> +
>>>  /* QDMA descriptor rxd2 */
>>>  #define RX_DMA_DONE		BIT(31)
>>> +#define RX_DMA_LSO		BIT(30)
>>>  #define RX_DMA_PLEN0(_x)	(((_x) & 0x3fff) << 16)
>>>  #define RX_DMA_GET_PLEN0(_x)	(((_x) >> 16) & 0x3fff)
>>>
>>> @@ -289,6 +298,7 @@
>>>
>>>  /* QDMA descriptor rxd4 */
>>>  #define RX_DMA_L4_VALID		BIT(24)
>>> +#define RX_DMA_L4_VALID_PDMA	BIT(30)		/* when PDMA is used */
>>>  #define RX_DMA_FPORT_SHIFT	19
>>>  #define RX_DMA_FPORT_MASK	0x7
>>>
>>> @@ -412,6 +422,19 @@
>>>  #define CO_QPHY_SEL            BIT(0)
>>>  #define GEPHY_MAC_SEL          BIT(1)
>>>
>>> +/* MT7628/88 specific stuff */
>>> +#define MT7628_PDMA_OFFSET	0x0800
>>> +#define MT7628_SDM_OFFSET	0x0c00
>>> +
>>> +#define MT7628_TX_BASE_PTR0	(MT7628_PDMA_OFFSET + 0x00)
>>> +#define MT7628_TX_MAX_CNT0	(MT7628_PDMA_OFFSET + 0x04)
>>> +#define MT7628_TX_CTX_IDX0	(MT7628_PDMA_OFFSET + 0x08)
>>> +#define MT7628_TX_DTX_IDX0	(MT7628_PDMA_OFFSET + 0x0c)
>>> +#define MT7628_PST_DTX_IDX0	BIT(0)
>>> +
>>> +#define MT7628_SDM_MAC_ADRL	(MT7628_SDM_OFFSET + 0x0c)
>>> +#define MT7628_SDM_MAC_ADRH	(MT7628_SDM_OFFSET + 0x10)
>>> +
>>>  struct mtk_rx_dma {
>>>  	unsigned int rxd1;
>>>  	unsigned int rxd2;
>>> @@ -509,6 +532,7 @@ enum mtk_clks_map {
>>>  				 BIT(MTK_CLK_SGMII_CK) | \
>>>  				 BIT(MTK_CLK_ETH2PLL))
>>>  #define MT7621_CLKS_BITMAP	(0)
>>> +#define MT7628_CLKS_BITMAP	(0)
>>>  #define MT7629_CLKS_BITMAP	(BIT(MTK_CLK_ETHIF) | BIT(MTK_CLK_ESW) |  \
>>>  				 BIT(MTK_CLK_GP0) | BIT(MTK_CLK_GP1) | \
>>>  				 BIT(MTK_CLK_GP2) | BIT(MTK_CLK_FE) | \
>>> @@ -563,6 +587,10 @@ struct mtk_tx_ring {
>>>  	struct mtk_tx_dma *last_free;
>>>  	u16 thresh;
>>>  	atomic_t free_count;
>>> +	int dma_size;
>>> +	struct mtk_tx_dma *dma_pdma;	/* For MT7628/88 PDMA handling */
>>> +	dma_addr_t phys_pdma;
>>> +	int cpu_idx;
>>>  };
>>>
>>>  /* PDMA rx ring mode */
>>> @@ -604,6 +632,7 @@ enum mkt_eth_capabilities {
>>>  	MTK_HWLRO_BIT,
>>>  	MTK_SHARED_INT_BIT,
>>>  	MTK_TRGMII_MT7621_CLK_BIT,
>>> +	MTK_SOC_MT7628,
>>
>> This should be MTK_SOC_MT7628_BIT, this only defines the bit number!
>>
>> and futher on #define MTK_SOC_MT7628 BIT(MTK_SOC_MT7628_BIT)
>
> Okay, thanks.
>
>> Based on this commit [0], MT7621 also needs the PDMA for the RX path.
>> I know that is not your issue but I think it is better to add a extra
>> capability bit for the PDMA bits so it can also be used on other socs.
>
> Yes, MT7621 also uses PDMA for RX. The code for RX is pretty much
> shared (re-used), with slight changes for the MT7628/88 to work
> correctly on this SoC.
>
> I'll work on a capability bit for PDMA vs QDMA on TX though. This
> might make things a little more transparent.

Great, Thanks for addressing this issue.

I hope we can collaborate to also support mt76x8 in my PHYLINK patches [0][1].
I am close to posting V2 of the patches but I am currently waiting on some
fiber modules to test the changes better.

Greats,

René

[0] https://patchwork.ozlabs.org/patch/1136551/
[1] https://patchwork.ozlabs.org/patch/1136519/

>
>> Greats,
>>
>> René
>>
>> [0] https://lkml.org/lkml/2018/3/14/1038
>
> Thanks,
> Stefan




^ permalink raw reply

* Re: [RFC bpf-next 0/3] tools: bpftool: add subcommand to count map entries
From: Quentin Monnet @ 2019-08-14  9:42 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, bpf, netdev, oss-drivers
In-Reply-To: <20190814015149.b4pmubo3s4ou5yek@ast-mbp>

2019-08-13 18:51 UTC-0700 ~ Alexei Starovoitov
<alexei.starovoitov@gmail.com>
> On Tue, Aug 13, 2019 at 02:09:18PM +0100, Quentin Monnet wrote:
>> This series adds a "bpftool map count" subcommand to count the number of
>> entries present in a BPF map. This results from a customer request for a
>> tool to count the number of entries in BPF maps used in production (for
>> example, to know how many free entries are left in a given map).
>>
>> The first two commits actually contain some clean-up in preparation for the
>> new subcommand.
>>
>> The third commit adds the new subcommand. Because what data should count as
>> an entry is not entirely clear for all map types, we actually dump several
>> counters, and leave it to the users to interpret the values.
>>
>> Sending as a RFC because I'm looking for feedback on the approach. Is
>> printing several values the good thing to do? Also, note that some map
>> types such as queue/stack maps do not support any type of counting, this
>> would need to be implemented in the kernel I believe.
>>
>> More generally, we have a use case where (hash) maps are under pressure
>> (many additions/deletions from the BPF program), and counting the entries
>> by iterating other the different keys is not at all reliable. Would that
>> make sense to add a new bpf() subcommand to count the entries on the kernel
>> side instead of cycling over the entries in bpftool? If so, we would need
>> to agree on what makes an entry for each kind of map.
> 
> I don't mind new bpftool sub-command, but against adding kernel interface.
> Can you elaborate what is the actual use case?

Hi Alexei, thanks for your feedback.

The use case is a network processing application (close to a NAT), where
a hash map is used to keep track of flows, many of them being
short-lived. The BPF program spends a good chunk of time adding and
deleting entries to/from the map. The overall size (number of entries)
increases slowly, and when it grows past a certain threshold some action
must be taken (some flows are deleted from user space, possibly copied
to another map or whatever) to ensure we still have some room for new
incoming flows.

> The same can be achieved by 'bpftool map dump|grep key|wc -l', no?

To some extent (with subtleties for some other map types); and we use a
similar command line as a workaround for now. But because of the rate of
inserts/deletes in the map, the process often reports a number higher
than the max number of entries (we observed up to ~750k when max_entries
is 500k), even is the map is only half-full on average during the count.
On the worst case (though not frequent), an entry is deleted just before
we get the next key from it, and iteration starts all over again. This
is not reliable to determine how much space is left in the map.

I cannot see a solution that would provide a more accurate count from
user space, when the map is under pressure?

> 
>> Note that we are also facing similar issues for purging map from their
>> entries (deleting all entries at once). We can iterate on the keys and
>> delete elements one by one, but this is very inefficient when entries are
>> being added/removed in parallel from the BPF program, and having another
>> dedicated command accessible from the bpf() system call might help here as
>> well.
> 
> I think that fits into the batch processing of map commands discussion.
> 

This is also what we do at the moment, but we hit similar limitations
when iterating over the keys.

Thanks,
Quentin

^ permalink raw reply

* can: tcan4x5x: spi bits_per_word issue on Raspberry PI
From: FIXED-TERM Buecheler Konstantin (ETAS-SEC/ECT-Mu) @ 2019-08-14  9:43 UTC (permalink / raw)
  To: linux-can@vger.kernel.org, netdev@vger.kernel.org; +Cc: Dan Murphy

Hi all,

I am trying to use a tcan4550 together with a Raspberry PI 3 B. I am using the tcan4x5x driver from net-next. 
I always get the following error during startup.
	tcan4x5x spi0.0: Probe failed, err=-22
	tcan4x5x: probe of spi0.0 failed with error -22

I realized that this happens because the Raspberry PI does only support 8/9 bit words. https://elinux.org/index.php?title=RPi_SPI#Supported_bits_per_word
In the driver it is set to 32.
	spi->bits_per_word = 32;

Setting this to 8 does not help of course since the tcan chip expects a multiple of 32 per spi transaction. 
I don't know if this is a Raspberry Pi specific problem or if there are more devices with this hardware limitation. 

Does anyone have a workaround for that? 

If this a common issue it might be a good idea to patch the driver. I will check if I can find proper a way to do so. 

Regards, 
Konstantin 

^ permalink raw reply

* Re: [PATCH] can: rcar_can: Remove unused platform data support
From: Wolfram Sang @ 2019-08-14  9:46 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Wolfgang Grandegger, Marc Kleine-Budde, Sergei Shtylyov,
	David S . Miller, Wolfram Sang, linux-can, linux-renesas-soc,
	netdev
In-Reply-To: <20190814092221.12959-1-geert+renesas@glider.be>

[-- Attachment #1: Type: text/plain, Size: 655 bytes --]

On Wed, Aug 14, 2019 at 11:22:21AM +0200, Geert Uytterhoeven wrote:
> All R-Car platforms use DT for describing CAN controllers.
> R-Car CAN platform data support was never used in any upstream kernel.
> 
> Move the Clock Select Register settings enum into the driver, and remove
> platform data support and the corresponding header file.
> 
> Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>

Not tested on HW, but double-checked there are no users in-tree, there
is no dangling pdata pointer left in the driver, visual review of the
moved block, and build-tested.

Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: KMSAN: uninit-value in smsc75xx_bind
From: Oliver Neukum @ 2019-08-14 10:16 UTC (permalink / raw)
  To: Andrey Konovalov
  Cc: David S. Miller, Alexander Potapenko, syzkaller-bugs,
	steve.glendinning, syzbot, LKML, USB list, netdev
In-Reply-To: <CAAeHK+zzj_+Qvm217KO2OHnfBMWbepP0Y-OYTWpOw3ghe5ji=g@mail.gmail.com>

Am Dienstag, den 13.08.2019, 17:08 +0200 schrieb Andrey Konovalov:
> On Tue, Aug 13, 2019 at 2:43 PM Oliver Neukum <oneukum@suse.com> wrote:
> > 
> > 
> > Hi,
> > 
> > this looks like a false positive to me.
> > The offending code is likely this:
> > 
> >         if (size) {
> >                 buf = kmalloc(size, GFP_KERNEL);
> >                 if (!buf)
> >                         goto out;
> >         }
> > 
> >         err = usb_control_msg(dev->udev, usb_rcvctrlpipe(dev->udev, 0),
> >                               cmd, reqtype, value, index, buf, size,
> >                               USB_CTRL_GET_TIMEOUT);
> > 
> > which uses 'buf' uninitialized. But it is used for input.
> > What is happening here?
> 
> AFAICS, the uninitialized use of buf that KMSAN points out is in the
> "if (buf & PMT_CTL_DEV_RDY)"  statement in smsc75xx_wait_ready(). Does
> __smsc75xx_read_reg/usb_control_msg() always initialize buf? Can it
> just initialize the first few bytes for example?
> 

Hi,

you are unfortunately right and this is not the only driver vulnerable
in this way. I am going through them.

	Regards
		Oliver


^ permalink raw reply

* Re: [PATCH AUTOSEL 4.19 04/42] netfilter: conntrack: always store window size un-scaled
From: Reindl Harald @ 2019-08-14 10:19 UTC (permalink / raw)
  To: Thomas Jarosch, Sasha Levin
  Cc: linux-kernel, stable, Florian Westphal, Jakub Jankowski,
	Jozsef Kadlecsik, Pablo Neira Ayuso, netfilter-devel, coreteam,
	netdev
In-Reply-To: <20190808090209.wb63n6ibii4ivvba@intra2net.com>

that's still not in 5.2.8

without the exception and "nf_conntrack_tcp_timeout_max_retrans = 60" a
vnc-over-ssh session having the VNC view in the background freezes
within 60 secods

-----------------------------------------------------------------------------------------------
IPV4 TABLE MANGLE (STATEFUL PRE-NAT/FILTER)
-----------------------------------------------------------------------------------------------
Chain PREROUTING (policy ACCEPT 100 packets, 9437 bytes)
num   pkts bytes target     prot opt in     out     source
 destination
1     6526 3892K ACCEPT     all  --  *      *       0.0.0.0/0
 0.0.0.0/0            ctstate RELATED,ESTABLISHED
2      125  6264 ACCEPT     all  --  lo     *       0.0.0.0/0
 0.0.0.0/0
3       64  4952 ACCEPT     all  --  vmnet8 *       0.0.0.0/0
 0.0.0.0/0
4        1    40 DROP       all  --  *      *       0.0.0.0/0
 0.0.0.0/0            ctstate INVALID

-------- Weitergeleitete Nachricht --------
Betreff: [PATCH AUTOSEL 5.2 07/76] netfilter: conntrack: always store
window size un-scaled

Am 08.08.19 um 11:02 schrieb Thomas Jarosch:
> Hello together,
> 
> You wrote on Fri, Aug 02, 2019 at 09:22:24AM -0400:
>> From: Florian Westphal <fw@strlen.de>
>>
>> [ Upstream commit 959b69ef57db00cb33e9c4777400ae7183ebddd3 ]
>>
>> Jakub Jankowski reported following oddity:
>>
>> After 3 way handshake completes, timeout of new connection is set to
>> max_retrans (300s) instead of established (5 days).
>>
>> shortened excerpt from pcap provided:
>> 25.070622 IP (flags [DF], proto TCP (6), length 52)
>> 10.8.5.4.1025 > 10.8.1.2.80: Flags [S], seq 11, win 64240, [wscale 8]
>> 26.070462 IP (flags [DF], proto TCP (6), length 48)
>> 10.8.1.2.80 > 10.8.5.4.1025: Flags [S.], seq 82, ack 12, win 65535, [wscale 3]
>> 27.070449 IP (flags [DF], proto TCP (6), length 40)
>> 10.8.5.4.1025 > 10.8.1.2.80: Flags [.], ack 83, win 512, length 0
>>
>> Turns out the last_win is of u16 type, but we store the scaled value:
>> 512 << 8 (== 0x20000) becomes 0 window.
>>
>> The Fixes tag is not correct, as the bug has existed forever, but
>> without that change all that this causes might cause is to mistake a
>> window update (to-nonzero-from-zero) for a retransmit.
>>
>> Fixes: fbcd253d2448b8 ("netfilter: conntrack: lower timeout to RETRANS seconds if window is 0")
>> Reported-by: Jakub Jankowski <shasta@toxcorp.com>
>> Tested-by: Jakub Jankowski <shasta@toxcorp.com>
>> Signed-off-by: Florian Westphal <fw@strlen.de>
>> Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
>> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
>> Signed-off-by: Sasha Levin <sashal@kernel.org>
> 
> Also:
> Tested-by: Thomas Jarosch <thomas.jarosch@intra2net.com>
> 
> ;)
> 
> We've hit the issue with the wrong conntrack timeout at two different sites,
> long-lived connections to a SAP server over IPSec VPN were constantly dropping.
> 
> For us this was a regression after updating from kernel 3.14 to 4.19.
> Yesterday I've applied the patch to kernel 4.19.57 and the problem is fixed.
> 
> The issue was extra hard to debug as we could just boot the new kernel
> for twenty minutes in the evening on these productive systems.
> 
> The stable kernel patch from last Friday came right on time. I was just
> about the replay the TCP connection with tcpreplay, so this saved
> me from another week of debugging. Thanks everyone!

^ permalink raw reply

* Re: [PATCH bpf-next 1/3] libbpf: add asm/unistd.h to xsk to get __NR_mmap2
From: Ivan Khoronzhuk @ 2019-08-14 10:19 UTC (permalink / raw)
  To: Yonghong Song
  Cc: magnus.karlsson@intel.com, bjorn.topel@intel.com,
	davem@davemloft.net, hawk@kernel.org, john.fastabend@gmail.com,
	jakub.kicinski@netronome.com, daniel@iogearbox.net,
	netdev@vger.kernel.org, bpf@vger.kernel.org,
	xdp-newbies@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <05e5d15b-5ef9-b4dc-a76c-0d423fb2f15d@fb.com>

On Wed, Aug 14, 2019 at 12:32:41AM +0000, Yonghong Song wrote:

Hi, Yonghong Song

>
>
>On 8/13/19 3:23 AM, Ivan Khoronzhuk wrote:
>> That's needed to get __NR_mmap2 when mmap2 syscall is used.
>
>It seems I did not have this issue on x64 machine e.g., Fedora 29.
>My glibc version is 2.28. gcc 8.2.1.

On 64 there is no the issue.

>
>What is your particular system glibc version?
>So needing kernel asm/unistd.h is because of older glibc on your
>system, or something else? Could you clarify?

It doesn't fix build issues, only runtime one on 32bits.

If no such inclusion -> no __NR_mmap2 definition - just mmap() is used ->
no problems on x64.

Is the inclusion -> no NR_mmap2 or is NR_mmap2 -> no problems on x64


-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox