Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next RFC 2/9] net: dsa: mv88e6xxx: expose switch time as a PTP hardware clock
From: Richard Cochran @ 2017-10-08 14:52 UTC (permalink / raw)
  To: Brandon Streiff
  Cc: netdev, linux-kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Erik Hons
In-Reply-To: <1506612341-18061-3-git-send-email-brandon.streiff@ni.com>

On Thu, Sep 28, 2017 at 10:25:34AM -0500, Brandon Streiff wrote:
> +static int mv88e6xxx_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
> +{
> +	if (scaled_ppm == 0)
> +		return 0;
> +
> +	return -EOPNOTSUPP;
> +}

We really want to have an adjustable clock here.  More below.

> +int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip)
> +{
> +	/* Set up the cycle counter */
> +	memset(&chip->tstamp_cc, 0, sizeof(chip->tstamp_cc));
> +	chip->tstamp_cc.read	= mv88e6xxx_ptp_clock_read;
> +	chip->tstamp_cc.mask	= CYCLECOUNTER_MASK(32);
> +	/* Raw timestamps are in units of 8-ns clock periods. */
> +	chip->tstamp_cc.mult	= 8;
> +	chip->tstamp_cc.shift	= 0;

First of all, the switch can use an external clock, and so at the very
least, the period should be a macro so that if and when we support the
external clock, the macro may be converted into a variable.

Secondly, the mult/shift should be chosen to allow the finest possible
frequency adjustment.  Here is what I did:

---
#define N 28
#define CC_MULT (8 << N)

int mv88e635x_setup(struct dsa_switch *ds)
{
	struct mv88e6xxx_chip *ps = ds->priv;

	ps->cc.read = mv88e635x_global_time_read;
	ps->cc.mask = CLOCKSOURCE_MASK(32);
	ps->cc.mult = CC_MULT;
	ps->cc.shift = N;
	timecounter_init(&ps->tc, &ps->cc, ktime_to_ns(ktime_get_real()));
	...
}

static int mv88e635x_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
{
	u64 adj;
	u32 diff, mult;
	int neg_adj = 0;
	struct mv88e6xxx_chip *ps =
		container_of(ptp, struct mv88e6xxx_chip, ptp_info);

	if (ppb < 0) {
		neg_adj = 1;
		ppb = -ppb;
	}
	mult = CC_MULT;
	adj = mult;
	adj *= ppb;
	diff = div_u64(adj, 1000000000ULL);

	mutex_lock(&ps->clock_mutex);
	timecounter_read(&ps->tc);
	ps->cc.mult = neg_adj ? mult - diff : mult + diff;
	mutex_unlock(&ps->clock_mutex);

	return 0;
}
---

(This is the legacy adjfreq method, but you can easily convert it into
 the adjfine method.)

Of course, this means that you'll have to drop the periodic output
signal code.

Thanks,
Richard

^ permalink raw reply

* Re: devlink dump of mlxsw_adj table triggers a panic
From: David Ahern @ 2017-10-08 15:04 UTC (permalink / raw)
  To: Arkadi Sharshevsky, Jiri Pirko; +Cc: netdev@vger.kernel.org
In-Reply-To: <c7286c2b-09de-1221-303a-0aed3ed77c21@mellanox.com>

On 10/8/17 1:43 AM, Arkadi Sharshevsky wrote:
> Thanks, will check it out. How many nexthops groups & overall number of
> nexthops you configured?

8 ports with 62 VLANs on each (496 total vlan devices) and 62 VRFs. BGP
is exchanging routes with neighbors. No multipath routes.

^ permalink raw reply

* Re: [PATCH net-next RFC 4/9] net: dsa: mv88e6xxx: add support for event capture
From: Richard Cochran @ 2017-10-08 15:06 UTC (permalink / raw)
  To: Brandon Streiff
  Cc: netdev, linux-kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Erik Hons
In-Reply-To: <1506612341-18061-5-git-send-email-brandon.streiff@ni.com>


There are some issues here.

On Thu, Sep 28, 2017 at 10:25:36AM -0500, Brandon Streiff wrote:
> +static int mv88e6xxx_config_periodic_trig(struct mv88e6xxx_chip *chip,
> +					  u32 ns, u16 picos)
> +{
> +	int err;
> +	u16 global_config;
> +
> +	if (picos >= 1000)
> +		return -ERANGE;
> +
> +	/* TRIG generation is in units of 8 ns clock periods. Convert ns
> +	 * and ps into 8 ns clock periods and up to 8000 additional ps
> +	 */
> +	picos += (ns & 0x7) * 1000;
> +	ns = ns >> 3;

Again, the 8 nanosecounds shouldn't be hard coded.

	...

> +	return err;
> +}

> +static void mv88e6xxx_tai_event_work(struct work_struct *ugly)
> +{
> +	struct delayed_work *dw = to_delayed_work(ugly);
> +	struct mv88e6xxx_chip *chip =
> +		container_of(dw, struct mv88e6xxx_chip, tai_event_work);
> +	u16 ev_status[4];
> +	int err;
> +
> +	mutex_lock(&chip->reg_lock);
> +
> +	err = mv88e6xxx_tai_read(chip, MV88E6XXX_TAI_EVENT_STATUS,
> +				 ev_status, ARRAY_SIZE(ev_status));
> +	if (err) {
> +		mutex_unlock(&chip->reg_lock);
> +		return;
> +	}
> +
> +	if (ev_status[0] & MV88E6XXX_TAI_EVENT_STATUS_ERROR)
> +		dev_warn(chip->dev, "missed event capture\n");
> +
> +	if (ev_status[0] & MV88E6XXX_TAI_EVENT_STATUS_VALID) {

Avoid IfOk.

> +		struct ptp_clock_event ev;
> +		u32 raw_ts = ((u32)ev_status[2] << 16) | ev_status[1];
> +
> +		/* Clear the valid bit so the next timestamp can come in */
> +		ev_status[0] &= ~MV88E6XXX_TAI_EVENT_STATUS_VALID;
> +		err = mv88e6xxx_tai_write(chip, MV88E6XXX_TAI_EVENT_STATUS,
> +					  ev_status[0]);
> +
> +		if (ev_status[0] & MV88E6XXX_TAI_EVENT_STATUS_CAP_TRIG) {
> +			/* TAI is configured to timestamp internal events.
> +			 * This will be a PPS event.
> +			 */
> +			ev.type = PTP_CLOCK_PPS;
> +		} else {
> +			/* Otherwise this is an external timestamp */
> +			ev.type = PTP_CLOCK_EXTTS;
> +		}
> +		/* We only have one timestamping channel. */
> +		ev.index = 0;
> +		ev.timestamp = timecounter_cyc2time(&chip->tstamp_tc, raw_ts);
> +
> +		ptp_clock_event(chip->ptp_clock, &ev);
> +	}
> +
> +	mutex_unlock(&chip->reg_lock);
> +
> +	schedule_delayed_work(&chip->tai_event_work, TAI_EVENT_WORK_INTERVAL);
> +}
> +

> +static int mv88e6xxx_ptp_enable_perout(struct mv88e6xxx_chip *chip,
> +				       struct ptp_clock_request *rq, int on)
> +{
> +	struct timespec ts;
> +	u64 ns;
> +	int pin;
> +	int err;
> +
> +	pin = ptp_find_pin(chip->ptp_clock, PTP_PF_PEROUT, rq->extts.index);
> +
> +	if (pin < 0)
> +		return -EBUSY;
> +
> +	ts.tv_sec = rq->perout.period.sec;
> +	ts.tv_nsec = rq->perout.period.nsec;
> +	ns = timespec_to_ns(&ts);
> +
> +	if (ns > U32_MAX)
> +		return -ERANGE;
> +
> +	mutex_lock(&chip->reg_lock);
> +
> +	err = mv88e6xxx_config_periodic_trig(chip, (u32)ns, 0);

Here you ignore the phase of the signal given in the trq->perout.start
field.  That is not what the user expects.  For periodic outputs where
the phase cannot be set, we really would need a new ioctl.

However, in this case, you should just drop this functionality.  I
understand that this works with your adjustable external oscillator,
but we cannot support that in mainline (at least, not yet).

Thanks,
Richard


> +	if (err)
> +		goto out;
> +
> +	if (on) {
> +		err = mv88e6xxx_g2_set_gpio_config(
> +			chip, pin, MV88E6XXX_G2_SCRATCH_GPIO_MODE_TRIG,
> +			MV88E6XXX_G2_SCRATCH_GPIO_DIR_OUT);
> +	} else {
> +		err = mv88e6xxx_g2_set_gpio_config(
> +			chip, pin, MV88E6XXX_G2_SCRATCH_GPIO_MODE_GPIO,
> +			MV88E6XXX_G2_SCRATCH_GPIO_DIR_IN);
> +	}
> +
> +out:
> +	mutex_unlock(&chip->reg_lock);
> +
> +	return err;
> +}

^ permalink raw reply

* Re: [PATCH 41/47] netfilter: convert hook list to an array
From: Tariq Toukan @ 2017-10-08 15:07 UTC (permalink / raw)
  To: Pablo Neira Ayuso, netfilter-devel, Aaron Conole,
	Florian Westphal
  Cc: davem, netdev
In-Reply-To: <1504478574-13281-6-git-send-email-pablo@netfilter.org>



On 04/09/2017 1:42 AM, Pablo Neira Ayuso wrote:
> From: Aaron Conole <aconole@bytheb.org>
> 
> This converts the storage and layout of netfilter hook entries from a
> linked list to an array.  After this commit, hook entries will be
> stored adjacent in memory.  The next pointer is no longer required.
> 
> The ops pointers are stored at the end of the array as they are only
> used in the register/unregister path and in the legacy br_netfilter code.
> 
> nf_unregister_net_hooks() is slower than needed as it just calls
> nf_unregister_net_hook in a loop (i.e. at least n synchronize_net()
> calls), this will be addressed in followup patch.
> 
> Test setup:
>   - ixgbe 10gbit
>   - netperf UDP_STREAM, 64 byte packets
>   - 5 hooks: (raw + mangle prerouting, mangle+filter input, inet filter):
> empty mangle and raw prerouting, mangle and filter input hooks:
> 353.9
> this patch:
> 364.2
> 
> Signed-off-by: Aaron Conole <aconole@bytheb.org>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---

Hi,

We experience a regression in server with iommu enabled.
After installing kernel and rebooting the server, it crashes during boot.
Please see trace below.

Bisecting points to this patch.

Any idea what's wrong?

Regards,
Tariq Toukan

[   25.590816] RIP: 0010:_raw_read_lock_bh+0x15/0x40
[   25.596160] RSP: 0018:ffffc90007db77a0 EFLAGS: 00010286
[   25.602089] RAX: 0000000000000100 RBX: 0000000000000003 RCX: 
0000000000000000
[   25.610152] RDX: 0000000000000000 RSI: ffffc90007db7898 RDI: 
000000000000003c
[   25.618470] RBP: ffffc90007db7840 R08: 0000000000000001 R09: 
0000000087c10eef
[   25.626786] R10: ffff88180f21f040 R11: ffffea005feeaf00 R12: 
0000000000000000
[   25.635103] R13: ffffc90007db7898 R14: ffff8817fbabdc00 R15: 
ffff8817fbabdc00
[   25.643421] FS:  00007fcdb7771740(0000) GS:ffff88180f200000(0000) 
knlGS:0000000000000000
[   25.653056] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   25.659818] CR2: 000000000000003c CR3: 0000001809ae0001 CR4: 
00000000001606e0
[   25.668136] Call Trace:
[   25.671215]  ? ebt_do_table+0x3d/0x6e8 [ebtables]
[   25.676817]  ebt_nat_out+0x1f/0x30 [ebtable_nat]
[   25.682326]  nf_hook_slow+0x3c/0xb0
[   25.686576]  __br_forward+0xb1/0x1b0 [bridge]
[   25.691786]  ? br_dev_queue_push_xmit+0x170/0x170 [bridge]
[   25.704333]  br_flood+0x130/0x1b0 [bridge]
[   25.709254]  br_dev_xmit+0x1e5/0x2a0 [bridge]
[   25.714468]  dev_hard_start_xmit+0xa1/0x210
[   25.719485]  __dev_queue_xmit+0x4f6/0x610
[   25.724304]  dev_queue_xmit+0x10/0x20
[   25.728739]  ip_finish_output2+0x233/0x320
[   25.733656]  ip_finish_output+0x12a/0x1d0
[   25.738474]  ? netif_rx_ni+0x33/0x80
[   25.742805]  ip_mc_output+0x84/0x250
[   25.747140]  ip_local_out+0x35/0x40
[   25.751377]  ip_send_skb+0x19/0x40
[   25.755583]  udp_send_skb+0x172/0x280
[   25.760013]  udp_sendmsg+0x2c0/0xa30
[   25.764348]  ? ip_reply_glue_bits+0x50/0x50
[   25.769366]  ? import_iovec+0x2c/0xc0
[   25.773801]  inet_sendmsg+0x31/0xb0
[   25.778042]  sock_sendmsg+0x38/0x50
[   25.782276]  ___sys_sendmsg+0x25c/0x270
[   25.786904]  ? file_update_time+0x3a/0xf0
[   25.791727]  ? __wake_up_sync_key+0x50/0x60
[   25.796741]  ? pipe_write+0x3cc/0x420
[   25.801175]  ? __vfs_write+0xd0/0x130
[   25.805608]  __sys_sendmsg+0x45/0x80
[   25.809938]  SyS_sendmsg+0x12/0x20
[   25.814077]  entry_SYSCALL_64_fastpath+0x1a/0xa5
[   25.819577] RIP: 0033:0x7fcdb64ac7a0
[   25.823908] RSP: 002b:00007ffe2b98cb98 EFLAGS: 00000246 ORIG_RAX: 
000000000000002e
[   25.832961] RAX: ffffffffffffffda RBX: 00007ffe2b98c630 RCX: 
00007fcdb64ac7a0
[   25.841270] RDX: 0000000000000000 RSI: 00007ffe2b98cc50 RDI: 
000000000000000c
[   25.849583] RBP: 00007fcdb69018f8 R08: 00007ffe2b98cbc3 R09: 
0000000000000004
[   25.857901] R10: 0000000000000019 R11: 0000000000000246 R12: 
0000000000000000
[   25.866213] R13: 0000000000000000 R14: 00007ffe2b98c6c0 R15: 
00007ffe2b98c6e0
[   25.874520] Code: 55 48 89 e5 e8 bd 74 82 ff 5d c3 66 66 2e 0f 1f 84 
00 00 00 00 00 0f 1f 44 00 00 65 81 05 68 78 74 7e 00 02 00 00 b8 00 01 
00 00 <f0> 0f c1 07 8d b0 00 01 00 00 40 84
[   25.896497] RIP: _raw_read_lock_bh+0x15/0x40 RSP: ffffc90007db77a0
[   25.903744] CR2: 000000000000003c
[   25.907808] ---[ end trace 4f824a5c467b1872 ]---
[   25.907811] BUG: unable to handle kernel NULL pointer dereference at 
000000000000003c
[   25.907828] IP: _raw_read_lock_bh+0x15/0x40
[   25.907830] PGD 0 P4D 0
[   25.907834] Oops: 0002 [#2] SMP
[   25.907836] Modules linked in: ebtable_nat(+) ebtables ib_ucm mlx4_en 
mlx4_ib rpcrdma mlx4_core rdma_ucm ib_uverbs ib_iser ib_umad rdma_cm 
ib_ipoib iw_cm ib_cm mlx5_ib bridge stp llc sge
[   25.907895] CPU: 12 PID: 0 Comm: swapper/12 Tainted: G      D 
4.13.0-for-linust-perf-2017-09-10_06-48-01-64 #1
[   25.907896] Hardware name: Dell Inc. PowerEdge R720/0HJK12, BIOS 
2.2.3 05/20/2014
[   25.907898] task: ffff880c0c2f8000 task.stack: ffffc90006318000
[   25.907901] RIP: 0010:_raw_read_lock_bh+0x15/0x40
[   25.907902] RSP: 0018:ffff880c0f9839d0 EFLAGS: 00010286
[   25.907904] RAX: 0000000000000100 RBX: 0000000000000003 RCX: 
0000000000000000
[   25.907905] RDX: 0000000000000000 RSI: ffff880c0f983ac8 RDI: 
000000000000003c
[   25.907906] RBP: ffff880c0f983a70 R08: 0000000000000001 R09: 
0000000000000000
[   25.907907] R10: 0000000000000000 R11: 0000000000000000 R12: 
0000000000000000
[   25.907909] R13: ffff880c0f983ac8 R14: ffff880bfcfdda00 R15: 
ffff880bfcfdda00
[   25.907911] FS:  0000000000000000(0000) GS:ffff880c0f980000(0000) 
knlGS:0000000000000000
[   25.907912] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   25.907913] CR2: 000000000000003c CR3: 0000001809a5e001 CR4: 
00000000001606e0
[   25.907915] Call Trace:
[   25.907918]  <IRQ>
[   25.907925]  ? ebt_do_table+0x3d/0x6e8 [ebtables]
[   25.907929]  ? lock_timer_base+0x7d/0xa0
[   25.907932]  ? mod_timer+0xa9/0x2c0
[   25.907937]  ebt_nat_out+0x1f/0x30 [ebtable_nat]
[   25.907946]  nf_hook_slow+0x3c/0xb0
[   25.907958]  __br_forward+0xb1/0x1b0 [bridge]
[   25.907966]  ? br_dev_queue_push_xmit+0x170/0x170 [bridge]
[   25.907972]  br_flood+0x130/0x1b0 [bridge]
[   25.907979]  br_dev_xmit+0x1e5/0x2a0 [bridge]
[   25.907987]  dev_hard_start_xmit+0xa1/0x210
[   25.907990]  __dev_queue_xmit+0x4f6/0x610
[   25.907993]  ? _raw_read_unlock_bh+0x20/0x30
[   25.907996]  dev_queue_xmit+0x10/0x20
[   25.908001]  ip6_finish_output2+0x3b5/0x4c0
[   25.908005]  ip6_finish_output+0xa5/0x100
[   25.908007]  ip6_output+0x5b/0xf0
[   25.908012]  NF_HOOK.constprop.43+0x30/0x90
[   25.908015]  ? icmp6_dst_alloc+0xd2/0x110
[   25.908018]  mld_sendpack+0x168/0x220
[   25.908021]  mld_ifc_timer_expire+0x17f/0x290
[   25.908024]  ? mld_dad_timer_expire+0x60/0x60
[   25.908026]  call_timer_fn+0x35/0x140
[   25.908028]  run_timer_softirq+0x1ce/0x410
[   25.908031]  ? timerqueue_add+0x59/0x90
[   25.908036]  ? sched_clock+0x9/0x10
[   25.908039]  ? sched_clock_cpu+0x11/0xb0
[   25.908042]  __do_softirq+0xd1/0x27f
[   25.908046]  irq_exit+0xb5/0xc0
[   25.908048]  smp_apic_timer_interrupt+0x69/0x130
[   25.908050]  apic_timer_interrupt+0x93/0xa0
[   25.908052]  </IRQ>
[   25.908056] RIP: 0010:cpuidle_enter_state+0xe9/0x280
[   25.908057] RSP: 0018:ffffc9000631be88 EFLAGS: 00000246 ORIG_RAX: 
ffffffffffffff10
[   25.908059] RAX: ffff880c0f99bdc0 RBX: ffffe8f400180270 RCX: 
000000000000001f
[   25.908060] RDX: 0000000000000000 RSI: ffff7761f8923d16 RDI: 
0000000000000000
[   25.908061] RBP: ffffc9000631bec0 R08: 00000000000002a1 R09: 
0000000000000390
[   25.908062] R10: 000000000000037e R11: 0000000000000018 R12: 
0000000000000004
[   25.908063] R13: 000000000000000c R14: ffffe8f400180270 R15: 
00000005f7b4d9b4
[   25.908068]  ? cpuidle_enter_state+0xc5/0x280
[   25.908071]  cpuidle_enter+0x17/0x20
[   25.908074]  call_cpuidle+0x23/0x40
[   25.908077]  do_idle+0x172/0x1e0
[   25.908079]  cpu_startup_entry+0x1d/0x30
[   25.908084]  start_secondary+0x103/0x130
[   25.908087]  secondary_startup_64+0xa5/0xa5
[   25.908089] Code: 55 48 89 e5 e8 bd 74 82 ff 5d c3 66 66 2e 0f 1f 84 
00 00 00 00 00 0f 1f 44 00 00 65 81 05 68 78 74 7e 00 02 00 00 b8 00 01 
00 00 <f0> 0f c1 07 8d b0 00 01 00 00 40 84
[   25.908124] RIP: _raw_read_lock_bh+0x15/0x40 RSP: ffff880c0f9839d0
[   25.908124] CR2: 000000000000003c
[   25.908154] ---[ end trace 4f824a5c467b1873 ]---
[   25.913089] Kernel panic - not syncing: Fatal exception in interrupt
[   26.964216] Shutting down cpus with NMI
[   26.968841] Kernel Offset: disabled
[   26.975644] ---[ end Kernel panic - not syncing: Fatal exception in 
interrupt

^ permalink raw reply

* Re: [PATCH net-next RFC 8/9] net: dsa: mv88e6xxx: add rx/tx timestamping support
From: Richard Cochran @ 2017-10-08 15:12 UTC (permalink / raw)
  To: Brandon Streiff
  Cc: netdev, linux-kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Erik Hons
In-Reply-To: <1506612341-18061-9-git-send-email-brandon.streiff@ni.com>

On Thu, Sep 28, 2017 at 10:25:40AM -0500, Brandon Streiff wrote:
> We also utilize a feature of the "generation 3" PTP hardware that lets
> us to embed the timestamp value into one of the reserved fields in the
> PTP header. This lets us extract the timestamp out of the header and
> avoid an SMI access in the RX codepath. (This implementation does not
> presently support the older generations.)

That is fine for the later models, but we really need the code to read
over MDIO as well.  You added .ptp_support = true for those older
switches, and so the present series won't work.

If it helps, maybe I can adapt the relevant code from my driver to
your work.

Thanks,
Richard

^ permalink raw reply

* [BUG] stmmac: A possible sleep-in-atomic bug in stmmac_suspend
From: Jia-Ju Bai @ 2017-10-08 15:13 UTC (permalink / raw)
  To: peppe.cavallaro, alexandre.torgue, davem
  Cc: netdev, Linux Kernel Mailing List

According to stmmac_main.c, the driver may sleep under a spinlock,
and the function call path is:
stmmac_suspend (acquire the spinlock)
   stmmac_disable_all_queues
     napi_disable
       might_sleep --> may sleep
       msleep --> may sleep

This bug is found by my static analysis tool and my code review.


Thanks,
Jia-Ju Bai

^ permalink raw reply

* [PATCH net-next] ipv6: Do not use this_cpu_ptr() in preemptible context
From: Ido Schimmel @ 2017-10-08 15:18 UTC (permalink / raw)
  To: netdev; +Cc: davem, weiwan, mlxsw, Ido Schimmel

Without the rwlock and with PREEMPT_RCU we're no longer guaranteed to be
in non-preemptible context when performing a route lookup, so use
raw_cpu_ptr() instead.

Takes care of the following splat:
[  122.221814] BUG: using smp_processor_id() in preemptible [00000000] code: sshd/2672
[  122.221845] caller is debug_smp_processor_id+0x17/0x20
[  122.221866] CPU: 0 PID: 2672 Comm: sshd Not tainted 4.14.0-rc3-idosch-next-custom #639
[  122.221880] Hardware name: Mellanox Technologies Ltd. MSN2100-CB2FO/SA001017, BIOS 5.6.5 06/07/2016
[  122.221893] Call Trace:
[  122.221919]  dump_stack+0xb1/0x10c
[  122.221946]  ? _atomic_dec_and_lock+0x124/0x124
[  122.221974]  ? ___ratelimit+0xfe/0x240
[  122.222020]  check_preemption_disabled+0x173/0x1b0
[  122.222060]  debug_smp_processor_id+0x17/0x20
[  122.222083]  ip6_pol_route+0x1482/0x24a0
...

Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 net/ipv6/route.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 399d1bceec4a..579d4b73beb1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1112,7 +1112,7 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 {
 	struct rt6_info *pcpu_rt, **p;
 
-	p = this_cpu_ptr(rt->rt6i_pcpu);
+	p = raw_cpu_ptr(rt->rt6i_pcpu);
 	pcpu_rt = *p;
 
 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
@@ -1134,7 +1134,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
 	}
 
 	dst_hold(&pcpu_rt->dst);
-	p = this_cpu_ptr(rt->rt6i_pcpu);
+	p = raw_cpu_ptr(rt->rt6i_pcpu);
 	prev = cmpxchg(p, NULL, pcpu_rt);
 	if (prev) {
 		/* If someone did it before us, return prev instead */
-- 
2.13.6

^ permalink raw reply related

* Re: [PATCH net-next RFC 8/9] net: dsa: mv88e6xxx: add rx/tx timestamping support
From: Richard Cochran @ 2017-10-08 15:29 UTC (permalink / raw)
  To: Brandon Streiff
  Cc: netdev, linux-kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Erik Hons
In-Reply-To: <1506612341-18061-9-git-send-email-brandon.streiff@ni.com>

On Thu, Sep 28, 2017 at 10:25:40AM -0500, Brandon Streiff wrote:
> +void mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port,
> +			     struct sk_buff *clone, unsigned int type)
> +{
> +	struct mv88e6xxx_chip *chip = ds->priv;
> +	struct mv88e6xxx_port_hwtstamp *ps = &chip->port_hwtstamp[port];
> +
> +	if (!chip->info->ptp_support)
> +		return;
> +
> +	if (port < 0 || port >= mv88e6xxx_num_ports(chip))
> +		goto out;
> +
> +	if (unlikely(skb_shinfo(clone)->tx_flags & SKBTX_HW_TSTAMP) &&
> +	    mv88e6xxx_should_tstamp(chip, port, clone, type)) {
> +		__be16 *seq_ptr = (__be16 *)(_get_ptp_header(clone, type) +
> +					     OFF_PTP_SEQUENCE_ID);
> +
> +		if (!test_and_set_bit_lock(MV88E6XXX_HWTSTAMP_TX_IN_PROGRESS,
> +					   &ps->state)) {
> +			ps->tx_skb = clone;
> +			ps->tx_tstamp_start = jiffies;
> +			ps->tx_seq_id = be16_to_cpup(seq_ptr);
> +
> +			/* Fetching the timestamp is high-priority work because
> +			 * 802.1AS bounds the time for a response.

Can you please use this?

commit d9535cb7b7603aeb549c697ecdf92024e4d0a650
Author: Grygorii Strashko <grygorii.strashko@ti.com>
Date:   Fri Jul 28 17:30:02 2017 -0500

    ptp: introduce ptp auxiliary worker
    
    Many PTP drivers required to perform some asynchronous or periodic work,
    like periodically handling PHC counter overflow or handle delayed timestamp
    for RX/TX network packets. In most of the cases, such work is implemented
    using workqueues. Unfortunately, Kernel workqueues might introduce
    significant delay in work scheduling under high system load and on -RT,
    which could cause misbehavior of PTP drivers due to internal counter
    overflow, for example, and there is no way to tune its execution policy and
    priority manuallly.
    
    Hence, The kthread_worker can be used insted of workqueues, as it create
    separte named kthread for each worker and its its execution policy and
    priority can be configured using chrt tool.

> +			 * No need to check result of queue_work(). ps->tx_skb
> +			 * check ensures work item is not pending (it may be
> +			 * waiting to exit)
> +			 */
> +			queue_work(system_highpri_wq, &ps->tx_tstamp_work);
> +			return;
> +		}
> +
> +		/* Otherwise we're already in progress... */
> +		dev_dbg(chip->dev,
> +			"p%d: tx timestamp already in progress, discarding",
> +			port);
> +	}
> +
> +out:
> +	/* We don't need it after all. */
> +	kfree_skb(clone);
> +}

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH net-next RFC 0/9] net: dsa: PTP timestamping for mv88e6xxx
From: Richard Cochran @ 2017-10-08 15:38 UTC (permalink / raw)
  To: Brandon Streiff
  Cc: netdev, linux-kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Erik Hons
In-Reply-To: <20170929094323.cwh2ubv4odknlyot@localhost>

On Fri, Sep 29, 2017 at 05:43:23AM -0400, Richard Cochran wrote:
> I happy to see this series.  I just finished porting an out-of-tree
> PHC driver for the Marvell mv88e635x, and I want to mainline it, but I
> also have a few uglies.

This series looks really good.  I won't even post my mine, as that
would now be too embarrassing.

I will try to get my hands on some HW, perhaps by the end of October,
in order to test and complete your driver...

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH net-next] ipv6: Do not use this_cpu_ptr() in preemptible context
From: Eric Dumazet @ 2017-10-08 16:03 UTC (permalink / raw)
  To: Ido Schimmel; +Cc: netdev, davem, weiwan, mlxsw
In-Reply-To: <20171008151839.24519-1-idosch@mellanox.com>

On Sun, 2017-10-08 at 18:18 +0300, Ido Schimmel wrote:
> Without the rwlock and with PREEMPT_RCU we're no longer guaranteed to be
> in non-preemptible context when performing a route lookup, so use
> raw_cpu_ptr() instead.
> 
> Takes care of the following splat:
> [  122.221814] BUG: using smp_processor_id() in preemptible [00000000] code: sshd/2672
> [  122.221845] caller is debug_smp_processor_id+0x17/0x20
> [  122.221866] CPU: 0 PID: 2672 Comm: sshd Not tainted 4.14.0-rc3-idosch-next-custom #639
> [  122.221880] Hardware name: Mellanox Technologies Ltd. MSN2100-CB2FO/SA001017, BIOS 5.6.5 06/07/2016
> [  122.221893] Call Trace:
> [  122.221919]  dump_stack+0xb1/0x10c
> [  122.221946]  ? _atomic_dec_and_lock+0x124/0x124
> [  122.221974]  ? ___ratelimit+0xfe/0x240
> [  122.222020]  check_preemption_disabled+0x173/0x1b0
> [  122.222060]  debug_smp_processor_id+0x17/0x20
> [  122.222083]  ip6_pol_route+0x1482/0x24a0
> ...
> 
> Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> ---


Thanks Ido for this patch.

IMO, we no longer play this read_lock() -> write_lock() game since
ip6_dst_gc() could be called from rt6_make_pcpu_route()


So we might simplify things quite a bit, by blocking BH (and thus
preventing preemption)

Something like :

 net/ipv6/route.c |   26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 399d1bceec4a6e6736c367e706dd2acbd4093d58..606e80325b21c0e10a02e9c7d5b3fcfbfc26a003 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1136,15 +1136,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
 	dst_hold(&pcpu_rt->dst);
 	p = this_cpu_ptr(rt->rt6i_pcpu);
 	prev = cmpxchg(p, NULL, pcpu_rt);
-	if (prev) {
-		/* If someone did it before us, return prev instead */
-		/* release refcnt taken by ip6_rt_pcpu_alloc() */
-		dst_release_immediate(&pcpu_rt->dst);
-		/* release refcnt taken by above dst_hold() */
-		dst_release_immediate(&pcpu_rt->dst);
-		dst_hold(&prev->dst);
-		pcpu_rt = prev;
-	}
+	BUG_ON(prev);
 
 	rt6_dst_from_metrics_check(pcpu_rt);
 	return pcpu_rt;
@@ -1739,31 +1731,25 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		struct rt6_info *pcpu_rt;
 
 		dst_use_noref(&rt->dst, jiffies);
+		local_bh_disable();
 		pcpu_rt = rt6_get_pcpu_route(rt);
 
-		if (pcpu_rt) {
-			rcu_read_unlock();
-		} else {
+		if (!pcpu_rt) {
 			/* atomic_inc_not_zero() is needed when using rcu */
 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
-				/* We have to do the read_unlock first
-				 * because rt6_make_pcpu_route() may trigger
-				 * ip6_dst_gc() which will take the write_lock.
-				 *
-				 * No dst_hold() on rt is needed because grabbing
+				/* No dst_hold() on rt is needed because grabbing
 				 * rt->rt6i_ref makes sure rt can't be released.
 				 */
-				rcu_read_unlock();
 				pcpu_rt = rt6_make_pcpu_route(rt);
 				rt6_release(rt);
 			} else {
 				/* rt is already removed from tree */
-				rcu_read_unlock();
 				pcpu_rt = net->ipv6.ip6_null_entry;
 				dst_hold(&pcpu_rt->dst);
 			}
 		}
-
+		local_bh_enable();
+		rcu_read_unlock();
 		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
 		return pcpu_rt;
 	}

^ permalink raw reply related

* [PATCH 00/12] radix-tree: split out struct radix_tree_root out to <linux/radix-tree-root.h>
From: Masahiro Yamada @ 2017-10-08 16:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thomas Gleixner, Andrew Morton, Ian Abbott, Ingo Molnar,
	Linus Torvalds, Masahiro Yamada, linux-cachefs, linux-sh,
	Rodrigo Vivi, dri-devel, David Airlie, linux-rdma, Yoshinori Sato,
	Tariq Toukan, Rich Felker, Leon Romanovsky, Jani Nikula,
	J. Bruce Fields, David Howells, intel-gfx, Yishai Hadas


The motivation of this series is to cut down unnecessary header
dependency in terms of radix tree.

Sub-systems or drivers that use radix-tree for data management
typically embed struct radix_tree_root in their data structures,
like this:

struct foo {
       ...

       struct radix_tree_root   foo_tree;
       ...
};

So, <linux/foo.h> needs to include <linux/radix-tree.h>,
therefore, users of <linux/foo.h> include a lot of bloat
from <linux/radix-tree.h>.

If you see the definition of radix_tree_root,

   struct radix_tree_root {
           gfp_t			gfp_mask;
	   struct radix_tree_node	__rcu *rnode;
   };

it is a very simple structure.
It only depends on <linux/types.h> for gfp_t and
<linux/compiler.h> for __rcu.

By splitting out the radix_tree_root definition,
we can reduce the header file dependency.

Reducing the header dependency will help for speeding the kernel
build, suppressing unnecessary recompile of objects during
git-bisect'ing, etc.

The patch 1 is a trivial clean-up; it is just here
to avoid conflict.

The patch 2 is the main part of this series;
split out struct radix_tree_root.

The rest of the series replace <linux/radix-tree.h>
with <linux/radix-tree-root.h> where appropriate.

Please review if the idea is OK.

If it is OK, I'd like to know how to apply the series.

Perhaps, the first two for v4.15.  Then, rest of series
will be sent per-subsystem for v4.16?

Or, can somebody take care of the whole series?

I checked allmodconfig for x86 and arm64.
I am expecting 0 day testing will check it too.



Masahiro Yamada (12):
  radix-tree: replace <linux/spinlock.h> with <linux/spinlock_types.h>
  radix-tree: split struct radix_tree_root to <linux/radix-tree-root.h>
  irqdomain: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
  writeback: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
  iocontext.h: replace <linux/radix-tree.h> with
    <linux/radix-tree-root.h>
  fs: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
  blkcg: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
  fscache: include <linux-radix-tree.h>
  sh: intc: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
  net/mlx4: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
  net/mlx5: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
  drm/i915: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>

 drivers/gpu/drm/i915/i915_gem.c            |  1 +
 drivers/gpu/drm/i915/i915_gem_context.c    |  1 +
 drivers/gpu/drm/i915/i915_gem_context.h    |  2 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  1 +
 drivers/gpu/drm/i915/i915_gem_object.h     |  1 +
 drivers/net/ethernet/mellanox/mlx4/cq.c    |  1 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |  2 +-
 drivers/net/ethernet/mellanox/mlx4/qp.c    |  1 +
 drivers/net/ethernet/mellanox/mlx4/srq.c   |  1 +
 drivers/sh/intc/internals.h                |  2 +-
 include/linux/backing-dev-defs.h           |  2 +-
 include/linux/blk-cgroup.h                 |  2 +-
 include/linux/fs.h                         |  2 +-
 include/linux/fscache.h                    |  1 +
 include/linux/iocontext.h                  |  2 +-
 include/linux/irqdomain.h                  |  2 +-
 include/linux/mlx4/device.h                |  2 +-
 include/linux/mlx4/qp.h                    |  1 +
 include/linux/mlx5/driver.h                |  2 +-
 include/linux/mlx5/qp.h                    |  1 +
 include/linux/radix-tree-root.h            | 24 ++++++++++++++++++++++++
 include/linux/radix-tree.h                 |  8 ++------
 22 files changed, 46 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/radix-tree-root.h

-- 
2.7.4

^ permalink raw reply

* [PATCH 10/12] net/mlx4: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
From: Masahiro Yamada @ 2017-10-08 16:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thomas Gleixner, Andrew Morton, Ian Abbott, Ingo Molnar,
	Linus Torvalds, Masahiro Yamada, linux-rdma, Yishai Hadas,
	Tariq Toukan, netdev
In-Reply-To: <1507479013-5207-1-git-send-email-yamada.masahiro@socionext.com>

The headers
 - include/linux/mlx4/device.h
 - drivers/net/ethernet/mellanox/mlx4/mlx4.h
require the definition of struct radix_tree_root, but do not need to
know anything about other radix tree stuff.

Include <linux/radix-tree-root.h> instead of <linux/radix-tree.h> to
reduce the header dependency.

While we are here, let's add missing <linux/radix-tree.h> where
radix tree accessors are used.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---

 drivers/net/ethernet/mellanox/mlx4/cq.c   | 1 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h | 2 +-
 drivers/net/ethernet/mellanox/mlx4/qp.c   | 1 +
 drivers/net/ethernet/mellanox/mlx4/srq.c  | 1 +
 include/linux/mlx4/device.h               | 2 +-
 include/linux/mlx4/qp.h                   | 1 +
 6 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 72eb50c..4cbe65c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -36,6 +36,7 @@
 
 #include <linux/hardirq.h>
 #include <linux/export.h>
+#include <linux/radix-tree.h>
 
 #include <linux/mlx4/cmd.h>
 #include <linux/mlx4/cq.h>
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index c68da19..975ef70 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -38,7 +38,7 @@
 #define MLX4_H
 
 #include <linux/mutex.h>
-#include <linux/radix-tree.h>
+#include <linux/radix-tree-root.h>
 #include <linux/rbtree.h>
 #include <linux/timer.h>
 #include <linux/semaphore.h>
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 728a2fb..50cbc62 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -35,6 +35,7 @@
 
 #include <linux/gfp.h>
 #include <linux/export.h>
+#include <linux/radix-tree.h>
 
 #include <linux/mlx4/cmd.h>
 #include <linux/mlx4/qp.h>
diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c
index bedf521..4201a46 100644
--- a/drivers/net/ethernet/mellanox/mlx4/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -36,6 +36,7 @@
 #include <linux/mlx4/srq.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
+#include <linux/radix-tree.h>
 
 #include "mlx4.h"
 #include "icm.h"
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b0a57e0..75eac23 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -36,7 +36,7 @@
 #include <linux/if_ether.h>
 #include <linux/pci.h>
 #include <linux/completion.h>
-#include <linux/radix-tree.h>
+#include <linux/radix-tree-root.h>
 #include <linux/cpu_rmap.h>
 #include <linux/crash_dump.h>
 
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 8e2828d..dfa7d8e 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -35,6 +35,7 @@
 
 #include <linux/types.h>
 #include <linux/if_ether.h>
+#include <linux/radix-tree.h>
 
 #include <linux/mlx4/device.h>
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 11/12] net/mlx5: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
From: Masahiro Yamada @ 2017-10-08 16:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thomas Gleixner, Andrew Morton, Ian Abbott, Ingo Molnar,
	Linus Torvalds, Masahiro Yamada, Matan Barak, linux-rdma, netdev,
	Saeed Mahameed, Leon Romanovsky
In-Reply-To: <1507479013-5207-1-git-send-email-yamada.masahiro@socionext.com>

The header include/linux/mlx5/driver.h requires the definition of
struct radix_tree_root, but does not need to know anything about
other radix tree stuff.

Include <linux/radix-tree-root.h> instead of <linux/radix-tree.h> to
reduce the number of included header files.

Also, add <linux/radix-tree.h> to include/linux/mlx5/gp.h where radix
tree accessors are used.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---

 include/linux/mlx5/driver.h | 2 +-
 include/linux/mlx5/qp.h     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 401c897..0aea568 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -40,7 +40,7 @@
 #include <linux/semaphore.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/radix-tree.h>
+#include <linux/radix-tree-root.h>
 #include <linux/workqueue.h>
 #include <linux/mempool.h>
 #include <linux/interrupt.h>
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 66d19b6..a90996f 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -35,6 +35,7 @@
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/driver.h>
+#include <linux/radix-tree.h>
 
 #define MLX5_INVALID_LKEY	0x100
 #define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 5)
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next] ipv6: Do not use this_cpu_ptr() in preemptible context
From: Ido Schimmel @ 2017-10-08 16:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Ido Schimmel, netdev, davem, weiwan, mlxsw
In-Reply-To: <1507478633.14419.30.camel@edumazet-glaptop3.roam.corp.google.com>

Hi Eric,

On Sun, Oct 08, 2017 at 09:03:53AM -0700, Eric Dumazet wrote:
> Thanks Ido for this patch.
> 
> IMO, we no longer play this read_lock() -> write_lock() game since
> ip6_dst_gc() could be called from rt6_make_pcpu_route()

Right, cause we can't deadlock anymore as with the rwlock.

> 
> So we might simplify things quite a bit, by blocking BH (and thus
> preventing preemption)
> 
> Something like :
> 
>  net/ipv6/route.c |   26 ++++++--------------------
>  1 file changed, 6 insertions(+), 20 deletions(-)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 399d1bceec4a6e6736c367e706dd2acbd4093d58..606e80325b21c0e10a02e9c7d5b3fcfbfc26a003 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -1136,15 +1136,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
>  	dst_hold(&pcpu_rt->dst);
>  	p = this_cpu_ptr(rt->rt6i_pcpu);
>  	prev = cmpxchg(p, NULL, pcpu_rt);
> -	if (prev) {
> -		/* If someone did it before us, return prev instead */
> -		/* release refcnt taken by ip6_rt_pcpu_alloc() */
> -		dst_release_immediate(&pcpu_rt->dst);
> -		/* release refcnt taken by above dst_hold() */
> -		dst_release_immediate(&pcpu_rt->dst);
> -		dst_hold(&prev->dst);
> -		pcpu_rt = prev;
> -	}
> +	BUG_ON(prev);

Is this BUG_ON() now valid because of the local_bh_disable() in
ip6_pol_route()?

>  
>  	rt6_dst_from_metrics_check(pcpu_rt);
>  	return pcpu_rt;
> @@ -1739,31 +1731,25 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
>  		struct rt6_info *pcpu_rt;
>  
>  		dst_use_noref(&rt->dst, jiffies);
> +		local_bh_disable();
>  		pcpu_rt = rt6_get_pcpu_route(rt);
>  
> -		if (pcpu_rt) {
> -			rcu_read_unlock();
> -		} else {
> +		if (!pcpu_rt) {
>  			/* atomic_inc_not_zero() is needed when using rcu */
>  			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
> -				/* We have to do the read_unlock first
> -				 * because rt6_make_pcpu_route() may trigger
> -				 * ip6_dst_gc() which will take the write_lock.
> -				 *
> -				 * No dst_hold() on rt is needed because grabbing
> +				/* No dst_hold() on rt is needed because grabbing
>  				 * rt->rt6i_ref makes sure rt can't be released.
>  				 */
> -				rcu_read_unlock();
>  				pcpu_rt = rt6_make_pcpu_route(rt);
>  				rt6_release(rt);
>  			} else {
>  				/* rt is already removed from tree */
> -				rcu_read_unlock();
>  				pcpu_rt = net->ipv6.ip6_null_entry;
>  				dst_hold(&pcpu_rt->dst);
>  			}
>  		}
> -
> +		local_bh_enable();
> +		rcu_read_unlock();
>  		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
>  		return pcpu_rt;
>  	}

I replaced my patch with yours and I don't trigger the bug anymore. Feel
free to add my tag:

Tested-by: Ido Schimmel <idosch@mellanox.com>

Thanks!

^ permalink raw reply

* Re: [PATCH 10/12] net/mlx4: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
From: David Miller @ 2017-10-08 17:00 UTC (permalink / raw)
  To: yamada.masahiro
  Cc: linux-kernel, tglx, akpm, abbotti, mingo, torvalds, linux-rdma,
	yishaih, tariqt, netdev
In-Reply-To: <1507479013-5207-11-git-send-email-yamada.masahiro@socionext.com>

From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon,  9 Oct 2017 01:10:11 +0900

> The headers
>  - include/linux/mlx4/device.h
>  - drivers/net/ethernet/mellanox/mlx4/mlx4.h
> require the definition of struct radix_tree_root, but do not need to
> know anything about other radix tree stuff.
> 
> Include <linux/radix-tree-root.h> instead of <linux/radix-tree.h> to
> reduce the header dependency.
> 
> While we are here, let's add missing <linux/radix-tree.h> where
> radix tree accessors are used.
> 
> Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>

Honestly this makes things more complicated.

The driver was trying to consolidate all of the header needs
by including them all in one place, the main driver header.

Now you're including headers in several different files.

I really don't like the results of this change and would
ask you to reconsider.

Just add both radix-tree-root.h _and_ radix-tree.h to mlx4.h
and leave the rest of the driver alone.

^ permalink raw reply

* Re: [PATCH 11/12] net/mlx5: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
From: David Miller @ 2017-10-08 17:00 UTC (permalink / raw)
  To: yamada.masahiro
  Cc: linux-kernel, tglx, akpm, abbotti, mingo, torvalds, matanb,
	linux-rdma, netdev, saeedm, leonro
In-Reply-To: <1507479013-5207-12-git-send-email-yamada.masahiro@socionext.com>

From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon,  9 Oct 2017 01:10:12 +0900

> The header include/linux/mlx5/driver.h requires the definition of
> struct radix_tree_root, but does not need to know anything about
> other radix tree stuff.
> 
> Include <linux/radix-tree-root.h> instead of <linux/radix-tree.h> to
> reduce the number of included header files.
> 
> Also, add <linux/radix-tree.h> to include/linux/mlx5/gp.h where radix
> tree accessors are used.
> 
> Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>

Same objections as the mlx4 changes, just include both headers in
driver.h

^ permalink raw reply

* Re: [PATCH 3/3] batman-adv: Add missing kerneldoc for extack
From: David Miller @ 2017-10-08 17:04 UTC (permalink / raw)
  To: sven; +Cc: b.a.t.m.a.n, netdev, dsahern
In-Reply-To: <1655154.zJsG0lsWS3@sven-edge>

From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 08 Oct 2017 09:29:17 +0200

> Are you expected to apply this change:
> =====================================
> 
> I was hoping that Simon is picking the patch up and forwards it do you in a 
> proper pull request. But I would doubt that he has a big problem with you 
> applying this single line kernel-doc change. At least it is less work for him 
> and less extra noise on both mailing lists...
> 
> But feel free to tell us your preferred solution.

Ok I'll wait to get this from the next batman-adv pull request then.

Thanks.

^ permalink raw reply

* Re: [patch net] mlxsw: spectrum_router: Avoid expensive lookup during route removal
From: David Miller @ 2017-10-08 17:06 UTC (permalink / raw)
  To: jiri; +Cc: netdev, idosch, mlxsw, dsa
In-Reply-To: <20171008095326.1238-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@resnulli.us>
Date: Sun,  8 Oct 2017 11:53:26 +0200

> From: Ido Schimmel <idosch@mellanox.com>
> 
> In commit fc922bb0dd94 ("mlxsw: spectrum_router: Use one LPM tree for
> all virtual routers") I increased the scale of supported VRFs by having
> all of them share the same LPM tree.
> 
> In order to avoid look-ups for prefix lengths that don't exist, each
> route removal would trigger an aggregation across all the active virtual
> routers to see which prefix lengths are in use and which aren't and
> structure the tree accordingly.
> 
> With the way the data structures are currently laid out, this is a very
> expensive operation. When preformed repeatedly - due to the invocation
> of the abort mechanism - and with enough VRFs, this can result in a hung
> task.
> 
> For now, avoid this optimization until it can be properly re-added in
> net-next.
> 
> Fixes: fc922bb0dd94 ("mlxsw: spectrum_router: Use one LPM tree for all virtual routers")
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> Reported-by: David Ahern <dsa@cumulusnetworks.com>
> Tested-by: David Ahern <dsa@cumulusnetworks.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>

Applied.

^ permalink raw reply

* Re: [patch net-next repost 0/2] mlxsw: Add more extack error reporting
From: David Miller @ 2017-10-08 17:07 UTC (permalink / raw)
  To: jiri; +Cc: netdev, idosch, mlxsw, dsahern
In-Reply-To: <20171008095756.2139-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@resnulli.us>
Date: Sun,  8 Oct 2017 11:57:54 +0200

> From: Jiri Pirko <jiri@mellanox.com>
> 
> Ido says:
> 
> Add error messages to VLAN and bridge enslavements to help users
> understand why the enslavement failed.

Series applied, thanks for the repost.

^ permalink raw reply

* Re: [PATCH net-next,0/3] hv_netvsc: support changing TCP hash level
From: David Miller @ 2017-10-08 17:11 UTC (permalink / raw)
  To: haiyangz, haiyangz; +Cc: netdev, kys, sthemmin, olaf, vkuznets, linux-kernel
In-Reply-To: <20171006153359.8400-1-haiyangz@exchange.microsoft.com>

From: Haiyang Zhang <haiyangz@exchange.microsoft.com>
Date: Fri,  6 Oct 2017 08:33:56 -0700

> From: Haiyang Zhang <haiyangz@microsoft.com>
> 
> The patch set simplifies the existing hash level switching code for
> UDP. It also adds the support for changing TCP hash level. So users
> can switch between L3 an L4 hash levels for TCP and UDP.

Series applied.

^ permalink raw reply

* Re: [PATCH v2] gso: fix payload length when gso_size is zero
From: David Miller @ 2017-10-08 17:13 UTC (permalink / raw)
  To: alexey.kodanev; +Cc: netdev, steffen.klassert, alexander.h.duyck
In-Reply-To: <1507305755-14393-1-git-send-email-alexey.kodanev@oracle.com>

From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri,  6 Oct 2017 19:02:35 +0300

> When gso_size reset to zero for the tail segment in skb_segment(), later
> in ipv6_gso_segment(), __skb_udp_tunnel_segment() and gre_gso_segment()
> we will get incorrect results (payload length, pcsum) for that segment.
> inet_gso_segment() already has a check for gso_size before calculating
> payload.
> 
> The issue was found with LTP vxlan & gre tests over ixgbe NIC.
> 
> Fixes: 07b26c9454a2 ("gso: Support partial splitting at the frag_list pointer")
> Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
> ---
> v2: also added skb_is_gso to gre_gso_segment() and __skb_udp_tunnel_segment()
> 

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH 10/12] net/mlx4: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
From: Masahiro Yamada @ 2017-10-08 17:29 UTC (permalink / raw)
  To: David Miller
  Cc: Linux Kernel Mailing List, Thomas Gleixner, Andrew Morton,
	abbotti, Ingo Molnar, Linus Torvalds, linux-rdma, yishaih, tariqt,
	netdev
In-Reply-To: <20171008.100013.2081721941420989813.davem@davemloft.net>

2017-10-09 2:00 GMT+09:00 David Miller <davem@davemloft.net>:
> From: Masahiro Yamada <yamada.masahiro@socionext.com>
> Date: Mon,  9 Oct 2017 01:10:11 +0900
>
>> The headers
>>  - include/linux/mlx4/device.h
>>  - drivers/net/ethernet/mellanox/mlx4/mlx4.h
>> require the definition of struct radix_tree_root, but do not need to
>> know anything about other radix tree stuff.
>>
>> Include <linux/radix-tree-root.h> instead of <linux/radix-tree.h> to
>> reduce the header dependency.
>>
>> While we are here, let's add missing <linux/radix-tree.h> where
>> radix tree accessors are used.
>>
>> Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
>
> Honestly this makes things more complicated.


The idea is simple; include necessary headers explicitly.

Putting everything into one common header
means most of C files are forced to parse unnecessary headers.



> The driver was trying to consolidate all of the header needs
> by including them all in one place, the main driver header.
>
> Now you're including headers in several different files.
>
> I really don't like the results of this change and would
> ask you to reconsider.
>
> Just add both radix-tree-root.h _and_ radix-tree.h to mlx4.h
> and leave the rest of the driver alone.


If you do not like this, you can just throw it away.

<linux/radix-tree.h> includes <linux/radix-tree-root.h>.
You do not need to include both.




-- 
Best Regards
Masahiro Yamada

^ permalink raw reply

* Re: [PATCH 10/12] net/mlx4: replace <linux/radix-tree.h> with <linux/radix-tree-root.h>
From: Joe Perches @ 2017-10-08 17:32 UTC (permalink / raw)
  To: Masahiro Yamada, David Miller
  Cc: Linux Kernel Mailing List, Thomas Gleixner, Andrew Morton,
	abbotti, Ingo Molnar, Linus Torvalds, linux-rdma, yishaih, tariqt,
	netdev
In-Reply-To: <CAK7LNAQ1BeSb14poieRU4uMsnsEHPNL2zSNtWi1TGN=YrK_8jA@mail.gmail.com>

On Mon, 2017-10-09 at 02:29 +0900, Masahiro Yamada wrote:
> The idea is simple; include necessary headers explicitly.

Try that for kernel.h

There's a reason aggregation of #includes is useful.

^ permalink raw reply

* Re: [PATCH net-next] ipv6: Do not use this_cpu_ptr() in preemptible context
From: Eric Dumazet @ 2017-10-08 18:25 UTC (permalink / raw)
  To: Ido Schimmel; +Cc: Ido Schimmel, netdev, davem, weiwan, mlxsw
In-Reply-To: <20171008165407.GA26635@shredder.mtl.com>

On Sun, 2017-10-08 at 19:54 +0300, Ido Schimmel wrote:
> Hi Eric,

> >  	prev = cmpxchg(p, NULL, pcpu_rt);
> > -	if (prev) {
> > -		/* If someone did it before us, return prev instead */
> > -		/* release refcnt taken by ip6_rt_pcpu_alloc() */
> > -		dst_release_immediate(&pcpu_rt->dst);
> > -		/* release refcnt taken by above dst_hold() */
> > -		dst_release_immediate(&pcpu_rt->dst);
> > -		dst_hold(&prev->dst);
> > -		pcpu_rt = prev;
> > -	}
> > +	BUG_ON(prev);
> 
> Is this BUG_ON() now valid because of the local_bh_disable() in
> ip6_pol_route()?

Yes, this bug to trigger would need this code be re-entered from a hard
IRQ, and that would be wrong of course.

^ permalink raw reply

* Re: [PATCH v6 05/11] dt-bindings: net: dwmac-sun8i: update documentation about integrated PHY
From: Corentin Labbe @ 2017-10-08 18:33 UTC (permalink / raw)
  To: Andrew Lunn, robh+dt, maxime.ripard, wens, f.fainelli
  Cc: mark.rutland, linux, catalin.marinas, will.deacon,
	peppe.cavallaro, alexandre.torgue, frowand.list, netdev,
	devicetree, linux-arm-kernel, linux-kernel, linux-sunxi
In-Reply-To: <20170928073708.GB32676@Red>

On Thu, Sep 28, 2017 at 09:37:08AM +0200, Corentin Labbe wrote:
> On Wed, Sep 27, 2017 at 04:02:10PM +0200, Andrew Lunn wrote:
> > Hi Corentin
> > 
> > > +Required properties for the mdio-mux node:
> > > +  - compatible = "mdio-mux"
> > 
> > This is too generic. Please add a more specific compatible for this
> > particular mux. You can keep "mdio-mux", since that is what the MDIO
> > subsystem will look for.
> > 
> 
> I will add allwinner,sun8i-h3-mdio-mux
> 
> > > +Required properties of the integrated phy node:
> > >  - clocks: a phandle to the reference clock for the EPHY
> > >  - resets: a phandle to the reset control for the EPHY
> > > +- phy-is-integrated
> > 
> > So the last thing you said is that the mux is not the problem
> > here. Something else is locking up. Did you discover what?
> > 
> > I really would like phy-is-integrated to go away.
> > 
> 
> I have found the problem: by enabling ephy clk/reset the timeout does not occur anymore.
> So we could remove phy-is-integrated by:
> Moving internal phy clk/reset handling in mdio_mux_syscon_switch_fn()
> But this means:
> - getting internalphy node always by manually get internal_mdio/internal_phy (and not by the given phyhandle)
> - doing some unnecessary tasks (enable/scan/disable) when external_phy is needed
> 
> Regards

Hello all

Below is the current patch, as you can read, it does not use anymore the phy-is-integrated property.
So now, the mdio-mux must always enable the internal mdio when switch_fn ask for it and so reset MAC and so need to enable ephy clk/reset.
But for this I need a reference to thoses clock and reset. (this is done in get_ephy_nodes)
The current version set those clock in mdio-mux node, and as you can see it is already ugly (lots of get next node),
if the clk/rst nodes were as it should be, in phy nodes, it will be more bad.

So, since the MAC have a dependency on thoses clk/rst nodes for doing reset(), I seek a proper way to get references on it.
OR do you agree that putting ephy clk/rst in emac is acceptable ?

thanks
regards

--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -17,6 +17,7 @@
 #include <linux/clk.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
+#include <linux/mdio-mux.h>
 #include <linux/mfd/syscon.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
@@ -41,14 +42,14 @@
  *				This value is used for disabling properly EMAC
  *				and used as a good starting value in case of the
  *				boot process(uboot) leave some stuff.
- * @internal_phy:		Does the MAC embed an internal PHY
+ * @soc_has_internal_phy:	Does the MAC embed an internal PHY
  * @support_mii:		Does the MAC handle MII
  * @support_rmii:		Does the MAC handle RMII
  * @support_rgmii:		Does the MAC handle RGMII
  */
 struct emac_variant {
 	u32 default_syscon_value;
-	int internal_phy;
+	bool soc_has_internal_phy;
 	bool support_mii;
 	bool support_rmii;
 	bool support_rgmii;
@@ -61,7 +62,7 @@ struct emac_variant {
  * @rst_ephy:	reference to the optional EPHY reset for the internal PHY
  * @variant:	reference to the current board variant
  * @regmap:	regmap for using the syscon
- * @use_internal_phy: Does the current PHY choice imply using the internal PHY
+ * @internal_phy_powered: Does the internal PHY is enabled
  */
 struct sunxi_priv_data {
 	struct clk *tx_clk;
@@ -70,12 +71,13 @@ struct sunxi_priv_data {
 	struct reset_control *rst_ephy;
 	const struct emac_variant *variant;
 	struct regmap *regmap;
-	bool use_internal_phy;
+	bool internal_phy_powered;
+	void *mux_handle;
 };
 
 static const struct emac_variant emac_variant_h3 = {
 	.default_syscon_value = 0x58000,
-	.internal_phy = PHY_INTERFACE_MODE_MII,
+	.soc_has_internal_phy = true,
 	.support_mii = true,
 	.support_rmii = true,
 	.support_rgmii = true
@@ -83,20 +85,20 @@ static const struct emac_variant emac_variant_h3 = {
 
 static const struct emac_variant emac_variant_v3s = {
 	.default_syscon_value = 0x38000,
-	.internal_phy = PHY_INTERFACE_MODE_MII,
+	.soc_has_internal_phy = true,
 	.support_mii = true
 };
 
 static const struct emac_variant emac_variant_a83t = {
 	.default_syscon_value = 0,
-	.internal_phy = 0,
+	.soc_has_internal_phy = false,
 	.support_mii = true,
 	.support_rgmii = true
 };
 
 static const struct emac_variant emac_variant_a64 = {
 	.default_syscon_value = 0,
-	.internal_phy = 0,
+	.soc_has_internal_phy = false,
 	.support_mii = true,
 	.support_rmii = true,
 	.support_rgmii = true
@@ -195,6 +197,9 @@ static const struct emac_variant emac_variant_a64 = {
 #define H3_EPHY_LED_POL		BIT(17) /* 1: active low, 0: active high */
 #define H3_EPHY_SHUTDOWN	BIT(16) /* 1: shutdown, 0: power up */
 #define H3_EPHY_SELECT		BIT(15) /* 1: internal PHY, 0: external PHY */
+#define H3_EPHY_MUX_MASK	(H3_EPHY_SHUTDOWN | H3_EPHY_SELECT)
+#define DWMAC_SUN8I_MDIO_MUX_INTERNAL_ID	1
+#define DWMAC_SUN8I_MDIO_MUX_EXTERNAL_ID	2
 
 /* H3/A64 specific bits */
 #define SYSCON_RMII_EN		BIT(13) /* 1: enable RMII (overrides EPIT) */
@@ -634,6 +639,164 @@ static int sun8i_dwmac_reset(struct stmmac_priv *priv)
 	return 0;
 }
 
+static int get_ephy_nodes(struct stmmac_priv *priv)
+{
+	struct sunxi_priv_data *gmac = priv->plat->bsp_priv;
+	struct device_node *mdio_mux;
+	struct device_node *mdio_internal;
+	int ret;
+
+	mdio_mux = of_get_child_by_name(priv->plat->mdio_node, "mdio-mux");
+	if (!mdio_mux) {
+		dev_err(priv->device, "Cannot get mdio-mux node\n");
+		return -ENODEV;
+	}
+
+	mdio_internal = of_find_compatible_node(mdio_mux, NULL,
+						"allwinner,sun8i-h3-mdio-internal");
+	if (!mdio_internal) {
+		dev_err(priv->device, "Cannot get internal_mdio node\n");
+		return -ENODEV;
+	}
+
+	/* insert here code to get child phynode */
+	gmac->ephy_clk = of_clk_get(mdio_internal, 0);
+	if (IS_ERR(gmac->ephy_clk)) {
+		ret = PTR_ERR(gmac->ephy_clk);
+		dev_err(priv->device, "Cannot get EPHY clock: %d\n", ret);
+		return -EINVAL;
+	}
+
+	/* Note that we cannot use devm_reset_control_get, since
+	 * the reset is not in the device node.
+	 */
+	gmac->rst_ephy = of_reset_control_get_exclusive(mdio_internal, NULL);
+	if (IS_ERR(gmac->rst_ephy)) {
+		ret = PTR_ERR(gmac->rst_ephy);
+		if (ret == -EPROBE_DEFER)
+			return ret;
+		dev_err(priv->device, "No EPHY reset control found %d\n", ret);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int sun8i_dwmac_power_internal_phy(struct stmmac_priv *priv)
+{
+	struct sunxi_priv_data *gmac = priv->plat->bsp_priv;
+	int ret;
+
+	if (gmac->internal_phy_powered) {
+		dev_warn(priv->device, "Internal PHY already powered\n");
+		return 0;
+	}
+
+	dev_info(priv->device, "Powering internal PHY\n");
+	ret = clk_prepare_enable(gmac->ephy_clk);
+	if (ret) {
+		dev_err(priv->device, "Cannot enable internal PHY\n");
+		return ret;
+	}
+
+	/* Make sure the EPHY is properly reseted, as U-Boot may leave
+	 * it at deasserted state, and thus it may fail to reset EMAC.
+	 */
+	reset_control_assert(gmac->rst_ephy);
+
+	ret = reset_control_deassert(gmac->rst_ephy);
+	if (ret) {
+		dev_err(priv->device, "Cannot deassert internal phy\n");
+		clk_disable_unprepare(gmac->ephy_clk);
+		return ret;
+	}
+
+	gmac->internal_phy_powered = true;
+
+	return 0;
+}
+
+static int sun8i_dwmac_unpower_internal_phy(struct sunxi_priv_data *gmac)
+{
+	if (!gmac->internal_phy_powered)
+		return 0;
+
+	clk_disable_unprepare(gmac->ephy_clk);
+	reset_control_assert(gmac->rst_ephy);
+	gmac->internal_phy_powered = false;
+	return 0;
+}
+
+/* MDIO multiplexing switch function
+ * This function is called by the mdio-mux layer when it thinks the mdio bus
+ * multiplexer needs to switch.
+ * 'current_child' is the current value of the mux register
+ * 'desired_child' is the value of the 'reg' property of the target child MDIO
+ * node.
+ * The first time this function is called, current_child == -1.
+ * If current_child == desired_child, then the mux is already set to the
+ * correct bus.
+ *
+ * Note that we do not use reg/mask like mdio-mux-mmioreg because we need to
+ * know easily which bus is used (reset must be done only for desired bus).
+ */
+static int mdio_mux_syscon_switch_fn(int current_child, int desired_child,
+				     void *data)
+{
+	struct stmmac_priv *priv = data;
+	struct sunxi_priv_data *gmac = priv->plat->bsp_priv;
+	u32 reg, val;
+	int ret = 0;
+	bool need_power_ephy = false;
+
+	if (current_child ^ desired_child) {
+		regmap_read(gmac->regmap, SYSCON_EMAC_REG, &reg);
+		switch (desired_child) {
+		case DWMAC_SUN8I_MDIO_MUX_INTERNAL_ID:
+			dev_info(priv->device, "Switch mux to internal PHY");
+			val = (reg & ~H3_EPHY_MUX_MASK) | H3_EPHY_SELECT;
+
+			need_power_ephy = true;
+			break;
+		case DWMAC_SUN8I_MDIO_MUX_EXTERNAL_ID:
+			dev_info(priv->device, "Switch mux to external PHY");
+			val = (reg & ~H3_EPHY_MUX_MASK) | H3_EPHY_SHUTDOWN;
+			need_power_ephy = false;
+			break;
+		default:
+			dev_err(priv->device, "Invalid child ID %x\n",
+				desired_child);
+			return -EINVAL;
+		}
+		regmap_write(gmac->regmap, SYSCON_EMAC_REG, val);
+		if (need_power_ephy) {
+			ret = sun8i_dwmac_power_internal_phy(priv);
+			if (ret)
+				return ret;
+		} else {
+			sun8i_dwmac_unpower_internal_phy(gmac);
+		}
+		/* After changing syscon value, the MAC need reset or it will
+		 * use the last value (and so the last PHY set).
+		 */
+		ret = sun8i_dwmac_reset(priv);
+	}
+	return ret;
+}
+
+static int sun8i_dwmac_register_mdio_mux(struct stmmac_priv *priv)
+{
+	int ret;
+	struct device_node *mdio_mux;
+	struct sunxi_priv_data *gmac = priv->plat->bsp_priv;
+
+	mdio_mux = of_get_child_by_name(priv->plat->mdio_node, "mdio-mux");
+	if (!mdio_mux)
+		return -ENODEV;
+
+	ret = mdio_mux_init(priv->device, mdio_mux, mdio_mux_syscon_switch_fn,
+			    &gmac->mux_handle, priv, priv->mii);
+	return ret;
+}
+
 static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv)
 {
 	struct sunxi_priv_data *gmac = priv->plat->bsp_priv;
@@ -648,35 +811,25 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv)
 			 "Current syscon value is not the default %x (expect %x)\n",
 			 val, reg);
 
-	if (gmac->variant->internal_phy) {
-		if (!gmac->use_internal_phy) {
-			/* switch to external PHY interface */
-			reg &= ~H3_EPHY_SELECT;
-		} else {
-			reg |= H3_EPHY_SELECT;
-			reg &= ~H3_EPHY_SHUTDOWN;
-			dev_dbg(priv->device, "Select internal_phy %x\n", reg);
-
-			if (of_property_read_bool(priv->plat->phy_node,
-						  "allwinner,leds-active-low"))
-				reg |= H3_EPHY_LED_POL;
-			else
-				reg &= ~H3_EPHY_LED_POL;
-
-			/* Force EPHY xtal frequency to 24MHz. */
-			reg |= H3_EPHY_CLK_SEL;
-
-			ret = of_mdio_parse_addr(priv->device,
-						 priv->plat->phy_node);
-			if (ret < 0) {
-				dev_err(priv->device, "Could not parse MDIO addr\n");
-				return ret;
-			}
-			/* of_mdio_parse_addr returns a valid (0 ~ 31) PHY
-			 * address. No need to mask it again.
-			 */
-			reg |= ret << H3_EPHY_ADDR_SHIFT;
+	if (gmac->variant->soc_has_internal_phy) {
+		if (of_property_read_bool(priv->plat->phy_node,
+					  "allwinner,leds-active-low"))
+			reg |= H3_EPHY_LED_POL;
+		else
+			reg &= ~H3_EPHY_LED_POL;
+
+		/* Force EPHY xtal frequency to 24MHz. */
+		reg |= H3_EPHY_CLK_SEL;
+
+		ret = of_mdio_parse_addr(priv->device, priv->plat->phy_node);
+		if (ret < 0) {
+			dev_err(priv->device, "Could not parse MDIO addr\n");
+			return ret;
 		}
+		/* of_mdio_parse_addr returns a valid (0 ~ 31) PHY
+		 * address. No need to mask it again.
+		 */
+		reg |= 1 << H3_EPHY_ADDR_SHIFT;
 	}
 
 	if (!of_property_read_u32(node, "allwinner,tx-delay-ps", &val)) {
@@ -746,81 +899,21 @@ static void sun8i_dwmac_unset_syscon(struct sunxi_priv_data *gmac)
 	regmap_write(gmac->regmap, SYSCON_EMAC_REG, reg);
 }
 
-static int sun8i_dwmac_power_internal_phy(struct stmmac_priv *priv)
+static void sun8i_dwmac_exit(struct platform_device *pdev, void *priv)
 {
-	struct sunxi_priv_data *gmac = priv->plat->bsp_priv;
-	int ret;
-
-	if (!gmac->use_internal_phy)
-		return 0;
-
-	ret = clk_prepare_enable(gmac->ephy_clk);
-	if (ret) {
-		dev_err(priv->device, "Cannot enable ephy\n");
-		return ret;
-	}
-
-	/* Make sure the EPHY is properly reseted, as U-Boot may leave
-	 * it at deasserted state, and thus it may fail to reset EMAC.
-	 */
-	reset_control_assert(gmac->rst_ephy);
+	struct sunxi_priv_data *gmac = priv;
 
-	ret = reset_control_deassert(gmac->rst_ephy);
-	if (ret) {
-		dev_err(priv->device, "Cannot deassert ephy\n");
-		clk_disable_unprepare(gmac->ephy_clk);
-		return ret;
+	if (gmac->variant->soc_has_internal_phy) {
+		/* sun8i_dwmac_exit could be called with mdiomux uninit */
+		if (gmac->mux_handle)
+			mdio_mux_uninit(gmac->mux_handle);
+		if (gmac->internal_phy_powered)
+			sun8i_dwmac_unpower_internal_phy(gmac);
 	}
 
-	return 0;
-}
-
-static int sun8i_dwmac_unpower_internal_phy(struct sunxi_priv_data *gmac)
-{
-	if (!gmac->use_internal_phy)
-		return 0;
-
-	clk_disable_unprepare(gmac->ephy_clk);
-	reset_control_assert(gmac->rst_ephy);
-	return 0;
-}
-
-/* sun8i_power_phy() - Activate the PHY:
- * In case of error, no need to call sun8i_unpower_phy(),
- * it will be called anyway by sun8i_dwmac_exit()
- */
-static int sun8i_power_phy(struct stmmac_priv *priv)
-{
-	int ret;
-
-	ret = sun8i_dwmac_power_internal_phy(priv);
-	if (ret)
-		return ret;
-
-	ret = sun8i_dwmac_set_syscon(priv);
-	if (ret)
-		return ret;
-
-	/* After changing syscon value, the MAC need reset or it will use
-	 * the last value (and so the last PHY set.
-	 */
-	ret = sun8i_dwmac_reset(priv);
-	if (ret)
-		return ret;
-	return 0;
-}
-
-static void sun8i_unpower_phy(struct sunxi_priv_data *gmac)
-{
 	sun8i_dwmac_unset_syscon(gmac);
-	sun8i_dwmac_unpower_internal_phy(gmac);
-}
 
-static void sun8i_dwmac_exit(struct platform_device *pdev, void *priv)
-{
-	struct sunxi_priv_data *gmac = priv;
-
-	sun8i_unpower_phy(gmac);
+	reset_control_put(gmac->rst_ephy);
 
 	clk_disable_unprepare(gmac->tx_clk);
 
@@ -849,7 +942,7 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv)
 	if (!mac)
 		return NULL;
 
-	ret = sun8i_power_phy(priv);
+	ret = sun8i_dwmac_set_syscon(priv);
 	if (ret)
 		return NULL;
 
@@ -889,6 +982,8 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 	struct sunxi_priv_data *gmac;
 	struct device *dev = &pdev->dev;
 	int ret;
+	struct stmmac_priv *priv;
+	struct net_device *ndev;
 
 	ret = stmmac_get_platform_resources(pdev, &stmmac_res);
 	if (ret)
@@ -932,29 +1027,6 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 	}
 
 	plat_dat->interface = of_get_phy_mode(dev->of_node);
-	if (plat_dat->interface == gmac->variant->internal_phy) {
-		dev_info(&pdev->dev, "Will use internal PHY\n");
-		gmac->use_internal_phy = true;
-		gmac->ephy_clk = of_clk_get(plat_dat->phy_node, 0);
-		if (IS_ERR(gmac->ephy_clk)) {
-			ret = PTR_ERR(gmac->ephy_clk);
-			dev_err(&pdev->dev, "Cannot get EPHY clock: %d\n", ret);
-			return -EINVAL;
-		}
-
-		gmac->rst_ephy = of_reset_control_get(plat_dat->phy_node, NULL);
-		if (IS_ERR(gmac->rst_ephy)) {
-			ret = PTR_ERR(gmac->rst_ephy);
-			if (ret == -EPROBE_DEFER)
-				return ret;
-			dev_err(&pdev->dev, "No EPHY reset control found %d\n",
-				ret);
-			return -EINVAL;
-		}
-	} else {
-		dev_info(&pdev->dev, "Will use external PHY\n");
-		gmac->use_internal_phy = false;
-	}
 
 	/* platform data specifying hardware features and callbacks.
 	 * hardware features were copied from Allwinner drivers.
@@ -973,9 +1045,34 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 
 	ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
 	if (ret)
-		sun8i_dwmac_exit(pdev, plat_dat->bsp_priv);
+		goto dwmac_exit;
+
+	ndev = dev_get_drvdata(&pdev->dev);
+	priv = netdev_priv(ndev);
+	/* The mux must be registered after parent MDIO
+	 * so after stmmac_dvr_probe()
+	 */
+	if (gmac->variant->soc_has_internal_phy) {
+		ret = get_ephy_nodes(priv);
+		if (ret)
+			return ret;
+		ret = sun8i_dwmac_register_mdio_mux(priv);
+		if (ret) {
+			dev_err(&pdev->dev, "Failed to register mux\n");
+			goto dwmac_mux;
+		}
+	} else {
+		ret = sun8i_dwmac_reset(priv);
+		if (ret)
+			goto dwmac_exit;
+	}
 
 	return ret;
+dwmac_mux:
+	sun8i_dwmac_unset_syscon(gmac);
+dwmac_exit:
+	sun8i_dwmac_exit(pdev, plat_dat->bsp_priv);
+return ret;
 }
 
 static const struct of_device_id sun8i_dwmac_match[] = {
-- 
2.13.6

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox