Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v4 5/6] net: mscc: remove the frame_info cpuq member
From: Antoine Tenart @ 2019-07-25 14:27 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver
  Cc: Antoine Tenart, netdev, thomas.petazzoni, allan.nielsen
In-Reply-To: <20190725142707.9313-1-antoine.tenart@bootlin.com>

In struct frame_info, the cpuq member is never used. This cosmetic patch
removes it from the structure, and from the parsing of the frame header
as it's only set but never used.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 drivers/net/ethernet/mscc/ocelot.h       | 1 -
 drivers/net/ethernet/mscc/ocelot_board.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index e0da8b4eddf2..515dee6fa8a6 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -45,7 +45,6 @@ struct frame_info {
 	u32 len;
 	u16 port;
 	u16 vid;
-	u8 cpuq;
 	u8 tag_type;
 };
 
diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index 5e4f1718dd99..df8d15994a89 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -33,7 +33,6 @@ static int ocelot_parse_ifh(u32 *_ifh, struct frame_info *info)
 
 	info->port = IFH_EXTRACT_BITFIELD64(ifh[1], 43, 4);
 
-	info->cpuq = IFH_EXTRACT_BITFIELD64(ifh[1], 20, 8);
 	info->tag_type = IFH_EXTRACT_BITFIELD64(ifh[1], 16,  1);
 	info->vid = IFH_EXTRACT_BITFIELD64(ifh[1], 0,  12);
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v4 6/6] net: mscc: PTP Hardware Clock (PHC) support
From: Antoine Tenart @ 2019-07-25 14:27 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver
  Cc: Antoine Tenart, netdev, thomas.petazzoni, allan.nielsen
In-Reply-To: <20190725142707.9313-1-antoine.tenart@bootlin.com>

This patch adds support for PTP Hardware Clock (PHC) to the Ocelot
switch for both PTP 1-step and 2-step modes.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 drivers/net/ethernet/mscc/ocelot.c       | 394 ++++++++++++++++++++++-
 drivers/net/ethernet/mscc/ocelot.h       |  39 +++
 drivers/net/ethernet/mscc/ocelot_board.c | 111 ++++++-
 3 files changed, 536 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index b71e4ecbe469..b08fcec73a30 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/phy.h>
+#include <linux/ptp_clock_kernel.h>
 #include <linux/skbuff.h>
 #include <linux/iopoll.h>
 #include <net/arp.h>
@@ -538,7 +539,7 @@ static int ocelot_port_stop(struct net_device *dev)
  */
 static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info)
 {
-	ifh[0] = IFH_INJ_BYPASS;
+	ifh[0] = IFH_INJ_BYPASS | ((0x1ff & info->rew_op) << 21);
 	ifh[1] = (0xf00 & info->port) >> 8;
 	ifh[2] = (0xff & info->port) << 24;
 	ifh[3] = (info->tag_type << 16) | info->vid;
@@ -548,6 +549,7 @@ static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info)
 
 static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	struct ocelot_port *port = netdev_priv(dev);
 	struct ocelot *ocelot = port->ocelot;
 	u32 val, ifh[IFH_LEN];
@@ -566,6 +568,14 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
 	info.port = BIT(port->chip_port);
 	info.tag_type = IFH_TAG_TYPE_C;
 	info.vid = skb_vlan_tag_get(skb);
+
+	/* Check if timestamping is needed */
+	if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP) {
+		info.rew_op = port->ptp_cmd;
+		if (port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP)
+			info.rew_op |= (port->ts_id  % 4) << 3;
+	}
+
 	ocelot_gen_ifh(ifh, &info);
 
 	for (i = 0; i < IFH_LEN; i++)
@@ -596,11 +606,51 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	dev->stats.tx_packets++;
 	dev->stats.tx_bytes += skb->len;
-	dev_kfree_skb_any(skb);
+
+	if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP &&
+	    port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) {
+		struct ocelot_skb *oskb =
+			kzalloc(sizeof(struct ocelot_skb), GFP_ATOMIC);
+
+		oskb->skb = skb;
+		oskb->id = port->ts_id % 4;
+		port->ts_id++;
+
+		list_add_tail(&oskb->head, &port->skbs);
+	} else {
+		dev_kfree_skb_any(skb);
+	}
 
 	return NETDEV_TX_OK;
 }
 
+void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts)
+{
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&ocelot->ptp_clock_lock, flags);
+
+	/* Read current PTP time to get seconds */
+	val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);
+
+	val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM);
+	val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_SAVE);
+	ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
+	ts->tv_sec = ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_LSB, TOD_ACC_PIN);
+
+	/* Read packet HW timestamp from FIFO */
+	val = ocelot_read(ocelot, SYS_PTP_TXSTAMP);
+	ts->tv_nsec = SYS_PTP_TXSTAMP_PTP_TXSTAMP(val);
+
+	/* Sec has incremented since the ts was registered */
+	if ((ts->tv_sec & 0x1) != !!(val & SYS_PTP_TXSTAMP_PTP_TXSTAMP_SEC))
+		ts->tv_sec--;
+
+	spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
+}
+EXPORT_SYMBOL(ocelot_get_hwtimestamp);
+
 static int ocelot_mc_unsync(struct net_device *dev, const unsigned char *addr)
 {
 	struct ocelot_port *port = netdev_priv(dev);
@@ -917,6 +967,97 @@ static int ocelot_get_port_parent_id(struct net_device *dev,
 	return 0;
 }
 
+static int ocelot_hwstamp_get(struct ocelot_port *port, struct ifreq *ifr)
+{
+	struct ocelot *ocelot = port->ocelot;
+
+	return copy_to_user(ifr->ifr_data, &ocelot->hwtstamp_config,
+			    sizeof(ocelot->hwtstamp_config)) ? -EFAULT : 0;
+}
+
+static int ocelot_hwstamp_set(struct ocelot_port *port, struct ifreq *ifr)
+{
+	struct ocelot *ocelot = port->ocelot;
+	struct hwtstamp_config cfg;
+
+	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+		return -EFAULT;
+
+	/* reserved for future extensions */
+	if (cfg.flags)
+		return -EINVAL;
+
+	/* Tx type sanity check */
+	switch (cfg.tx_type) {
+	case HWTSTAMP_TX_ON:
+		port->ptp_cmd = IFH_REW_OP_TWO_STEP_PTP;
+		break;
+	case HWTSTAMP_TX_ONESTEP_SYNC:
+		/* IFH_REW_OP_ONE_STEP_PTP updates the correctional field, we
+		 * need to update the origin time.
+		 */
+		port->ptp_cmd = IFH_REW_OP_ORIGIN_PTP;
+		break;
+	case HWTSTAMP_TX_OFF:
+		port->ptp_cmd = 0;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	mutex_lock(&ocelot->ptp_lock);
+
+	switch (cfg.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		break;
+	default:
+		mutex_unlock(&ocelot->ptp_lock);
+		return -ERANGE;
+	}
+
+	/* Commit back the result & save it */
+	memcpy(&ocelot->hwtstamp_config, &cfg, sizeof(cfg));
+	mutex_unlock(&ocelot->ptp_lock);
+
+	return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
+}
+
+static int ocelot_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+
+	/* The function is only used for PTP operations for now */
+	if (!ocelot->ptp)
+		return -EOPNOTSUPP;
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return ocelot_hwstamp_set(port, ifr);
+	case SIOCGHWTSTAMP:
+		return ocelot_hwstamp_get(port, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops ocelot_port_netdev_ops = {
 	.ndo_open			= ocelot_port_open,
 	.ndo_stop			= ocelot_port_stop,
@@ -933,6 +1074,7 @@ static const struct net_device_ops ocelot_port_netdev_ops = {
 	.ndo_set_features		= ocelot_set_features,
 	.ndo_get_port_parent_id		= ocelot_get_port_parent_id,
 	.ndo_setup_tc			= ocelot_setup_tc,
+	.ndo_do_ioctl			= ocelot_ioctl,
 };
 
 static void ocelot_get_strings(struct net_device *netdev, u32 sset, u8 *data)
@@ -1014,12 +1156,37 @@ static int ocelot_get_sset_count(struct net_device *dev, int sset)
 	return ocelot->num_stats;
 }
 
+static int ocelot_get_ts_info(struct net_device *dev,
+			      struct ethtool_ts_info *info)
+{
+	struct ocelot_port *ocelot_port = netdev_priv(dev);
+	struct ocelot *ocelot = ocelot_port->ocelot;
+
+	if (!ocelot->ptp)
+		return ethtool_op_get_ts_info(dev, info);
+
+	info->phc_index = ocelot->ptp_clock ?
+			  ptp_clock_index(ocelot->ptp_clock) : -1;
+	info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE |
+				 SOF_TIMESTAMPING_RX_SOFTWARE |
+				 SOF_TIMESTAMPING_SOFTWARE |
+				 SOF_TIMESTAMPING_TX_HARDWARE |
+				 SOF_TIMESTAMPING_RX_HARDWARE |
+				 SOF_TIMESTAMPING_RAW_HARDWARE;
+	info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON) |
+			 BIT(HWTSTAMP_TX_ONESTEP_SYNC);
+	info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) | BIT(HWTSTAMP_FILTER_ALL);
+
+	return 0;
+}
+
 static const struct ethtool_ops ocelot_ethtool_ops = {
 	.get_strings		= ocelot_get_strings,
 	.get_ethtool_stats	= ocelot_get_ethtool_stats,
 	.get_sset_count		= ocelot_get_sset_count,
 	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
 	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
+	.get_ts_info		= ocelot_get_ts_info,
 };
 
 static int ocelot_port_attr_stp_state_set(struct ocelot_port *ocelot_port,
@@ -1629,6 +1796,196 @@ struct notifier_block ocelot_switchdev_blocking_nb __read_mostly = {
 };
 EXPORT_SYMBOL(ocelot_switchdev_blocking_nb);
 
+int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info);
+	unsigned long flags;
+	time64_t s;
+	u32 val;
+	s64 ns;
+
+	spin_lock_irqsave(&ocelot->ptp_clock_lock, flags);
+
+	val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);
+	val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM);
+	val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_SAVE);
+	ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
+
+	s = ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_MSB, TOD_ACC_PIN) & 0xffff;
+	s <<= 32;
+	s += ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_LSB, TOD_ACC_PIN);
+	ns = ocelot_read_rix(ocelot, PTP_PIN_TOD_NSEC, TOD_ACC_PIN);
+
+	spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
+
+	/* Deal with negative values */
+	if (ns >= 0x3ffffff0 && ns <= 0x3fffffff) {
+		s--;
+		ns &= 0xf;
+		ns += 999999984;
+	}
+
+	set_normalized_timespec64(ts, s, ns);
+	return 0;
+}
+EXPORT_SYMBOL(ocelot_ptp_gettime64);
+
+static int ocelot_ptp_settime64(struct ptp_clock_info *ptp,
+				const struct timespec64 *ts)
+{
+	struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info);
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&ocelot->ptp_clock_lock, flags);
+
+	val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);
+	val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM);
+	val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_IDLE);
+
+	ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
+
+	ocelot_write_rix(ocelot, lower_32_bits(ts->tv_sec), PTP_PIN_TOD_SEC_LSB,
+			 TOD_ACC_PIN);
+	ocelot_write_rix(ocelot, upper_32_bits(ts->tv_sec), PTP_PIN_TOD_SEC_MSB,
+			 TOD_ACC_PIN);
+	ocelot_write_rix(ocelot, ts->tv_nsec, PTP_PIN_TOD_NSEC, TOD_ACC_PIN);
+
+	val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);
+	val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM);
+	val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_LOAD);
+
+	ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
+
+	spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
+	return 0;
+}
+
+static int ocelot_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	if (delta > -(NSEC_PER_SEC / 2) && delta < (NSEC_PER_SEC / 2)) {
+		struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info);
+		unsigned long flags;
+		u32 val;
+
+		spin_lock_irqsave(&ocelot->ptp_clock_lock, flags);
+
+		val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);
+		val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM);
+		val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_IDLE);
+
+		ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
+
+		ocelot_write_rix(ocelot, 0, PTP_PIN_TOD_SEC_LSB, TOD_ACC_PIN);
+		ocelot_write_rix(ocelot, 0, PTP_PIN_TOD_SEC_MSB, TOD_ACC_PIN);
+		ocelot_write_rix(ocelot, delta, PTP_PIN_TOD_NSEC, TOD_ACC_PIN);
+
+		val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);
+		val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM);
+		val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_DELTA);
+
+		ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
+
+		spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
+	} else {
+		/* Fall back using ocelot_ptp_settime64 which is not exact. */
+		struct timespec64 ts;
+		u64 now;
+
+		ocelot_ptp_gettime64(ptp, &ts);
+
+		now = ktime_to_ns(timespec64_to_ktime(ts));
+		ts = ns_to_timespec64(now + delta);
+
+		ocelot_ptp_settime64(ptp, &ts);
+	}
+	return 0;
+}
+
+static int ocelot_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+	struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info);
+	u32 unit = 0, direction = 0;
+	unsigned long flags;
+	u64 adj = 0;
+
+	spin_lock_irqsave(&ocelot->ptp_clock_lock, flags);
+
+	if (!scaled_ppm)
+		goto disable_adj;
+
+	if (scaled_ppm < 0) {
+		direction = PTP_CFG_CLK_ADJ_CFG_DIR;
+		scaled_ppm = -scaled_ppm;
+	}
+
+	adj = PSEC_PER_SEC << 16;
+	do_div(adj, scaled_ppm);
+	do_div(adj, 1000);
+
+	/* If the adjustment value is too large, use ns instead */
+	if (adj >= (1L << 30)) {
+		unit = PTP_CFG_CLK_ADJ_FREQ_NS;
+		do_div(adj, 1000);
+	}
+
+	/* Still too big */
+	if (adj >= (1L << 30))
+		goto disable_adj;
+
+	ocelot_write(ocelot, unit | adj, PTP_CLK_CFG_ADJ_FREQ);
+	ocelot_write(ocelot, PTP_CFG_CLK_ADJ_CFG_ENA | direction,
+		     PTP_CLK_CFG_ADJ_CFG);
+
+	spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
+	return 0;
+
+disable_adj:
+	ocelot_write(ocelot, 0, PTP_CLK_CFG_ADJ_CFG);
+
+	spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
+	return 0;
+}
+
+static struct ptp_clock_info ocelot_ptp_clock_info = {
+	.owner		= THIS_MODULE,
+	.name		= "ocelot ptp",
+	.max_adj	= 0x7fffffff,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.gettime64	= ocelot_ptp_gettime64,
+	.settime64	= ocelot_ptp_settime64,
+	.adjtime	= ocelot_ptp_adjtime,
+	.adjfine	= ocelot_ptp_adjfine,
+};
+
+static int ocelot_init_timestamp(struct ocelot *ocelot)
+{
+	ocelot->ptp_info = ocelot_ptp_clock_info;
+	ocelot->ptp_clock = ptp_clock_register(&ocelot->ptp_info, ocelot->dev);
+	if (IS_ERR(ocelot->ptp_clock))
+		return PTR_ERR(ocelot->ptp_clock);
+	/* Check if PHC support is missing at the configuration level */
+	if (!ocelot->ptp_clock)
+		return 0;
+
+	ocelot_write(ocelot, SYS_PTP_CFG_PTP_STAMP_WID(30), SYS_PTP_CFG);
+	ocelot_write(ocelot, 0xffffffff, ANA_TABLES_PTP_ID_LOW);
+	ocelot_write(ocelot, 0xffffffff, ANA_TABLES_PTP_ID_HIGH);
+
+	ocelot_write(ocelot, PTP_CFG_MISC_PTP_EN, PTP_CFG_MISC);
+
+	/* There is no device reconfiguration, PTP Rx stamping is always
+	 * enabled.
+	 */
+	ocelot->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+
+	return 0;
+}
+
 int ocelot_probe_port(struct ocelot *ocelot, u8 port,
 		      void __iomem *regs,
 		      struct phy_device *phy)
@@ -1661,6 +2018,8 @@ int ocelot_probe_port(struct ocelot *ocelot, u8 port,
 	ocelot_mact_learn(ocelot, PGID_CPU, dev->dev_addr, ocelot_port->pvid,
 			  ENTRYTYPE_LOCKED);
 
+	INIT_LIST_HEAD(&ocelot_port->skbs);
+
 	err = register_netdev(dev);
 	if (err) {
 		dev_err(ocelot->dev, "register_netdev failed\n");
@@ -1684,7 +2043,7 @@ EXPORT_SYMBOL(ocelot_probe_port);
 int ocelot_init(struct ocelot *ocelot)
 {
 	u32 port;
-	int i, cpu = ocelot->num_phys_ports;
+	int i, ret, cpu = ocelot->num_phys_ports;
 	char queue_name[32];
 
 	ocelot->lags = devm_kcalloc(ocelot->dev, ocelot->num_phys_ports,
@@ -1699,6 +2058,8 @@ int ocelot_init(struct ocelot *ocelot)
 		return -ENOMEM;
 
 	mutex_init(&ocelot->stats_lock);
+	mutex_init(&ocelot->ptp_lock);
+	spin_lock_init(&ocelot->ptp_clock_lock);
 	snprintf(queue_name, sizeof(queue_name), "%s-stats",
 		 dev_name(ocelot->dev));
 	ocelot->stats_queue = create_singlethread_workqueue(queue_name);
@@ -1812,15 +2173,42 @@ int ocelot_init(struct ocelot *ocelot)
 	INIT_DELAYED_WORK(&ocelot->stats_work, ocelot_check_stats_work);
 	queue_delayed_work(ocelot->stats_queue, &ocelot->stats_work,
 			   OCELOT_STATS_CHECK_DELAY);
+
+	if (ocelot->ptp) {
+		ret = ocelot_init_timestamp(ocelot);
+		if (ret) {
+			dev_err(ocelot->dev,
+				"Timestamp initialization failed\n");
+			return ret;
+		}
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(ocelot_init);
 
 void ocelot_deinit(struct ocelot *ocelot)
 {
+	struct list_head *pos, *tmp;
+	struct ocelot_port *port;
+	struct ocelot_skb *entry;
+	int i;
+
 	destroy_workqueue(ocelot->stats_queue);
 	mutex_destroy(&ocelot->stats_lock);
 	ocelot_ace_deinit();
+
+	for (i = 0; i < ocelot->num_phys_ports; i++) {
+		port = ocelot->ports[i];
+
+		list_for_each_safe(pos, tmp, &port->skbs) {
+			entry = list_entry(pos, struct ocelot_skb, head);
+
+			list_del(pos);
+			dev_kfree_skb_any(entry->skb);
+			kfree(entry);
+		}
+	}
 }
 EXPORT_SYMBOL(ocelot_deinit);
 
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index 515dee6fa8a6..e40773c01a44 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -11,9 +11,11 @@
 #include <linux/bitops.h>
 #include <linux/etherdevice.h>
 #include <linux/if_vlan.h>
+#include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
+#include <linux/ptp_clock_kernel.h>
 #include <linux/regmap.h>
 
 #include "ocelot_ana.h"
@@ -39,6 +41,8 @@
 
 #define OCELOT_STATS_CHECK_DELAY (2 * HZ)
 
+#define OCELOT_PTP_QUEUE_SZ	128
+
 #define IFH_LEN 4
 
 struct frame_info {
@@ -46,6 +50,8 @@ struct frame_info {
 	u16 port;
 	u16 vid;
 	u8 tag_type;
+	u16 rew_op;
+	u32 timestamp;	/* rew_val */
 };
 
 #define IFH_INJ_BYPASS	BIT(31)
@@ -54,6 +60,12 @@ struct frame_info {
 #define IFH_TAG_TYPE_C 0
 #define IFH_TAG_TYPE_S 1
 
+#define IFH_REW_OP_NOOP			0x0
+#define IFH_REW_OP_DSCP			0x1
+#define IFH_REW_OP_ONE_STEP_PTP		0x2
+#define IFH_REW_OP_TWO_STEP_PTP		0x3
+#define IFH_REW_OP_ORIGIN_PTP		0x5
+
 #define OCELOT_SPEED_2500 0
 #define OCELOT_SPEED_1000 1
 #define OCELOT_SPEED_100  2
@@ -401,6 +413,13 @@ enum ocelot_regfield {
 	REGFIELD_MAX
 };
 
+enum ocelot_clk_pins {
+	ALT_PPS_PIN	= 1,
+	EXT_CLK_PIN,
+	ALT_LDST_PIN,
+	TOD_ACC_PIN
+};
+
 struct ocelot_multicast {
 	struct list_head list;
 	unsigned char addr[ETH_ALEN];
@@ -450,6 +469,13 @@ struct ocelot {
 	u64 *stats;
 	struct delayed_work stats_work;
 	struct workqueue_struct *stats_queue;
+
+	u8 ptp:1;
+	struct ptp_clock *ptp_clock;
+	struct ptp_clock_info ptp_info;
+	struct hwtstamp_config hwtstamp_config;
+	struct mutex ptp_lock; /* Protects the PTP interface state */
+	spinlock_t ptp_clock_lock; /* Protects the PTP clock */
 };
 
 struct ocelot_port {
@@ -473,6 +499,16 @@ struct ocelot_port {
 	struct phy *serdes;
 
 	struct ocelot_port_tc tc;
+
+	u8 ptp_cmd;
+	struct list_head skbs;
+	u8 ts_id;
+};
+
+struct ocelot_skb {
+	struct list_head head;
+	struct sk_buff *skb;
+	u8 id;
 };
 
 u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset);
@@ -517,4 +553,7 @@ extern struct notifier_block ocelot_netdevice_nb;
 extern struct notifier_block ocelot_switchdev_nb;
 extern struct notifier_block ocelot_switchdev_blocking_nb;
 
+int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts);
+void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts);
+
 #endif
diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index df8d15994a89..0b14e7110e7f 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -31,6 +31,8 @@ static int ocelot_parse_ifh(u32 *_ifh, struct frame_info *info)
 
 	info->len = OCELOT_BUFFER_CELL_SZ * wlen + llen - 80;
 
+	info->timestamp = IFH_EXTRACT_BITFIELD64(ifh[0], 21, 32);
+
 	info->port = IFH_EXTRACT_BITFIELD64(ifh[1], 43, 4);
 
 	info->tag_type = IFH_EXTRACT_BITFIELD64(ifh[1], 16,  1);
@@ -98,7 +100,11 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
 		int sz, len, buf_len;
 		u32 ifh[4];
 		u32 val;
-		struct frame_info info;
+		struct frame_info info = {};
+		struct timespec64 ts;
+		struct skb_shared_hwtstamps *shhwtstamps;
+		u64 tod_in_ns;
+		u64 full_ts_in_ns;
 
 		for (i = 0; i < IFH_LEN; i++) {
 			err = ocelot_rx_frame_word(ocelot, grp, true, &ifh[i]);
@@ -145,6 +151,22 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
 			break;
 		}
 
+		if (ocelot->ptp) {
+			ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
+
+			tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
+			if ((tod_in_ns & 0xffffffff) < info.timestamp)
+				full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
+						info.timestamp;
+			else
+				full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
+						info.timestamp;
+
+			shhwtstamps = skb_hwtstamps(skb);
+			memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
+			shhwtstamps->hwtstamp = full_ts_in_ns;
+		}
+
 		/* Everything we see on an interface that is in the HW bridge
 		 * has already been forwarded.
 		 */
@@ -164,6 +186,70 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t ocelot_ptp_rdy_irq_handler(int irq, void *arg)
+{
+	int budget = OCELOT_PTP_QUEUE_SZ;
+	struct ocelot *ocelot = arg;
+
+	do {
+		struct skb_shared_hwtstamps shhwtstamps;
+		struct list_head *pos, *tmp;
+		struct sk_buff *skb = NULL;
+		struct ocelot_skb *entry;
+		struct ocelot_port *port;
+		struct timespec64 ts;
+		u32 val, id, txport;
+
+		/* Prevent from infinite loop */
+		if (unlikely(!--budget))
+			break;
+
+		val = ocelot_read(ocelot, SYS_PTP_STATUS);
+
+		/* Check if a timestamp can be retrieved */
+		if (!(val & SYS_PTP_STATUS_PTP_MESS_VLD))
+			break;
+
+		WARN_ON(val & SYS_PTP_STATUS_PTP_OVFL);
+
+		/* Retrieve the ts ID and Tx port */
+		id = SYS_PTP_STATUS_PTP_MESS_ID_X(val);
+		txport = SYS_PTP_STATUS_PTP_MESS_TXPORT_X(val);
+
+		/* Retrieve its associated skb */
+		port = ocelot->ports[txport];
+
+		list_for_each_safe(pos, tmp, &port->skbs) {
+			entry = list_entry(pos, struct ocelot_skb, head);
+			if (entry->id != id)
+				continue;
+
+			skb = entry->skb;
+
+			list_del(pos);
+			kfree(entry);
+		}
+
+		/* Next ts */
+		ocelot_write(ocelot, SYS_PTP_NXT_PTP_NXT, SYS_PTP_NXT);
+
+		if (unlikely(!skb))
+			continue;
+
+		/* Get the h/w timestamp */
+		ocelot_get_hwtimestamp(ocelot, &ts);
+
+		/* Set the timestamp into the skb */
+		memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+		shhwtstamps.hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec);
+		skb_tstamp_tx(skb, &shhwtstamps);
+
+		dev_kfree_skb_any(skb);
+	} while (true);
+
+	return IRQ_HANDLED;
+}
+
 static const struct of_device_id mscc_ocelot_match[] = {
 	{ .compatible = "mscc,vsc7514-switch" },
 	{ }
@@ -172,8 +258,8 @@ MODULE_DEVICE_TABLE(of, mscc_ocelot_match);
 
 static int mscc_ocelot_probe(struct platform_device *pdev)
 {
-	int err, irq;
 	unsigned int i;
+	int err, irq_xtr, irq_ptp_rdy;
 	struct device_node *np = pdev->dev.of_node;
 	struct device_node *ports, *portnp;
 	struct ocelot *ocelot;
@@ -232,16 +318,31 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	irq = platform_get_irq_byname(pdev, "xtr");
-	if (irq < 0)
+	irq_xtr = platform_get_irq_byname(pdev, "xtr");
+	if (irq_xtr < 0)
 		return -ENODEV;
 
-	err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+	err = devm_request_threaded_irq(&pdev->dev, irq_xtr, NULL,
 					ocelot_xtr_irq_handler, IRQF_ONESHOT,
 					"frame extraction", ocelot);
 	if (err)
 		return err;
 
+
+	irq_ptp_rdy = platform_get_irq_byname(pdev, "ptp_rdy");
+	if (irq_ptp_rdy > 0) {
+		err = devm_request_threaded_irq(&pdev->dev, irq_ptp_rdy, NULL,
+						ocelot_ptp_rdy_irq_handler,
+						IRQF_ONESHOT, "ptp ready",
+						ocelot);
+		if (err)
+			return err;
+
+		/* Check if we can support PTP */
+		if (ocelot->targets[PTP])
+			ocelot->ptp = 1;
+	}
+
 	regmap_field_write(ocelot->regfields[SYS_RESET_CFG_MEM_INIT], 1);
 	regmap_field_write(ocelot->regfields[SYS_RESET_CFG_MEM_ENA], 1);
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v4 4/6] net: mscc: improve the frame header parsing readability
From: Antoine Tenart @ 2019-07-25 14:27 UTC (permalink / raw)
  To: davem, richardcochran, alexandre.belloni, UNGLinuxDriver
  Cc: Antoine Tenart, netdev, thomas.petazzoni, allan.nielsen
In-Reply-To: <20190725142707.9313-1-antoine.tenart@bootlin.com>

This cosmetic patch improves the frame header parsing readability by
introducing a new macro to access and mask its fields.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 drivers/net/ethernet/mscc/ocelot_board.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index 990027f04d1b..5e4f1718dd99 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -16,24 +16,26 @@
 
 #include "ocelot.h"
 
-static int ocelot_parse_ifh(u32 *ifh, struct frame_info *info)
+#define IFH_EXTRACT_BITFIELD64(x, o, w) (((x) >> (o)) & GENMASK_ULL((w) - 1, 0))
+
+static int ocelot_parse_ifh(u32 *_ifh, struct frame_info *info)
 {
-	int i;
 	u8 llen, wlen;
+	u64 ifh[2];
+
+	ifh[0] = be64_to_cpu(((__force __be64 *)_ifh)[0]);
+	ifh[1] = be64_to_cpu(((__force __be64 *)_ifh)[1]);
 
-	/* The IFH is in network order, switch to CPU order */
-	for (i = 0; i < IFH_LEN; i++)
-		ifh[i] = ntohl((__force __be32)ifh[i]);
+	wlen = IFH_EXTRACT_BITFIELD64(ifh[0], 7,  8);
+	llen = IFH_EXTRACT_BITFIELD64(ifh[0], 15,  6);
 
-	wlen = (ifh[1] >> 7) & 0xff;
-	llen = (ifh[1] >> 15) & 0x3f;
 	info->len = OCELOT_BUFFER_CELL_SZ * wlen + llen - 80;
 
-	info->port = (ifh[2] & GENMASK(14, 11)) >> 11;
+	info->port = IFH_EXTRACT_BITFIELD64(ifh[1], 43, 4);
 
-	info->cpuq = (ifh[3] & GENMASK(27, 20)) >> 20;
-	info->tag_type = (ifh[3] & BIT(16)) >> 16;
-	info->vid = ifh[3] & GENMASK(11, 0);
+	info->cpuq = IFH_EXTRACT_BITFIELD64(ifh[1], 20, 8);
+	info->tag_type = IFH_EXTRACT_BITFIELD64(ifh[1], 16,  1);
+	info->vid = IFH_EXTRACT_BITFIELD64(ifh[1], 0,  12);
 
 	return 0;
 }
-- 
2.21.0


^ permalink raw reply related

* Re: BUG: spinlock recursion in release_sock
From: John Fastabend @ 2019-07-25 14:32 UTC (permalink / raw)
  To: syzbot, arvid.brodin, aviadye, borisp, daniel, davejwatson, davem,
	jakub.kicinski, john.fastabend, john.hurley, linux-kernel, netdev,
	simon.horman, syzkaller-bugs, willemb, xiyou.wangcong
In-Reply-To: <000000000000e8c654058e7576ef@google.com>

syzbot wrote:
> syzbot has bisected this bug to:
> 
> commit 8822e270d697010e6a4fd42a319dbefc33db91e1
> Author: John Hurley <john.hurley@netronome.com>
> Date:   Sun Jul 7 14:01:54 2019 +0000
> 
>      net: core: move push MPLS functionality from OvS to core helper
> 
> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=13ca5a5c600000
> start commit:   9e6dfe80 Add linux-next specific files for 20190724
> git tree:       linux-next
> final crash:    https://syzkaller.appspot.com/x/report.txt?x=102a5a5c600000
> console output: https://syzkaller.appspot.com/x/log.txt?x=17ca5a5c600000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=6cbb8fc2cf2842d7
> dashboard link: https://syzkaller.appspot.com/bug?extid=e67cf584b5e6b35a8ffa
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=13680594600000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=15b34144600000
> 
> Reported-by: syzbot+e67cf584b5e6b35a8ffa@syzkaller.appspotmail.com
> Fixes: 8822e270d697 ("net: core: move push MPLS functionality from OvS to  
> core helper")
> 
> For information about bisection process see: https://goo.gl/tpsmEJ#bisection

This commit is wrong, it appears to be introduced by some other fixes we
pushed last couple days for tls/bpf. I'll look into it. Thanks.

^ permalink raw reply

* Re: [RFC PATCH] rxrpc: Fix -Wframe-larger-than= warnings from on-stack crypto
From: Arnd Bergmann @ 2019-07-25 14:43 UTC (permalink / raw)
  To: David Howells
  Cc: linux-afs, Herbert Xu,
	open list:HARDWARE RANDOM NUMBER GENERATOR CORE, Networking
In-Reply-To: <156406148519.15479.13870345028835442313.stgit@warthog.procyon.org.uk>

On Thu, Jul 25, 2019 at 3:31 PM David Howells <dhowells@redhat.com> wrote:
>
> rxkad sometimes triggers a warning about oversized stack frames when
> building with clang for a 32-bit architecture:
>
> net/rxrpc/rxkad.c:243:12: error: stack frame size of 1088 bytes in function 'rxkad_secure_packet' [-Werror,-Wframe-larger-than=]
> net/rxrpc/rxkad.c:501:12: error: stack frame size of 1088 bytes in function 'rxkad_verify_packet' [-Werror,-Wframe-larger-than=]
>
> The problem is the combination of SYNC_SKCIPHER_REQUEST_ON_STACK() in
> rxkad_verify_packet()/rxkad_secure_packet() with the relatively large
> scatterlist in rxkad_verify_packet_1()/rxkad_secure_packet_encrypt().
>
> The warning does not show up when using gcc, which does not inline the
> functions as aggressively, but the problem is still the same.
>
> Allocate the cipher buffers from the slab instead, caching the allocated
> packet crypto request memory used for DATA packet crypto in the rxrpc_call
> struct.
>
> Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both")
> Reported-by: Arnd Bergmann <arnd@arndb.de>
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Herbert Xu <herbert@gondor.apana.org.au>

Acked-by: Arnd Bergmann <arnd@arndb.de>

^ permalink raw reply

* Re: [PATCH net-next v2 3/4] dt-bindings: net: fsl: enetc: Add bindings for the central MDIO PCIe endpoint
From: Sergei Shtylyov @ 2019-07-25 14:49 UTC (permalink / raw)
  To: Claudiu Manoil, David S . Miller
  Cc: andrew, Rob Herring, Li Yang, alexandru.marginean, netdev,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <1564053568-20522-4-git-send-email-claudiu.manoil@nxp.com>

Hello!

On 07/25/2019 02:19 PM, Claudiu Manoil wrote:

> The on-chip PCIe root complex that integrates the ENETC ethernet
> controllers also integrates a PCIe enpoint for the MDIO controller
> provinding for cetralized control of the ENETC mdio bus.

   Providing, centralized.

> Add bindings for this "central" MDIO Integrated PCIe Endpoit.
> 
> Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
> ---
> v1 - none
> v2 - none
> 
>  .../devicetree/bindings/net/fsl-enetc.txt     | 42 +++++++++++++++++--
>  1 file changed, 39 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/net/fsl-enetc.txt b/Documentation/devicetree/bindings/net/fsl-enetc.txt
> index 25fc687419db..c090f6df7a39 100644
> --- a/Documentation/devicetree/bindings/net/fsl-enetc.txt
> +++ b/Documentation/devicetree/bindings/net/fsl-enetc.txt
[...]
> @@ -47,8 +49,42 @@ Example:
>  		};
>  	};
>  
> -2) The ENETC port is an internal port or has a fixed-link external
> -connection:
> +1.2. Using the central MDIO PCIe enpoint device

   Endpoint. -ETOOMANYTYPOS. :-)

[...]

MBR, Sergei

^ permalink raw reply

* Re: [RFC PATCH] rxrpc: Fix -Wframe-larger-than= warnings from on-stack crypto
From: David Howells @ 2019-07-25 14:52 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: dhowells, linux-afs, Herbert Xu,
	open list:HARDWARE RANDOM NUMBER GENERATOR CORE, Networking
In-Reply-To: <CAK8P3a23gnvxA3PcvFy5wadNGoCPRH7PUEY_dqJ+bk3uH5=t+g@mail.gmail.com>

Would you rather this went through net or net-next?

David

^ permalink raw reply

* Re: [PATCH 3/3] net: dsa: ksz: Add Microchip KSZ8795 DSA driver
From: Marek Vasut @ 2019-07-25 14:56 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev, Tristram Ha, David S . Miller, Florian Fainelli,
	Vivien Didelot, Woojung Huh
In-Reply-To: <20190725140351.GG21952@lunn.ch>

On 7/25/19 4:03 PM, Andrew Lunn wrote:
> On Wed, Jul 24, 2019 at 03:40:48PM +0200, Marek Vasut wrote:
>> From: Tristram Ha <Tristram.Ha@microchip.com>
>> +static void ksz8795_phy_setup(struct ksz_device *dev, int port,
>> +			      struct phy_device *phy)
>> +{
>> +	if (port < dev->phy_port_cnt) {
>> +		/*
>> +		 * SUPPORTED_Asym_Pause and SUPPORTED_Pause can be removed to
>> +		 * disable flow control when rate limiting is used.
>> +		 */
>> +		linkmode_copy(phy->advertising, phy->supported);
>> +	}
>> +}
> 
> Hi Marek
> 
> Do you know why this is needed?

Unfortunately, no.

It seems it copies supported features of the PHY to advertised features
of the PHY for ports which are downstream (i.e. not the CPU port).

^ permalink raw reply

* Re: [PATCH 3/3] net: dsa: ksz: Add Microchip KSZ8795 DSA driver
From: Andrew Lunn @ 2019-07-25 14:59 UTC (permalink / raw)
  To: Marek Vasut
  Cc: netdev, Tristram Ha, David S . Miller, Florian Fainelli,
	Vivien Didelot, Woojung Huh
In-Reply-To: <ea4f2da3-a91f-f1fa-b70d-e9bd46708454@denx.de>

On Thu, Jul 25, 2019 at 04:56:37PM +0200, Marek Vasut wrote:
> On 7/25/19 4:03 PM, Andrew Lunn wrote:
> > On Wed, Jul 24, 2019 at 03:40:48PM +0200, Marek Vasut wrote:
> >> From: Tristram Ha <Tristram.Ha@microchip.com>
> >> +static void ksz8795_phy_setup(struct ksz_device *dev, int port,
> >> +			      struct phy_device *phy)
> >> +{
> >> +	if (port < dev->phy_port_cnt) {
> >> +		/*
> >> +		 * SUPPORTED_Asym_Pause and SUPPORTED_Pause can be removed to
> >> +		 * disable flow control when rate limiting is used.
> >> +		 */
> >> +		linkmode_copy(phy->advertising, phy->supported);
> >> +	}
> >> +}
> > 
> > Hi Marek
> > 
> > Do you know why this is needed?
> 
> Unfortunately, no.
> 
> It seems it copies supported features of the PHY to advertised features
> of the PHY for ports which are downstream (i.e. not the CPU port).

Hi Marek

Could you test it without this copy? Do you get sensible values from
ethtool? Does the pause configuration look sensible?

Thanks
	Andrew

^ permalink raw reply

* Re: [PATCH bpf-next v4 3/6] xdp: Add devmap_hash map type for looking up devices by hashed index
From: Toke Høiland-Jørgensen @ 2019-07-25 15:05 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Daniel Borkmann, Alexei Starovoitov, netdev, David Miller,
	Jakub Kicinski, Björn Töpel, Yonghong Song, brouer
In-Reply-To: <20190725133730.3750c66c@carbon>

Jesper Dangaard Brouer <brouer@redhat.com> writes:

> On Thu, 25 Jul 2019 12:32:19 +0200
> Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
>> Jesper Dangaard Brouer <brouer@redhat.com> writes:
>> 
>> > On Mon, 22 Jul 2019 13:52:48 +0200
>> > Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>> >  
>> >> +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
>> >> +						    int idx)
>> >> +{
>> >> +	return &dtab->dev_index_head[idx & (NETDEV_HASHENTRIES - 1)];
>> >> +}  
>> >
>> > It is good for performance that our "hash" function is simply an AND
>> > operation on the idx.  We want to keep it this way.
>> >
>> > I don't like that you are using NETDEV_HASHENTRIES, because the BPF map
>> > infrastructure already have a way to specify the map size (struct
>> > bpf_map_def .max_entries).  BUT for performance reasons, to keep the
>> > AND operation, we would need to round up the hash-array size to nearest
>> > power of 2 (or reject if user didn't specify a power of 2, if we want
>> > to "expose" this limit to users).  
>> 
>> But do we really want the number of hash buckets to be equal to the max
>> number of entries? The values are not likely to be evenly distributed,
>> so we'll end up with big buckets if the number is small, meaning we'll
>> blow performance on walking long lists in each bucket.
>
> The requested change makes it user-configurable, instead of fixed 256
> entries.  I've seen production use-case with >5000 net_devices, thus
> they need a knob to increase this (to avoid the list walking as you
> mention).

Ah, I see. That makes sense; I thought you wanted to make it smaller
(cf. the previous discussion about it being too big). Still, it seems
counter-intuitive to overload max_entries in this way.

I do see that this is what the existing hash map is also doing, though,
so I guess there is some precedence. I do wonder if we'll end up getting
bad performance from the hash being too simplistic, but I guess we can
always fix that later.

>> Also, if the size is dynamic the size needs to be loaded from memory
>> instead of being a compile-time constant, which will presumably hurt
>> performance (though not sure by how much)?
>
> To counter this, the mask value which need to be loaded from memory,
> needs to be placed next to some other struct member which is already in
> use (at least on same cacheline, Intel have some 16 bytes access micro
> optimizations, which I've never been able to measure, as its in 0.5
> nanosec scale).

In the fast path (i.e., in __xdp_map_lookup_elem) we will have already
loaded map->max_entries since it's on the same cacheline as map_type
which we use to disambiguate which function to call. So it should be
fine to just use that directly.

I'll send a new version with this change :)

-Toke

^ permalink raw reply

* [PATCH V2 1/3] dt-bindings: net: dsa: ksz: document Microchip KSZ87xx family switches
From: Marek Vasut @ 2019-07-25 15:05 UTC (permalink / raw)
  To: netdev
  Cc: Marek Vasut, Andrew Lunn, David S . Miller, Florian Fainelli,
	Rob Herring, Tristram Ha, Vivien Didelot, Woojung Huh, devicetree
In-Reply-To: <20190725150552.6901-1-marex@denx.de>

Document Microchip KSZ87xx family switches. These include
KSZ8765 - 5 port switch
KSZ8794 - 4 port switch
KSZ8795 - 5 port switch

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: David S. Miller <davem@davemloft.net>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Tristram Ha <Tristram.Ha@microchip.com>
Cc: Vivien Didelot <vivien.didelot@gmail.com>
Cc: Woojung Huh <woojung.huh@microchip.com>
Cc: devicetree@vger.kernel.org
---
V2: No change
---
 Documentation/devicetree/bindings/net/dsa/ksz.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt b/Documentation/devicetree/bindings/net/dsa/ksz.txt
index 4ac21cef370e..5e8429b6f9ca 100644
--- a/Documentation/devicetree/bindings/net/dsa/ksz.txt
+++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt
@@ -5,6 +5,9 @@ Required properties:
 
 - compatible: For external switch chips, compatible string must be exactly one
   of the following:
+  - "microchip,ksz8765"
+  - "microchip,ksz8794"
+  - "microchip,ksz8795"
   - "microchip,ksz9477"
   - "microchip,ksz9897"
   - "microchip,ksz9896"
-- 
2.20.1


^ permalink raw reply related

* [PATCH V2 0/3] net: dsa: ksz: Add Microchip KSZ87xx support
From: Marek Vasut @ 2019-07-25 15:05 UTC (permalink / raw)
  To: netdev
  Cc: Marek Vasut, Andrew Lunn, David S . Miller, Florian Fainelli,
	Tristram Ha, Vivien Didelot, Woojung Huh

This series adds support for Microchip KSZ87xx switches, which are
slightly simpler compared to KSZ9xxx .

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: David S. Miller <davem@davemloft.net>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Tristram Ha <Tristram.Ha@microchip.com>
Cc: Vivien Didelot <vivien.didelot@gmail.com>
Cc: Woojung Huh <woojung.huh@microchip.com>

Marek Vasut (1):
  dt-bindings: net: dsa: ksz: document Microchip KSZ87xx family switches

Tristram Ha (2):
  net: dsa: ksz: Add KSZ8795 tag code
  net: dsa: ksz: Add Microchip KSZ8795 DSA driver

 .../devicetree/bindings/net/dsa/ksz.txt       |    3 +
 drivers/net/dsa/microchip/Kconfig             |   18 +
 drivers/net/dsa/microchip/Makefile            |    2 +
 drivers/net/dsa/microchip/ksz8795.c           | 1324 +++++++++++++++++
 drivers/net/dsa/microchip/ksz8795_reg.h       | 1004 +++++++++++++
 drivers/net/dsa/microchip/ksz8795_spi.c       |  104 ++
 drivers/net/dsa/microchip/ksz_common.h        |   28 +
 drivers/net/dsa/microchip/ksz_priv.h          |    1 +
 include/net/dsa.h                             |    2 +
 net/dsa/Kconfig                               |    7 +
 net/dsa/tag_ksz.c                             |   62 +
 11 files changed, 2555 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8795.c
 create mode 100644 drivers/net/dsa/microchip/ksz8795_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8795_spi.c

-- 
2.20.1


^ permalink raw reply

* [PATCH V2 2/3] net: dsa: ksz: Add KSZ8795 tag code
From: Marek Vasut @ 2019-07-25 15:05 UTC (permalink / raw)
  To: netdev
  Cc: Tristram Ha, Marek Vasut, Andrew Lunn, David S . Miller,
	Florian Fainelli, Vivien Didelot, Woojung Huh
In-Reply-To: <20190725150552.6901-1-marex@denx.de>

From: Tristram Ha <Tristram.Ha@microchip.com>

Add DSA tag code for Microchip KSZ8795 switch. The switch is simpler
and the tag is only 1 byte, instead of 2 as is the case with KSZ9477.

Signed-off-by: Tristram Ha <Tristram.Ha@microchip.com>
Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: David S. Miller <davem@davemloft.net>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Tristram Ha <Tristram.Ha@microchip.com>
Cc: Vivien Didelot <vivien.didelot@gmail.com>
Cc: Woojung Huh <woojung.huh@microchip.com>
---
V2: No change
---
 include/net/dsa.h |  2 ++
 net/dsa/Kconfig   |  7 ++++++
 net/dsa/tag_ksz.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1e8650fa8acc..147b757ef8ea 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -41,6 +41,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_TRAILER_VALUE		11
 #define DSA_TAG_PROTO_8021Q_VALUE		12
 #define DSA_TAG_PROTO_SJA1105_VALUE		13
+#define DSA_TAG_PROTO_KSZ8795_VALUE		14
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -57,6 +58,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
 	DSA_TAG_PROTO_8021Q		= DSA_TAG_PROTO_8021Q_VALUE,
 	DSA_TAG_PROTO_SJA1105		= DSA_TAG_PROTO_SJA1105_VALUE,
+	DSA_TAG_PROTO_KSZ8795		= DSA_TAG_PROTO_KSZ8795_VALUE,
 };
 
 struct packet_type;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 6e942dda1bcd..423fa4370608 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -77,6 +77,13 @@ config NET_DSA_TAG_KSZ_COMMON
 	tristate
 	default n
 
+config NET_DSA_TAG_KSZ8795
+	tristate "Tag driver for Microchip 8795 family of switches"
+	select NET_DSA_TAG_KSZ_COMMON
+	help
+	  Say Y if you want to enable support for tagging frames for the
+	  Microchip 8795 family of switches.
+
 config NET_DSA_TAG_KSZ
 	tristate "Tag driver for Microchip 9893 family of switches"
 	select NET_DSA_TAG_KSZ_COMMON
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index adc6c1e03a4c..7af71db91c54 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -69,6 +69,67 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
 	return skb;
 }
 
+/*
+ * For Ingress (Host -> KSZ8795), 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
+ *
+ * For Egress (KSZ8795 -> Host), 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : zero-based value represents port
+ *	  (eg, 0x00=port1, 0x02=port3, 0x06=port7)
+ */
+
+#define KSZ8795_INGRESS_TAG_LEN		1
+
+#define KSZ8795_TAIL_TAG_OVERRIDE	BIT(6)
+#define KSZ8795_TAIL_TAG_LOOKUP		BIT(7)
+
+static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct sk_buff *nskb;
+	u8 *tag;
+	u8 *addr;
+
+	nskb = ksz_common_xmit(skb, dev, KSZ8795_INGRESS_TAG_LEN);
+	if (!nskb)
+		return NULL;
+
+	/* Tag encoding */
+	tag = skb_put(nskb, KSZ8795_INGRESS_TAG_LEN);
+	addr = skb_mac_header(nskb);
+
+	*tag = 1 << dp->index;
+	if (is_link_local_ether_addr(addr))
+		*tag |= KSZ8795_TAIL_TAG_OVERRIDE;
+
+	return nskb;
+}
+
+static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev,
+				  struct packet_type *pt)
+{
+	u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+
+	return ksz_common_rcv(skb, dev, tag[0] & 7, KSZ_EGRESS_TAG_LEN);
+}
+
+static const struct dsa_device_ops ksz8795_netdev_ops = {
+	.name	= "ksz8795",
+	.proto	= DSA_TAG_PROTO_KSZ8795,
+	.xmit	= ksz8795_xmit,
+	.rcv	= ksz8795_rcv,
+	.overhead = KSZ8795_INGRESS_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(ksz8795_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795);
+
 /*
  * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS.
  * ---------------------------------------------------------------------------
@@ -183,6 +244,7 @@ DSA_TAG_DRIVER(ksz9893_netdev_ops);
 MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893);
 
 static struct dsa_tag_driver *dsa_tag_driver_array[] = {
+	&DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops),
 	&DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops),
 	&DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops),
 };
-- 
2.20.1


^ permalink raw reply related

* [PATCH V2 3/3] net: dsa: ksz: Add Microchip KSZ8795 DSA driver
From: Marek Vasut @ 2019-07-25 15:05 UTC (permalink / raw)
  To: netdev
  Cc: Tristram Ha, Marek Vasut, Andrew Lunn, David S . Miller,
	Florian Fainelli, Vivien Didelot, Woojung Huh
In-Reply-To: <20190725150552.6901-1-marex@denx.de>

From: Tristram Ha <Tristram.Ha@microchip.com>

Add Microchip KSZ8795 DSA driver.

Signed-off-by: Tristram Ha <Tristram.Ha@microchip.com>
Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: David S. Miller <davem@davemloft.net>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Tristram Ha <Tristram.Ha@microchip.com>
Cc: Vivien Didelot <vivien.didelot@gmail.com>
Cc: Woojung Huh <woojung.huh@microchip.com>
---
V2: - Use reverse xmas tree for variable declaration
    - Use BIT() macro where applicable
    - Use regmap_update_bits() where applicable
    - Replace ad-hoc stp_multicast_addr[] with ether_addr_copy(..., eth_stp_addr)
---
 drivers/net/dsa/microchip/Kconfig       |   18 +
 drivers/net/dsa/microchip/Makefile      |    2 +
 drivers/net/dsa/microchip/ksz8795.c     | 1324 +++++++++++++++++++++++
 drivers/net/dsa/microchip/ksz8795_reg.h | 1004 +++++++++++++++++
 drivers/net/dsa/microchip/ksz8795_spi.c |  104 ++
 drivers/net/dsa/microchip/ksz_common.h  |   28 +
 drivers/net/dsa/microchip/ksz_priv.h    |    1 +
 7 files changed, 2481 insertions(+)
 create mode 100644 drivers/net/dsa/microchip/ksz8795.c
 create mode 100644 drivers/net/dsa/microchip/ksz8795_reg.h
 create mode 100644 drivers/net/dsa/microchip/ksz8795_spi.c

diff --git a/drivers/net/dsa/microchip/Kconfig b/drivers/net/dsa/microchip/Kconfig
index fe0a13b79c4b..d990c7128991 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -16,3 +16,21 @@ config NET_DSA_MICROCHIP_KSZ9477_SPI
 	select REGMAP_SPI
 	help
 	  Select to enable support for registering switches configured through SPI.
+
+menuconfig NET_DSA_MICROCHIP_KSZ8795
+	tristate "Microchip KSZ8795 series switch support"
+	depends on NET_DSA
+	select NET_DSA_TAG_KSZ8795
+	select NET_DSA_MICROCHIP_KSZ_COMMON
+	help
+	  This driver adds support for Microchip KSZ8795 switch chips.
+
+config NET_DSA_MICROCHIP_KSZ8795_SPI
+	tristate "KSZ8795 series SPI connected switch driver"
+	depends on NET_DSA_MICROCHIP_KSZ8795 && SPI
+	select REGMAP_SPI
+	help
+	  This driver accesses KSZ8795 chip through SPI.
+
+	  It is required to use the KSZ8795 switch driver as the only access
+	  is through SPI.
diff --git a/drivers/net/dsa/microchip/Makefile b/drivers/net/dsa/microchip/Makefile
index 68451b02f775..e3d799b95d7d 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -2,3 +2,5 @@
 obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON)	+= ksz_common.o
 obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ9477)		+= ksz9477.o
 obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI)	+= ksz9477_spi.o
+obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ8795)		+= ksz8795.o
+obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI)	+= ksz8795_spi.o
diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c
new file mode 100644
index 000000000000..f2408d5943db
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -0,0 +1,1324 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Microchip KSZ8795 switch driver
+ *
+ * Copyright (C) 2017 Microchip Technology Inc.
+ *	Tristram Ha <Tristram.Ha@microchip.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/gpio.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_data/microchip-ksz.h>
+#include <linux/phy.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <net/dsa.h>
+#include <net/switchdev.h>
+
+#include "ksz_priv.h"
+#include "ksz_common.h"
+#include "ksz8795_reg.h"
+
+static const struct {
+	char string[ETH_GSTRING_LEN];
+} mib_names[TOTAL_SWITCH_COUNTER_NUM] = {
+	{ "rx_hi" },
+	{ "rx_undersize" },
+	{ "rx_fragments" },
+	{ "rx_oversize" },
+	{ "rx_jabbers" },
+	{ "rx_symbol_err" },
+	{ "rx_crc_err" },
+	{ "rx_align_err" },
+	{ "rx_mac_ctrl" },
+	{ "rx_pause" },
+	{ "rx_bcast" },
+	{ "rx_mcast" },
+	{ "rx_ucast" },
+	{ "rx_64_or_less" },
+	{ "rx_65_127" },
+	{ "rx_128_255" },
+	{ "rx_256_511" },
+	{ "rx_512_1023" },
+	{ "rx_1024_1522" },
+	{ "rx_1523_2000" },
+	{ "rx_2001" },
+	{ "tx_hi" },
+	{ "tx_late_col" },
+	{ "tx_pause" },
+	{ "tx_bcast" },
+	{ "tx_mcast" },
+	{ "tx_ucast" },
+	{ "tx_deferred" },
+	{ "tx_total_col" },
+	{ "tx_exc_col" },
+	{ "tx_single_col" },
+	{ "tx_mult_col" },
+	{ "rx_total" },
+	{ "tx_total" },
+	{ "rx_discards" },
+	{ "tx_discards" },
+};
+
+static void ksz_cfg(struct ksz_device *dev, u32 addr, u8 bits, bool set)
+{
+	regmap_update_bits(dev->regmap[0], addr, bits, set ? bits : 0);
+}
+
+static void ksz_port_cfg(struct ksz_device *dev, int port, int offset, u8 bits,
+			 bool set)
+{
+	regmap_update_bits(dev->regmap[0], PORT_CTRL_ADDR(port, offset),
+			   bits, set ? bits : 0);
+}
+
+static int ksz8795_reset_switch(struct ksz_device *dev)
+{
+	/* reset switch */
+	ksz_write8(dev, REG_POWER_MANAGEMENT_1,
+		   SW_SOFTWARE_POWER_DOWN << SW_POWER_MANAGEMENT_MODE_S);
+	ksz_write8(dev, REG_POWER_MANAGEMENT_1, 0);
+
+	return 0;
+}
+
+static void ksz8795_set_prio_queue(struct ksz_device *dev, int port, int queue)
+{
+	u8 hi, lo;
+
+	/* Number of queues can only be 1, 2, or 4. */
+	switch (queue) {
+	case 4:
+	case 3:
+		queue = PORT_QUEUE_SPLIT_4;
+		break;
+	case 2:
+		queue = PORT_QUEUE_SPLIT_2;
+		break;
+	default:
+		queue = PORT_QUEUE_SPLIT_1;
+	}
+	ksz_pread8(dev, port, REG_PORT_CTRL_0, &lo);
+	ksz_pread8(dev, port, P_DROP_TAG_CTRL, &hi);
+	lo &= ~PORT_QUEUE_SPLIT_L;
+	if (queue & PORT_QUEUE_SPLIT_2)
+		lo |= PORT_QUEUE_SPLIT_L;
+	hi &= ~PORT_QUEUE_SPLIT_H;
+	if (queue & PORT_QUEUE_SPLIT_4)
+		hi |= PORT_QUEUE_SPLIT_H;
+	ksz_pwrite8(dev, port, REG_PORT_CTRL_0, lo);
+	ksz_pwrite8(dev, port, P_DROP_TAG_CTRL, hi);
+
+	/* Default is port based for egress rate limit. */
+	if (queue != PORT_QUEUE_SPLIT_1)
+		ksz_cfg(dev, REG_SW_CTRL_19, SW_OUT_RATE_LIMIT_QUEUE_BASED,
+			true);
+}
+
+static void ksz8795_r_mib_cnt(struct ksz_device *dev, int port, u16 addr,
+			      u64 *cnt)
+{
+	u16 ctrl_addr;
+	u32 data;
+	u8 check;
+	int loop;
+
+	ctrl_addr = addr + SWITCH_COUNTER_NUM * port;
+	ctrl_addr |= IND_ACC_TABLE(TABLE_MIB | TABLE_READ);
+
+	mutex_lock(&dev->alu_mutex);
+	ksz_write16(dev, REG_IND_CTRL_0, ctrl_addr);
+
+	/* It is almost guaranteed to always read the valid bit because of
+	 * slow SPI speed.
+	 */
+	for (loop = 2; loop > 0; loop--) {
+		ksz_read8(dev, REG_IND_MIB_CHECK, &check);
+
+		if (check & MIB_COUNTER_VALID) {
+			ksz_read32(dev, REG_IND_DATA_LO, &data);
+			if (check & MIB_COUNTER_OVERFLOW)
+				*cnt += MIB_COUNTER_VALUE + 1;
+			*cnt += data & MIB_COUNTER_VALUE;
+			break;
+		}
+	}
+	mutex_unlock(&dev->alu_mutex);
+}
+
+static void ksz8795_r_mib_pkt(struct ksz_device *dev, int port, u16 addr,
+			      u64 *dropped, u64 *cnt)
+{
+	u16 ctrl_addr;
+	u32 data;
+	u8 check;
+	int loop;
+
+	addr -= SWITCH_COUNTER_NUM;
+	ctrl_addr = (KS_MIB_TOTAL_RX_1 - KS_MIB_TOTAL_RX_0) * port;
+	ctrl_addr += addr + KS_MIB_TOTAL_RX_0;
+	ctrl_addr |= IND_ACC_TABLE(TABLE_MIB | TABLE_READ);
+
+	mutex_lock(&dev->alu_mutex);
+	ksz_write16(dev, REG_IND_CTRL_0, ctrl_addr);
+
+	/* It is almost guaranteed to always read the valid bit because of
+	 * slow SPI speed.
+	 */
+	for (loop = 2; loop > 0; loop--) {
+		ksz_read8(dev, REG_IND_MIB_CHECK, &check);
+
+		if (check & MIB_COUNTER_VALID) {
+			ksz_read32(dev, REG_IND_DATA_LO, &data);
+			if (addr < 2) {
+				u64 total;
+
+				total = check & MIB_TOTAL_BYTES_H;
+				total <<= 32;
+				*cnt += total;
+				*cnt += data;
+				if (check & MIB_COUNTER_OVERFLOW) {
+					total = MIB_TOTAL_BYTES_H + 1;
+					total <<= 32;
+					*cnt += total;
+				}
+			} else {
+				if (check & MIB_COUNTER_OVERFLOW)
+					*cnt += MIB_PACKET_DROPPED + 1;
+				*cnt += data & MIB_PACKET_DROPPED;
+			}
+			break;
+		}
+	}
+	mutex_unlock(&dev->alu_mutex);
+}
+
+static void ksz8795_freeze_mib(struct ksz_device *dev, int port, bool freeze)
+{
+	/* enable the port for flush/freeze function */
+	if (freeze)
+		ksz_cfg(dev, REG_SW_CTRL_6, BIT(port), true);
+	ksz_cfg(dev, REG_SW_CTRL_6, SW_MIB_COUNTER_FREEZE, freeze);
+
+	/* disable the port after freeze is done */
+	if (!freeze)
+		ksz_cfg(dev, REG_SW_CTRL_6, BIT(port), false);
+}
+
+static void ksz8795_port_init_cnt(struct ksz_device *dev, int port)
+{
+	struct ksz_port_mib *mib = &dev->ports[port].mib;
+
+	/* flush all enabled port MIB counters */
+	ksz_cfg(dev, REG_SW_CTRL_6, BIT(port), true);
+	ksz_cfg(dev, REG_SW_CTRL_6, SW_MIB_COUNTER_FLUSH, true);
+	ksz_cfg(dev, REG_SW_CTRL_6, BIT(port), false);
+
+	mib->cnt_ptr = 0;
+
+	/* Some ports may not have MIB counters before SWITCH_COUNTER_NUM. */
+	while (mib->cnt_ptr < dev->reg_mib_cnt) {
+		dev->dev_ops->r_mib_cnt(dev, port, mib->cnt_ptr,
+					&mib->counters[mib->cnt_ptr]);
+		++mib->cnt_ptr;
+	}
+
+	/* Some ports may not have MIB counters after SWITCH_COUNTER_NUM. */
+	while (mib->cnt_ptr < dev->mib_cnt) {
+		dev->dev_ops->r_mib_pkt(dev, port, mib->cnt_ptr,
+					NULL, &mib->counters[mib->cnt_ptr]);
+		++mib->cnt_ptr;
+	}
+	mib->cnt_ptr = 0;
+	memset(mib->counters, 0, dev->mib_cnt * sizeof(u64));
+}
+
+static void ksz8795_r_table(struct ksz_device *dev, int table, u16 addr,
+			    u64 *data)
+{
+	u16 ctrl_addr;
+
+	ctrl_addr = IND_ACC_TABLE(table | TABLE_READ) | addr;
+
+	mutex_lock(&dev->alu_mutex);
+	ksz_write16(dev, REG_IND_CTRL_0, ctrl_addr);
+	ksz_read64(dev, REG_IND_DATA_HI, data);
+	mutex_unlock(&dev->alu_mutex);
+}
+
+static void ksz8795_w_table(struct ksz_device *dev, int table, u16 addr,
+			    u64 data)
+{
+	u16 ctrl_addr;
+
+	ctrl_addr = IND_ACC_TABLE(table) | addr;
+
+	mutex_lock(&dev->alu_mutex);
+	ksz_write64(dev, REG_IND_DATA_HI, data);
+	ksz_write16(dev, REG_IND_CTRL_0, ctrl_addr);
+	mutex_unlock(&dev->alu_mutex);
+}
+
+static int ksz8795_valid_dyn_entry(struct ksz_device *dev, u8 *data)
+{
+	int timeout = 100;
+
+	do {
+		ksz_read8(dev, REG_IND_DATA_CHECK, data);
+		timeout--;
+	} while ((*data & DYNAMIC_MAC_TABLE_NOT_READY) && timeout);
+
+	/* Entry is not ready for accessing. */
+	if (*data & DYNAMIC_MAC_TABLE_NOT_READY) {
+		return -EAGAIN;
+	/* Entry is ready for accessing. */
+	} else {
+		ksz_read8(dev, REG_IND_DATA_8, data);
+
+		/* There is no valid entry in the table. */
+		if (*data & DYNAMIC_MAC_TABLE_MAC_EMPTY)
+			return -ENXIO;
+	}
+	return 0;
+}
+
+static int ksz8795_r_dyn_mac_table(struct ksz_device *dev, u16 addr,
+				   u8 *mac_addr, u8 *fid, u8 *src_port,
+				   u8 *timestamp, u16 *entries)
+{
+	u32 data_hi, data_lo;
+	u16 ctrl_addr;
+	u8 data;
+	int rc;
+
+	ctrl_addr = IND_ACC_TABLE(TABLE_DYNAMIC_MAC | TABLE_READ) | addr;
+
+	mutex_lock(&dev->alu_mutex);
+	ksz_write16(dev, REG_IND_CTRL_0, ctrl_addr);
+
+	rc = ksz8795_valid_dyn_entry(dev, &data);
+	if (rc == -EAGAIN) {
+		if (addr == 0)
+			*entries = 0;
+	} else if (rc == -ENXIO) {
+		*entries = 0;
+	/* At least one valid entry in the table. */
+	} else {
+		u64 buf = 0;
+		int cnt;
+
+		ksz_read64(dev, REG_IND_DATA_HI, &buf);
+		data_hi = (u32)(buf >> 32);
+		data_lo = (u32)buf;
+
+		/* Check out how many valid entry in the table. */
+		cnt = data & DYNAMIC_MAC_TABLE_ENTRIES_H;
+		cnt <<= DYNAMIC_MAC_ENTRIES_H_S;
+		cnt |= (data_hi & DYNAMIC_MAC_TABLE_ENTRIES) >>
+			DYNAMIC_MAC_ENTRIES_S;
+		*entries = cnt + 1;
+
+		*fid = (data_hi & DYNAMIC_MAC_TABLE_FID) >>
+			DYNAMIC_MAC_FID_S;
+		*src_port = (data_hi & DYNAMIC_MAC_TABLE_SRC_PORT) >>
+			DYNAMIC_MAC_SRC_PORT_S;
+		*timestamp = (data_hi & DYNAMIC_MAC_TABLE_TIMESTAMP) >>
+			DYNAMIC_MAC_TIMESTAMP_S;
+
+		mac_addr[5] = (u8)data_lo;
+		mac_addr[4] = (u8)(data_lo >> 8);
+		mac_addr[3] = (u8)(data_lo >> 16);
+		mac_addr[2] = (u8)(data_lo >> 24);
+
+		mac_addr[1] = (u8)data_hi;
+		mac_addr[0] = (u8)(data_hi >> 8);
+		rc = 0;
+	}
+	mutex_unlock(&dev->alu_mutex);
+
+	return rc;
+}
+
+static int ksz8795_r_sta_mac_table(struct ksz_device *dev, u16 addr,
+				   struct alu_struct *alu)
+{
+	u32 data_hi, data_lo;
+	u64 data;
+
+	ksz8795_r_table(dev, TABLE_STATIC_MAC, addr, &data);
+	data_hi = data >> 32;
+	data_lo = (u32)data;
+	if (data_hi & (STATIC_MAC_TABLE_VALID | STATIC_MAC_TABLE_OVERRIDE)) {
+		alu->mac[5] = (u8)data_lo;
+		alu->mac[4] = (u8)(data_lo >> 8);
+		alu->mac[3] = (u8)(data_lo >> 16);
+		alu->mac[2] = (u8)(data_lo >> 24);
+		alu->mac[1] = (u8)data_hi;
+		alu->mac[0] = (u8)(data_hi >> 8);
+		alu->port_forward = (data_hi & STATIC_MAC_TABLE_FWD_PORTS) >>
+			STATIC_MAC_FWD_PORTS_S;
+		alu->is_override =
+			(data_hi & STATIC_MAC_TABLE_OVERRIDE) ? 1 : 0;
+		data_hi >>= 1;
+		alu->is_use_fid = (data_hi & STATIC_MAC_TABLE_USE_FID) ? 1 : 0;
+		alu->fid = (data_hi & STATIC_MAC_TABLE_FID) >>
+			STATIC_MAC_FID_S;
+		return 0;
+	}
+	return -ENXIO;
+}
+
+static void ksz8795_w_sta_mac_table(struct ksz_device *dev, u16 addr,
+				    struct alu_struct *alu)
+{
+	u32 data_hi, data_lo;
+	u64 data;
+
+	data_lo = ((u32)alu->mac[2] << 24) |
+		((u32)alu->mac[3] << 16) |
+		((u32)alu->mac[4] << 8) | alu->mac[5];
+	data_hi = ((u32)alu->mac[0] << 8) | alu->mac[1];
+	data_hi |= (u32)alu->port_forward << STATIC_MAC_FWD_PORTS_S;
+
+	if (alu->is_override)
+		data_hi |= STATIC_MAC_TABLE_OVERRIDE;
+	if (alu->is_use_fid) {
+		data_hi |= STATIC_MAC_TABLE_USE_FID;
+		data_hi |= (u32)alu->fid << STATIC_MAC_FID_S;
+	}
+	if (alu->is_static)
+		data_hi |= STATIC_MAC_TABLE_VALID;
+	else
+		data_hi &= ~STATIC_MAC_TABLE_OVERRIDE;
+
+	data = (u64)data_hi << 32 | data_lo;
+	ksz8795_w_table(dev, TABLE_STATIC_MAC, addr, data);
+}
+
+static void ksz8795_from_vlan(u16 vlan, u8 *fid, u8 *member, u8 *valid)
+{
+	*fid = vlan & VLAN_TABLE_FID;
+	*member = (vlan & VLAN_TABLE_MEMBERSHIP) >> VLAN_TABLE_MEMBERSHIP_S;
+	*valid = !!(vlan & VLAN_TABLE_VALID);
+}
+
+static void ksz8795_to_vlan(u8 fid, u8 member, u8 valid, u16 *vlan)
+{
+	*vlan = fid;
+	*vlan |= (u16)member << VLAN_TABLE_MEMBERSHIP_S;
+	if (valid)
+		*vlan |= VLAN_TABLE_VALID;
+}
+
+static void ksz8795_r_vlan_entries(struct ksz_device *dev, u16 addr)
+{
+	u64 data;
+	int i;
+
+	ksz8795_r_table(dev, TABLE_VLAN, addr, &data);
+	addr *= 4;
+	for (i = 0; i < 4; i++) {
+		dev->vlan_cache[addr + i].table[0] = (u16)data;
+		data >>= VLAN_TABLE_S;
+	}
+}
+
+static void ksz8795_r_vlan_table(struct ksz_device *dev, u16 vid, u16 *vlan)
+{
+	int index;
+	u16 *data;
+	u16 addr;
+	u64 buf;
+
+	data = (u16 *)&buf;
+	addr = vid / 4;
+	index = vid & 3;
+	ksz8795_r_table(dev, TABLE_VLAN, addr, &buf);
+	*vlan = data[index];
+}
+
+static void ksz8795_w_vlan_table(struct ksz_device *dev, u16 vid, u16 vlan)
+{
+	int index;
+	u16 *data;
+	u16 addr;
+	u64 buf;
+
+	data = (u16 *)&buf;
+	addr = vid / 4;
+	index = vid & 3;
+	ksz8795_r_table(dev, TABLE_VLAN, addr, &buf);
+	data[index] = vlan;
+	dev->vlan_cache[vid].table[0] = vlan;
+	ksz8795_w_table(dev, TABLE_VLAN, addr, buf);
+}
+
+static void ksz8795_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
+{
+	u8 restart, speed, ctrl, link;
+	int processed = true;
+	u16 data = 0;
+	u8 p = phy;
+
+	switch (reg) {
+	case PHY_REG_CTRL:
+		ksz_pread8(dev, p, P_NEG_RESTART_CTRL, &restart);
+		ksz_pread8(dev, p, P_SPEED_STATUS, &speed);
+		ksz_pread8(dev, p, P_FORCE_CTRL, &ctrl);
+		if (restart & PORT_PHY_LOOPBACK)
+			data |= PHY_LOOPBACK;
+		if (ctrl & PORT_FORCE_100_MBIT)
+			data |= PHY_SPEED_100MBIT;
+		if (!(ctrl & PORT_AUTO_NEG_DISABLE))
+			data |= PHY_AUTO_NEG_ENABLE;
+		if (restart & PORT_POWER_DOWN)
+			data |= PHY_POWER_DOWN;
+		if (restart & PORT_AUTO_NEG_RESTART)
+			data |= PHY_AUTO_NEG_RESTART;
+		if (ctrl & PORT_FORCE_FULL_DUPLEX)
+			data |= PHY_FULL_DUPLEX;
+		if (speed & PORT_HP_MDIX)
+			data |= PHY_HP_MDIX;
+		if (restart & PORT_FORCE_MDIX)
+			data |= PHY_FORCE_MDIX;
+		if (restart & PORT_AUTO_MDIX_DISABLE)
+			data |= PHY_AUTO_MDIX_DISABLE;
+		if (restart & PORT_TX_DISABLE)
+			data |= PHY_TRANSMIT_DISABLE;
+		if (restart & PORT_LED_OFF)
+			data |= PHY_LED_DISABLE;
+		break;
+	case PHY_REG_STATUS:
+		ksz_pread8(dev, p, P_LINK_STATUS, &link);
+		data = PHY_100BTX_FD_CAPABLE |
+		       PHY_100BTX_CAPABLE |
+		       PHY_10BT_FD_CAPABLE |
+		       PHY_10BT_CAPABLE |
+		       PHY_AUTO_NEG_CAPABLE;
+		if (link & PORT_AUTO_NEG_COMPLETE)
+			data |= PHY_AUTO_NEG_ACKNOWLEDGE;
+		if (link & PORT_STAT_LINK_GOOD)
+			data |= PHY_LINK_STATUS;
+		break;
+	case PHY_REG_ID_1:
+		data = KSZ8795_ID_HI;
+		break;
+	case PHY_REG_ID_2:
+		data = KSZ8795_ID_LO;
+		break;
+	case PHY_REG_AUTO_NEGOTIATION:
+		ksz_pread8(dev, p, P_LOCAL_CTRL, &ctrl);
+		data = PHY_AUTO_NEG_802_3;
+		if (ctrl & PORT_AUTO_NEG_SYM_PAUSE)
+			data |= PHY_AUTO_NEG_SYM_PAUSE;
+		if (ctrl & PORT_AUTO_NEG_100BTX_FD)
+			data |= PHY_AUTO_NEG_100BTX_FD;
+		if (ctrl & PORT_AUTO_NEG_100BTX)
+			data |= PHY_AUTO_NEG_100BTX;
+		if (ctrl & PORT_AUTO_NEG_10BT_FD)
+			data |= PHY_AUTO_NEG_10BT_FD;
+		if (ctrl & PORT_AUTO_NEG_10BT)
+			data |= PHY_AUTO_NEG_10BT;
+		break;
+	case PHY_REG_REMOTE_CAPABILITY:
+		ksz_pread8(dev, p, P_REMOTE_STATUS, &link);
+		data = PHY_AUTO_NEG_802_3;
+		if (link & PORT_REMOTE_SYM_PAUSE)
+			data |= PHY_AUTO_NEG_SYM_PAUSE;
+		if (link & PORT_REMOTE_100BTX_FD)
+			data |= PHY_AUTO_NEG_100BTX_FD;
+		if (link & PORT_REMOTE_100BTX)
+			data |= PHY_AUTO_NEG_100BTX;
+		if (link & PORT_REMOTE_10BT_FD)
+			data |= PHY_AUTO_NEG_10BT_FD;
+		if (link & PORT_REMOTE_10BT)
+			data |= PHY_AUTO_NEG_10BT;
+		if (data & ~PHY_AUTO_NEG_802_3)
+			data |= PHY_REMOTE_ACKNOWLEDGE_NOT;
+		break;
+	default:
+		processed = false;
+		break;
+	}
+	if (processed)
+		*val = data;
+}
+
+static void ksz8795_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
+{
+	u8 p = phy;
+	u8 restart, speed, ctrl, data;
+
+	switch (reg) {
+	case PHY_REG_CTRL:
+
+		/* Do not support PHY reset function. */
+		if (val & PHY_RESET)
+			break;
+		ksz_pread8(dev, p, P_SPEED_STATUS, &speed);
+		data = speed;
+		if (val & PHY_HP_MDIX)
+			data |= PORT_HP_MDIX;
+		else
+			data &= ~PORT_HP_MDIX;
+		if (data != speed)
+			ksz_pwrite8(dev, p, P_SPEED_STATUS, data);
+		ksz_pread8(dev, p, P_FORCE_CTRL, &ctrl);
+		data = ctrl;
+		if (!(val & PHY_AUTO_NEG_ENABLE))
+			data |= PORT_AUTO_NEG_DISABLE;
+		else
+			data &= ~PORT_AUTO_NEG_DISABLE;
+
+		/* Fiber port does not support auto-negotiation. */
+		if (dev->ports[p].fiber)
+			data |= PORT_AUTO_NEG_DISABLE;
+		if (val & PHY_SPEED_100MBIT)
+			data |= PORT_FORCE_100_MBIT;
+		else
+			data &= ~PORT_FORCE_100_MBIT;
+		if (val & PHY_FULL_DUPLEX)
+			data |= PORT_FORCE_FULL_DUPLEX;
+		else
+			data &= ~PORT_FORCE_FULL_DUPLEX;
+		if (data != ctrl)
+			ksz_pwrite8(dev, p, P_FORCE_CTRL, data);
+		ksz_pread8(dev, p, P_NEG_RESTART_CTRL, &restart);
+		data = restart;
+		if (val & PHY_LED_DISABLE)
+			data |= PORT_LED_OFF;
+		else
+			data &= ~PORT_LED_OFF;
+		if (val & PHY_TRANSMIT_DISABLE)
+			data |= PORT_TX_DISABLE;
+		else
+			data &= ~PORT_TX_DISABLE;
+		if (val & PHY_AUTO_NEG_RESTART)
+			data |= PORT_AUTO_NEG_RESTART;
+		else
+			data &= ~(PORT_AUTO_NEG_RESTART);
+		if (val & PHY_POWER_DOWN)
+			data |= PORT_POWER_DOWN;
+		else
+			data &= ~PORT_POWER_DOWN;
+		if (val & PHY_AUTO_MDIX_DISABLE)
+			data |= PORT_AUTO_MDIX_DISABLE;
+		else
+			data &= ~PORT_AUTO_MDIX_DISABLE;
+		if (val & PHY_FORCE_MDIX)
+			data |= PORT_FORCE_MDIX;
+		else
+			data &= ~PORT_FORCE_MDIX;
+		if (val & PHY_LOOPBACK)
+			data |= PORT_PHY_LOOPBACK;
+		else
+			data &= ~PORT_PHY_LOOPBACK;
+		if (data != restart)
+			ksz_pwrite8(dev, p, P_NEG_RESTART_CTRL, data);
+		break;
+	case PHY_REG_AUTO_NEGOTIATION:
+		ksz_pread8(dev, p, P_LOCAL_CTRL, &ctrl);
+		data = ctrl;
+		data &= ~(PORT_AUTO_NEG_SYM_PAUSE |
+			  PORT_AUTO_NEG_100BTX_FD |
+			  PORT_AUTO_NEG_100BTX |
+			  PORT_AUTO_NEG_10BT_FD |
+			  PORT_AUTO_NEG_10BT);
+		if (val & PHY_AUTO_NEG_SYM_PAUSE)
+			data |= PORT_AUTO_NEG_SYM_PAUSE;
+		if (val & PHY_AUTO_NEG_100BTX_FD)
+			data |= PORT_AUTO_NEG_100BTX_FD;
+		if (val & PHY_AUTO_NEG_100BTX)
+			data |= PORT_AUTO_NEG_100BTX;
+		if (val & PHY_AUTO_NEG_10BT_FD)
+			data |= PORT_AUTO_NEG_10BT_FD;
+		if (val & PHY_AUTO_NEG_10BT)
+			data |= PORT_AUTO_NEG_10BT;
+		if (data != ctrl)
+			ksz_pwrite8(dev, p, P_LOCAL_CTRL, data);
+		break;
+	default:
+		break;
+	}
+}
+
+static enum dsa_tag_protocol ksz8795_get_tag_protocol(struct dsa_switch *ds,
+						      int port)
+{
+	return DSA_TAG_PROTO_KSZ8795;
+}
+
+static void ksz8795_get_strings(struct dsa_switch *ds, int port,
+				u32 stringset, uint8_t *buf)
+{
+	int i;
+
+	for (i = 0; i < TOTAL_SWITCH_COUNTER_NUM; i++) {
+		memcpy(buf + i * ETH_GSTRING_LEN, mib_names[i].string,
+		       ETH_GSTRING_LEN);
+	}
+}
+
+static void ksz8795_cfg_port_member(struct ksz_device *dev, int port,
+				    u8 member)
+{
+	u8 data;
+
+	ksz_pread8(dev, port, P_MIRROR_CTRL, &data);
+	data &= ~PORT_VLAN_MEMBERSHIP;
+	data |= (member & dev->port_mask);
+	ksz_pwrite8(dev, port, P_MIRROR_CTRL, data);
+	dev->ports[port].member = member;
+}
+
+static void ksz8795_port_stp_state_set(struct dsa_switch *ds, int port,
+				       u8 state)
+{
+	struct ksz_device *dev = ds->priv;
+	int forward = dev->member;
+	struct ksz_port *p;
+	int member = -1;
+	u8 data;
+
+	p = &dev->ports[port];
+
+	ksz_pread8(dev, port, P_STP_CTRL, &data);
+	data &= ~(PORT_TX_ENABLE | PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+
+	switch (state) {
+	case BR_STATE_DISABLED:
+		data |= PORT_LEARN_DISABLE;
+		if (port < SWITCH_PORT_NUM)
+			member = 0;
+		break;
+	case BR_STATE_LISTENING:
+		data |= (PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+		if (port < SWITCH_PORT_NUM &&
+		    p->stp_state == BR_STATE_DISABLED)
+			member = dev->host_mask | p->vid_member;
+		break;
+	case BR_STATE_LEARNING:
+		data |= PORT_RX_ENABLE;
+		break;
+	case BR_STATE_FORWARDING:
+		data |= (PORT_TX_ENABLE | PORT_RX_ENABLE);
+
+		/* This function is also used internally. */
+		if (port == dev->cpu_port)
+			break;
+
+		/* Port is a member of a bridge. */
+		if (dev->br_member & BIT(port)) {
+			dev->member |= BIT(port);
+			member = dev->member;
+		} else {
+			member = dev->host_mask | p->vid_member;
+		}
+		break;
+	case BR_STATE_BLOCKING:
+		data |= PORT_LEARN_DISABLE;
+		if (port < SWITCH_PORT_NUM &&
+		    p->stp_state == BR_STATE_DISABLED)
+			member = dev->host_mask | p->vid_member;
+		break;
+	default:
+		dev_err(ds->dev, "invalid STP state: %d\n", state);
+		return;
+	}
+
+	ksz_pwrite8(dev, port, P_STP_CTRL, data);
+	p->stp_state = state;
+	if (data & PORT_RX_ENABLE)
+		dev->rx_ports |= BIT(port);
+	else
+		dev->rx_ports &= ~BIT(port);
+	if (data & PORT_TX_ENABLE)
+		dev->tx_ports |= BIT(port);
+	else
+		dev->tx_ports &= ~BIT(port);
+
+	/* Port membership may share register with STP state. */
+	if (member >= 0 && member != p->member)
+		ksz8795_cfg_port_member(dev, port, (u8)member);
+
+	/* Check if forwarding needs to be updated. */
+	if (state != BR_STATE_FORWARDING) {
+		if (dev->br_member & BIT(port))
+			dev->member &= ~BIT(port);
+	}
+
+	/* When topology has changed the function ksz_update_port_member
+	 * should be called to modify port forwarding behavior.
+	 */
+	if (forward != dev->member)
+		ksz_update_port_member(dev, port);
+}
+
+static void ksz8795_flush_dyn_mac_table(struct ksz_device *dev, int port)
+{
+	u8 learn[TOTAL_PORT_NUM];
+	int first, index, cnt;
+	struct ksz_port *p;
+
+	if ((uint)port < TOTAL_PORT_NUM) {
+		first = port;
+		cnt = port + 1;
+	} else {
+		/* Flush all ports. */
+		first = 0;
+		cnt = dev->mib_port_cnt;
+	}
+	for (index = first; index < cnt; index++) {
+		p = &dev->ports[index];
+		if (!p->on)
+			continue;
+		ksz_pread8(dev, index, P_STP_CTRL, &learn[index]);
+		if (!(learn[index] & PORT_LEARN_DISABLE))
+			ksz_pwrite8(dev, index, P_STP_CTRL,
+				    learn[index] | PORT_LEARN_DISABLE);
+	}
+	ksz_cfg(dev, S_FLUSH_TABLE_CTRL, SW_FLUSH_DYN_MAC_TABLE, true);
+	for (index = first; index < cnt; index++) {
+		p = &dev->ports[index];
+		if (!p->on)
+			continue;
+		if (!(learn[index] & PORT_LEARN_DISABLE))
+			ksz_pwrite8(dev, index, P_STP_CTRL, learn[index]);
+	}
+}
+
+static int ksz8795_port_vlan_filtering(struct dsa_switch *ds, int port,
+				       bool flag)
+{
+	struct ksz_device *dev = ds->priv;
+
+	ksz_cfg(dev, S_MIRROR_CTRL, SW_VLAN_ENABLE, flag);
+
+	return 0;
+}
+
+static void ksz8795_port_vlan_add(struct dsa_switch *ds, int port,
+				  const struct switchdev_obj_port_vlan *vlan)
+{
+	bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
+	struct ksz_device *dev = ds->priv;
+	u16 data, vid, new_pvid = 0;
+	u8 fid, member, valid;
+
+	ksz_port_cfg(dev, port, P_TAG_CTRL, PORT_REMOVE_TAG, untagged);
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
+		ksz8795_r_vlan_table(dev, vid, &data);
+		ksz8795_from_vlan(data, &fid, &member, &valid);
+
+		/* First time to setup the VLAN entry. */
+		if (!valid) {
+			/* Need to find a way to map VID to FID. */
+			fid = 1;
+			valid = 1;
+		}
+		member |= BIT(port);
+
+		ksz8795_to_vlan(fid, member, valid, &data);
+		ksz8795_w_vlan_table(dev, vid, data);
+
+		/* change PVID */
+		if (vlan->flags & BRIDGE_VLAN_INFO_PVID)
+			new_pvid = vid;
+	}
+
+	if (new_pvid) {
+		ksz_pread16(dev, port, REG_PORT_CTRL_VID, &vid);
+		vid &= 0xfff;
+		vid |= new_pvid;
+		ksz_pwrite16(dev, port, REG_PORT_CTRL_VID, vid);
+	}
+}
+
+static int ksz8795_port_vlan_del(struct dsa_switch *ds, int port,
+				 const struct switchdev_obj_port_vlan *vlan)
+{
+	bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
+	struct ksz_device *dev = ds->priv;
+	u16 data, vid, pvid, new_pvid = 0;
+	u8 fid, member, valid;
+
+	ksz_pread16(dev, port, REG_PORT_CTRL_VID, &pvid);
+	pvid = pvid & 0xFFF;
+
+	ksz_port_cfg(dev, port, P_TAG_CTRL, PORT_REMOVE_TAG, untagged);
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
+		ksz8795_r_vlan_table(dev, vid, &data);
+		ksz8795_from_vlan(data, &fid, &member, &valid);
+
+		member &= ~BIT(port);
+
+		/* Invalidate the entry if no more member. */
+		if (!member) {
+			fid = 0;
+			valid = 0;
+		}
+
+		if (pvid == vid)
+			new_pvid = 1;
+
+		ksz8795_to_vlan(fid, member, valid, &data);
+		ksz8795_w_vlan_table(dev, vid, data);
+	}
+
+	if (new_pvid != pvid)
+		ksz_pwrite16(dev, port, REG_PORT_CTRL_VID, pvid);
+
+	return 0;
+}
+
+static int ksz8795_port_mirror_add(struct dsa_switch *ds, int port,
+				   struct dsa_mall_mirror_tc_entry *mirror,
+				   bool ingress)
+{
+	struct ksz_device *dev = ds->priv;
+
+	if (ingress) {
+		ksz_port_cfg(dev, port, P_MIRROR_CTRL, PORT_MIRROR_RX, true);
+		dev->mirror_rx |= BIT(port);
+	} else {
+		ksz_port_cfg(dev, port, P_MIRROR_CTRL, PORT_MIRROR_TX, true);
+		dev->mirror_tx |= BIT(port);
+	}
+
+	ksz_port_cfg(dev, port, P_MIRROR_CTRL, PORT_MIRROR_SNIFFER, false);
+
+	/* configure mirror port */
+	if (dev->mirror_rx || dev->mirror_tx)
+		ksz_port_cfg(dev, mirror->to_local_port, P_MIRROR_CTRL,
+			     PORT_MIRROR_SNIFFER, true);
+
+	return 0;
+}
+
+static void ksz8795_port_mirror_del(struct dsa_switch *ds, int port,
+				    struct dsa_mall_mirror_tc_entry *mirror)
+{
+	struct ksz_device *dev = ds->priv;
+	u8 data;
+
+	if (mirror->ingress) {
+		ksz_port_cfg(dev, port, P_MIRROR_CTRL, PORT_MIRROR_RX, false);
+		dev->mirror_rx &= ~BIT(port);
+	} else {
+		ksz_port_cfg(dev, port, P_MIRROR_CTRL, PORT_MIRROR_TX, false);
+		dev->mirror_tx &= ~BIT(port);
+	}
+
+	ksz_pread8(dev, port, P_MIRROR_CTRL, &data);
+
+	if (!dev->mirror_rx && !dev->mirror_tx)
+		ksz_port_cfg(dev, mirror->to_local_port, P_MIRROR_CTRL,
+			     PORT_MIRROR_SNIFFER, false);
+}
+
+static void ksz8795_phy_setup(struct ksz_device *dev, int port,
+			      struct phy_device *phy)
+{
+	if (port < dev->phy_port_cnt) {
+		/*
+		 * SUPPORTED_Asym_Pause and SUPPORTED_Pause can be removed to
+		 * disable flow control when rate limiting is used.
+		 */
+		linkmode_copy(phy->advertising, phy->supported);
+	}
+}
+
+static void ksz8795_port_setup(struct ksz_device *dev, int port, bool cpu_port)
+{
+	struct ksz_port *p = &dev->ports[port];
+	u8 data8, member;
+
+	/* enable broadcast storm limit */
+	ksz_port_cfg(dev, port, P_BCAST_STORM_CTRL, PORT_BROADCAST_STORM, true);
+
+	ksz8795_set_prio_queue(dev, port, 4);
+
+	/* disable DiffServ priority */
+	ksz_port_cfg(dev, port, P_PRIO_CTRL, PORT_DIFFSERV_ENABLE, false);
+
+	/* replace priority */
+	ksz_port_cfg(dev, port, P_802_1P_CTRL, PORT_802_1P_REMAPPING, false);
+
+	/* enable 802.1p priority */
+	ksz_port_cfg(dev, port, P_PRIO_CTRL, PORT_802_1P_ENABLE, true);
+
+	if (cpu_port) {
+		/* Configure MII interface for proper network communication. */
+		ksz_read8(dev, REG_PORT_5_CTRL_6, &data8);
+		data8 &= ~PORT_INTERFACE_TYPE;
+		data8 &= ~PORT_GMII_1GPS_MODE;
+		switch (dev->interface) {
+		case PHY_INTERFACE_MODE_MII:
+			p->phydev.speed = SPEED_100;
+			break;
+		case PHY_INTERFACE_MODE_RMII:
+			data8 |= PORT_INTERFACE_RMII;
+			p->phydev.speed = SPEED_100;
+			break;
+		case PHY_INTERFACE_MODE_GMII:
+			data8 |= PORT_GMII_1GPS_MODE;
+			data8 |= PORT_INTERFACE_GMII;
+			p->phydev.speed = SPEED_1000;
+			break;
+		default:
+			data8 &= ~PORT_RGMII_ID_IN_ENABLE;
+			data8 &= ~PORT_RGMII_ID_OUT_ENABLE;
+			if (dev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
+			    dev->interface == PHY_INTERFACE_MODE_RGMII_RXID)
+				data8 |= PORT_RGMII_ID_IN_ENABLE;
+			if (dev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
+			    dev->interface == PHY_INTERFACE_MODE_RGMII_TXID)
+				data8 |= PORT_RGMII_ID_OUT_ENABLE;
+			data8 |= PORT_GMII_1GPS_MODE;
+			data8 |= PORT_INTERFACE_RGMII;
+			p->phydev.speed = SPEED_1000;
+			break;
+		}
+		ksz_write8(dev, REG_PORT_5_CTRL_6, data8);
+		p->phydev.duplex = 1;
+
+		member = dev->port_mask;
+		dev->on_ports = dev->host_mask;
+		dev->live_ports = dev->host_mask;
+	} else {
+		member = dev->host_mask | p->vid_member;
+		dev->on_ports |= BIT(port);
+
+		/* Link was detected before port is enabled. */
+		if (p->phydev.link)
+			dev->live_ports |= BIT(port);
+	}
+	ksz8795_cfg_port_member(dev, port, member);
+}
+
+static void ksz8795_config_cpu_port(struct dsa_switch *ds)
+{
+	struct ksz_device *dev = ds->priv;
+	struct ksz_port *p;
+	u8 remote;
+	int i;
+
+	ds->num_ports = dev->port_cnt + 1;
+
+	/* Switch marks the maximum frame with extra byte as oversize. */
+	ksz_cfg(dev, REG_SW_CTRL_2, SW_LEGAL_PACKET_DISABLE, true);
+	ksz_cfg(dev, S_TAIL_TAG_CTRL, SW_TAIL_TAG_ENABLE, true);
+
+	p = &dev->ports[dev->cpu_port];
+	p->vid_member = dev->port_mask;
+	p->on = 1;
+
+	ksz8795_port_setup(dev, dev->cpu_port, true);
+	dev->member = dev->host_mask;
+
+	for (i = 0; i < SWITCH_PORT_NUM; i++) {
+		p = &dev->ports[i];
+
+		/* Initialize to non-zero so that ksz_cfg_port_member() will
+		 * be called.
+		 */
+		p->vid_member = BIT(i);
+		p->member = dev->port_mask;
+		ksz8795_port_stp_state_set(ds, i, BR_STATE_DISABLED);
+
+		/* Last port may be disabled. */
+		if (i == dev->port_cnt)
+			break;
+		p->on = 1;
+		p->phy = 1;
+	}
+	for (i = 0; i < dev->phy_port_cnt; i++) {
+		p = &dev->ports[i];
+		if (!p->on)
+			continue;
+		ksz_pread8(dev, i, P_REMOTE_STATUS, &remote);
+		if (remote & PORT_FIBER_MODE)
+			p->fiber = 1;
+		if (p->fiber)
+			ksz_port_cfg(dev, i, P_STP_CTRL, PORT_FORCE_FLOW_CTRL,
+				     true);
+		else
+			ksz_port_cfg(dev, i, P_STP_CTRL, PORT_FORCE_FLOW_CTRL,
+				     false);
+	}
+}
+
+static int ksz8795_setup(struct dsa_switch *ds)
+{
+	struct ksz_device *dev = ds->priv;
+	struct alu_struct alu;
+	int i, ret = 0;
+
+	dev->vlan_cache = devm_kcalloc(dev->dev, sizeof(struct vlan_table),
+				       dev->num_vlans, GFP_KERNEL);
+	if (!dev->vlan_cache)
+		return -ENOMEM;
+
+	ret = ksz8795_reset_switch(dev);
+	if (ret) {
+		dev_err(ds->dev, "failed to reset switch\n");
+		return ret;
+	}
+
+	ksz_cfg(dev, S_REPLACE_VID_CTRL, SW_FLOW_CTRL, true);
+
+	/* Enable automatic fast aging when link changed detected. */
+	ksz_cfg(dev, S_LINK_AGING_CTRL, SW_LINK_AUTO_AGING, true);
+
+	/* Enable aggressive back off algorithm in half duplex mode. */
+	regmap_update_bits(dev->regmap[0], REG_SW_CTRL_1,
+			   SW_AGGR_BACKOFF, SW_AGGR_BACKOFF);
+
+	/*
+	 * Make sure unicast VLAN boundary is set as default and
+	 * enable no excessive collision drop.
+	 */
+	regmap_update_bits(dev->regmap[0], REG_SW_CTRL_2,
+			   UNICAST_VLAN_BOUNDARY | NO_EXC_COLLISION_DROP,
+			   UNICAST_VLAN_BOUNDARY | NO_EXC_COLLISION_DROP);
+
+	ksz8795_config_cpu_port(ds);
+
+	ksz_cfg(dev, REG_SW_CTRL_2, MULTICAST_STORM_DISABLE, true);
+
+	ksz_cfg(dev, S_REPLACE_VID_CTRL, SW_REPLACE_VID, false);
+
+	ksz_cfg(dev, S_MIRROR_CTRL, SW_MIRROR_RX_TX, false);
+
+	/* set broadcast storm protection 10% rate */
+	regmap_update_bits(dev->regmap[1], S_REPLACE_VID_CTRL,
+			   BROADCAST_STORM_RATE,
+			   (BROADCAST_STORM_VALUE *
+			   BROADCAST_STORM_PROT_RATE) / 100);
+
+	for (i = 0; i < VLAN_TABLE_ENTRIES; i++)
+		ksz8795_r_vlan_entries(dev, i);
+
+	/* Setup STP address for STP operation. */
+	memset(&alu, 0, sizeof(alu));
+	ether_addr_copy(alu.mac, eth_stp_addr);
+	alu.is_static = true;
+	alu.is_override = true;
+	alu.port_forward = dev->host_mask;
+
+	ksz8795_w_sta_mac_table(dev, 0, &alu);
+
+	ksz_init_mib_timer(dev);
+
+	return 0;
+}
+
+static const struct dsa_switch_ops ksz8795_switch_ops = {
+	.get_tag_protocol	= ksz8795_get_tag_protocol,
+	.setup			= ksz8795_setup,
+	.phy_read		= ksz_phy_read16,
+	.phy_write		= ksz_phy_write16,
+	.adjust_link		= ksz_adjust_link,
+	.port_enable		= ksz_enable_port,
+	.port_disable		= ksz_disable_port,
+	.get_strings		= ksz8795_get_strings,
+	.get_ethtool_stats	= ksz_get_ethtool_stats,
+	.get_sset_count		= ksz_sset_count,
+	.port_bridge_join	= ksz_port_bridge_join,
+	.port_bridge_leave	= ksz_port_bridge_leave,
+	.port_stp_state_set	= ksz8795_port_stp_state_set,
+	.port_fast_age		= ksz_port_fast_age,
+	.port_vlan_filtering	= ksz8795_port_vlan_filtering,
+	.port_vlan_prepare	= ksz_port_vlan_prepare,
+	.port_vlan_add		= ksz8795_port_vlan_add,
+	.port_vlan_del		= ksz8795_port_vlan_del,
+	.port_fdb_dump		= ksz_port_fdb_dump,
+	.port_mdb_prepare       = ksz_port_mdb_prepare,
+	.port_mdb_add           = ksz_port_mdb_add,
+	.port_mdb_del           = ksz_port_mdb_del,
+	.port_mirror_add	= ksz8795_port_mirror_add,
+	.port_mirror_del	= ksz8795_port_mirror_del,
+};
+
+static u32 ksz8795_get_port_addr(int port, int offset)
+{
+	return PORT_CTRL_ADDR(port, offset);
+}
+
+static int ksz8795_switch_detect(struct ksz_device *dev)
+{
+	u8 id1, id2;
+	u16 id16;
+	int ret;
+
+	/* read chip id */
+	ret = ksz_read16(dev, REG_CHIP_ID0, &id16);
+	if (ret)
+		return ret;
+
+	id1 = id16 >> 8;
+	id2 = id16 & SW_CHIP_ID_M;
+	if (id1 != FAMILY_ID ||
+	    (id2 != CHIP_ID_94 && id2 != CHIP_ID_95))
+		return -ENODEV;
+
+	dev->mib_port_cnt = TOTAL_PORT_NUM;
+	dev->phy_port_cnt = SWITCH_PORT_NUM;
+	dev->port_cnt = SWITCH_PORT_NUM;
+
+	if (id2 == CHIP_ID_95) {
+		u8 val;
+
+		id2 = 0x95;
+		ksz_read8(dev, REG_PORT_1_STATUS_0, &val);
+		if (val & PORT_FIBER_MODE)
+			id2 = 0x65;
+	} else if (id2 == CHIP_ID_94) {
+		dev->port_cnt--;
+		dev->last_port = dev->port_cnt;
+		id2 = 0x94;
+	}
+	id16 &= ~0xff;
+	id16 |= id2;
+	dev->chip_id = id16;
+
+	dev->cpu_port = dev->mib_port_cnt - 1;
+	dev->host_mask = BIT(dev->cpu_port);
+
+	return 0;
+}
+
+struct ksz_chip_data {
+	u16 chip_id;
+	const char *dev_name;
+	int num_vlans;
+	int num_alus;
+	int num_statics;
+	int cpu_ports;
+	int port_cnt;
+};
+
+static const struct ksz_chip_data ksz8795_switch_chips[] = {
+	{
+		.chip_id = 0x8795,
+		.dev_name = "KSZ8795",
+		.num_vlans = 4096,
+		.num_alus = 0,
+		.num_statics = 8,
+		.cpu_ports = 0x10,	/* can be configured as cpu port */
+		.port_cnt = 4,		/* total physical port count */
+	},
+	{
+		.chip_id = 0x8794,
+		.dev_name = "KSZ8794",
+		.num_vlans = 4096,
+		.num_alus = 0,
+		.num_statics = 8,
+		.cpu_ports = 0x10,	/* can be configured as cpu port */
+		.port_cnt = 3,		/* total physical port count */
+	},
+	{
+		.chip_id = 0x8765,
+		.dev_name = "KSZ8765",
+		.num_vlans = 4096,
+		.num_alus = 0,
+		.num_statics = 8,
+		.cpu_ports = 0x10,	/* can be configured as cpu port */
+		.port_cnt = 4,		/* total physical port count */
+	},
+};
+
+static int ksz8795_switch_init(struct ksz_device *dev)
+{
+	int i;
+
+	mutex_init(&dev->stats_mutex);
+	mutex_init(&dev->alu_mutex);
+	mutex_init(&dev->vlan_mutex);
+
+	dev->ds->ops = &ksz8795_switch_ops;
+
+	for (i = 0; i < ARRAY_SIZE(ksz8795_switch_chips); i++) {
+		const struct ksz_chip_data *chip = &ksz8795_switch_chips[i];
+
+		if (dev->chip_id == chip->chip_id) {
+			dev->name = chip->dev_name;
+			dev->num_vlans = chip->num_vlans;
+			dev->num_alus = chip->num_alus;
+			dev->num_statics = chip->num_statics;
+			dev->port_cnt = chip->port_cnt;
+			dev->cpu_ports = chip->cpu_ports;
+
+			break;
+		}
+	}
+
+	/* no switch found */
+	if (!dev->cpu_ports)
+		return -ENODEV;
+
+	dev->port_mask = BIT(dev->port_cnt) - 1;
+	dev->port_mask |= dev->host_mask;
+
+	dev->reg_mib_cnt = SWITCH_COUNTER_NUM;
+	dev->mib_cnt = TOTAL_SWITCH_COUNTER_NUM;
+
+	i = dev->mib_port_cnt;
+	dev->ports = devm_kzalloc(dev->dev, sizeof(struct ksz_port) * i,
+				  GFP_KERNEL);
+	if (!dev->ports)
+		return -ENOMEM;
+	for (i = 0; i < dev->mib_port_cnt; i++) {
+		mutex_init(&dev->ports[i].mib.cnt_mutex);
+		dev->ports[i].mib.counters =
+			devm_kzalloc(dev->dev,
+				     sizeof(u64) *
+				     (TOTAL_SWITCH_COUNTER_NUM + 1),
+				     GFP_KERNEL);
+		if (!dev->ports[i].mib.counters)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void ksz8795_switch_exit(struct ksz_device *dev)
+{
+	ksz8795_reset_switch(dev);
+}
+
+static const struct ksz_dev_ops ksz8795_dev_ops = {
+	.get_port_addr = ksz8795_get_port_addr,
+	.cfg_port_member = ksz8795_cfg_port_member,
+	.flush_dyn_mac_table = ksz8795_flush_dyn_mac_table,
+	.phy_setup = ksz8795_phy_setup,
+	.port_setup = ksz8795_port_setup,
+	.r_phy = ksz8795_r_phy,
+	.w_phy = ksz8795_w_phy,
+	.r_dyn_mac_table = ksz8795_r_dyn_mac_table,
+	.r_sta_mac_table = ksz8795_r_sta_mac_table,
+	.w_sta_mac_table = ksz8795_w_sta_mac_table,
+	.r_mib_cnt = ksz8795_r_mib_cnt,
+	.r_mib_pkt = ksz8795_r_mib_pkt,
+	.freeze_mib = ksz8795_freeze_mib,
+	.port_init_cnt = ksz8795_port_init_cnt,
+	.shutdown = ksz8795_reset_switch,
+	.detect = ksz8795_switch_detect,
+	.init = ksz8795_switch_init,
+	.exit = ksz8795_switch_exit,
+};
+
+int ksz8795_switch_register(struct ksz_device *dev)
+{
+	return ksz_switch_register(dev, &ksz8795_dev_ops);
+}
+EXPORT_SYMBOL(ksz8795_switch_register);
+
+MODULE_AUTHOR("Tristram Ha <Tristram.Ha@microchip.com>");
+MODULE_DESCRIPTION("Microchip KSZ8795 Series Switch DSA Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/dsa/microchip/ksz8795_reg.h b/drivers/net/dsa/microchip/ksz8795_reg.h
new file mode 100644
index 000000000000..3a50462df8fa
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz8795_reg.h
@@ -0,0 +1,1004 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Microchip KSZ8795 register definitions
+ *
+ * Copyright (c) 2017 Microchip Technology Inc.
+ *	Tristram Ha <Tristram.Ha@microchip.com>
+ */
+
+#ifndef __KSZ8795_REG_H
+#define __KSZ8795_REG_H
+
+#define KS_PORT_M			0x1F
+
+#define KS_PRIO_M			0x3
+#define KS_PRIO_S			2
+
+#define REG_CHIP_ID0			0x00
+
+#define FAMILY_ID			0x87
+
+#define REG_CHIP_ID1			0x01
+
+#define SW_CHIP_ID_M			0xF0
+#define SW_CHIP_ID_S			4
+#define SW_REVISION_M			0x0E
+#define SW_REVISION_S			1
+#define SW_START			0x01
+
+#define CHIP_ID_94			0x60
+#define CHIP_ID_95			0x90
+
+#define REG_SW_CTRL_0			0x02
+
+#define SW_NEW_BACKOFF			BIT(7)
+#define SW_GLOBAL_RESET			BIT(6)
+#define SW_FLUSH_DYN_MAC_TABLE		BIT(5)
+#define SW_FLUSH_STA_MAC_TABLE		BIT(4)
+#define SW_LINK_AUTO_AGING		BIT(0)
+
+#define REG_SW_CTRL_1			0x03
+
+#define SW_HUGE_PACKET			BIT(6)
+#define SW_TX_FLOW_CTRL_DISABLE		BIT(5)
+#define SW_RX_FLOW_CTRL_DISABLE		BIT(4)
+#define SW_CHECK_LENGTH			BIT(3)
+#define SW_AGING_ENABLE			BIT(2)
+#define SW_FAST_AGING			BIT(1)
+#define SW_AGGR_BACKOFF			BIT(0)
+
+#define REG_SW_CTRL_2			0x04
+
+#define UNICAST_VLAN_BOUNDARY		BIT(7)
+#define MULTICAST_STORM_DISABLE		BIT(6)
+#define SW_BACK_PRESSURE		BIT(5)
+#define FAIR_FLOW_CTRL			BIT(4)
+#define NO_EXC_COLLISION_DROP		BIT(3)
+#define SW_LEGAL_PACKET_DISABLE		BIT(1)
+
+#define REG_SW_CTRL_3			0x05
+ #define WEIGHTED_FAIR_QUEUE_ENABLE	BIT(3)
+
+#define SW_VLAN_ENABLE			BIT(7)
+#define SW_IGMP_SNOOP			BIT(6)
+#define SW_MIRROR_RX_TX			BIT(0)
+
+#define REG_SW_CTRL_4			0x06
+
+#define SW_HALF_DUPLEX_FLOW_CTRL	BIT(7)
+#define SW_HALF_DUPLEX			BIT(6)
+#define SW_FLOW_CTRL			BIT(5)
+#define SW_10_MBIT			BIT(4)
+#define SW_REPLACE_VID			BIT(3)
+#define BROADCAST_STORM_RATE_HI		0x07
+
+#define REG_SW_CTRL_5			0x07
+
+#define BROADCAST_STORM_RATE_LO		0xFF
+#define BROADCAST_STORM_RATE		0x07FF
+
+#define REG_SW_CTRL_6			0x08
+
+#define SW_MIB_COUNTER_FLUSH		BIT(7)
+#define SW_MIB_COUNTER_FREEZE		BIT(6)
+#define SW_MIB_COUNTER_CTRL_ENABLE	KS_PORT_M
+
+#define REG_SW_CTRL_9			0x0B
+
+#define SPI_CLK_125_MHZ			0x80
+#define SPI_CLK_62_5_MHZ		0x40
+#define SPI_CLK_31_25_MHZ		0x00
+
+#define SW_LED_MODE_M			0x3
+#define SW_LED_MODE_S			4
+#define SW_LED_LINK_ACT_SPEED		0
+#define SW_LED_LINK_ACT			1
+#define SW_LED_LINK_ACT_DUPLEX		2
+#define SW_LED_LINK_DUPLEX		3
+
+#define REG_SW_CTRL_10			0x0C
+
+#define SW_TAIL_TAG_ENABLE		BIT(1)
+#define SW_PASS_PAUSE			BIT(0)
+
+#define REG_SW_CTRL_11			0x0D
+
+#define REG_POWER_MANAGEMENT_1		0x0E
+
+#define SW_PLL_POWER_DOWN		BIT(5)
+#define SW_POWER_MANAGEMENT_MODE_M	0x3
+#define SW_POWER_MANAGEMENT_MODE_S	3
+#define SW_POWER_NORMAL			0
+#define SW_ENERGY_DETECTION		1
+#define SW_SOFTWARE_POWER_DOWN		2
+
+#define REG_POWER_MANAGEMENT_2		0x0F
+
+#define REG_PORT_1_CTRL_0		0x10
+#define REG_PORT_2_CTRL_0		0x20
+#define REG_PORT_3_CTRL_0		0x30
+#define REG_PORT_4_CTRL_0		0x40
+#define REG_PORT_5_CTRL_0		0x50
+
+#define PORT_BROADCAST_STORM		BIT(7)
+#define PORT_DIFFSERV_ENABLE		BIT(6)
+#define PORT_802_1P_ENABLE		BIT(5)
+#define PORT_BASED_PRIO_S		3
+#define PORT_BASED_PRIO_M		KS_PRIO_M
+#define PORT_BASED_PRIO_0		0
+#define PORT_BASED_PRIO_1		1
+#define PORT_BASED_PRIO_2		2
+#define PORT_BASED_PRIO_3		3
+#define PORT_INSERT_TAG			BIT(2)
+#define PORT_REMOVE_TAG			BIT(1)
+#define PORT_QUEUE_SPLIT_L		BIT(0)
+
+#define REG_PORT_1_CTRL_1		0x11
+#define REG_PORT_2_CTRL_1		0x21
+#define REG_PORT_3_CTRL_1		0x31
+#define REG_PORT_4_CTRL_1		0x41
+#define REG_PORT_5_CTRL_1		0x51
+
+#define PORT_MIRROR_SNIFFER		BIT(7)
+#define PORT_MIRROR_RX			BIT(6)
+#define PORT_MIRROR_TX			BIT(5)
+#define PORT_VLAN_MEMBERSHIP		KS_PORT_M
+
+#define REG_PORT_1_CTRL_2		0x12
+#define REG_PORT_2_CTRL_2		0x22
+#define REG_PORT_3_CTRL_2		0x32
+#define REG_PORT_4_CTRL_2		0x42
+#define REG_PORT_5_CTRL_2		0x52
+
+#define PORT_802_1P_REMAPPING		BIT(7)
+#define PORT_INGRESS_FILTER		BIT(6)
+#define PORT_DISCARD_NON_VID		BIT(5)
+#define PORT_FORCE_FLOW_CTRL		BIT(4)
+#define PORT_BACK_PRESSURE		BIT(3)
+#define PORT_TX_ENABLE			BIT(2)
+#define PORT_RX_ENABLE			BIT(1)
+#define PORT_LEARN_DISABLE		BIT(0)
+
+#define REG_PORT_1_CTRL_3		0x13
+#define REG_PORT_2_CTRL_3		0x23
+#define REG_PORT_3_CTRL_3		0x33
+#define REG_PORT_4_CTRL_3		0x43
+#define REG_PORT_5_CTRL_3		0x53
+#define REG_PORT_1_CTRL_4		0x14
+#define REG_PORT_2_CTRL_4		0x24
+#define REG_PORT_3_CTRL_4		0x34
+#define REG_PORT_4_CTRL_4		0x44
+#define REG_PORT_5_CTRL_4		0x54
+
+#define PORT_DEFAULT_VID		0x0001
+
+#define REG_PORT_1_CTRL_5		0x15
+#define REG_PORT_2_CTRL_5		0x25
+#define REG_PORT_3_CTRL_5		0x35
+#define REG_PORT_4_CTRL_5		0x45
+#define REG_PORT_5_CTRL_5		0x55
+
+#define PORT_ACL_ENABLE			BIT(2)
+#define PORT_AUTHEN_MODE		0x3
+#define PORT_AUTHEN_PASS		0
+#define PORT_AUTHEN_BLOCK		1
+#define PORT_AUTHEN_TRAP		2
+
+#define REG_PORT_5_CTRL_6		0x56
+
+#define PORT_MII_INTERNAL_CLOCK		BIT(7)
+#define PORT_GMII_1GPS_MODE		BIT(6)
+#define PORT_RGMII_ID_IN_ENABLE		BIT(4)
+#define PORT_RGMII_ID_OUT_ENABLE	BIT(3)
+#define PORT_GMII_MAC_MODE		BIT(2)
+#define PORT_INTERFACE_TYPE		0x3
+#define PORT_INTERFACE_MII		0
+#define PORT_INTERFACE_RMII		1
+#define PORT_INTERFACE_GMII		2
+#define PORT_INTERFACE_RGMII		3
+
+#define REG_PORT_1_CTRL_7		0x17
+#define REG_PORT_2_CTRL_7		0x27
+#define REG_PORT_3_CTRL_7		0x37
+#define REG_PORT_4_CTRL_7		0x47
+
+#define PORT_AUTO_NEG_ASYM_PAUSE	BIT(5)
+#define PORT_AUTO_NEG_SYM_PAUSE		BIT(4)
+#define PORT_AUTO_NEG_100BTX_FD		BIT(3)
+#define PORT_AUTO_NEG_100BTX		BIT(2)
+#define PORT_AUTO_NEG_10BT_FD		BIT(1)
+#define PORT_AUTO_NEG_10BT		BIT(0)
+
+#define REG_PORT_1_STATUS_0		0x18
+#define REG_PORT_2_STATUS_0		0x28
+#define REG_PORT_3_STATUS_0		0x38
+#define REG_PORT_4_STATUS_0		0x48
+
+/* For KSZ8765. */
+#define PORT_FIBER_MODE			BIT(7)
+
+#define PORT_REMOTE_ASYM_PAUSE		BIT(5)
+#define PORT_REMOTE_SYM_PAUSE		BIT(4)
+#define PORT_REMOTE_100BTX_FD		BIT(3)
+#define PORT_REMOTE_100BTX		BIT(2)
+#define PORT_REMOTE_10BT_FD		BIT(1)
+#define PORT_REMOTE_10BT		BIT(0)
+
+#define REG_PORT_1_STATUS_1		0x19
+#define REG_PORT_2_STATUS_1		0x29
+#define REG_PORT_3_STATUS_1		0x39
+#define REG_PORT_4_STATUS_1		0x49
+
+#define PORT_HP_MDIX			BIT(7)
+#define PORT_REVERSED_POLARITY		BIT(5)
+#define PORT_TX_FLOW_CTRL		BIT(4)
+#define PORT_RX_FLOW_CTRL		BIT(3)
+#define PORT_STAT_SPEED_100MBIT		BIT(2)
+#define PORT_STAT_FULL_DUPLEX		BIT(1)
+
+#define PORT_REMOTE_FAULT		BIT(0)
+
+#define REG_PORT_1_LINK_MD_CTRL		0x1A
+#define REG_PORT_2_LINK_MD_CTRL		0x2A
+#define REG_PORT_3_LINK_MD_CTRL		0x3A
+#define REG_PORT_4_LINK_MD_CTRL		0x4A
+
+#define PORT_CABLE_10M_SHORT		BIT(7)
+#define PORT_CABLE_DIAG_RESULT_M	0x3
+#define PORT_CABLE_DIAG_RESULT_S	5
+#define PORT_CABLE_STAT_NORMAL		0
+#define PORT_CABLE_STAT_OPEN		1
+#define PORT_CABLE_STAT_SHORT		2
+#define PORT_CABLE_STAT_FAILED		3
+#define PORT_START_CABLE_DIAG		BIT(4)
+#define PORT_FORCE_LINK			BIT(3)
+#define PORT_POWER_SAVING		BIT(2)
+#define PORT_PHY_REMOTE_LOOPBACK	BIT(1)
+#define PORT_CABLE_FAULT_COUNTER_H	0x01
+
+#define REG_PORT_1_LINK_MD_RESULT	0x1B
+#define REG_PORT_2_LINK_MD_RESULT	0x2B
+#define REG_PORT_3_LINK_MD_RESULT	0x3B
+#define REG_PORT_4_LINK_MD_RESULT	0x4B
+
+#define PORT_CABLE_FAULT_COUNTER_L	0xFF
+#define PORT_CABLE_FAULT_COUNTER	0x1FF
+
+#define REG_PORT_1_CTRL_9		0x1C
+#define REG_PORT_2_CTRL_9		0x2C
+#define REG_PORT_3_CTRL_9		0x3C
+#define REG_PORT_4_CTRL_9		0x4C
+
+#define PORT_AUTO_NEG_DISABLE		BIT(7)
+#define PORT_FORCE_100_MBIT		BIT(6)
+#define PORT_FORCE_FULL_DUPLEX		BIT(5)
+
+#define REG_PORT_1_CTRL_10		0x1D
+#define REG_PORT_2_CTRL_10		0x2D
+#define REG_PORT_3_CTRL_10		0x3D
+#define REG_PORT_4_CTRL_10		0x4D
+
+#define PORT_LED_OFF			BIT(7)
+#define PORT_TX_DISABLE			BIT(6)
+#define PORT_AUTO_NEG_RESTART		BIT(5)
+#define PORT_POWER_DOWN			BIT(3)
+#define PORT_AUTO_MDIX_DISABLE		BIT(2)
+#define PORT_FORCE_MDIX			BIT(1)
+#define PORT_MAC_LOOPBACK		BIT(0)
+
+#define REG_PORT_1_STATUS_2		0x1E
+#define REG_PORT_2_STATUS_2		0x2E
+#define REG_PORT_3_STATUS_2		0x3E
+#define REG_PORT_4_STATUS_2		0x4E
+
+#define PORT_MDIX_STATUS		BIT(7)
+#define PORT_AUTO_NEG_COMPLETE		BIT(6)
+#define PORT_STAT_LINK_GOOD		BIT(5)
+
+#define REG_PORT_1_STATUS_3		0x1F
+#define REG_PORT_2_STATUS_3		0x2F
+#define REG_PORT_3_STATUS_3		0x3F
+#define REG_PORT_4_STATUS_3		0x4F
+
+#define PORT_PHY_LOOPBACK		BIT(7)
+#define PORT_PHY_ISOLATE		BIT(5)
+#define PORT_PHY_SOFT_RESET		BIT(4)
+#define PORT_PHY_FORCE_LINK		BIT(3)
+#define PORT_PHY_MODE_M			0x7
+#define PHY_MODE_IN_AUTO_NEG		1
+#define PHY_MODE_10BT_HALF		2
+#define PHY_MODE_100BT_HALF		3
+#define PHY_MODE_10BT_FULL		5
+#define PHY_MODE_100BT_FULL		6
+#define PHY_MODE_ISOLDATE		7
+
+#define REG_PORT_CTRL_0			0x00
+#define REG_PORT_CTRL_1			0x01
+#define REG_PORT_CTRL_2			0x02
+#define REG_PORT_CTRL_VID		0x03
+
+#define REG_PORT_CTRL_5			0x05
+
+#define REG_PORT_CTRL_7			0x07
+#define REG_PORT_STATUS_0		0x08
+#define REG_PORT_STATUS_1		0x09
+#define REG_PORT_LINK_MD_CTRL		0x0A
+#define REG_PORT_LINK_MD_RESULT		0x0B
+#define REG_PORT_CTRL_9			0x0C
+#define REG_PORT_CTRL_10		0x0D
+#define REG_PORT_STATUS_2		0x0E
+#define REG_PORT_STATUS_3		0x0F
+
+#define REG_PORT_CTRL_12		0xA0
+#define REG_PORT_CTRL_13		0xA1
+#define REG_PORT_RATE_CTRL_3		0xA2
+#define REG_PORT_RATE_CTRL_2		0xA3
+#define REG_PORT_RATE_CTRL_1		0xA4
+#define REG_PORT_RATE_CTRL_0		0xA5
+#define REG_PORT_RATE_LIMIT		0xA6
+#define REG_PORT_IN_RATE_0		0xA7
+#define REG_PORT_IN_RATE_1		0xA8
+#define REG_PORT_IN_RATE_2		0xA9
+#define REG_PORT_IN_RATE_3		0xAA
+#define REG_PORT_OUT_RATE_0		0xAB
+#define REG_PORT_OUT_RATE_1		0xAC
+#define REG_PORT_OUT_RATE_2		0xAD
+#define REG_PORT_OUT_RATE_3		0xAE
+
+#define PORT_CTRL_ADDR(port, addr)		\
+	((addr) + REG_PORT_1_CTRL_0 + (port) *	\
+		(REG_PORT_2_CTRL_0 - REG_PORT_1_CTRL_0))
+
+#define REG_SW_MAC_ADDR_0		0x68
+#define REG_SW_MAC_ADDR_1		0x69
+#define REG_SW_MAC_ADDR_2		0x6A
+#define REG_SW_MAC_ADDR_3		0x6B
+#define REG_SW_MAC_ADDR_4		0x6C
+#define REG_SW_MAC_ADDR_5		0x6D
+
+#define REG_IND_CTRL_0			0x6E
+
+#define TABLE_EXT_SELECT_S		5
+#define TABLE_EEE_V			1
+#define TABLE_ACL_V			2
+#define TABLE_PME_V			4
+#define TABLE_LINK_MD_V			5
+#define TABLE_EEE			(TABLE_EEE_V << TABLE_EXT_SELECT_S)
+#define TABLE_ACL			(TABLE_ACL_V << TABLE_EXT_SELECT_S)
+#define TABLE_PME			(TABLE_PME_V << TABLE_EXT_SELECT_S)
+#define TABLE_LINK_MD			(TABLE_LINK_MD << TABLE_EXT_SELECT_S)
+#define TABLE_READ			BIT(4)
+#define TABLE_SELECT_S			2
+#define TABLE_STATIC_MAC_V		0
+#define TABLE_VLAN_V			1
+#define TABLE_DYNAMIC_MAC_V		2
+#define TABLE_MIB_V			3
+#define TABLE_STATIC_MAC		(TABLE_STATIC_MAC_V << TABLE_SELECT_S)
+#define TABLE_VLAN			(TABLE_VLAN_V << TABLE_SELECT_S)
+#define TABLE_DYNAMIC_MAC		(TABLE_DYNAMIC_MAC_V << TABLE_SELECT_S)
+#define TABLE_MIB			(TABLE_MIB_V << TABLE_SELECT_S)
+
+#define REG_IND_CTRL_1			0x6F
+
+#define TABLE_ENTRY_MASK		0x03FF
+#define TABLE_EXT_ENTRY_MASK		0x0FFF
+
+#define REG_IND_DATA_8			0x70
+#define REG_IND_DATA_7			0x71
+#define REG_IND_DATA_6			0x72
+#define REG_IND_DATA_5			0x73
+#define REG_IND_DATA_4			0x74
+#define REG_IND_DATA_3			0x75
+#define REG_IND_DATA_2			0x76
+#define REG_IND_DATA_1			0x77
+#define REG_IND_DATA_0			0x78
+
+#define REG_IND_DATA_PME_EEE_ACL	0xA0
+
+#define REG_IND_DATA_CHECK		REG_IND_DATA_6
+#define REG_IND_MIB_CHECK		REG_IND_DATA_4
+#define REG_IND_DATA_HI			REG_IND_DATA_7
+#define REG_IND_DATA_LO			REG_IND_DATA_3
+
+#define REG_INT_STATUS			0x7C
+#define REG_INT_ENABLE			0x7D
+
+#define INT_PME				BIT(4)
+
+#define REG_ACL_INT_STATUS		0x7E
+#define REG_ACL_INT_ENABLE		0x7F
+
+#define INT_PORT_5			BIT(4)
+#define INT_PORT_4			BIT(3)
+#define INT_PORT_3			BIT(2)
+#define INT_PORT_2			BIT(1)
+#define INT_PORT_1			BIT(0)
+
+#define INT_PORT_ALL			\
+	(INT_PORT_5 | INT_PORT_4 | INT_PORT_3 | INT_PORT_2 | INT_PORT_1)
+
+#define REG_SW_CTRL_12			0x80
+#define REG_SW_CTRL_13			0x81
+
+#define SWITCH_802_1P_MASK		3
+#define SWITCH_802_1P_BASE		3
+#define SWITCH_802_1P_SHIFT		2
+
+#define SW_802_1P_MAP_M			KS_PRIO_M
+#define SW_802_1P_MAP_S			KS_PRIO_S
+
+#define REG_SWITCH_CTRL_14		0x82
+
+#define SW_PRIO_MAPPING_M		KS_PRIO_M
+#define SW_PRIO_MAPPING_S		6
+#define SW_PRIO_MAP_3_HI		0
+#define SW_PRIO_MAP_2_HI		2
+#define SW_PRIO_MAP_0_LO		3
+
+#define REG_SW_CTRL_15			0x83
+#define REG_SW_CTRL_16			0x84
+#define REG_SW_CTRL_17			0x85
+#define REG_SW_CTRL_18			0x86
+
+#define SW_SELF_ADDR_FILTER_ENABLE	BIT(6)
+
+#define REG_SW_UNK_UCAST_CTRL		0x83
+#define REG_SW_UNK_MCAST_CTRL		0x84
+#define REG_SW_UNK_VID_CTRL		0x85
+#define REG_SW_UNK_IP_MCAST_CTRL	0x86
+
+#define SW_UNK_FWD_ENABLE		BIT(5)
+#define SW_UNK_FWD_MAP			KS_PORT_M
+
+#define REG_SW_CTRL_19			0x87
+
+#define SW_IN_RATE_LIMIT_PERIOD_M	0x3
+#define SW_IN_RATE_LIMIT_PERIOD_S	4
+#define SW_IN_RATE_LIMIT_16_MS		0
+#define SW_IN_RATE_LIMIT_64_MS		1
+#define SW_IN_RATE_LIMIT_256_MS		2
+#define SW_OUT_RATE_LIMIT_QUEUE_BASED	BIT(3)
+#define SW_INS_TAG_ENABLE		BIT(2)
+
+#define REG_TOS_PRIO_CTRL_0		0x90
+#define REG_TOS_PRIO_CTRL_1		0x91
+#define REG_TOS_PRIO_CTRL_2		0x92
+#define REG_TOS_PRIO_CTRL_3		0x93
+#define REG_TOS_PRIO_CTRL_4		0x94
+#define REG_TOS_PRIO_CTRL_5		0x95
+#define REG_TOS_PRIO_CTRL_6		0x96
+#define REG_TOS_PRIO_CTRL_7		0x97
+#define REG_TOS_PRIO_CTRL_8		0x98
+#define REG_TOS_PRIO_CTRL_9		0x99
+#define REG_TOS_PRIO_CTRL_10		0x9A
+#define REG_TOS_PRIO_CTRL_11		0x9B
+#define REG_TOS_PRIO_CTRL_12		0x9C
+#define REG_TOS_PRIO_CTRL_13		0x9D
+#define REG_TOS_PRIO_CTRL_14		0x9E
+#define REG_TOS_PRIO_CTRL_15		0x9F
+
+#define TOS_PRIO_M			KS_PRIO_M
+#define TOS_PRIO_S			KS_PRIO_S
+
+#define REG_SW_CTRL_20			0xA3
+
+#define SW_GMII_DRIVE_STRENGTH_S	4
+#define SW_DRIVE_STRENGTH_M		0x7
+#define SW_DRIVE_STRENGTH_2MA		0
+#define SW_DRIVE_STRENGTH_4MA		1
+#define SW_DRIVE_STRENGTH_8MA		2
+#define SW_DRIVE_STRENGTH_12MA		3
+#define SW_DRIVE_STRENGTH_16MA		4
+#define SW_DRIVE_STRENGTH_20MA		5
+#define SW_DRIVE_STRENGTH_24MA		6
+#define SW_DRIVE_STRENGTH_28MA		7
+#define SW_MII_DRIVE_STRENGTH_S		0
+
+#define REG_SW_CTRL_21			0xA4
+
+#define SW_IPV6_MLD_OPTION		BIT(3)
+#define SW_IPV6_MLD_SNOOP		BIT(2)
+
+#define REG_PORT_1_CTRL_12		0xB0
+#define REG_PORT_2_CTRL_12		0xC0
+#define REG_PORT_3_CTRL_12		0xD0
+#define REG_PORT_4_CTRL_12		0xE0
+#define REG_PORT_5_CTRL_12		0xF0
+
+#define PORT_PASS_ALL			BIT(6)
+#define PORT_INS_TAG_FOR_PORT_5_S	3
+#define PORT_INS_TAG_FOR_PORT_5		BIT(3)
+#define PORT_INS_TAG_FOR_PORT_4		BIT(2)
+#define PORT_INS_TAG_FOR_PORT_3		BIT(1)
+#define PORT_INS_TAG_FOR_PORT_2		BIT(0)
+
+#define REG_PORT_1_CTRL_13		0xB1
+#define REG_PORT_2_CTRL_13		0xC1
+#define REG_PORT_3_CTRL_13		0xD1
+#define REG_PORT_4_CTRL_13		0xE1
+#define REG_PORT_5_CTRL_13		0xF1
+
+#define PORT_QUEUE_SPLIT_H		BIT(1)
+#define PORT_QUEUE_SPLIT_1		0
+#define PORT_QUEUE_SPLIT_2		1
+#define PORT_QUEUE_SPLIT_4		2
+#define PORT_DROP_TAG			BIT(0)
+
+#define REG_PORT_1_CTRL_14		0xB2
+#define REG_PORT_2_CTRL_14		0xC2
+#define REG_PORT_3_CTRL_14		0xD2
+#define REG_PORT_4_CTRL_14		0xE2
+#define REG_PORT_5_CTRL_14		0xF2
+#define REG_PORT_1_CTRL_15		0xB3
+#define REG_PORT_2_CTRL_15		0xC3
+#define REG_PORT_3_CTRL_15		0xD3
+#define REG_PORT_4_CTRL_15		0xE3
+#define REG_PORT_5_CTRL_15		0xF3
+#define REG_PORT_1_CTRL_16		0xB4
+#define REG_PORT_2_CTRL_16		0xC4
+#define REG_PORT_3_CTRL_16		0xD4
+#define REG_PORT_4_CTRL_16		0xE4
+#define REG_PORT_5_CTRL_16		0xF4
+#define REG_PORT_1_CTRL_17		0xB5
+#define REG_PORT_2_CTRL_17		0xC5
+#define REG_PORT_3_CTRL_17		0xD5
+#define REG_PORT_4_CTRL_17		0xE5
+#define REG_PORT_5_CTRL_17		0xF5
+
+#define REG_PORT_1_RATE_CTRL_3		0xB2
+#define REG_PORT_1_RATE_CTRL_2		0xB3
+#define REG_PORT_1_RATE_CTRL_1		0xB4
+#define REG_PORT_1_RATE_CTRL_0		0xB5
+#define REG_PORT_2_RATE_CTRL_3		0xC2
+#define REG_PORT_2_RATE_CTRL_2		0xC3
+#define REG_PORT_2_RATE_CTRL_1		0xC4
+#define REG_PORT_2_RATE_CTRL_0		0xC5
+#define REG_PORT_3_RATE_CTRL_3		0xD2
+#define REG_PORT_3_RATE_CTRL_2		0xD3
+#define REG_PORT_3_RATE_CTRL_1		0xD4
+#define REG_PORT_3_RATE_CTRL_0		0xD5
+#define REG_PORT_4_RATE_CTRL_3		0xE2
+#define REG_PORT_4_RATE_CTRL_2		0xE3
+#define REG_PORT_4_RATE_CTRL_1		0xE4
+#define REG_PORT_4_RATE_CTRL_0		0xE5
+#define REG_PORT_5_RATE_CTRL_3		0xF2
+#define REG_PORT_5_RATE_CTRL_2		0xF3
+#define REG_PORT_5_RATE_CTRL_1		0xF4
+#define REG_PORT_5_RATE_CTRL_0		0xF5
+
+#define RATE_CTRL_ENABLE		BIT(7)
+#define RATE_RATIO_M			(BIT(7) - 1)
+
+#define PORT_OUT_RATE_ENABLE		BIT(7)
+
+#define REG_PORT_1_RATE_LIMIT		0xB6
+#define REG_PORT_2_RATE_LIMIT		0xC6
+#define REG_PORT_3_RATE_LIMIT		0xD6
+#define REG_PORT_4_RATE_LIMIT		0xE6
+#define REG_PORT_5_RATE_LIMIT		0xF6
+
+#define PORT_IN_PORT_BASED_S		6
+#define PORT_RATE_PACKET_BASED_S	5
+#define PORT_IN_FLOW_CTRL_S		4
+#define PORT_IN_LIMIT_MODE_M		0x3
+#define PORT_IN_LIMIT_MODE_S		2
+#define PORT_COUNT_IFG_S		1
+#define PORT_COUNT_PREAMBLE_S		0
+#define PORT_IN_PORT_BASED		BIT(PORT_IN_PORT_BASED_S)
+#define PORT_RATE_PACKET_BASED		BIT(PORT_RATE_PACKET_BASED_S)
+#define PORT_IN_FLOW_CTRL		BIT(PORT_IN_FLOW_CTRL_S)
+#define PORT_IN_ALL			0
+#define PORT_IN_UNICAST			1
+#define PORT_IN_MULTICAST		2
+#define PORT_IN_BROADCAST		3
+#define PORT_COUNT_IFG			BIT(PORT_COUNT_IFG_S)
+#define PORT_COUNT_PREAMBLE		BIT(PORT_COUNT_PREAMBLE_S)
+
+#define REG_PORT_1_IN_RATE_0		0xB7
+#define REG_PORT_2_IN_RATE_0		0xC7
+#define REG_PORT_3_IN_RATE_0		0xD7
+#define REG_PORT_4_IN_RATE_0		0xE7
+#define REG_PORT_5_IN_RATE_0		0xF7
+#define REG_PORT_1_IN_RATE_1		0xB8
+#define REG_PORT_2_IN_RATE_1		0xC8
+#define REG_PORT_3_IN_RATE_1		0xD8
+#define REG_PORT_4_IN_RATE_1		0xE8
+#define REG_PORT_5_IN_RATE_1		0xF8
+#define REG_PORT_1_IN_RATE_2		0xB9
+#define REG_PORT_2_IN_RATE_2		0xC9
+#define REG_PORT_3_IN_RATE_2		0xD9
+#define REG_PORT_4_IN_RATE_2		0xE9
+#define REG_PORT_5_IN_RATE_2		0xF9
+#define REG_PORT_1_IN_RATE_3		0xBA
+#define REG_PORT_2_IN_RATE_3		0xCA
+#define REG_PORT_3_IN_RATE_3		0xDA
+#define REG_PORT_4_IN_RATE_3		0xEA
+#define REG_PORT_5_IN_RATE_3		0xFA
+
+#define PORT_IN_RATE_ENABLE		BIT(7)
+#define PORT_RATE_LIMIT_M		(BIT(7) - 1)
+
+#define REG_PORT_1_OUT_RATE_0		0xBB
+#define REG_PORT_2_OUT_RATE_0		0xCB
+#define REG_PORT_3_OUT_RATE_0		0xDB
+#define REG_PORT_4_OUT_RATE_0		0xEB
+#define REG_PORT_5_OUT_RATE_0		0xFB
+#define REG_PORT_1_OUT_RATE_1		0xBC
+#define REG_PORT_2_OUT_RATE_1		0xCC
+#define REG_PORT_3_OUT_RATE_1		0xDC
+#define REG_PORT_4_OUT_RATE_1		0xEC
+#define REG_PORT_5_OUT_RATE_1		0xFC
+#define REG_PORT_1_OUT_RATE_2		0xBD
+#define REG_PORT_2_OUT_RATE_2		0xCD
+#define REG_PORT_3_OUT_RATE_2		0xDD
+#define REG_PORT_4_OUT_RATE_2		0xED
+#define REG_PORT_5_OUT_RATE_2		0xFD
+#define REG_PORT_1_OUT_RATE_3		0xBE
+#define REG_PORT_2_OUT_RATE_3		0xCE
+#define REG_PORT_3_OUT_RATE_3		0xDE
+#define REG_PORT_4_OUT_RATE_3		0xEE
+#define REG_PORT_5_OUT_RATE_3		0xFE
+
+/* PME */
+
+#define SW_PME_OUTPUT_ENABLE		BIT(1)
+#define SW_PME_ACTIVE_HIGH		BIT(0)
+
+#define PORT_MAGIC_PACKET_DETECT	BIT(2)
+#define PORT_LINK_UP_DETECT		BIT(1)
+#define PORT_ENERGY_DETECT		BIT(0)
+
+/* ACL */
+
+#define ACL_FIRST_RULE_M		0xF
+
+#define ACL_MODE_M			0x3
+#define ACL_MODE_S			4
+#define ACL_MODE_DISABLE		0
+#define ACL_MODE_LAYER_2		1
+#define ACL_MODE_LAYER_3		2
+#define ACL_MODE_LAYER_4		3
+#define ACL_ENABLE_M			0x3
+#define ACL_ENABLE_S			2
+#define ACL_ENABLE_2_COUNT		0
+#define ACL_ENABLE_2_TYPE		1
+#define ACL_ENABLE_2_MAC		2
+#define ACL_ENABLE_2_BOTH		3
+#define ACL_ENABLE_3_IP			1
+#define ACL_ENABLE_3_SRC_DST_COMP	2
+#define ACL_ENABLE_4_PROTOCOL		0
+#define ACL_ENABLE_4_TCP_PORT_COMP	1
+#define ACL_ENABLE_4_UDP_PORT_COMP	2
+#define ACL_ENABLE_4_TCP_SEQN_COMP	3
+#define ACL_SRC				BIT(1)
+#define ACL_EQUAL			BIT(0)
+
+#define ACL_MAX_PORT			0xFFFF
+
+#define ACL_MIN_PORT			0xFFFF
+#define ACL_IP_ADDR			0xFFFFFFFF
+#define ACL_TCP_SEQNUM			0xFFFFFFFF
+
+#define ACL_RESERVED			0xF8
+#define ACL_PORT_MODE_M			0x3
+#define ACL_PORT_MODE_S			1
+#define ACL_PORT_MODE_DISABLE		0
+#define ACL_PORT_MODE_EITHER		1
+#define ACL_PORT_MODE_IN_RANGE		2
+#define ACL_PORT_MODE_OUT_OF_RANGE	3
+
+#define ACL_TCP_FLAG_ENABLE		BIT(0)
+
+#define ACL_TCP_FLAG_M			0xFF
+
+#define ACL_TCP_FLAG			0xFF
+#define ACL_ETH_TYPE			0xFFFF
+#define ACL_IP_M			0xFFFFFFFF
+
+#define ACL_PRIO_MODE_M			0x3
+#define ACL_PRIO_MODE_S			6
+#define ACL_PRIO_MODE_DISABLE		0
+#define ACL_PRIO_MODE_HIGHER		1
+#define ACL_PRIO_MODE_LOWER		2
+#define ACL_PRIO_MODE_REPLACE		3
+#define ACL_PRIO_M			0x7
+#define ACL_PRIO_S			3
+#define ACL_VLAN_PRIO_REPLACE		BIT(2)
+#define ACL_VLAN_PRIO_M			0x7
+#define ACL_VLAN_PRIO_HI_M		0x3
+
+#define ACL_VLAN_PRIO_LO_M		0x8
+#define ACL_VLAN_PRIO_S			7
+#define ACL_MAP_MODE_M			0x3
+#define ACL_MAP_MODE_S			5
+#define ACL_MAP_MODE_DISABLE		0
+#define ACL_MAP_MODE_OR			1
+#define ACL_MAP_MODE_AND		2
+#define ACL_MAP_MODE_REPLACE		3
+#define ACL_MAP_PORT_M			0x1F
+
+#define ACL_CNT_M			(BIT(11) - 1)
+#define ACL_CNT_S			5
+#define ACL_MSEC_UNIT			BIT(4)
+#define ACL_INTR_MODE			BIT(3)
+
+#define REG_PORT_ACL_BYTE_EN_MSB	0x10
+
+#define ACL_BYTE_EN_MSB_M		0x3F
+
+#define REG_PORT_ACL_BYTE_EN_LSB	0x11
+
+#define ACL_ACTION_START		0xA
+#define ACL_ACTION_LEN			2
+#define ACL_INTR_CNT_START		0xB
+#define ACL_RULESET_START		0xC
+#define ACL_RULESET_LEN			2
+#define ACL_TABLE_LEN			14
+
+#define ACL_ACTION_ENABLE		0x000C
+#define ACL_MATCH_ENABLE		0x1FF0
+#define ACL_RULESET_ENABLE		0x2003
+#define ACL_BYTE_ENABLE			((ACL_BYTE_EN_MSB_M << 8) | 0xFF)
+#define ACL_MODE_ENABLE			(0x10 << 8)
+
+#define REG_PORT_ACL_CTRL_0		0x12
+
+#define PORT_ACL_WRITE_DONE		BIT(6)
+#define PORT_ACL_READ_DONE		BIT(5)
+#define PORT_ACL_WRITE			BIT(4)
+#define PORT_ACL_INDEX_M		0xF
+
+#define REG_PORT_ACL_CTRL_1		0x13
+
+#define PORT_ACL_FORCE_DLR_MISS		BIT(0)
+
+#ifndef PHY_REG_CTRL
+#define PHY_REG_CTRL			0
+
+#define PHY_RESET			BIT(15)
+#define PHY_LOOPBACK			BIT(14)
+#define PHY_SPEED_100MBIT		BIT(13)
+#define PHY_AUTO_NEG_ENABLE		BIT(12)
+#define PHY_POWER_DOWN			BIT(11)
+#define PHY_MII_DISABLE			BIT(10)
+#define PHY_AUTO_NEG_RESTART		BIT(9)
+#define PHY_FULL_DUPLEX			BIT(8)
+#define PHY_COLLISION_TEST_NOT		BIT(7)
+#define PHY_HP_MDIX			BIT(5)
+#define PHY_FORCE_MDIX			BIT(4)
+#define PHY_AUTO_MDIX_DISABLE		BIT(3)
+#define PHY_REMOTE_FAULT_DISABLE	BIT(2)
+#define PHY_TRANSMIT_DISABLE		BIT(1)
+#define PHY_LED_DISABLE			BIT(0)
+
+#define PHY_REG_STATUS			1
+
+#define PHY_100BT4_CAPABLE		BIT(15)
+#define PHY_100BTX_FD_CAPABLE		BIT(14)
+#define PHY_100BTX_CAPABLE		BIT(13)
+#define PHY_10BT_FD_CAPABLE		BIT(12)
+#define PHY_10BT_CAPABLE		BIT(11)
+#define PHY_MII_SUPPRESS_CAPABLE_NOT	BIT(6)
+#define PHY_AUTO_NEG_ACKNOWLEDGE	BIT(5)
+#define PHY_REMOTE_FAULT		BIT(4)
+#define PHY_AUTO_NEG_CAPABLE		BIT(3)
+#define PHY_LINK_STATUS			BIT(2)
+#define PHY_JABBER_DETECT_NOT		BIT(1)
+#define PHY_EXTENDED_CAPABILITY		BIT(0)
+
+#define PHY_REG_ID_1			2
+#define PHY_REG_ID_2			3
+
+#define PHY_REG_AUTO_NEGOTIATION	4
+
+#define PHY_AUTO_NEG_NEXT_PAGE_NOT	BIT(15)
+#define PHY_AUTO_NEG_REMOTE_FAULT_NOT	BIT(13)
+#define PHY_AUTO_NEG_SYM_PAUSE		BIT(10)
+#define PHY_AUTO_NEG_100BT4		BIT(9)
+#define PHY_AUTO_NEG_100BTX_FD		BIT(8)
+#define PHY_AUTO_NEG_100BTX		BIT(7)
+#define PHY_AUTO_NEG_10BT_FD		BIT(6)
+#define PHY_AUTO_NEG_10BT		BIT(5)
+#define PHY_AUTO_NEG_SELECTOR		0x001F
+#define PHY_AUTO_NEG_802_3		0x0001
+
+#define PHY_REG_REMOTE_CAPABILITY	5
+
+#define PHY_REMOTE_NEXT_PAGE_NOT	BIT(15)
+#define PHY_REMOTE_ACKNOWLEDGE_NOT	BIT(14)
+#define PHY_REMOTE_REMOTE_FAULT_NOT	BIT(13)
+#define PHY_REMOTE_SYM_PAUSE		BIT(10)
+#define PHY_REMOTE_100BTX_FD		BIT(8)
+#define PHY_REMOTE_100BTX		BIT(7)
+#define PHY_REMOTE_10BT_FD		BIT(6)
+#define PHY_REMOTE_10BT			BIT(5)
+#endif
+
+#define KSZ8795_ID_HI			0x0022
+#define KSZ8795_ID_LO			0x1550
+
+#define KSZ8795_SW_ID			0x8795
+
+#define PHY_REG_LINK_MD			0x1D
+
+#define PHY_START_CABLE_DIAG		BIT(15)
+#define PHY_CABLE_DIAG_RESULT		0x6000
+#define PHY_CABLE_STAT_NORMAL		0x0000
+#define PHY_CABLE_STAT_OPEN		0x2000
+#define PHY_CABLE_STAT_SHORT		0x4000
+#define PHY_CABLE_STAT_FAILED		0x6000
+#define PHY_CABLE_10M_SHORT		BIT(12)
+#define PHY_CABLE_FAULT_COUNTER		0x01FF
+
+#define PHY_REG_PHY_CTRL		0x1F
+
+#define PHY_MODE_M			0x7
+#define PHY_MODE_S			8
+#define PHY_STAT_REVERSED_POLARITY	BIT(5)
+#define PHY_STAT_MDIX			BIT(4)
+#define PHY_FORCE_LINK			BIT(3)
+#define PHY_POWER_SAVING_ENABLE		BIT(2)
+#define PHY_REMOTE_LOOPBACK		BIT(1)
+
+/* Chip resource */
+
+#define PRIO_QUEUES			4
+
+#define KS_PRIO_IN_REG			4
+
+#define TOTAL_PORT_NUM			5
+
+/* Host port can only be last of them. */
+#define SWITCH_PORT_NUM			(TOTAL_PORT_NUM - 1)
+
+#define KSZ8795_COUNTER_NUM		0x20
+#define TOTAL_KSZ8795_COUNTER_NUM	(KSZ8795_COUNTER_NUM + 4)
+
+#define SWITCH_COUNTER_NUM		KSZ8795_COUNTER_NUM
+#define TOTAL_SWITCH_COUNTER_NUM	TOTAL_KSZ8795_COUNTER_NUM
+
+/* Common names used by other drivers */
+
+#define P_BCAST_STORM_CTRL		REG_PORT_CTRL_0
+#define P_PRIO_CTRL			REG_PORT_CTRL_0
+#define P_TAG_CTRL			REG_PORT_CTRL_0
+#define P_MIRROR_CTRL			REG_PORT_CTRL_1
+#define P_802_1P_CTRL			REG_PORT_CTRL_2
+#define P_STP_CTRL			REG_PORT_CTRL_2
+#define P_LOCAL_CTRL			REG_PORT_CTRL_7
+#define P_REMOTE_STATUS			REG_PORT_STATUS_0
+#define P_FORCE_CTRL			REG_PORT_CTRL_9
+#define P_NEG_RESTART_CTRL		REG_PORT_CTRL_10
+#define P_SPEED_STATUS			REG_PORT_STATUS_1
+#define P_LINK_STATUS			REG_PORT_STATUS_2
+#define P_PASS_ALL_CTRL			REG_PORT_CTRL_12
+#define P_INS_SRC_PVID_CTRL		REG_PORT_CTRL_12
+#define P_DROP_TAG_CTRL			REG_PORT_CTRL_13
+#define P_RATE_LIMIT_CTRL		REG_PORT_RATE_LIMIT
+
+#define S_UNKNOWN_DA_CTRL		REG_SWITCH_CTRL_12
+#define S_FORWARD_INVALID_VID_CTRL	REG_FORWARD_INVALID_VID
+
+#define S_FLUSH_TABLE_CTRL		REG_SW_CTRL_0
+#define S_LINK_AGING_CTRL		REG_SW_CTRL_0
+#define S_HUGE_PACKET_CTRL		REG_SW_CTRL_1
+#define S_MIRROR_CTRL			REG_SW_CTRL_3
+#define S_REPLACE_VID_CTRL		REG_SW_CTRL_4
+#define S_PASS_PAUSE_CTRL		REG_SW_CTRL_10
+#define S_TAIL_TAG_CTRL			REG_SW_CTRL_10
+#define S_802_1P_PRIO_CTRL		REG_SW_CTRL_12
+#define S_TOS_PRIO_CTRL			REG_TOS_PRIO_CTRL_0
+#define S_IPV6_MLD_CTRL			REG_SW_CTRL_21
+
+#define IND_ACC_TABLE(table)		((table) << 8)
+
+/* Driver set switch broadcast storm protection at 10% rate. */
+#define BROADCAST_STORM_PROT_RATE	10
+
+/* 148,800 frames * 67 ms / 100 */
+#define BROADCAST_STORM_VALUE		9969
+
+/**
+ * STATIC_MAC_TABLE_ADDR		00-0000FFFF-FFFFFFFF
+ * STATIC_MAC_TABLE_FWD_PORTS		00-001F0000-00000000
+ * STATIC_MAC_TABLE_VALID		00-00200000-00000000
+ * STATIC_MAC_TABLE_OVERRIDE		00-00400000-00000000
+ * STATIC_MAC_TABLE_USE_FID		00-00800000-00000000
+ * STATIC_MAC_TABLE_FID			00-7F000000-00000000
+ */
+
+#define STATIC_MAC_TABLE_ADDR		0x0000FFFF
+#define STATIC_MAC_TABLE_FWD_PORTS	0x001F0000
+#define STATIC_MAC_TABLE_VALID		0x00200000
+#define STATIC_MAC_TABLE_OVERRIDE	0x00400000
+#define STATIC_MAC_TABLE_USE_FID	0x00800000
+#define STATIC_MAC_TABLE_FID		0x7F000000
+
+#define STATIC_MAC_FWD_PORTS_S		16
+#define STATIC_MAC_FID_S		24
+
+/**
+ * VLAN_TABLE_FID			00-007F007F-007F007F
+ * VLAN_TABLE_MEMBERSHIP		00-0F800F80-0F800F80
+ * VLAN_TABLE_VALID			00-10001000-10001000
+ */
+
+#define VLAN_TABLE_FID			0x007F
+#define VLAN_TABLE_MEMBERSHIP		0x0F80
+#define VLAN_TABLE_VALID		0x1000
+
+#define VLAN_TABLE_MEMBERSHIP_S		7
+#define VLAN_TABLE_S			16
+
+/**
+ * DYNAMIC_MAC_TABLE_ADDR		00-0000FFFF-FFFFFFFF
+ * DYNAMIC_MAC_TABLE_FID		00-007F0000-00000000
+ * DYNAMIC_MAC_TABLE_NOT_READY		00-00800000-00000000
+ * DYNAMIC_MAC_TABLE_SRC_PORT		00-07000000-00000000
+ * DYNAMIC_MAC_TABLE_TIMESTAMP		00-18000000-00000000
+ * DYNAMIC_MAC_TABLE_ENTRIES		7F-E0000000-00000000
+ * DYNAMIC_MAC_TABLE_MAC_EMPTY		80-00000000-00000000
+ */
+
+#define DYNAMIC_MAC_TABLE_ADDR		0x0000FFFF
+#define DYNAMIC_MAC_TABLE_FID		0x007F0000
+#define DYNAMIC_MAC_TABLE_SRC_PORT	0x07000000
+#define DYNAMIC_MAC_TABLE_TIMESTAMP	0x18000000
+#define DYNAMIC_MAC_TABLE_ENTRIES	0xE0000000
+
+#define DYNAMIC_MAC_TABLE_NOT_READY	0x80
+
+#define DYNAMIC_MAC_TABLE_ENTRIES_H	0x7F
+#define DYNAMIC_MAC_TABLE_MAC_EMPTY	0x80
+
+#define DYNAMIC_MAC_FID_S		16
+#define DYNAMIC_MAC_SRC_PORT_S		24
+#define DYNAMIC_MAC_TIMESTAMP_S		27
+#define DYNAMIC_MAC_ENTRIES_S		29
+#define DYNAMIC_MAC_ENTRIES_H_S		3
+
+/**
+ * MIB_COUNTER_VALUE			00-00000000-3FFFFFFF
+ * MIB_TOTAL_BYTES			00-0000000F-FFFFFFFF
+ * MIB_PACKET_DROPPED			00-00000000-0000FFFF
+ * MIB_COUNTER_VALID			00-00000020-00000000
+ * MIB_COUNTER_OVERFLOW			00-00000040-00000000
+ */
+
+#define MIB_COUNTER_OVERFLOW		BIT(6)
+#define MIB_COUNTER_VALID		BIT(5)
+
+#define MIB_COUNTER_VALUE		0x3FFFFFFF
+
+#define KS_MIB_TOTAL_RX_0		0x100
+#define KS_MIB_TOTAL_TX_0		0x101
+#define KS_MIB_PACKET_DROPPED_RX_0	0x102
+#define KS_MIB_PACKET_DROPPED_TX_0	0x103
+#define KS_MIB_TOTAL_RX_1		0x104
+#define KS_MIB_TOTAL_TX_1		0x105
+#define KS_MIB_PACKET_DROPPED_TX_1	0x106
+#define KS_MIB_PACKET_DROPPED_RX_1	0x107
+#define KS_MIB_TOTAL_RX_2		0x108
+#define KS_MIB_TOTAL_TX_2		0x109
+#define KS_MIB_PACKET_DROPPED_TX_2	0x10A
+#define KS_MIB_PACKET_DROPPED_RX_2	0x10B
+#define KS_MIB_TOTAL_RX_3		0x10C
+#define KS_MIB_TOTAL_TX_3		0x10D
+#define KS_MIB_PACKET_DROPPED_TX_3	0x10E
+#define KS_MIB_PACKET_DROPPED_RX_3	0x10F
+#define KS_MIB_TOTAL_RX_4		0x110
+#define KS_MIB_TOTAL_TX_4		0x111
+#define KS_MIB_PACKET_DROPPED_TX_4	0x112
+#define KS_MIB_PACKET_DROPPED_RX_4	0x113
+
+#define MIB_PACKET_DROPPED		0x0000FFFF
+
+#define MIB_TOTAL_BYTES_H		0x0000000F
+
+#define TAIL_TAG_OVERRIDE		BIT(6)
+#define TAIL_TAG_LOOKUP			BIT(7)
+
+#define VLAN_TABLE_ENTRIES		(4096 / 4)
+#define FID_ENTRIES			128
+
+#endif
diff --git a/drivers/net/dsa/microchip/ksz8795_spi.c b/drivers/net/dsa/microchip/ksz8795_spi.c
new file mode 100644
index 000000000000..50aa0d24effb
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz8795_spi.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Microchip KSZ8795 series register access through SPI
+ *
+ * Copyright (C) 2017 Microchip Technology Inc.
+ *	Tristram Ha <Tristram.Ha@microchip.com>
+ */
+
+#include <asm/unaligned.h>
+
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/spi/spi.h>
+
+#include "ksz_priv.h"
+#include "ksz_common.h"
+
+#define SPI_ADDR_SHIFT			12
+#define SPI_ADDR_ALIGN			3
+#define SPI_TURNAROUND_SHIFT		1
+
+KSZ_REGMAP_TABLE(ksz8795, 16, SPI_ADDR_SHIFT,
+		 SPI_TURNAROUND_SHIFT, SPI_ADDR_ALIGN);
+
+static int ksz8795_spi_probe(struct spi_device *spi)
+{
+	struct ksz_device *dev;
+	int i, ret;
+
+	dev = ksz_switch_alloc(&spi->dev, spi);
+	if (!dev)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(ksz8795_regmap_config); i++) {
+		dev->regmap[i] = devm_regmap_init_spi(spi,
+						      &ksz8795_regmap_config
+						      [i]);
+		if (IS_ERR(dev->regmap[i])) {
+			ret = PTR_ERR(dev->regmap[i]);
+			dev_err(&spi->dev,
+				"Failed to initialize regmap%i: %d\n",
+				ksz8795_regmap_config[i].val_bits, ret);
+			return ret;
+		}
+	}
+
+	if (spi->dev.platform_data)
+		dev->pdata = spi->dev.platform_data;
+
+	ret = ksz8795_switch_register(dev);
+
+	/* Main DSA driver may not be started yet. */
+	if (ret)
+		return ret;
+
+	spi_set_drvdata(spi, dev);
+
+	return 0;
+}
+
+static int ksz8795_spi_remove(struct spi_device *spi)
+{
+	struct ksz_device *dev = spi_get_drvdata(spi);
+
+	if (dev)
+		ksz_switch_remove(dev);
+
+	return 0;
+}
+
+static void ksz8795_spi_shutdown(struct spi_device *spi)
+{
+	struct ksz_device *dev = spi_get_drvdata(spi);
+
+	if (dev && dev->dev_ops->shutdown)
+		dev->dev_ops->shutdown(dev);
+}
+
+static const struct of_device_id ksz8795_dt_ids[] = {
+	{ .compatible = "microchip,ksz8765" },
+	{ .compatible = "microchip,ksz8794" },
+	{ .compatible = "microchip,ksz8795" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, ksz8795_dt_ids);
+
+static struct spi_driver ksz8795_spi_driver = {
+	.driver = {
+		.name	= "ksz8795-switch",
+		.owner	= THIS_MODULE,
+		.of_match_table = of_match_ptr(ksz8795_dt_ids),
+	},
+	.probe	= ksz8795_spi_probe,
+	.remove	= ksz8795_spi_remove,
+	.shutdown = ksz8795_spi_shutdown,
+};
+
+module_spi_driver(ksz8795_spi_driver);
+
+MODULE_AUTHOR("Tristram Ha <Tristram.Ha@microchip.com>");
+MODULE_DESCRIPTION("Microchip KSZ8795 Series Switch SPI Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index ee7096d8af07..84fed4a2578b 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -68,6 +68,22 @@ static inline int ksz_read32(struct ksz_device *dev, u32 reg, u32 *val)
 	return ret;
 }
 
+static inline int ksz_read64(struct ksz_device *dev, u32 reg, u64 *val)
+{
+	u32 value[2];
+	int ret;
+
+	ret = regmap_bulk_read(dev->regmap[2], reg, value, 2);
+	if (!ret) {
+		/* Ick! ToDo: Add 64bit R/W to regmap on 32bit systems */
+		value[0] = swab32(value[0]);
+		value[1] = swab32(value[1]);
+		*val = swab64((u64)*value);
+	}
+
+	return ret;
+}
+
 static inline int ksz_write8(struct ksz_device *dev, u32 reg, u8 value)
 {
 	return regmap_write(dev->regmap[0], reg, value);
@@ -83,6 +99,18 @@ static inline int ksz_write32(struct ksz_device *dev, u32 reg, u32 value)
 	return regmap_write(dev->regmap[2], reg, value);
 }
 
+static inline int ksz_write64(struct ksz_device *dev, u32 reg, u64 value)
+{
+	u32 val[2];
+
+	/* Ick! ToDo: Add 64bit R/W to regmap on 32bit systems */
+	value = swab64(value);
+	val[0] = swab32(value & 0xffffffffULL);
+	val[1] = swab32(value >> 32ULL);
+
+	return regmap_bulk_write(dev->regmap[2], reg, val, 2);
+}
+
 static inline void ksz_pread8(struct ksz_device *dev, int port, int offset,
 			      u8 *data)
 {
diff --git a/drivers/net/dsa/microchip/ksz_priv.h b/drivers/net/dsa/microchip/ksz_priv.h
index beacf0e40f42..44c16aaf775c 100644
--- a/drivers/net/dsa/microchip/ksz_priv.h
+++ b/drivers/net/dsa/microchip/ksz_priv.h
@@ -150,6 +150,7 @@ int ksz_switch_register(struct ksz_device *dev,
 			const struct ksz_dev_ops *ops);
 void ksz_switch_remove(struct ksz_device *dev);
 
+int ksz8795_switch_register(struct ksz_device *dev);
 int ksz9477_switch_register(struct ksz_device *dev);
 
 #endif
-- 
2.20.1


^ permalink raw reply related

* RE: [PATCH net-next 3/3] net: stmmac: Introducing support for Page Pool
From: Jose Abreu @ 2019-07-25 15:12 UTC (permalink / raw)
  To: Jon Hunter, Jose Abreu, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, linux-stm32@st-md-mailman.stormreply.com,
	linux-arm-kernel@lists.infradead.org
  Cc: Joao Pinto, David S . Miller, Giuseppe Cavallaro,
	Alexandre Torgue, Maxime Coquelin, Maxime Ripard, Chen-Yu Tsai,
	Robin Murphy, linux-tegra
In-Reply-To: <9e695f33-fd9f-a910-0891-2b63bd75e082@nvidia.com>

From: Jon Hunter <jonathanh@nvidia.com>
Date: Jul/25/2019, 15:25:59 (UTC+00:00)

> 
> On 25/07/2019 14:26, Jose Abreu wrote:
> 
> ...
> 
> > Well, I wasn't expecting that :/
> > 
> > Per documentation of barriers I think we should set descriptor fields 
> > and then barrier and finally ownership to HW so that remaining fields 
> > are coherent before owner is set.
> > 
> > Anyway, can you also add a dma_rmb() after the call to 
> > stmmac_rx_status() ?
> 
> Yes. I removed the debug print added the barrier, but that did not help.

So, I was finally able to setup NFS using your replicated setup and I 
can't see the issue :(

The only difference I have from yours is that I'm using TCP in NFS 
whilst you (I believe from the logs), use UDP.

You do have flow control active right ? And your HW FIFO size is >= 4k ?

---
Thanks,
Jose Miguel Abreu

^ permalink raw reply

* [PATCH bpf-next v2 0/7] bpf/flow_dissector: support input flags
From: Stanislav Fomichev @ 2019-07-25 15:33 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Song Liu,
	Willem de Bruijn, Petar Penkov

C flow dissector supports input flags that tell it to customize parsing
by either stopping early or trying to parse as deep as possible.
BPF flow dissector always parses as deep as possible which is sub-optimal.
Pass input flags to the BPF flow dissector as well so it can make the same
decisions.

Series outline:
* remove unused FLOW_DISSECTOR_F_STOP_AT_L3 flag
* export FLOW_DISSECTOR_F_XXX flags as uapi and pass them to BPF
  flow dissector
* add documentation for the export flags
* support input flags in BPF_PROG_TEST_RUN via ctx_{in,out}
* sync uapi to tools
* support FLOW_DISSECTOR_F_PARSE_1ST_FRAG in selftest
* support FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL in kernel and selftest
* support FLOW_DISSECTOR_F_STOP_AT_ENCAP in selftest

Pros:
* makes BPF flow dissector faster by avoiding burning extra cycles
* existing BPF progs continue to work by ignoring the flags and always
  parsing as deep as possible

Cons:
* new UAPI which we need to support (OTOH, if we need to deprecate some
  flags, we can just stop setting them upon calling BPF programs)

Some numbers (with .repeat = 4000000 in test_flow_dissector):
        test_flow_dissector:PASS:ipv4-frag 35 nsec
        test_flow_dissector:PASS:ipv4-frag 35 nsec
        test_flow_dissector:PASS:ipv4-no-frag 32 nsec
        test_flow_dissector:PASS:ipv4-no-frag 32 nsec

        test_flow_dissector:PASS:ipv6-frag 39 nsec
        test_flow_dissector:PASS:ipv6-frag 39 nsec
        test_flow_dissector:PASS:ipv6-no-frag 36 nsec
        test_flow_dissector:PASS:ipv6-no-frag 36 nsec

        test_flow_dissector:PASS:ipv6-flow-label 36 nsec
        test_flow_dissector:PASS:ipv6-flow-label 36 nsec
        test_flow_dissector:PASS:ipv6-no-flow-label 33 nsec
        test_flow_dissector:PASS:ipv6-no-flow-label 33 nsec

        test_flow_dissector:PASS:ipip-encap 38 nsec
        test_flow_dissector:PASS:ipip-encap 38 nsec
        test_flow_dissector:PASS:ipip-no-encap 32 nsec
        test_flow_dissector:PASS:ipip-no-encap 32 nsec

The improvement is around 10%, but it's in a tight cache-hot
BPF_PROG_TEST_RUN loop.

Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>

Stanislav Fomichev (7):
  bpf/flow_dissector: pass input flags to BPF flow dissector program
  bpf/flow_dissector: document flags
  bpf/flow_dissector: support flags in BPF_PROG_TEST_RUN
  tools/bpf: sync bpf_flow_keys flags
  selftests/bpf: support FLOW_DISSECTOR_F_PARSE_1ST_FRAG
  bpf/flow_dissector: support ipv6 flow_label and
    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL
  selftests/bpf: support FLOW_DISSECTOR_F_STOP_AT_ENCAP

 Documentation/bpf/prog_flow_dissector.rst     |  18 ++
 include/linux/skbuff.h                        |   2 +-
 include/net/flow_dissector.h                  |   4 -
 include/uapi/linux/bpf.h                      |   6 +
 net/bpf/test_run.c                            |  39 ++-
 net/core/flow_dissector.c                     |  14 +-
 tools/include/uapi/linux/bpf.h                |   6 +
 .../selftests/bpf/prog_tests/flow_dissector.c | 242 +++++++++++++++++-
 tools/testing/selftests/bpf/progs/bpf_flow.c  |  46 +++-
 9 files changed, 359 insertions(+), 18 deletions(-)

-- 
2.22.0.657.g960e92d24f-goog

^ permalink raw reply

* [PATCH bpf-next v2 1/7] bpf/flow_dissector: pass input flags to BPF flow dissector program
From: Stanislav Fomichev @ 2019-07-25 15:33 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Willem de Bruijn,
	Song Liu, Petar Penkov
In-Reply-To: <20190725153342.3571-1-sdf@google.com>

C flow dissector supports input flags that tell it to customize parsing
by either stopping early or trying to parse as deep as possible. Pass
those flags to the BPF flow dissector so it can make the same
decisions. In the next commits I'll add support for those flags to
our reference bpf_flow.c

Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/linux/skbuff.h       | 2 +-
 include/net/flow_dissector.h | 4 ----
 include/uapi/linux/bpf.h     | 5 +++++
 net/bpf/test_run.c           | 2 +-
 net/core/flow_dissector.c    | 5 +++--
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 718742b1c505..9b7a8038beec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1271,7 +1271,7 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
 
 struct bpf_flow_dissector;
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
-		      __be16 proto, int nhoff, int hlen);
+		      __be16 proto, int nhoff, int hlen, unsigned int flags);
 
 bool __skb_flow_dissect(const struct net *net,
 			const struct sk_buff *skb,
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 90bd210be060..3e2642587b76 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -253,10 +253,6 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_MAX,
 };
 
-#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG		BIT(0)
-#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	BIT(1)
-#define FLOW_DISSECTOR_F_STOP_AT_ENCAP		BIT(2)
-
 struct flow_dissector_key {
 	enum flow_dissector_key_id key_id;
 	size_t offset; /* offset of struct flow_dissector_key_*
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fa1c753dcdbc..b4ad19bd6aa8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3507,6 +3507,10 @@ enum bpf_task_fd_type {
 	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
 };
 
+#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG		(1U << 0)
+#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	(1U << 1)
+#define FLOW_DISSECTOR_F_STOP_AT_ENCAP		(1U << 2)
+
 struct bpf_flow_keys {
 	__u16	nhoff;
 	__u16	thoff;
@@ -3528,6 +3532,7 @@ struct bpf_flow_keys {
 			__u32	ipv6_dst[4];	/* in6_addr; network order */
 		};
 	};
+	__u32	flags;
 };
 
 struct bpf_func_info {
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 80e6f3a6864d..4e41d15a1098 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -419,7 +419,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
 		retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
-					  size);
+					  size, 0);
 
 		if (signal_pending(current)) {
 			preempt_enable();
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3e6fedb57bc1..a74c4ed1b30d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -784,7 +784,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
 }
 
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
-		      __be16 proto, int nhoff, int hlen)
+		      __be16 proto, int nhoff, int hlen, unsigned int flags)
 {
 	struct bpf_flow_keys *flow_keys = ctx->flow_keys;
 	u32 result;
@@ -794,6 +794,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 	flow_keys->n_proto = proto;
 	flow_keys->nhoff = nhoff;
 	flow_keys->thoff = flow_keys->nhoff;
+	flow_keys->flags = flags;
 
 	preempt_disable();
 	result = BPF_PROG_RUN(prog, ctx);
@@ -914,7 +915,7 @@ bool __skb_flow_dissect(const struct net *net,
 			}
 
 			ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff,
-					       hlen);
+					       hlen, flags);
 			__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
 						 target_container);
 			rcu_read_unlock();
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related

* [PATCH bpf-next v2 2/7] bpf/flow_dissector: document flags
From: Stanislav Fomichev @ 2019-07-25 15:33 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Willem de Bruijn,
	Song Liu, Petar Penkov
In-Reply-To: <20190725153342.3571-1-sdf@google.com>

Describe what each input flag does and who uses it.

Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 Documentation/bpf/prog_flow_dissector.rst | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/bpf/prog_flow_dissector.rst b/Documentation/bpf/prog_flow_dissector.rst
index ed343abe541e..0f3f380b2ce4 100644
--- a/Documentation/bpf/prog_flow_dissector.rst
+++ b/Documentation/bpf/prog_flow_dissector.rst
@@ -26,6 +26,7 @@ and output arguments.
   * ``nhoff`` - initial offset of the networking header
   * ``thoff`` - initial offset of the transport header, initialized to nhoff
   * ``n_proto`` - L3 protocol type, parsed out of L2 header
+  * ``flags`` - optional flags
 
 Flow dissector BPF program should fill out the rest of the ``struct
 bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be
@@ -101,6 +102,23 @@ can be called for both cases and would have to be written carefully to
 handle both cases.
 
 
+Flags
+=====
+
+``flow_keys->flags`` might contain optional input flags that work as follows:
+
+* ``FLOW_DISSECTOR_F_PARSE_1ST_FRAG`` - tells BPF flow dissector to continue
+  parsing first fragment; the default expected behavior is that flow dissector
+  returns as soon as it finds out that the packet is fragmented;
+  used by ``eth_get_headlen`` to estimate length of all headers for GRO.
+* ``FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL`` - tells BPF flow dissector to stop
+  parsing as soon as it reaches IPv6 flow label; used by ``___skb_get_hash``
+  and ``__skb_get_hash_symmetric`` to get flow hash.
+* ``FLOW_DISSECTOR_F_STOP_AT_ENCAP`` - tells BPF flow dissector to stop
+  parsing as soon as it reaches encapsulated headers; used by routing
+  infrastructure.
+
+
 Reference Implementation
 ========================
 
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related

* [PATCH bpf-next v2 3/7] bpf/flow_dissector: support flags in BPF_PROG_TEST_RUN
From: Stanislav Fomichev @ 2019-07-25 15:33 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Willem de Bruijn,
	Song Liu, Petar Penkov
In-Reply-To: <20190725153342.3571-1-sdf@google.com>

This will allow us to write tests for those flags.

v2:
* Swap kfree(data) and kfree(user_ctx) (Song Liu)

Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 net/bpf/test_run.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 4e41d15a1098..1153bbcdff72 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -377,6 +377,22 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	return ret;
 }
 
+static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx)
+{
+	/* make sure the fields we don't use are zeroed */
+	if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags)))
+		return -EINVAL;
+
+	/* flags is allowed */
+
+	if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) +
+			   FIELD_SIZEOF(struct bpf_flow_keys, flags),
+			   sizeof(struct bpf_flow_keys)))
+		return -EINVAL;
+
+	return 0;
+}
+
 int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 				     const union bpf_attr *kattr,
 				     union bpf_attr __user *uattr)
@@ -384,9 +400,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	u32 size = kattr->test.data_size_in;
 	struct bpf_flow_dissector ctx = {};
 	u32 repeat = kattr->test.repeat;
+	struct bpf_flow_keys *user_ctx;
 	struct bpf_flow_keys flow_keys;
 	u64 time_start, time_spent = 0;
 	const struct ethhdr *eth;
+	unsigned int flags = 0;
 	u32 retval, duration;
 	void *data;
 	int ret;
@@ -395,9 +413,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
 		return -EINVAL;
 
-	if (kattr->test.ctx_in || kattr->test.ctx_out)
-		return -EINVAL;
-
 	if (size < ETH_HLEN)
 		return -EINVAL;
 
@@ -410,6 +425,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	if (!repeat)
 		repeat = 1;
 
+	user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys));
+	if (IS_ERR(user_ctx)) {
+		kfree(data);
+		return PTR_ERR(user_ctx);
+	}
+	if (user_ctx) {
+		ret = verify_user_bpf_flow_keys(user_ctx);
+		if (ret)
+			goto out;
+		flags = user_ctx->flags;
+	}
+
 	ctx.flow_keys = &flow_keys;
 	ctx.data = data;
 	ctx.data_end = (__u8 *)data + size;
@@ -419,7 +446,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
 		retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
-					  size, 0);
+					  size, flags);
 
 		if (signal_pending(current)) {
 			preempt_enable();
@@ -450,8 +477,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 
 	ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
 			      retval, duration);
+	if (!ret)
+		ret = bpf_ctx_finish(kattr, uattr, user_ctx,
+				     sizeof(struct bpf_flow_keys));
 
 out:
+	kfree(user_ctx);
 	kfree(data);
 	return ret;
 }
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related

* [PATCH bpf-next v2 4/7] tools/bpf: sync bpf_flow_keys flags
From: Stanislav Fomichev @ 2019-07-25 15:33 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Willem de Bruijn,
	Song Liu, Petar Penkov
In-Reply-To: <20190725153342.3571-1-sdf@google.com>

Export bpf_flow_keys flags to tools/libbpf/selftests.

Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/include/uapi/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4e455018da65..a0e1c891b56f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3504,6 +3504,10 @@ enum bpf_task_fd_type {
 	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
 };
 
+#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG		(1U << 0)
+#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL	(1U << 1)
+#define FLOW_DISSECTOR_F_STOP_AT_ENCAP		(1U << 2)
+
 struct bpf_flow_keys {
 	__u16	nhoff;
 	__u16	thoff;
@@ -3525,6 +3529,7 @@ struct bpf_flow_keys {
 			__u32	ipv6_dst[4];	/* in6_addr; network order */
 		};
 	};
+	__u32	flags;
 };
 
 struct bpf_func_info {
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related

* [PATCH bpf-next v2 5/7] selftests/bpf: support FLOW_DISSECTOR_F_PARSE_1ST_FRAG
From: Stanislav Fomichev @ 2019-07-25 15:33 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Willem de Bruijn,
	Song Liu, Petar Penkov
In-Reply-To: <20190725153342.3571-1-sdf@google.com>

bpf_flow.c: exit early unless FLOW_DISSECTOR_F_PARSE_1ST_FRAG is passed
in flags. Also, set ip_proto earlier, this makes sure we have correct
value with fragmented packets.

Add selftest cases to test ipv4/ipv6 fragments and skip eth_get_headlen
tests that don't have FLOW_DISSECTOR_F_PARSE_1ST_FRAG flag.

eth_get_headlen calls flow dissector with
FLOW_DISSECTOR_F_PARSE_1ST_FRAG flag so we can't run tests that
have different set of input flags against it.

v2:
 * sefltests -> selftests (Willem de Bruijn)
 * Reword a comment about eth_get_headlen flags (Song Liu)

Acked-by: Willem de Bruijn <willemb@google.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 .../selftests/bpf/prog_tests/flow_dissector.c | 132 +++++++++++++++++-
 tools/testing/selftests/bpf/progs/bpf_flow.c  |  28 +++-
 2 files changed, 153 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index c938283ac232..f93a115db650 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -5,6 +5,10 @@
 #include <linux/if_tun.h>
 #include <sys/uio.h>
 
+#ifndef IP_MF
+#define IP_MF 0x2000
+#endif
+
 #define CHECK_FLOW_KEYS(desc, got, expected)				\
 	CHECK_ATTR(memcmp(&got, &expected, sizeof(got)) != 0,		\
 	      desc,							\
@@ -49,6 +53,18 @@ struct ipv6_pkt {
 	struct tcphdr tcp;
 } __packed;
 
+struct ipv6_frag_pkt {
+	struct ethhdr eth;
+	struct ipv6hdr iph;
+	struct frag_hdr {
+		__u8 nexthdr;
+		__u8 reserved;
+		__be16 frag_off;
+		__be32 identification;
+	} ipf;
+	struct tcphdr tcp;
+} __packed;
+
 struct dvlan_ipv6_pkt {
 	struct ethhdr eth;
 	__u16 vlan_tci;
@@ -65,9 +81,11 @@ struct test {
 		struct ipv4_pkt ipv4;
 		struct svlan_ipv4_pkt svlan_ipv4;
 		struct ipv6_pkt ipv6;
+		struct ipv6_frag_pkt ipv6_frag;
 		struct dvlan_ipv6_pkt dvlan_ipv6;
 	} pkt;
 	struct bpf_flow_keys keys;
+	__u32 flags;
 };
 
 #define VLAN_HLEN	4
@@ -143,6 +161,102 @@ struct test tests[] = {
 			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
 		},
 	},
+	{
+		.name = "ipv4-frag",
+		.pkt.ipv4 = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+			.iph.ihl = 5,
+			.iph.protocol = IPPROTO_TCP,
+			.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph.frag_off = __bpf_constant_htons(IP_MF),
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct iphdr),
+			.addr_proto = ETH_P_IP,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IP),
+			.is_frag = true,
+			.is_first_frag = true,
+			.sport = 80,
+			.dport = 8080,
+		},
+		.flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+	},
+	{
+		.name = "ipv4-no-frag",
+		.pkt.ipv4 = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+			.iph.ihl = 5,
+			.iph.protocol = IPPROTO_TCP,
+			.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph.frag_off = __bpf_constant_htons(IP_MF),
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct iphdr),
+			.addr_proto = ETH_P_IP,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IP),
+			.is_frag = true,
+			.is_first_frag = true,
+		},
+	},
+	{
+		.name = "ipv6-frag",
+		.pkt.ipv6_frag = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.iph.nexthdr = IPPROTO_FRAGMENT,
+			.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+			.ipf.nexthdr = IPPROTO_TCP,
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct ipv6hdr) +
+				sizeof(struct frag_hdr),
+			.addr_proto = ETH_P_IPV6,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.is_frag = true,
+			.is_first_frag = true,
+			.sport = 80,
+			.dport = 8080,
+		},
+		.flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+	},
+	{
+		.name = "ipv6-no-frag",
+		.pkt.ipv6_frag = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.iph.nexthdr = IPPROTO_FRAGMENT,
+			.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+			.ipf.nexthdr = IPPROTO_TCP,
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct ipv6hdr) +
+				sizeof(struct frag_hdr),
+			.addr_proto = ETH_P_IPV6,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.is_frag = true,
+			.is_first_frag = true,
+		},
+	},
 };
 
 static int create_tap(const char *ifname)
@@ -225,6 +339,13 @@ void test_flow_dissector(void)
 			.data_size_in = sizeof(tests[i].pkt),
 			.data_out = &flow_keys,
 		};
+		static struct bpf_flow_keys ctx = {};
+
+		if (tests[i].flags) {
+			tattr.ctx_in = &ctx;
+			tattr.ctx_size_in = sizeof(ctx);
+			ctx.flags = tests[i].flags;
+		}
 
 		err = bpf_prog_test_run_xattr(&tattr);
 		CHECK_ATTR(tattr.data_size_out != sizeof(flow_keys) ||
@@ -251,10 +372,19 @@ void test_flow_dissector(void)
 	CHECK(err, "ifup", "err %d errno %d\n", err, errno);
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
-		struct bpf_flow_keys flow_keys = {};
+		/* Keep in sync with 'flags' from eth_get_headlen. */
+		__u32 eth_get_headlen_flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
 		struct bpf_prog_test_run_attr tattr = {};
+		struct bpf_flow_keys flow_keys = {};
 		__u32 key = 0;
 
+		/* For skb-less case we can't pass input flags; run
+		 * only the tests that have a matching set of flags.
+		 */
+
+		if (tests[i].flags != eth_get_headlen_flags)
+			continue;
+
 		err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
 		CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c
index 5ae485a6af3f..0eabe5e57944 100644
--- a/tools/testing/selftests/bpf/progs/bpf_flow.c
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -153,7 +153,6 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 	struct tcphdr *tcp, _tcp;
 	struct udphdr *udp, _udp;
 
-	keys->ip_proto = proto;
 	switch (proto) {
 	case IPPROTO_ICMP:
 		icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp);
@@ -231,7 +230,6 @@ static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
 {
 	struct bpf_flow_keys *keys = skb->flow_keys;
 
-	keys->ip_proto = nexthdr;
 	switch (nexthdr) {
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_DSTOPTS:
@@ -266,6 +264,7 @@ PROG(IP)(struct __sk_buff *skb)
 	keys->addr_proto = ETH_P_IP;
 	keys->ipv4_src = iph->saddr;
 	keys->ipv4_dst = iph->daddr;
+	keys->ip_proto = iph->protocol;
 
 	keys->thoff += iph->ihl << 2;
 	if (data + keys->thoff > data_end)
@@ -273,13 +272,19 @@ PROG(IP)(struct __sk_buff *skb)
 
 	if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
 		keys->is_frag = true;
-		if (iph->frag_off & bpf_htons(IP_OFFSET))
+		if (iph->frag_off & bpf_htons(IP_OFFSET)) {
 			/* From second fragment on, packets do not have headers
 			 * we can parse.
 			 */
 			done = true;
-		else
+		} else {
 			keys->is_first_frag = true;
+			/* No need to parse fragmented packet unless
+			 * explicitly asked for.
+			 */
+			if (!(keys->flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
+				done = true;
+		}
 	}
 
 	if (done)
@@ -301,6 +306,7 @@ PROG(IPV6)(struct __sk_buff *skb)
 	memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
 
 	keys->thoff += sizeof(struct ipv6hdr);
+	keys->ip_proto = ip6h->nexthdr;
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
@@ -317,7 +323,8 @@ PROG(IPV6OP)(struct __sk_buff *skb)
 	/* hlen is in 8-octets and does not include the first 8 bytes
 	 * of the header
 	 */
-	skb->flow_keys->thoff += (1 + ip6h->hdrlen) << 3;
+	keys->thoff += (1 + ip6h->hdrlen) << 3;
+	keys->ip_proto = ip6h->nexthdr;
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
@@ -333,9 +340,18 @@ PROG(IPV6FR)(struct __sk_buff *skb)
 
 	keys->thoff += sizeof(*fragh);
 	keys->is_frag = true;
-	if (!(fragh->frag_off & bpf_htons(IP6_OFFSET)))
+	keys->ip_proto = fragh->nexthdr;
+
+	if (!(fragh->frag_off & bpf_htons(IP6_OFFSET))) {
 		keys->is_first_frag = true;
 
+		/* No need to parse fragmented packet unless
+		 * explicitly asked for.
+		 */
+		if (!(keys->flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
+			return export_flow_keys(keys, BPF_OK);
+	}
+
 	return parse_ipv6_proto(skb, fragh->nexthdr);
 }
 
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related

* [PATCH bpf-next v2 6/7] bpf/flow_dissector: support ipv6 flow_label and FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL
From: Stanislav Fomichev @ 2019-07-25 15:33 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Willem de Bruijn,
	Song Liu, Petar Penkov
In-Reply-To: <20190725153342.3571-1-sdf@google.com>

Add support for exporting ipv6 flow label via bpf_flow_keys.
Export flow label from bpf_flow.c and also return early when
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL is passed.

Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/uapi/linux/bpf.h                      |  1 +
 net/core/flow_dissector.c                     |  9 ++++
 tools/include/uapi/linux/bpf.h                |  1 +
 .../selftests/bpf/prog_tests/flow_dissector.c | 46 +++++++++++++++++++
 tools/testing/selftests/bpf/progs/bpf_flow.c  | 10 ++++
 5 files changed, 67 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b4ad19bd6aa8..83b4150466af 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3533,6 +3533,7 @@ struct bpf_flow_keys {
 		};
 	};
 	__u32	flags;
+	__be32	flow_label;
 };
 
 struct bpf_func_info {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a74c4ed1b30d..bcdb863cad28 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -737,6 +737,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
+	struct flow_dissector_key_tags *key_tags;
 
 	key_control = skb_flow_dissector_target(flow_dissector,
 						FLOW_DISSECTOR_KEY_CONTROL,
@@ -781,6 +782,14 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
 		key_ports->src = flow_keys->sport;
 		key_ports->dst = flow_keys->dport;
 	}
+
+	if (dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+		key_tags = skb_flow_dissector_target(flow_dissector,
+						     FLOW_DISSECTOR_KEY_FLOW_LABEL,
+						     target_container);
+		key_tags->flow_label = ntohl(flow_keys->flow_label);
+	}
 }
 
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a0e1c891b56f..c26ca432b1b3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3530,6 +3530,7 @@ struct bpf_flow_keys {
 		};
 	};
 	__u32	flags;
+	__be32	flow_label;
 };
 
 struct bpf_func_info {
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index f93a115db650..ada032be6199 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -20,6 +20,7 @@
 	      "is_encap=%u/%u "						\
 	      "ip_proto=0x%x/0x%x "					\
 	      "n_proto=0x%x/0x%x "					\
+	      "flow_label=0x%x/0x%x "					\
 	      "sport=%u/%u "						\
 	      "dport=%u/%u\n",						\
 	      got.nhoff, expected.nhoff,				\
@@ -30,6 +31,7 @@
 	      got.is_encap, expected.is_encap,				\
 	      got.ip_proto, expected.ip_proto,				\
 	      got.n_proto, expected.n_proto,				\
+	      got.flow_label, expected.flow_label,			\
 	      got.sport, expected.sport,				\
 	      got.dport, expected.dport)
 
@@ -257,6 +259,50 @@ struct test tests[] = {
 			.is_first_frag = true,
 		},
 	},
+	{
+		.name = "ipv6-flow-label",
+		.pkt.ipv6 = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.iph.nexthdr = IPPROTO_TCP,
+			.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph.flow_lbl = { 0xb, 0xee, 0xef },
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct ipv6hdr),
+			.addr_proto = ETH_P_IPV6,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.sport = 80,
+			.dport = 8080,
+			.flow_label = __bpf_constant_htonl(0xbeeef),
+		},
+	},
+	{
+		.name = "ipv6-no-flow-label",
+		.pkt.ipv6 = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.iph.nexthdr = IPPROTO_TCP,
+			.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph.flow_lbl = { 0xb, 0xee, 0xef },
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.flags = FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct ipv6hdr),
+			.addr_proto = ETH_P_IPV6,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.flow_label = __bpf_constant_htonl(0xbeeef),
+		},
+		.flags = FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
+	},
 };
 
 static int create_tap(const char *ifname)
diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c
index 0eabe5e57944..7d73b7bfe609 100644
--- a/tools/testing/selftests/bpf/progs/bpf_flow.c
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -83,6 +83,12 @@ static __always_inline int export_flow_keys(struct bpf_flow_keys *keys,
 	return ret;
 }
 
+#define IPV6_FLOWLABEL_MASK		__bpf_constant_htonl(0x000FFFFF)
+static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
+{
+	return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
+}
+
 static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
 							 __u16 hdr_size,
 							 void *buffer)
@@ -307,6 +313,10 @@ PROG(IPV6)(struct __sk_buff *skb)
 
 	keys->thoff += sizeof(struct ipv6hdr);
 	keys->ip_proto = ip6h->nexthdr;
+	keys->flow_label = ip6_flowlabel(ip6h);
+
+	if (keys->flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
+		return export_flow_keys(keys, BPF_OK);
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related

* [PATCH bpf-next v2 7/7] selftests/bpf: support FLOW_DISSECTOR_F_STOP_AT_ENCAP
From: Stanislav Fomichev @ 2019-07-25 15:33 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Willem de Bruijn,
	Song Liu, Petar Penkov
In-Reply-To: <20190725153342.3571-1-sdf@google.com>

Exit as soon as we found that packet is encapped when
FLOW_DISSECTOR_F_STOP_AT_ENCAP is passed.
Add appropriate selftest cases.

v2:
* Subtract sizeof(struct iphdr) from .iph_inner.tot_len (Willem de Bruijn)

Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 .../selftests/bpf/prog_tests/flow_dissector.c | 64 +++++++++++++++++++
 tools/testing/selftests/bpf/progs/bpf_flow.c  |  8 +++
 2 files changed, 72 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index ada032be6199..15265c7a90a3 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -41,6 +41,13 @@ struct ipv4_pkt {
 	struct tcphdr tcp;
 } __packed;
 
+struct ipip_pkt {
+	struct ethhdr eth;
+	struct iphdr iph;
+	struct iphdr iph_inner;
+	struct tcphdr tcp;
+} __packed;
+
 struct svlan_ipv4_pkt {
 	struct ethhdr eth;
 	__u16 vlan_tci;
@@ -82,6 +89,7 @@ struct test {
 	union {
 		struct ipv4_pkt ipv4;
 		struct svlan_ipv4_pkt svlan_ipv4;
+		struct ipip_pkt ipip;
 		struct ipv6_pkt ipv6;
 		struct ipv6_frag_pkt ipv6_frag;
 		struct dvlan_ipv6_pkt dvlan_ipv6;
@@ -303,6 +311,62 @@ struct test tests[] = {
 		},
 		.flags = FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
 	},
+	{
+		.name = "ipip-encap",
+		.pkt.ipip = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+			.iph.ihl = 5,
+			.iph.protocol = IPPROTO_IPIP,
+			.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph_inner.ihl = 5,
+			.iph_inner.protocol = IPPROTO_TCP,
+			.iph_inner.tot_len =
+				__bpf_constant_htons(MAGIC_BYTES) -
+				sizeof(struct iphdr),
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.nhoff = 0,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct iphdr) +
+				sizeof(struct iphdr),
+			.addr_proto = ETH_P_IP,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IP),
+			.is_encap = true,
+			.sport = 80,
+			.dport = 8080,
+		},
+	},
+	{
+		.name = "ipip-no-encap",
+		.pkt.ipip = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+			.iph.ihl = 5,
+			.iph.protocol = IPPROTO_IPIP,
+			.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph_inner.ihl = 5,
+			.iph_inner.protocol = IPPROTO_TCP,
+			.iph_inner.tot_len =
+				__bpf_constant_htons(MAGIC_BYTES) -
+				sizeof(struct iphdr),
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.flags = FLOW_DISSECTOR_F_STOP_AT_ENCAP,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct iphdr),
+			.addr_proto = ETH_P_IP,
+			.ip_proto = IPPROTO_IPIP,
+			.n_proto = __bpf_constant_htons(ETH_P_IP),
+			.is_encap = true,
+		},
+		.flags = FLOW_DISSECTOR_F_STOP_AT_ENCAP,
+	},
 };
 
 static int create_tap(const char *ifname)
diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c
index 7d73b7bfe609..b6236cdf8564 100644
--- a/tools/testing/selftests/bpf/progs/bpf_flow.c
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -167,9 +167,15 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 		return export_flow_keys(keys, BPF_OK);
 	case IPPROTO_IPIP:
 		keys->is_encap = true;
+		if (keys->flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+			return export_flow_keys(keys, BPF_OK);
+
 		return parse_eth_proto(skb, bpf_htons(ETH_P_IP));
 	case IPPROTO_IPV6:
 		keys->is_encap = true;
+		if (keys->flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+			return export_flow_keys(keys, BPF_OK);
+
 		return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6));
 	case IPPROTO_GRE:
 		gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre);
@@ -189,6 +195,8 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 			keys->thoff += 4; /* Step over sequence number */
 
 		keys->is_encap = true;
+		if (keys->flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+			return export_flow_keys(keys, BPF_OK);
 
 		if (gre->proto == bpf_htons(ETH_P_TEB)) {
 			eth = bpf_flow_dissect_get_header(skb, sizeof(*eth),
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related

* Re: [PATCH bpf-next v3 03/11] xsk: add support to allow unaligned chunk placement
From: Jonathan Lemon @ 2019-07-25 15:39 UTC (permalink / raw)
  To: Kevin Laatz
  Cc: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	saeedm, maximmi, stephen, bruce.richardson, ciara.loftus, bpf,
	intel-wired-lan
In-Reply-To: <20190724051043.14348-4-kevin.laatz@intel.com>



On 23 Jul 2019, at 22:10, Kevin Laatz wrote:

> Currently, addresses are chunk size aligned. This means, we are very
> restricted in terms of where we can place chunk within the umem. For
> example, if we have a chunk size of 2k, then our chunks can only be 
> placed
> at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0).
>
> This patch introduces the ability to use unaligned chunks. With these
> changes, we are no longer bound to having to place chunks at a 2k (or
> whatever your chunk size is) interval. Since we are no longer dealing 
> with
> aligned chunks, they can now cross page boundaries. Checks for page
> contiguity have been added in order to keep track of which pages are
> followed by a physically contiguous page.
>
> Signed-off-by: Kevin Laatz <kevin.laatz@intel.com>
> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
>
> ---
> v2:
>   - Add checks for the flags coming from userspace
>   - Fix how we get chunk_size in xsk_diag.c
>   - Add defines for masking the new descriptor format
>   - Modified the rx functions to use new descriptor format
>   - Modified the tx functions to use new descriptor format
>
> v3:
>   - Add helper function to do address/offset masking/addition
> ---
>  include/net/xdp_sock.h      | 17 ++++++++
>  include/uapi/linux/if_xdp.h |  9 ++++
>  net/xdp/xdp_umem.c          | 18 +++++---
>  net/xdp/xsk.c               | 86 
> ++++++++++++++++++++++++++++++-------
>  net/xdp/xsk_diag.c          |  2 +-
>  net/xdp/xsk_queue.h         | 68 +++++++++++++++++++++++++----
>  6 files changed, 170 insertions(+), 30 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index 69796d264f06..738996c0f995 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -19,6 +19,7 @@ struct xsk_queue;
>  struct xdp_umem_page {
>  	void *addr;
>  	dma_addr_t dma;
> +	bool next_pg_contig;
>  };
>
>  struct xdp_umem_fq_reuse {
> @@ -48,6 +49,7 @@ struct xdp_umem {
>  	bool zc;
>  	spinlock_t xsk_list_lock;
>  	struct list_head xsk_list;
> +	u32 flags;
>  };
>
>  struct xdp_sock {
> @@ -144,6 +146,15 @@ static inline void xsk_umem_fq_reuse(struct 
> xdp_umem *umem, u64 addr)
>
>  	rq->handles[rq->length++] = addr;
>  }
> +
> +static inline u64 xsk_umem_handle_offset(struct xdp_umem *umem, u64 
> handle,
> +					 u64 offset)
> +{
> +	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNKS)
> +		return handle |= (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
> +	else
> +		return handle += offset;
> +}

This should be something like 'xsk_umem_adjust_offset()', and use "+=" 
for both cases.


>  #else
>  static inline int xsk_generic_rcv(struct xdp_sock *xs, struct 
> xdp_buff *xdp)
>  {
> @@ -241,6 +252,12 @@ static inline void xsk_umem_fq_reuse(struct 
> xdp_umem *umem, u64 addr)
>  {
>  }
>
> +static inline u64 xsk_umem_handle_offset(struct xdp_umem *umem, u64 
> handle,
> +					 u64 offset)
> +{
> +	return NULL;
> +}
> +
>  #endif /* CONFIG_XDP_SOCKETS */
>
>  #endif /* _LINUX_XDP_SOCK_H */
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index faaa5ca2a117..f8dc68fcdf78 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -17,6 +17,9 @@
>  #define XDP_COPY	(1 << 1) /* Force copy-mode */
>  #define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
>
> +/* Flags for xsk_umem_config flags */
> +#define XDP_UMEM_UNALIGNED_CHUNKS (1 << 0)
> +
>  struct sockaddr_xdp {
>  	__u16 sxdp_family;
>  	__u16 sxdp_flags;
> @@ -53,6 +56,7 @@ struct xdp_umem_reg {
>  	__u64 len; /* Length of packet data area */
>  	__u32 chunk_size;
>  	__u32 headroom;
> +	__u32 flags;
>  };
>
>  struct xdp_statistics {
> @@ -74,6 +78,11 @@ struct xdp_options {
>  #define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
>  #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
>
> +/* Masks for unaligned chunks mode */
> +#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
> +#define XSK_UNALIGNED_BUF_ADDR_MASK \
> +	((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
> +
>  /* Rx/Tx descriptor */
>  struct xdp_desc {
>  	__u64 addr;
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 83de74ca729a..952ca22103e9 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -299,6 +299,7 @@ static int xdp_umem_account_pages(struct xdp_umem 
> *umem)
>
>  static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg 
> *mr)
>  {
> +	bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNKS;
>  	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
>  	unsigned int chunks, chunks_per_page;
>  	u64 addr = mr->addr, size = mr->len;
> @@ -314,7 +315,10 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
> struct xdp_umem_reg *mr)
>  		return -EINVAL;
>  	}
>
> -	if (!is_power_of_2(chunk_size))
> +	if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNKS))
> +		return -EINVAL;
> +
> +	if (!unaligned_chunks && !is_power_of_2(chunk_size))
>  		return -EINVAL;
>
>  	if (!PAGE_ALIGNED(addr)) {
> @@ -331,9 +335,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
> struct xdp_umem_reg *mr)
>  	if (chunks == 0)
>  		return -EINVAL;
>
> -	chunks_per_page = PAGE_SIZE / chunk_size;
> -	if (chunks < chunks_per_page || chunks % chunks_per_page)
> -		return -EINVAL;
> +	if (!unaligned_chunks) {
> +		chunks_per_page = PAGE_SIZE / chunk_size;
> +		if (chunks < chunks_per_page || chunks % chunks_per_page)
> +			return -EINVAL;
> +	}
>
>  	headroom = ALIGN(headroom, 64);
>
> @@ -342,13 +348,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
> struct xdp_umem_reg *mr)
>  		return -EINVAL;
>
>  	umem->address = (unsigned long)addr;
> -	umem->chunk_mask = ~((u64)chunk_size - 1);
> +	umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
> +					    : ~((u64)chunk_size - 1);

The handle needs to be cleaned (reset to base address) when removed
from the fill queue or recycle stack.  This will not provide the correct
semantics for unaligned mode.


>  	umem->size = size;
>  	umem->headroom = headroom;
>  	umem->chunk_size_nohr = chunk_size - headroom;
>  	umem->npgs = size / PAGE_SIZE;
>  	umem->pgs = NULL;
>  	umem->user = NULL;
> +	umem->flags = mr->flags;
>  	INIT_LIST_HEAD(&umem->xsk_list);
>  	spin_lock_init(&umem->xsk_list_lock);
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 59b57d708697..b3ab653091c4 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs);
>
>  u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
>  {
> -	return xskq_peek_addr(umem->fq, addr);
> +	return xskq_peek_addr(umem->fq, addr, umem);
>  }
>  EXPORT_SYMBOL(xsk_umem_peek_addr);
>
> @@ -55,21 +55,42 @@ void xsk_umem_discard_addr(struct xdp_umem *umem)
>  }
>  EXPORT_SYMBOL(xsk_umem_discard_addr);
>
> +/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one 
> for
> + * each page. This is only required in copy mode.
> + */
> +static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void 
> *from_buf,
> +			     u32 len, u32 metalen)
> +{
> +	void *to_buf = xdp_umem_get_data(umem, addr);
> +
> +	if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
> +		void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
> +		u64 page_start = addr & (PAGE_SIZE - 1);
> +		u64 first_len = PAGE_SIZE - (addr - page_start);
> +
> +		memcpy(to_buf, from_buf, first_len + metalen);
> +		memcpy(next_pg_addr, from_buf + first_len, len - first_len);
> +
> +		return;
> +	}
> +
> +	memcpy(to_buf, from_buf, len + metalen);
> +}
> +
>  static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 
> len)
>  {
> -	void *to_buf, *from_buf;
> +	u64 offset = xs->umem->headroom;
> +	void *from_buf;
>  	u32 metalen;
>  	u64 addr;
>  	int err;
>
> -	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
> +	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
>  	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
>  		xs->rx_dropped++;
>  		return -ENOSPC;
>  	}
>
> -	addr += xs->umem->headroom;
> -
>  	if (unlikely(xdp_data_meta_unsupported(xdp))) {
>  		from_buf = xdp->data;
>  		metalen = 0;
> @@ -78,9 +99,13 @@ static int __xsk_rcv(struct xdp_sock *xs, struct 
> xdp_buff *xdp, u32 len)
>  		metalen = xdp->data - xdp->data_meta;
>  	}
>
> -	to_buf = xdp_umem_get_data(xs->umem, addr);
> -	memcpy(to_buf, from_buf, len + metalen);
> -	addr += metalen;
> +	__xsk_rcv_memcpy(xs->umem, addr + offset, from_buf, len, metalen);
> +
> +	offset += metalen;
> +	if (xs->umem->flags & XDP_UMEM_UNALIGNED_CHUNKS)
> +		addr |= offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
> +	else
> +		addr += offset;
>  	err = xskq_produce_batch_desc(xs->rx, addr, len);
>  	if (!err) {
>  		xskq_discard_addr(xs->umem->fq);
> @@ -127,6 +152,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct 
> xdp_buff *xdp)
>  	u32 len = xdp->data_end - xdp->data;
>  	void *buffer;
>  	u64 addr;
> +	u64 offset = xs->umem->headroom;
>  	int err;
>
>  	spin_lock_bh(&xs->rx_lock);
> @@ -136,17 +162,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct 
> xdp_buff *xdp)
>  		goto out_unlock;
>  	}
>
> -	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
> +	if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
>  	    len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
>  		err = -ENOSPC;
>  		goto out_drop;
>  	}
>
> -	addr += xs->umem->headroom;
> -
> -	buffer = xdp_umem_get_data(xs->umem, addr);
> +	buffer = xdp_umem_get_data(xs->umem, addr + offset);
>  	memcpy(buffer, xdp->data_meta, len + metalen);
> -	addr += metalen;
> +	offset += metalen;
> +
> +	addr = xsk_umem_handle_offset(xs->umem, addr, offset);
>  	err = xskq_produce_batch_desc(xs->rx, addr, len);
>  	if (err)
>  		goto out_drop;
> @@ -190,7 +216,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, 
> struct xdp_desc *desc)
>
>  	rcu_read_lock();
>  	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
> -		if (!xskq_peek_desc(xs->tx, desc))
> +		if (!xskq_peek_desc(xs->tx, desc, umem))
>  			continue;
>
>  		if (xskq_produce_addr_lazy(umem->cq, desc->addr))
> @@ -243,7 +269,7 @@ static int xsk_generic_xmit(struct sock *sk, 
> struct msghdr *m,
>  	if (xs->queue_id >= xs->dev->real_num_tx_queues)
>  		goto out;
>
> -	while (xskq_peek_desc(xs->tx, &desc)) {
> +	while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
>  		char *buffer;
>  		u64 addr;
>  		u32 len;
> @@ -262,6 +288,10 @@ static int xsk_generic_xmit(struct sock *sk, 
> struct msghdr *m,
>
>  		skb_put(skb, len);
>  		addr = desc.addr;
> +		if (xs->umem->flags & XDP_UMEM_UNALIGNED_CHUNKS)
> +			addr = (addr & XSK_UNALIGNED_BUF_ADDR_MASK) |
> +				(addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT);

This doesn't look right to me.  Shouldn't it be "(addr & mask) + (addr 
 >> shift)"?
I'd also prefer to see this type of logic in an inline/macro

> +
>  		buffer = xdp_umem_get_data(xs->umem, addr);
>  		err = skb_store_bits(skb, 0, buffer, len);
>  		if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
> @@ -272,7 +302,7 @@ static int xsk_generic_xmit(struct sock *sk, 
> struct msghdr *m,
>  		skb->dev = xs->dev;
>  		skb->priority = sk->sk_priority;
>  		skb->mark = sk->sk_mark;
> -		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
> +		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
>  		skb->destructor = xsk_destruct_skb;
>
>  		err = dev_direct_xmit(skb, xs->queue_id);
> @@ -412,6 +442,28 @@ static struct socket *xsk_lookup_xsk_from_fd(int 
> fd)
>  	return sock;
>  }
>
> +/* Check if umem pages are contiguous.
> + * If zero-copy mode, use the DMA address to do the page contiguity 
> check
> + * For all other modes we use addr (kernel virtual address)
> + */
> +static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 
> flags)
> +{
> +	int i;
> +
> +	if (flags & XDP_ZEROCOPY) {
> +		for (i = 0; i < umem->npgs - 1; i++)
> +			umem->pages[i].next_pg_contig =
> +					(umem->pages[i].dma + PAGE_SIZE ==
> +						umem->pages[i + 1].dma);
> +		return;
> +	}
> +
> +	for (i = 0; i < umem->npgs - 1; i++)
> +		umem->pages[i].next_pg_contig =
> +				(umem->pages[i].addr + PAGE_SIZE ==
> +					umem->pages[i + 1].addr);
> +}
> +
>  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int 
> addr_len)
>  {
>  	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
> @@ -500,6 +552,8 @@ static int xsk_bind(struct socket *sock, struct 
> sockaddr *addr, int addr_len)
>  		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
>  		if (err)
>  			goto out_unlock;
> +
> +		xsk_check_page_contiguity(xs->umem, flags);
>  	}
>
>  	xs->dev = dev;
> diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
> index d5e06c8e0cbf..9986a759fe06 100644
> --- a/net/xdp/xsk_diag.c
> +++ b/net/xdp/xsk_diag.c
> @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock 
> *xs, struct sk_buff *nlskb)
>  	du.id = umem->id;
>  	du.size = umem->size;
>  	du.num_pages = umem->npgs;
> -	du.chunk_size = (__u32)(~umem->chunk_mask + 1);
> +	du.chunk_size = umem->chunk_size_nohr + umem->headroom;
>  	du.headroom = umem->headroom;
>  	du.ifindex = umem->dev ? umem->dev->ifindex : 0;
>  	du.queue_id = umem->queue_id;
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index 909c5168ed0f..0d77212367f0 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -133,6 +133,16 @@ static inline bool xskq_has_addrs(struct 
> xsk_queue *q, u32 cnt)
>
>  /* UMEM queue */
>
> +static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, 
> u64 addr,
> +					      u64 length)
> +{
> +	bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
> +	bool next_pg_contig =
> +		umem->pages[(addr >> PAGE_SHIFT) + 1].next_pg_contig;
> +
> +	return cross_pg && !next_pg_contig;
> +}
> +
>  static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
>  {
>  	if (addr >= q->size) {
> @@ -143,23 +153,50 @@ static inline bool xskq_is_valid_addr(struct 
> xsk_queue *q, u64 addr)
>  	return true;
>  }
>
> -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
> +static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, 
> u64 addr,
> +						u64 length,
> +						struct xdp_umem *umem)
> +{
> +	addr += addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
> +	addr &= XSK_UNALIGNED_BUF_ADDR_MASK;
> +	if (addr >= q->size ||
> +	    xskq_crosses_non_contig_pg(umem, addr, length)) {
> +		q->invalid_descs++;
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
> +				      struct xdp_umem *umem)
>  {
>  	while (q->cons_tail != q->cons_head) {
>  		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
>  		unsigned int idx = q->cons_tail & q->ring_mask;
>
>  		*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask> +
> +		if (umem->flags & XDP_UMEM_UNALIGNED_CHUNKS) {
> +			if (xskq_is_valid_addr_unaligned(q, *addr,
> +							 umem->chunk_size_nohr,
> +							 umem))
> +				return addr;
> +			goto out;
> +		}
> +
>  		if (xskq_is_valid_addr(q, *addr))
>  			return addr;
>
> +out:
>  		q->cons_tail++;
>  	}
>
>  	return NULL;
>  }
>
> -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
> +static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
> +				  struct xdp_umem *umem)
>  {
>  	if (q->cons_tail == q->cons_head) {
>  		smp_mb(); /* D, matches A */
> @@ -170,7 +207,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue 
> *q, u64 *addr)
>  		smp_rmb();
>  	}
>
> -	return xskq_validate_addr(q, addr);
> +	return xskq_validate_addr(q, addr, umem);
>  }
>
>  static inline void xskq_discard_addr(struct xsk_queue *q)
> @@ -229,8 +266,21 @@ static inline int xskq_reserve_addr(struct 
> xsk_queue *q)
>
>  /* Rx/Tx queue */
>
> -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct 
> xdp_desc *d)
> +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct 
> xdp_desc *d,
> +				      struct xdp_umem *umem)
>  {
> +	if (umem->flags & XDP_UMEM_UNALIGNED_CHUNKS) {
> +		if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
> +			return false;
> +
> +		if (d->len > umem->chunk_size_nohr || d->options) {
> +			q->invalid_descs++;
> +			return false;
> +		}
> +
> +		return true;
> +	}
> +
>  	if (!xskq_is_valid_addr(q, d->addr))
>  		return false;
>
> @@ -244,14 +294,15 @@ static inline bool xskq_is_valid_desc(struct 
> xsk_queue *q, struct xdp_desc *d)
>  }
>
>  static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue 
> *q,
> -						  struct xdp_desc *desc)
> +						  struct xdp_desc *desc,
> +						  struct xdp_umem *umem)
>  {
>  	while (q->cons_tail != q->cons_head) {
>  		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
>  		unsigned int idx = q->cons_tail & q->ring_mask;
>
>  		*desc = READ_ONCE(ring->desc[idx]);
> -		if (xskq_is_valid_desc(q, desc))
> +		if (xskq_is_valid_desc(q, desc, umem))
>  			return desc;
>
>  		q->cons_tail++;
> @@ -261,7 +312,8 @@ static inline struct xdp_desc 
> *xskq_validate_desc(struct xsk_queue *q,
>  }
>
>  static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
> -					      struct xdp_desc *desc)
> +					      struct xdp_desc *desc,
> +					      struct xdp_umem *umem)
>  {
>  	if (q->cons_tail == q->cons_head) {
>  		smp_mb(); /* D, matches A */
> @@ -272,7 +324,7 @@ static inline struct xdp_desc 
> *xskq_peek_desc(struct xsk_queue *q,
>  		smp_rmb(); /* C, matches B */
>  	}
>
> -	return xskq_validate_desc(q, desc);
> +	return xskq_validate_desc(q, desc, umem);
>  }
>
>  static inline void xskq_discard_desc(struct xsk_queue *q)
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH bpf-next v3 00/11] XDP unaligned chunk placement support
From: Jonathan Lemon @ 2019-07-25 15:39 UTC (permalink / raw)
  To: Kevin Laatz
  Cc: netdev, ast, daniel, bjorn.topel, magnus.karlsson, jakub.kicinski,
	saeedm, maximmi, stephen, bruce.richardson, ciara.loftus, bpf,
	intel-wired-lan
In-Reply-To: <20190724051043.14348-1-kevin.laatz@intel.com>



On 23 Jul 2019, at 22:10, Kevin Laatz wrote:

> This patch set adds the ability to use unaligned chunks in the XDP umem.
>
> Currently, all chunk addresses passed to the umem are masked to be chunk
> size aligned (max is PAGE_SIZE). This limits where we can place chunks
> within the umem as well as limiting the packet sizes that are supported.
>
> The changes in this patch set removes these restrictions, allowing XDP to
> be more flexible in where it can place a chunk within a umem. By relaxing
> where the chunks can be placed, it allows us to use an arbitrary buffer
> size and place that wherever we have a free address in the umem. These
> changes add the ability to support arbitrary frame sizes up to 4k
> (PAGE_SIZE) and make it easy to integrate with other existing frameworks
> that have their own memory management systems, such as DPDK.
> In DPDK, for example, there is already support for AF_XDP with zero-copy.
> However, with this patch set the integration will be much more seamless.
> You can find the DPDK AF_XDP driver at:
> https://git.dpdk.org/dpdk/tree/drivers/net/af_xdp
>
> Since we are now dealing with arbitrary frame sizes, we need also need to
> update how we pass around addresses. Currently, the addresses can simply be
> masked to 2k to get back to the original address. This becomes less trivial
> when using frame sizes that are not a 'power of 2' size. This patch set
> modifies the Rx/Tx descriptor format to use the upper 16-bits of the addr
> field for an offset value, leaving the lower 48-bits for the address (this
> leaves us with 256 Terabytes, which should be enough!). We only need to use
> the upper 16-bits to store the offset when running in unaligned mode.
> Rather than adding the offset (headroom etc) to the address, we will store
> it in the upper 16-bits of the address field. This way, we can easily add
> the offset to the address where we need it, using some bit manipulation and
> addition, and we can also easily get the original address wherever we need
> it (for example in i40e_zca_fr-- ee) by simply masking to get the lower
> 48-bits of the address field.

I wonder if it would be better to break backwards compatibility here and
say that a handle is going to change from [addr] to [base | offset], or
even [index | offset], where address = (index * chunk size) + offset, and
then use accessor macros to manipulate the queue entries.

This way, the XDP hotpath can adjust the handle with simple arithmetic,
bypassing the "if (unaligned)", check, as it changes the offset directly.

Using a chunk index instead of a base address is safer, otherwise it is
too easy to corrupt things.
-- 
Jonathan

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox