* [PATCH] DSA support for Micrel KSZ8895
From: Pavel Machek @ 2017-08-27 12:36 UTC (permalink / raw)
To: Woojung.Huh, nathan.leigh.conrad
Cc: vivien.didelot, f.fainelli, netdev, linux-kernel, Tristram.Ha,
andrew, pavel
In-Reply-To: <9235D6609DB808459E95D78E17F2E43D40AFF8C1@CHN-SV-EXMX02.mchp-main.com>
[-- Attachment #1: Type: text/plain, Size: 62101 bytes --]
Hi!
So I fought with the driver a bit more, and now I have something that
kind-of-works.
"great great hack" belows worries me.
Yeah, disabled code needs to be removed before merge.
No, tag_ksz part probably is not acceptable. Do you see solution
better than just copying it into tag_ksz1 file?
Any more comments, etc?
Help would be welcome.
Best regards,
Pavel
Signed-off-by: Pavel Machek <pavel@denx.de>
diff --git a/drivers/net/dsa/microchip/Kconfig b/drivers/net/dsa/microchip/Kconfig
index a8b8f59099ce..7b7d7ddb3488 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -1,12 +1,25 @@
menuconfig MICROCHIP_KSZ
- tristate "Microchip KSZ series switch support"
+ tristate "Microchip KSZ 9477 series switch support"
+ depends on NET_DSA
+ select NET_DSA_TAG_KSZ
+ help
+ This driver adds support for Microchip KSZ switch chips.
+
+menuconfig MICROCHIP_KSZ_8895
+ tristate "Microchip KSZ 8895 series switch support"
depends on NET_DSA
select NET_DSA_TAG_KSZ
help
This driver adds support for Microchip KSZ switch chips.
config MICROCHIP_KSZ_SPI_DRIVER
- tristate "KSZ series SPI connected switch driver"
+ tristate "KSZ 9477 series SPI connected switch driver"
depends on MICROCHIP_KSZ && SPI
help
Select to enable support for registering switches configured through SPI.
+
+config MICROCHIP_KSZ_8895_SPI_DRIVER
+ tristate "KSZ 8895 series SPI connected switch driver"
+ depends on MICROCHIP_KSZ_8895 && SPI
+ help
+ Select to enable support for registering switches configured through SPI.
diff --git a/drivers/net/dsa/microchip/Makefile b/drivers/net/dsa/microchip/Makefile
index ed335e29fae8..b6a17f79d2d9 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -1,2 +1,4 @@
obj-$(CONFIG_MICROCHIP_KSZ) += ksz_common.o
+obj-$(CONFIG_MICROCHIP_KSZ_8895) += ksz_8895.o
obj-$(CONFIG_MICROCHIP_KSZ_SPI_DRIVER) += ksz_spi.o
+obj-$(CONFIG_MICROCHIP_KSZ_8895_SPI_DRIVER) += ksz_8895_spi.o
diff --git a/drivers/net/dsa/microchip/ksz_8895.c b/drivers/net/dsa/microchip/ksz_8895.c
new file mode 100644
index 000000000000..d546e08b1281
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz_8895.c
@@ -0,0 +1,721 @@
+/*
+ * Microchip switch driver main logic
+ *
+ * Copyright (C) 2017
+ * Copyright (C) 2017 Pavel Machek <pavel@denx.de>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/gpio.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_data/microchip-ksz.h>
+#include <linux/phy.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <net/dsa.h>
+#include <net/switchdev.h>
+
+#include "ksz_8895_reg.h"
+#include "ksz_priv.h"
+
+static const struct {
+ int index;
+ char string[ETH_GSTRING_LEN];
+} mib_names[TOTAL_SWITCH_COUNTER_NUM] = {
+ { 0x00, "???" },
+};
+
+static void ksz_cfg(struct ksz_device *dev, u32 addr, u8 bits, bool set)
+{
+ u8 data;
+
+ ksz_read8(dev, addr, &data);
+ if (set)
+ data |= bits;
+ else
+ data &= ~bits;
+ ksz_write8(dev, addr, data);
+}
+
+#if 0
+static void ksz_cfg32(struct ksz_device *dev, u32 addr, u32 bits, bool set)
+{
+ u32 data;
+
+ ksz_read32(dev, addr, &data);
+ if (set)
+ data |= bits;
+ else
+ data &= ~bits;
+ ksz_write32(dev, addr, data);
+}
+#endif
+
+static void ksz_port_cfg(struct ksz_device *dev, int port, int offset, u8 bits,
+ bool set)
+{
+ u32 addr;
+ u8 data;
+
+ addr = PORT_CTRL_ADDR(port, offset);
+ ksz_read8(dev, addr, &data);
+
+ if (set)
+ data |= bits;
+ else
+ data &= ~bits;
+
+ ksz_write8(dev, addr, data);
+}
+
+#if 0
+static void ksz_port_cfg32(struct ksz_device *dev, int port, int offset,
+ u32 bits, bool set)
+{
+ u32 addr;
+ u32 data;
+
+ addr = PORT_CTRL_ADDR(port, offset);
+ ksz_read32(dev, addr, &data);
+
+ if (set)
+ data |= bits;
+ else
+ data &= ~bits;
+
+ ksz_write32(dev, addr, data);
+}
+#endif
+
+#define NOTIMPL() do { NOTIMPLV(); return -EJUKEBOX; } while (0)
+#define NOTIMPLV() do { printk("Not implemented -- %s\n", __func__); } while (0)
+
+static int ksz_reset_switch(struct dsa_switch *ds)
+{
+ struct ksz_device *dev = ds->priv;
+#if 0
+ /* This seems to break the code. */
+ ksz_write8(dev, REG_POWER_MANAGEMENT_1, SW_SOFTWARE_POWER_DOWN << SW_POWER_MANAGEMENT_MODE_S);
+ ksz_write8(dev, REG_POWER_MANAGEMENT_1, 0);
+#endif
+ return 0;
+}
+
+#define PORT_MAC_LOOPBACK_my 0x80
+#define REG_PORT_CTRL_LOOPBACK 0x0f
+
+static void port_setup(struct ksz_device *dev, int port, bool cpu_port)
+{
+ printk("Port setup %d, %d\n", port, cpu_port);
+
+ if (cpu_port && port != 4)
+ printk("!!! tail tagging only works on port 5\n");
+ if (cpu_port) {
+ printk("enable tail tagging\n");
+ ksz_cfg(dev, S_TAIL_TAG_CTRL, SW_TAIL_TAG_ENABLE, true);
+ }
+
+ ksz_port_cfg(dev, port, REG_PORT_CTRL_LOOPBACK, PORT_MAC_LOOPBACK_my, false);
+#ifdef FIXME
+ /* set back pressure */
+ ksz_port_cfg(dev, port, REG_PORT_MAC_CTRL_1, PORT_BACK_PRESSURE, true);
+
+ /* set flow control */
+ ksz_port_cfg(dev, port, REG_PORT_CTRL_0,
+ PORT_FORCE_TX_FLOW_CTRL | PORT_FORCE_RX_FLOW_CTRL, true);
+
+ /* enable broadcast storm limit */
+ ksz_port_cfg(dev, port, P_BCAST_STORM_CTRL, PORT_BROADCAST_STORM, true);
+
+ /* disable DiffServ priority */
+ ksz_port_cfg(dev, port, P_PRIO_CTRL, PORT_DIFFSERV_PRIO_ENABLE, false);
+
+ /* replace priority */
+ ksz_port_cfg(dev, port, REG_PORT_MRI_MAC_CTRL, PORT_USER_PRIO_CEILING,
+ false);
+ ksz_port_cfg32(dev, port, REG_PORT_MTI_QUEUE_CTRL_0__4,
+ MTI_PVID_REPLACE, false);
+
+ /* enable 802.1p priority */
+ ksz_port_cfg(dev, port, P_PRIO_CTRL, PORT_802_1P_PRIO_ENABLE, true);
+
+ /* configure MAC to 1G & RGMII mode */
+ ksz_pread8(dev, port, REG_PORT_XMII_CTRL_1, &data8);
+ data8 |= PORT_RGMII_ID_EG_ENABLE;
+ data8 &= ~PORT_MII_NOT_1GBIT;
+ data8 &= ~PORT_MII_SEL_M;
+ data8 |= PORT_RGMII_SEL;
+ ksz_pwrite8(dev, port, REG_PORT_XMII_CTRL_1, data8);
+
+ /* clear pending interrupts */
+ ksz_pread16(dev, port, REG_PORT_PHY_INT_ENABLE, &data16);
+#endif
+}
+
+static void ksz_config_cpu_port(struct dsa_switch *ds)
+{
+ struct ksz_device *dev = ds->priv;
+ int i;
+
+ ds->num_ports = dev->port_cnt;
+
+ for (i = 0; i < ds->num_ports; i++) {
+ if (dsa_is_cpu_port(ds, i) && (dev->cpu_ports & (1 << i))) {
+ dev->cpu_port = i;
+
+ /* enable cpu port */
+ port_setup(dev, i, true);
+ }
+ }
+}
+
+#if 0
+/*
+ * sw_init_vlan - initialize switch VLAN
+ *
+ * Everyone can communicate with CPU, ports do not communicate with each other
+ */
+static void sw_init_vlan(struct ksz_device *dev)
+{
+ int port;
+
+ for (port = 0; port < dev->port_cnt; port++) {
+ //port_get_def_vid(sw, port, &info->port_cfg[port].vid);
+ //port_r(sw, port, P_MIRROR_CTRL, &data);
+
+ ksz_port_cfg(dev, port, P_MIRROR_CTRL, PORT_VLAN_MEMBERSHIP, false);
+ ksz_port_cfg(dev, port, P_MIRROR_CTRL, 1<<port, true);
+#if 0
+ port_cfg(sw, port, P_INS_SRC_PVID_CTRL,
+ (PORT_INS_TAG_FOR_PORT_5 | PORT_INS_TAG_FOR_PORT_4 |
+ PORT_INS_TAG_FOR_PORT_3 | PORT_INS_TAG_FOR_PORT_2),
+ true);
+#endif
+ }
+ ksz_port_cfg(dev, dev->port_cnt-1, P_MIRROR_CTRL, PORT_VLAN_MEMBERSHIP, true);
+}
+
+static void sw_init_bridge(struct ksz_device *dev)
+{
+ int port;
+
+ for (port = 0; port < dev->port_cnt; port++) {
+ ksz_port_cfg(dev, port, P_MIRROR_CTRL, PORT_VLAN_MEMBERSHIP, true);
+ }
+}
+#endif
+
+static void br_update(struct dsa_switch *ds)
+{
+ struct ksz_device *dev = ds->priv;
+ int i,j;
+ int mask[5];
+ u8 val;
+
+ for (i = 0; i < dev->port_cnt; i++) {
+ mask[i] = 0;
+ }
+
+ for (i = 0; i < dev->port_cnt; i++) {
+ for (j = 0; j < dev->port_cnt; j++) {
+ //printk("port %d bridge %lx\n", i, (unsigned long) ds->ports[i].bridge_dev);
+ if (ds->ports[i].bridge_dev &&
+ (ds->ports[i].bridge_dev == ds->ports[j].bridge_dev))
+ mask[i] |= 1<<j;
+ }
+ }
+
+ for (i = 0; i < dev->port_cnt-1; i++) {
+ printk("port %d mask %x\n", i, mask[i]);
+ ksz_pread8(dev, i, P_MIRROR_CTRL, &val);
+ val &= ~PORT_VLAN_MEMBERSHIP;
+ val |= mask[i] | 0x10 | (1 << i);
+ ksz_pwrite8(dev, i, P_MIRROR_CTRL, val);
+ }
+
+ ksz_pread8(dev, dev->port_cnt-1, P_MIRROR_CTRL, &val);
+ val &= ~PORT_VLAN_MEMBERSHIP;
+ val |= mask[i] | 0x1f;
+ ksz_pwrite8(dev, dev->port_cnt-1, P_MIRROR_CTRL, val);
+
+}
+
+
+/* bridge_change -- need to get it here */
+
+int ksz_br_join(struct dsa_switch *ds, int port, struct net_device *br)
+{
+ struct ksz_device *dev = ds->priv;
+
+ br_update(ds);
+ return 0;
+}
+
+void ksz_br_leave(struct dsa_switch *ds, int port, struct net_device *br)
+{
+ struct ksz_device *dev = ds->priv;
+
+// sw_init_vlan(dev);
+
+ br_update(ds);
+}
+
+
+
+static int ksz_setup(struct dsa_switch *ds)
+{
+ struct ksz_device *dev = ds->priv;
+ int ret = 0;
+
+ dev->vlan_cache = devm_kcalloc(dev->dev, sizeof(struct vlan_table),
+ dev->num_vlans, GFP_KERNEL);
+ if (!dev->vlan_cache)
+ return -ENOMEM;
+
+ ret = ksz_reset_switch(ds);
+ if (ret) {
+ dev_err(ds->dev, "failed to reset switch\n");
+ return ret;
+ }
+
+ /* accept packet up to 2000bytes */
+ //ksz_cfg(dev, REG_SW_MAC_CTRL_1, SW_LEGAL_PACKET_DISABLE, true);
+
+ ksz_config_cpu_port(ds);
+
+ ksz_cfg(dev, REG_SW_CTRL_2, MULTICAST_STORM_DISABLE, true);
+
+ /* queue based egress rate limit */
+ //ksz_cfg(dev, REG_SW_MAC_CTRL_5, SW_OUT_RATE_LIMIT_QUEUE_BASED, true);
+
+ //sw_init_broad_storm(sw);
+ //sw_init_prio(sw);
+ //sw_init_prio_rate(sw);
+ //sw_init_vlan(ds); FIXME!!!
+
+ /* start switch */
+ ksz_cfg(dev, REG_CHIP_ID1, SW_START, true);
+
+ return 0;
+}
+
+static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds)
+{
+ return DSA_TAG_PROTO_KSZ;
+}
+
+#include "ksz_mdio_emulation.c"
+
+static int ksz_enable_port(struct dsa_switch *ds, int port,
+ struct phy_device *phy)
+{
+ struct ksz_device *dev = ds->priv;
+
+ /* setup slave port */
+ port_setup(dev, port, false);
+
+ return 0;
+}
+
+
+static void ksz_disable_port(struct dsa_switch *ds, int port,
+ struct phy_device *phy)
+{
+ struct ksz_device *dev = ds->priv;
+
+ /* there is no port disable */
+ ksz_port_cfg(dev, port, REG_PORT_CTRL_LOOPBACK, PORT_MAC_LOOPBACK_my, true);
+}
+
+static int ksz_sset_count(struct dsa_switch *ds)
+{
+ return TOTAL_SWITCH_COUNTER_NUM;
+}
+
+static void ksz_get_strings(struct dsa_switch *ds, int port, uint8_t *buf)
+{
+ int i;
+
+ for (i = 0; i < TOTAL_SWITCH_COUNTER_NUM; i++) {
+ memcpy(buf + i * ETH_GSTRING_LEN, mib_names[i].string,
+ ETH_GSTRING_LEN);
+ }
+}
+
+static void ksz_get_ethtool_stats(struct dsa_switch *ds, int port,
+ uint64_t *buf)
+{
+ NOTIMPLV();
+}
+
+#if 0
+static void ksz_dump(struct ksz_device *dev)
+{
+ int i;
+ u8 v;
+
+ printk("ksz: dumping:\n");
+ for (i = 0; i < 0x100; i++) {
+ if (!(i % 0x10))
+ printk("\n %x: ", i);
+ ksz_read8(dev, i, &v);
+ printk("%02x ", v);
+ }
+ printk("\nksz: dump done\n");
+}
+#endif
+
+static void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
+{
+ struct ksz_device *dev = ds->priv;
+ u8 data;
+
+ printk("port %d state %d\n", port, state);
+ ksz_pread8(dev, port, P_STP_CTRL, &data);
+ data &= ~(PORT_TX_ENABLE | PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+
+ switch (state) {
+ case BR_STATE_DISABLED:
+ printk("port %d state %d disable\n", port, state);
+ data |= PORT_LEARN_DISABLE;
+ break;
+ case BR_STATE_LISTENING:
+ printk("port %d state %d listen\n", port, state);
+ data |= (PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+ break;
+ case BR_STATE_LEARNING:
+ printk("port %d state %d learn\n", port, state);
+ data |= PORT_RX_ENABLE;
+ break;
+ case BR_STATE_FORWARDING:
+ printk("port %d state %d forwarding\n", port, state);
+ data |= (PORT_TX_ENABLE | PORT_RX_ENABLE);
+ break;
+ case BR_STATE_BLOCKING:
+ printk("port %d state %d blocking\n", port, state);
+ data |= PORT_LEARN_DISABLE;
+ break;
+ default:
+ dev_err(ds->dev, "invalid STP state: %d\n", state);
+ return;
+ }
+ /* FIXME: great great hack */
+ data |= (PORT_RX_ENABLE | PORT_TX_ENABLE);
+
+
+ ksz_pwrite8(dev, port, P_STP_CTRL, data);
+ //ksz_dump(dev);
+}
+
+static void ksz_port_fast_age(struct dsa_switch *ds, int port)
+{
+ NOTIMPLV();
+
+}
+
+static int ksz_port_vlan_filtering(struct dsa_switch *ds, int port, bool flag)
+{
+ NOTIMPL();
+ return 0;
+}
+
+static int ksz_port_vlan_prepare(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct switchdev_trans *trans)
+{
+ /* nothing needed */
+
+ return 0;
+}
+
+static void ksz_port_vlan_add(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct switchdev_trans *trans)
+{
+ NOTIMPLV();
+}
+
+static int ksz_port_vlan_del(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ NOTIMPL();
+
+ return 0;
+}
+
+static int ksz_port_vlan_dump(struct dsa_switch *ds, int port,
+ struct switchdev_obj_port_vlan *vlan,
+ switchdev_obj_dump_cb_t *cb)
+{
+ NOTIMPL();
+}
+
+static int ksz_port_fdb_prepare(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_fdb *fdb,
+ struct switchdev_trans *trans)
+{
+ /* nothing needed */
+
+ return 0;
+}
+
+struct alu_struct {
+ /* entry 1 */
+ u8 is_static:1;
+ u8 is_src_filter:1;
+ u8 is_dst_filter:1;
+ u8 prio_age:3;
+ u32 _reserv_0_1:23;
+ u8 mstp:3;
+ /* entry 2 */
+ u8 is_override:1;
+ u8 is_use_fid:1;
+ u32 _reserv_1_1:23;
+ u8 port_forward:7;
+ /* entry 3 & 4*/
+ u32 _reserv_2_1:9;
+ u8 fid:7;
+ u8 mac[ETH_ALEN];
+};
+
+static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_fdb *fdb,
+ struct switchdev_trans *trans)
+{
+ NOTIMPLV();
+}
+
+static int ksz_port_fdb_del(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_fdb *fdb)
+{
+ NOTIMPL();
+}
+
+static int ksz_port_fdb_dump(struct dsa_switch *ds, int port,
+ struct switchdev_obj_port_fdb *fdb,
+ switchdev_obj_dump_cb_t *cb)
+{
+ NOTIMPL();
+}
+
+static int ksz_port_mdb_prepare(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct switchdev_trans *trans)
+{
+ /* nothing to do */
+ return 0;
+}
+
+static void ksz_port_mdb_add(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct switchdev_trans *trans)
+{
+ NOTIMPLV();
+}
+
+static int ksz_port_mdb_del(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ NOTIMPL();
+}
+
+static int ksz_port_mdb_dump(struct dsa_switch *ds, int port,
+ struct switchdev_obj_port_mdb *mdb,
+ switchdev_obj_dump_cb_t *cb)
+{
+ /* this is not called by switch layer */
+ return 0;
+}
+
+static int ksz_port_mirror_add(struct dsa_switch *ds, int port,
+ struct dsa_mall_mirror_tc_entry *mirror,
+ bool ingress)
+{
+ NOTIMPL();
+
+}
+
+static void ksz_port_mirror_del(struct dsa_switch *ds, int port,
+ struct dsa_mall_mirror_tc_entry *mirror)
+{
+ NOTIMPLV();
+}
+
+static const struct dsa_switch_ops ksz_switch_ops = {
+ .get_tag_protocol = ksz_get_tag_protocol,
+ .setup = ksz_setup,
+ .phy_read = ksz_phy_read16,
+ .phy_write = ksz_phy_write16,
+ .port_enable = ksz_enable_port,
+ .port_disable = ksz_disable_port,
+ .get_strings = ksz_get_strings,
+ .get_ethtool_stats = ksz_get_ethtool_stats,
+ .get_sset_count = ksz_sset_count,
+ .port_bridge_join = ksz_br_join,
+ .port_bridge_leave = ksz_br_leave,
+ .port_stp_state_set = ksz_port_stp_state_set,
+ .port_fast_age = ksz_port_fast_age,
+ .port_vlan_filtering = ksz_port_vlan_filtering,
+ .port_vlan_prepare = ksz_port_vlan_prepare,
+ .port_vlan_add = ksz_port_vlan_add,
+ .port_vlan_del = ksz_port_vlan_del,
+ .port_vlan_dump = ksz_port_vlan_dump,
+ .port_fdb_prepare = ksz_port_fdb_prepare,
+ .port_fdb_dump = ksz_port_fdb_dump,
+ .port_fdb_add = ksz_port_fdb_add,
+ .port_fdb_del = ksz_port_fdb_del,
+ .port_mdb_prepare = ksz_port_mdb_prepare,
+ .port_mdb_add = ksz_port_mdb_add,
+ .port_mdb_del = ksz_port_mdb_del,
+ .port_mdb_dump = ksz_port_mdb_dump,
+ .port_mirror_add = ksz_port_mirror_add,
+ .port_mirror_del = ksz_port_mirror_del,
+};
+
+struct ksz_chip_data {
+ u32 chip_id;
+ const char *dev_name;
+ int num_vlans;
+ int num_alus;
+ int num_statics;
+ int cpu_ports;
+ int port_cnt;
+};
+
+static const struct ksz_chip_data ksz_switch_chips[] = {
+ {
+ .chip_id = 0x95600c04,
+ .dev_name = "KSZ8895",
+ .num_vlans = 4096, /* FIXME ? */
+ .num_alus = 4096,
+ .num_statics = 16,
+ .cpu_ports = 0x10, /* can be configured as cpu port */
+ .port_cnt = 5, /* total physical port count */
+ },
+};
+
+static int ksz_switch_init(struct ksz_device *dev)
+{
+ int i;
+
+ mutex_init(&dev->reg_mutex);
+ mutex_init(&dev->stats_mutex);
+ mutex_init(&dev->alu_mutex);
+ mutex_init(&dev->vlan_mutex);
+
+ dev->ds->ops = &ksz_switch_ops;
+
+ for (i = 0; i < ARRAY_SIZE(ksz_switch_chips); i++) {
+ const struct ksz_chip_data *chip = &ksz_switch_chips[i];
+
+ if (dev->chip_id == chip->chip_id) {
+ dev->name = chip->dev_name;
+ dev->num_vlans = chip->num_vlans;
+ dev->num_alus = chip->num_alus;
+ dev->num_statics = chip->num_statics;
+ dev->port_cnt = chip->port_cnt;
+ dev->cpu_ports = chip->cpu_ports;
+
+ break;
+ }
+ }
+
+ printk("ksz_switch_init: detected %s\n", dev->name);
+
+ /* no switch found */
+ if (!dev->port_cnt)
+ return -ENODEV;
+
+ return 0;
+}
+
+struct ksz_device *ksz_switch_alloc(struct device *base,
+ const struct ksz_io_ops *ops,
+ void *priv)
+{
+ struct dsa_switch *ds;
+ struct ksz_device *swdev;
+
+ ds = dsa_switch_alloc(base, DSA_MAX_PORTS);
+ if (!ds)
+ return NULL;
+
+ swdev = devm_kzalloc(base, sizeof(*swdev), GFP_KERNEL);
+ if (!swdev)
+ return NULL;
+
+ ds->priv = swdev;
+ swdev->dev = base;
+
+ swdev->ds = ds;
+ swdev->priv = priv;
+ swdev->ops = ops;
+
+ return swdev;
+}
+EXPORT_SYMBOL(ksz_switch_alloc);
+
+int ksz_switch_detect(struct ksz_device *dev)
+{
+ u8 data8;
+ u32 id32;
+ int ret;
+
+ /* read chip id */
+ ret = ksz_read32(dev, REG_CHIP_ID0__1, &id32);
+ if (ret)
+ return ret;
+
+ ret = ksz_read8(dev, 0, &data8);
+ ret = ksz_read8(dev, 1, &data8);
+ ret = ksz_read8(dev, 2, &data8);
+ ret = ksz_read8(dev, 3, &data8);
+
+ dev->chip_id = id32;
+
+ return 0;
+}
+EXPORT_SYMBOL(ksz_switch_detect);
+
+int ksz_switch_register(struct ksz_device *dev)
+{
+ int ret;
+
+ if (dev->pdata)
+ dev->chip_id = dev->pdata->chip_id;
+
+ if (ksz_switch_detect(dev))
+ return -EINVAL;
+
+ ret = ksz_switch_init(dev);
+ if (ret)
+ return ret;
+
+
+ return dsa_register_switch(dev->ds);
+}
+EXPORT_SYMBOL(ksz_switch_register);
+
+void ksz_switch_remove(struct ksz_device *dev)
+{
+ dsa_unregister_switch(dev->ds);
+}
+EXPORT_SYMBOL(ksz_switch_remove);
+
+MODULE_AUTHOR("Woojung Huh <Woojung.Huh@microchip.com>");
+MODULE_DESCRIPTION("Microchip KSZ Series Switch DSA Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/dsa/microchip/ksz_8895_reg.h b/drivers/net/dsa/microchip/ksz_8895_reg.h
new file mode 100644
index 000000000000..b6490c42448e
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz_8895_reg.h
@@ -0,0 +1,769 @@
+/*
+ * Microchip KSZ9477 register definitions
+ *
+ * Copyright (C) 2017
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __KSZ9477_REGS_H
+#define __KSZ9477_REGS_H
+
+#define KS_PRIO_M 0x3
+#define KS_PRIO_S 2
+
+/* 0 - Operation */
+#define REG_CHIP_ID0__1 0x0000
+
+#define REG_CHIP_ID1__1 0x0001
+
+#define SW_START 1
+
+#define FAMILY_ID 0x95
+#define FAMILY_ID_94 0x94
+#define FAMILY_ID_95 0x95
+#define FAMILY_ID_85 0x85
+#define FAMILY_ID_98 0x98
+#define FAMILY_ID_88 0x88
+
+#define TOTAL_SWITCH_COUNTER_NUM 1
+#define PORT_CTRL_ADDR(port, addr) ((addr) | (((port) + 1) << 4))
+
+#define ADDR_SHIFT 14
+#define ADDR_8 1
+#define ADDR_16 2
+#define ADDR_24 3
+#define ADDR_32 4
+
+#define BANK_SHIFT 12
+
+#define PHY_REG(addr, reg) \
+ (((addr) << ADDR_SHIFT) | (reg))
+
+#define PHY_BANK_REG(addr, bank, reg) \
+ (((addr) << ADDR_SHIFT) | ((bank) << BANK_SHIFT) | (reg))
+
+/* Use PHY access if no direct access. */
+#ifndef SW_R8
+#define SW_R8(s, r) phy_read(s->phydev, PHY_REG(ADDR_8, r))
+#define SW_W8(s, r, v) phy_write(s->phydev, PHY_REG(ADDR_8, r), v)
+#define SW_R16(s, r) phy_read(s->phydev, PHY_REG(ADDR_16, r))
+#define SW_W16(s, r, v) phy_write(s->phydev, PHY_REG(ADDR_16, r), v)
+#define SW_R32(s, r) phy_read(s->phydev, PHY_REG(ADDR_32, r))
+#define SW_W32(s, r, v) \
+ do { \
+ phy_write(s->phydev, PHY_REG(ADDR_32, (r) + 2), (v) >> 16); \
+ phy_write(s->phydev, PHY_REG(ADDR_32, r), v); \
+ } while (0)
+#define SW_LOCK(s) \
+ do { \
+ mutex_lock(s->hwlock); \
+ } while (0)
+#define SW_UNLOCK(s) \
+ do { \
+ mutex_unlock(s->hwlock); \
+ } while (0)
+#endif
+
+
+#define KS_PORT_M 0x1F
+
+#define REG_CHIP_ID0 0x00
+
+#define FAMILY_ID 0x95
+
+#define REG_CHIP_ID1 0x01
+
+#define SW_CHIP_ID_M 0xF0
+#define SW_CHIP_ID_S 4
+#define SW_REVISION_M 0x0E
+#define SW_REVISION_S 1
+
+#define CHIP_ID_95 0x40
+#define CHIP_ID_95R 0x60
+
+#define REG_SW_CTRL_0 0x02
+
+#define SW_NEW_BACKOFF (1 << 7)
+#define SW_FLUSH_DYN_MAC_TABLE (1 << 5)
+#define SW_FLUSH_STA_MAC_TABLE (1 << 4)
+#define SW_UNH_MODE (1 << 1)
+#define SW_LINK_AUTO_AGING (1 << 0)
+
+#define REG_SW_CTRL_1 0x03
+
+#define SW_PASS_ALL (1 << 7)
+#define SW_2K_PACKET (1 << 6)
+#define SW_TX_FLOW_CTRL_DISABLE (1 << 5)
+#define SW_RX_FLOW_CTRL_DISABLE (1 << 4)
+#define SW_CHECK_LENGTH (1 << 3)
+#define SW_AGING_ENABLE (1 << 2)
+#define SW_FAST_AGING (1 << 1)
+#define SW_AGGR_BACKOFF (1 << 0)
+
+#define REG_SW_CTRL_2 0x04
+
+#define UNICAST_VLAN_BOUNDARY (1 << 7)
+#define MULTICAST_STORM_DISABLE (1 << 6)
+#define SW_BACK_PRESSURE (1 << 5)
+#define FAIR_FLOW_CTRL (1 << 4)
+#define NO_EXC_COLLISION_DROP (1 << 3)
+#define SW_HUGE_PACKET (1 << 2)
+#define SW_LEGAL_PACKET (1 << 1)
+
+#define REG_SW_CTRL_3 0x05
+#define SW_VLAN_ENABLE (1 << 7)
+#define SW_IGMP_SNOOP (1 << 6)
+#define SW_DIRECT (1 << 5)
+#define SW_PRE_TAG (1 << 4)
+#define SW_VLAN_TAG (1 << 1)
+#define SW_MIRROR_RX_TX (1 << 0)
+
+#define REG_SW_CTRL_4 0x06
+
+#define SW_HALF_DUPLEX_FLOW_CTRL (1 << 7)
+#define SW_HALF_DUPLEX (1 << 6)
+#define SW_FLOW_CTRL (1 << 5)
+#define SW_10_MBIT (1 << 4)
+#define SW_REPLACE_VID (1 << 3)
+#define BROADCAST_STORM_RATE_HI 0x07
+
+#define REG_SW_CTRL_5 0x07
+
+#define BROADCAST_STORM_RATE_LO 0xFF
+#define BROADCAST_STORM_RATE 0x07FF
+
+#define REG_SW_CTRL_9 0x0B
+
+#define SW_DATA_SAMPLING_NEG (1 << 6)
+#define SW_PHY_POWER_SAVE_DISABLE (1 << 3)
+#define SW_LED_MODE_1 (1 << 1)
+#define SW_SPI_SAMPLING_RISING (1 << 0)
+
+#define REG_SW_CTRL_10 0x0C
+
+#define SPI_CLK_125_MHZ 0x20
+#define SPI_CLK_83_33_MHZ 0x10
+#define SPI_CLK_41_67_MHZ 0x00
+#define SW_TAIL_TAG_ENABLE (1 << 1)
+#define SW_PASS_PAUSE (1 << 0)
+
+#define REG_SW_CTRL_11 0x0D
+
+#define REG_POWER_MANAGEMENT_1 0x0E
+
+#define SW_PLL_POWER_DOWN (1 << 5)
+#define SW_POWER_MANAGEMENT_MODE_M 0x3
+#define SW_POWER_MANAGEMENT_MODE_S 3
+#define SW_POWER_NORMAL 0
+#define SW_ENERGY_DETECTION 1
+#define SW_SOFTWARE_POWER_DOWN 2
+#define SW_POWER_SAVING 3
+
+#define REG_POWER_MANAGEMENT_2 0x0F
+
+
+#define REG_PORT_1_CTRL_0 0x10
+#define REG_PORT_2_CTRL_0 0x20
+#define REG_PORT_3_CTRL_0 0x30
+#define REG_PORT_4_CTRL_0 0x40
+#define REG_PORT_5_CTRL_0 0x50
+
+#define PORT_BROADCAST_STORM (1 << 7)
+#define PORT_DIFFSERV_ENABLE (1 << 6)
+#define PORT_802_1P_ENABLE (1 << 5)
+#define PORT_BASED_PRIO_S 3
+#define PORT_BASED_PRIO_M (KS_PRIO_M << PORT_BASED_PRIO_S)
+#define PORT_PORT_PRIO_0 0
+#define PORT_PORT_PRIO_1 1
+#define PORT_PORT_PRIO_2 2
+#define PORT_PORT_PRIO_3 3
+#define PORT_INSERT_TAG (1 << 2)
+#define PORT_REMOVE_TAG (1 << 1)
+#define PORT_QUEUE_SPLIT_L (1 << 0)
+
+#define REG_PORT_1_CTRL_1 0x11
+#define REG_PORT_2_CTRL_1 0x21
+#define REG_PORT_3_CTRL_1 0x31
+#define REG_PORT_4_CTRL_1 0x41
+#define REG_PORT_5_CTRL_1 0x51
+
+#define PORT_MIRROR_SNIFFER (1 << 7)
+#define PORT_MIRROR_RX (1 << 6)
+#define PORT_MIRROR_TX (1 << 5)
+#define PORT_VLAN_MEMBERSHIP KS_PORT_M
+
+#define REG_PORT_1_CTRL_2 0x12
+#define REG_PORT_2_CTRL_2 0x22
+#define REG_PORT_3_CTRL_2 0x32
+#define REG_PORT_4_CTRL_2 0x42
+#define REG_PORT_5_CTRL_2 0x52
+
+#define PORT_802_1P_REMAPPING (1 << 7)
+#define PORT_INGRESS_FILTER (1 << 6)
+#define PORT_DISCARD_NON_VID (1 << 5)
+#define PORT_FORCE_FLOW_CTRL (1 << 4)
+#define PORT_BACK_PRESSURE (1 << 3)
+#define PORT_TX_ENABLE (1 << 2)
+#define PORT_RX_ENABLE (1 << 1)
+#define PORT_LEARN_DISABLE (1 << 0)
+
+#define REG_PORT_1_CTRL_3 0x13
+#define REG_PORT_2_CTRL_3 0x23
+#define REG_PORT_3_CTRL_3 0x33
+#define REG_PORT_4_CTRL_3 0x43
+#define REG_PORT_5_CTRL_3 0x53
+#define REG_PORT_1_CTRL_4 0x14
+#define REG_PORT_2_CTRL_4 0x24
+#define REG_PORT_3_CTRL_4 0x34
+#define REG_PORT_4_CTRL_4 0x44
+#define REG_PORT_5_CTRL_4 0x54
+
+#define PORT_DEFAULT_VID 0x0001
+
+#define REG_PORT_1_STATUS_0 0x19
+#define REG_PORT_2_STATUS_0 0x29
+#define REG_PORT_3_STATUS_0 0x39
+#define REG_PORT_4_STATUS_0 0x49
+#define REG_PORT_5_STATUS_0 0x59
+
+#define PORT_HP_MDIX (1 << 7)
+#define PORT_REVERSED_POLARITY (1 << 5)
+#define PORT_TX_FLOW_CTRL (1 << 4)
+#define PORT_RX_FLOW_CTRL (1 << 3)
+#define PORT_STAT_SPEED_100MBIT (1 << 2)
+#define PORT_STAT_FULL_DUPLEX (1 << 1)
+
+#define REG_PORT_1_LINK_MD_CTRL 0x1A
+#define REG_PORT_2_LINK_MD_CTRL 0x2A
+#define REG_PORT_3_LINK_MD_CTRL 0x3A
+#define REG_PORT_4_LINK_MD_CTRL 0x4A
+#define REG_PORT_5_LINK_MD_CTRL 0x5A
+
+#define PORT_CABLE_10M_SHORT (1 << 7)
+#define PORT_CABLE_DIAG_RESULT_M 0x3
+#define PORT_CABLE_DIAG_RESULT_S 5
+#define PORT_CABLE_STAT_NORMAL 0
+#define PORT_CABLE_STAT_OPEN 1
+#define PORT_CABLE_STAT_SHORT 2
+#define PORT_CABLE_STAT_FAILED 3
+#define PORT_START_CABLE_DIAG (1 << 4)
+#define PORT_FORCE_LINK (1 << 3)
+#define PORT_POWER_SAVING (1 << 2)
+#define PORT_PHY_REMOTE_LOOPBACK (1 << 1)
+#define PORT_CABLE_FAULT_COUNTER_H 0x01
+
+#define REG_PORT_1_LINK_MD_RESULT 0x1B
+#define REG_PORT_2_LINK_MD_RESULT 0x2B
+#define REG_PORT_3_LINK_MD_RESULT 0x3B
+#define REG_PORT_4_LINK_MD_RESULT 0x4B
+#define REG_PORT_5_LINK_MD_RESULT 0x5B
+
+#define PORT_CABLE_FAULT_COUNTER_L 0xFF
+#define PORT_CABLE_FAULT_COUNTER 0x1FF
+
+#define REG_PORT_1_CTRL_5 0x1C
+#define REG_PORT_2_CTRL_5 0x2C
+#define REG_PORT_3_CTRL_5 0x3C
+#define REG_PORT_4_CTRL_5 0x4C
+#define REG_PORT_5_CTRL_5 0x5C
+
+#define PORT_AUTO_NEG_DISABLE (1 << 7)
+#define PORT_FORCE_100_MBIT (1 << 6)
+#define PORT_FORCE_FULL_DUPLEX (1 << 5)
+#define PORT_AUTO_NEG_SYM_PAUSE (1 << 4)
+#define PORT_AUTO_NEG_100BTX_FD (1 << 3)
+#define PORT_AUTO_NEG_100BTX (1 << 2)
+#define PORT_AUTO_NEG_10BT_FD (1 << 1)
+#define PORT_AUTO_NEG_10BT (1 << 0)
+
+#define REG_PORT_1_CTRL_6 0x1D
+#define REG_PORT_2_CTRL_6 0x2D
+#define REG_PORT_3_CTRL_6 0x3D
+#define REG_PORT_4_CTRL_6 0x4D
+#define REG_PORT_5_CTRL_6 0x5D
+
+#define PORT_LED_OFF (1 << 7)
+#define PORT_TX_DISABLE (1 << 6)
+#define PORT_AUTO_NEG_RESTART (1 << 5)
+#define PORT_POWER_DOWN (1 << 3)
+#define PORT_AUTO_MDIX_DISABLE (1 << 2)
+#define PORT_FORCE_MDIX (1 << 1)
+#define PORT_MAC_LOOPBACK (1 << 0)
+
+#define REG_PORT_1_STATUS_1 0x1E
+#define REG_PORT_2_STATUS_1 0x2E
+#define REG_PORT_3_STATUS_1 0x3E
+#define REG_PORT_4_STATUS_1 0x4E
+#define REG_PORT_5_STATUS_1 0x5E
+
+#define PORT_MDIX_STATUS (1 << 7)
+#define PORT_AUTO_NEG_COMPLETE (1 << 6)
+#define PORT_STAT_LINK_GOOD (1 << 5)
+#define PORT_REMOTE_SYM_PAUSE (1 << 4)
+#define PORT_REMOTE_100BTX_FD (1 << 3)
+#define PORT_REMOTE_100BTX (1 << 2)
+#define PORT_REMOTE_10BT_FD (1 << 1)
+#define PORT_REMOTE_10BT (1 << 0)
+
+#define REG_PORT_1_STATUS_2 0x1F
+#define REG_PORT_2_STATUS_2 0x2F
+#define REG_PORT_3_STATUS_2 0x3F
+#define REG_PORT_4_STATUS_2 0x4F
+#define REG_PORT_5_STATUS_2 0x5F
+
+#define PORT_PHY_LOOPBACK (1 << 7)
+#define PORT_PHY_ISOLATE (1 << 5)
+#define PORT_PHY_SOFT_RESET (1 << 4)
+#define PORT_PHY_FORCE_LINK (1 << 3)
+#define PORT_PHY_MODE_M 0x7
+#define PHY_MODE_IN_AUTO_NEG 1
+#define PHY_MODE_10BT_HALF 2
+#define PHY_MODE_100BT_HALF 3
+#define PHY_MODE_10BT_FULL 5
+#define PHY_MODE_100BT_FULL 6
+#define PHY_MODE_ISOLDATE 7
+
+#define REG_PORT_CTRL_0 0x00
+#define REG_PORT_CTRL_1 0x01
+#define REG_PORT_CTRL_2 0x02
+#define REG_PORT_CTRL_VID 0x03
+
+#define REG_PORT_STATUS_0 0x09
+#define REG_PORT_LINK_MD_CTRL 0x0A
+#define REG_PORT_LINK_MD_RESULT 0x0B
+#define REG_PORT_CTRL_5 0x0C
+#define REG_PORT_CTRL_6 0x0D
+#define REG_PORT_STATUS_1 0x0E
+#define REG_PORT_STATUS_2 0x0F
+
+#define REG_PORT_CTRL_8 0xA0
+#define REG_PORT_CTRL_9 0xA1
+#define REG_PORT_RATE_CTRL_3 0xA2
+#define REG_PORT_RATE_CTRL_2 0xA3
+#define REG_PORT_RATE_CTRL_1 0xA4
+#define REG_PORT_RATE_CTRL_0 0xA5
+#define REG_PORT_RATE_LIMIT 0xA6
+#define REG_PORT_IN_RATE_0 0xA7
+#define REG_PORT_IN_RATE_1 0xA8
+#define REG_PORT_IN_RATE_2 0xA9
+#define REG_PORT_IN_RATE_3 0xAA
+#define REG_PORT_OUT_RATE_0 0xAB
+#define REG_PORT_OUT_RATE_1 0xAC
+#define REG_PORT_OUT_RATE_2 0xAD
+#define REG_PORT_OUT_RATE_3 0xAE
+
+#define REG_SW_MAC_ADDR_0 0x68
+#define REG_SW_MAC_ADDR_1 0x69
+#define REG_SW_MAC_ADDR_2 0x6A
+#define REG_SW_MAC_ADDR_3 0x6B
+#define REG_SW_MAC_ADDR_4 0x6C
+#define REG_SW_MAC_ADDR_5 0x6D
+
+#define REG_IND_CTRL_0 0x6E
+
+#define TABLE_READ (1 << 4)
+#define TABLE_SELECT_S 2
+#define TABLE_STATIC_MAC (0 << TABLE_SELECT_S)
+#define TABLE_VLAN (1 << TABLE_SELECT_S)
+#define TABLE_DYNAMIC_MAC (2 << TABLE_SELECT_S)
+#define TABLE_MIB (3 << TABLE_SELECT_S)
+
+#define REG_IND_CTRL_1 0x6F
+
+#define TABLE_ENTRY_MASK 0x03FF
+
+#define REG_IND_DATA_8 0x70
+#define REG_IND_DATA_7 0x71
+#define REG_IND_DATA_6 0x72
+#define REG_IND_DATA_5 0x73
+#define REG_IND_DATA_4 0x74
+#define REG_IND_DATA_3 0x75
+#define REG_IND_DATA_2 0x76
+#define REG_IND_DATA_1 0x77
+#define REG_IND_DATA_0 0x78
+
+#define REG_IND_DATA_CHECK REG_IND_DATA_6
+#define REG_IND_MIB_CHECK REG_IND_DATA_3
+#define REG_IND_DATA_HI REG_IND_DATA_7
+#define REG_IND_DATA_LO REG_IND_DATA_3
+
+#define REG_INT_STATUS 0x7C
+#define REG_INT_ENABLE 0x7D
+
+#define INT_PORT_5 (1 << 4)
+#define INT_PORT_4 (1 << 3)
+#define INT_PORT_3 (1 << 2)
+#define INT_PORT_2 (1 << 1)
+#define INT_PORT_1 (1 << 0)
+
+#define REG_SW_CTRL_12 0x80
+#define REG_SW_CTRL_13 0x81
+
+#define SWITCH_802_1P_MASK 3
+#define SWITCH_802_1P_BASE 3
+#define SWITCH_802_1P_SHIFT 2
+
+#define SW_802_1P_MAP_M KS_PRIO_M
+#define SW_802_1P_MAP_S KS_PRIO_S
+
+#define REG_SWITCH_CTRL_14 0x82
+
+#define SW_PRIO_MAPPING_M KS_PRIO_M
+#define SW_PRIO_MAPPING_S 6
+#define SW_PRIO_MAP_3_HI 0
+#define SW_PRIO_MAP_2_HI 2
+#define SW_PRIO_MAP_0_LO 3
+
+#define REG_SW_CTRL_15 0x83
+#define REG_SW_CTRL_16 0x84
+
+#define SW_DRIVE_STRENGTH_M 0x3
+#define SW_DRIVE_STRENGTH_4MA 0
+#define SW_DRIVE_STRENGTH_8MA 1
+#define SW_DRIVE_STRENGTH_10MA 2
+#define SW_DRIVE_STRENGTH_14MA 3
+#define SW_MII_DRIVE_STRENGTH_S 6
+
+#define REG_SW_CTRL_17 0x85
+#define REG_SW_CTRL_18 0x86
+
+#define SW_SELF_ADDR_FILTER_ENABLE (1 << 6)
+
+#define REG_SW_UNK_UCAST_CTRL 0x83
+#define REG_SW_UNK_MCAST_CTRL 0x84
+#define REG_SW_UNK_VID_CTRL 0x85
+#define REG_SW_UNK_IP_MCAST_CTRL 0x86
+
+#define SW_UNK_FWD_ENABLE (1 << 5)
+#define SW_UNK_FWD_MAP KS_PORT_M
+
+#define REG_SW_CTRL_19 0x87
+
+#define SW_IN_RATE_LIMIT_PERIOD_M 0x3
+#define SW_IN_RATE_LIMIT_PERIOD_S 4
+#define SW_IN_RATE_LIMIT_16_MS 0
+#define SW_IN_RATE_LIMIT_64_MS 1
+#define SW_IN_RATE_LIMIT_256_MS 2
+#define SW_QUEUE_BASED_OUT_RATE_LIMIT (1 << 3)
+#define SW_INS_TAG_ENABLE (1 << 2)
+
+#define REG_TOS_PRIO_CTRL_0 0x90
+#define REG_TOS_PRIO_CTRL_1 0x91
+#define REG_TOS_PRIO_CTRL_2 0x92
+#define REG_TOS_PRIO_CTRL_3 0x93
+#define REG_TOS_PRIO_CTRL_4 0x94
+#define REG_TOS_PRIO_CTRL_5 0x95
+#define REG_TOS_PRIO_CTRL_6 0x96
+#define REG_TOS_PRIO_CTRL_7 0x97
+#define REG_TOS_PRIO_CTRL_8 0x98
+#define REG_TOS_PRIO_CTRL_9 0x99
+#define REG_TOS_PRIO_CTRL_10 0x9A
+#define REG_TOS_PRIO_CTRL_11 0x9B
+#define REG_TOS_PRIO_CTRL_12 0x9C
+#define REG_TOS_PRIO_CTRL_13 0x9D
+#define REG_TOS_PRIO_CTRL_14 0x9E
+#define REG_TOS_PRIO_CTRL_15 0x9F
+
+#define TOS_PRIO_M KS_PRIO_M
+#define TOS_PRIO_S KS_PRIO_S
+
+
+#define REG_PORT_1_CTRL_8 0xB0
+#define REG_PORT_2_CTRL_8 0xC0
+#define REG_PORT_3_CTRL_8 0xD0
+#define REG_PORT_4_CTRL_8 0xE0
+#define REG_PORT_5_CTRL_8 0xF0
+
+#define PORT_INS_TAG_FOR_PORT_5_S 3
+#define PORT_INS_TAG_FOR_PORT_5 (1 << 3)
+#define PORT_INS_TAG_FOR_PORT_4 (1 << 2)
+#define PORT_INS_TAG_FOR_PORT_3 (1 << 1)
+#define PORT_INS_TAG_FOR_PORT_2 (1 << 0)
+
+#define REG_PORT_1_CTRL_9 0xB1
+#define REG_PORT_2_CTRL_9 0xC1
+#define REG_PORT_3_CTRL_9 0xD1
+#define REG_PORT_4_CTRL_9 0xE1
+#define REG_PORT_5_CTRL_9 0xF1
+
+#define PORT_QUEUE_SPLIT_H (1 << 1)
+#define PORT_QUEUE_SPLIT_1 0
+#define PORT_QUEUE_SPLIT_2 1
+#define PORT_QUEUE_SPLIT_4 2
+#define PORT_DROP_TAG (1 << 0)
+
+#define REG_PORT_1_CTRL_10 0xB2
+#define REG_PORT_2_CTRL_10 0xC2
+#define REG_PORT_3_CTRL_10 0xD2
+#define REG_PORT_4_CTRL_10 0xE2
+#define REG_PORT_5_CTRL_10 0xF2
+#define REG_PORT_1_CTRL_11 0xB3
+#define REG_PORT_2_CTRL_11 0xC3
+#define REG_PORT_3_CTRL_11 0xD3
+#define REG_PORT_4_CTRL_11 0xE3
+#define REG_PORT_5_CTRL_11 0xF3
+#define REG_PORT_1_CTRL_12 0xB4
+#define REG_PORT_2_CTRL_12 0xC4
+#define REG_PORT_3_CTRL_12 0xD4
+#define REG_PORT_4_CTRL_12 0xE4
+#define REG_PORT_5_CTRL_12 0xF4
+#define REG_PORT_1_CTRL_13 0xB5
+#define REG_PORT_2_CTRL_13 0xC5
+#define REG_PORT_3_CTRL_13 0xD5
+#define REG_PORT_4_CTRL_13 0xE5
+#define REG_PORT_5_CTRL_13 0xF5
+
+#define REG_PORT_1_RATE_CTRL_3 0xB2
+#define REG_PORT_1_RATE_CTRL_2 0xB3
+#define REG_PORT_1_RATE_CTRL_1 0xB4
+#define REG_PORT_1_RATE_CTRL_0 0xB5
+#define REG_PORT_2_RATE_CTRL_3 0xC2
+#define REG_PORT_2_RATE_CTRL_2 0xC3
+#define REG_PORT_2_RATE_CTRL_1 0xC4
+#define REG_PORT_2_RATE_CTRL_0 0xC5
+#define REG_PORT_3_RATE_CTRL_3 0xD2
+#define REG_PORT_3_RATE_CTRL_2 0xD3
+#define REG_PORT_3_RATE_CTRL_1 0xD4
+#define REG_PORT_3_RATE_CTRL_0 0xD5
+#define REG_PORT_4_RATE_CTRL_3 0xE2
+#define REG_PORT_4_RATE_CTRL_2 0xE3
+#define REG_PORT_4_RATE_CTRL_1 0xE4
+#define REG_PORT_4_RATE_CTRL_0 0xE5
+#define REG_PORT_5_RATE_CTRL_3 0xF2
+#define REG_PORT_5_RATE_CTRL_2 0xF3
+#define REG_PORT_5_RATE_CTRL_1 0xF4
+#define REG_PORT_5_RATE_CTRL_0 0xF5
+
+#define RATE_CTRL_ENABLE (1 << 7)
+#define RATE_RATIO_M ((1 << 7) - 1)
+
+#define REG_PORT_1_RATE_LIMIT 0xB6
+#define REG_PORT_2_RATE_LIMIT 0xC6
+#define REG_PORT_3_RATE_LIMIT 0xD6
+#define REG_PORT_4_RATE_LIMIT 0xE6
+#define REG_PORT_5_RATE_LIMIT 0xF6
+
+#define PORT_IN_FLOW_CTRL_S 4
+#define PORT_IN_LIMIT_MODE_M 0x3
+#define PORT_IN_LIMIT_MODE_S 2
+#define PORT_COUNT_IFG_S 1
+#define PORT_COUNT_PREAMBLE_S 0
+#define PORT_IN_FLOW_CTRL (1 << PORT_IN_FLOW_CTRL_S)
+#define PORT_IN_ALL 0
+#define PORT_IN_UNICAST 1
+#define PORT_IN_MULTICAST 2
+#define PORT_IN_BROADCAST 3
+#define PORT_COUNT_IFG (1 << PORT_COUNT_IFG_S)
+#define PORT_COUNT_PREAMBLE (1 << PORT_COUNT_PREAMBLE_S)
+
+#define REG_PORT_1_IN_RATE_0 0xB7
+#define REG_PORT_2_IN_RATE_0 0xC7
+#define REG_PORT_3_IN_RATE_0 0xD7
+#define REG_PORT_4_IN_RATE_0 0xE7
+#define REG_PORT_5_IN_RATE_0 0xF7
+#define REG_PORT_1_IN_RATE_1 0xB8
+#define REG_PORT_2_IN_RATE_1 0xC8
+#define REG_PORT_3_IN_RATE_1 0xD8
+#define REG_PORT_4_IN_RATE_1 0xE8
+#define REG_PORT_5_IN_RATE_1 0xF8
+#define REG_PORT_1_IN_RATE_2 0xB9
+#define REG_PORT_2_IN_RATE_2 0xC9
+#define REG_PORT_3_IN_RATE_2 0xD9
+#define REG_PORT_4_IN_RATE_2 0xE9
+#define REG_PORT_5_IN_RATE_2 0xF9
+#define REG_PORT_1_IN_RATE_3 0xBA
+#define REG_PORT_2_IN_RATE_3 0xCA
+#define REG_PORT_3_IN_RATE_3 0xDA
+#define REG_PORT_4_IN_RATE_3 0xEA
+#define REG_PORT_5_IN_RATE_3 0xFA
+
+#define PORT_RATE_LIMIT_M ((1 << 7) - 1)
+
+#define REG_PORT_1_OUT_RATE_0 0xBB
+#define REG_PORT_2_OUT_RATE_0 0xCB
+#define REG_PORT_3_OUT_RATE_0 0xDB
+#define REG_PORT_4_OUT_RATE_0 0xEB
+#define REG_PORT_5_OUT_RATE_0 0xFB
+#define REG_PORT_1_OUT_RATE_1 0xBC
+#define REG_PORT_2_OUT_RATE_1 0xCC
+#define REG_PORT_3_OUT_RATE_1 0xDC
+#define REG_PORT_4_OUT_RATE_1 0xEC
+#define REG_PORT_5_OUT_RATE_1 0xFC
+#define REG_PORT_1_OUT_RATE_2 0xBD
+#define REG_PORT_2_OUT_RATE_2 0xCD
+#define REG_PORT_3_OUT_RATE_2 0xDD
+#define REG_PORT_4_OUT_RATE_2 0xED
+#define REG_PORT_5_OUT_RATE_2 0xFD
+#define REG_PORT_1_OUT_RATE_3 0xBE
+#define REG_PORT_2_OUT_RATE_3 0xCE
+#define REG_PORT_3_OUT_RATE_3 0xDE
+#define REG_PORT_4_OUT_RATE_3 0xEE
+#define REG_PORT_5_OUT_RATE_3 0xFE
+
+
+#define REG_SW_CFG 0xEF
+
+#define SW_PORT_3_FIBER (1 << 7)
+
+/* KSZ8864 */
+
+#define REG_PHY_PORT_CTRL_1 0xCF
+
+#define PORT_HALF_DUPLEX (1 << 7)
+#define PORT_FLOW_CTRL (1 << 6)
+#define PORT_10_MBIT (1 << 5)
+
+#define REG_PHY_PORT_CTRL_2 0xDF
+
+#define PORT_MII_MAC_MODE (1 << 6)
+
+#define REG_KSZ8864_CHIP_ID 0xFE
+
+#define SW_KSZ8864 (1 << 7)
+
+
+#ifndef PHY_REG_CTRL
+#define PHY_REG_CTRL 0
+
+#define PHY_RESET (1 << 15)
+#define PHY_LOOPBACK (1 << 14)
+#define PHY_SPEED_100MBIT (1 << 13)
+#define PHY_AUTO_NEG_ENABLE (1 << 12)
+#define PHY_POWER_DOWN (1 << 11)
+#define PHY_MII_DISABLE (1 << 10)
+#define PHY_AUTO_NEG_RESTART (1 << 9)
+#define PHY_FULL_DUPLEX (1 << 8)
+#define PHY_COLLISION_TEST_NOT (1 << 7)
+#define PHY_HP_MDIX (1 << 5)
+#define PHY_FORCE_MDIX (1 << 4)
+#define PHY_AUTO_MDIX_DISABLE (1 << 3)
+#define PHY_REMOTE_FAULT_DISABLE (1 << 2)
+#define PHY_TRANSMIT_DISABLE (1 << 1)
+#define PHY_LED_DISABLE (1 << 0)
+
+#define PHY_REG_STATUS 1
+
+#define PHY_100BT4_CAPABLE (1 << 15)
+#define PHY_100BTX_FD_CAPABLE (1 << 14)
+#define PHY_100BTX_CAPABLE (1 << 13)
+#define PHY_10BT_FD_CAPABLE (1 << 12)
+#define PHY_10BT_CAPABLE (1 << 11)
+#define PHY_MII_SUPPRESS_CAPABLE_NOT (1 << 6)
+#define PHY_AUTO_NEG_ACKNOWLEDGE (1 << 5)
+#define PHY_REMOTE_FAULT (1 << 4)
+#define PHY_AUTO_NEG_CAPABLE (1 << 3)
+#define PHY_LINK_STATUS (1 << 2)
+#define PHY_JABBER_DETECT_NOT (1 << 1)
+#define PHY_EXTENDED_CAPABILITY (1 << 0)
+
+#define PHY_REG_ID_1 2
+#define PHY_REG_ID_2 3
+
+#define KSZ8895_ID_HI 0x0022
+#define KSZ8895_ID_LO 0x1450
+
+#define PHY_REG_AUTO_NEGOTIATION 4
+
+#define PHY_AUTO_NEG_NEXT_PAGE_NOT (1 << 15)
+#define PHY_AUTO_NEG_REMOTE_FAULT_NOT (1 << 13)
+#define PHY_AUTO_NEG_SYM_PAUSE (1 << 10)
+#define PHY_AUTO_NEG_100BT4 (1 << 9)
+#define PHY_AUTO_NEG_100BTX_FD (1 << 8)
+#define PHY_AUTO_NEG_100BTX (1 << 7)
+#define PHY_AUTO_NEG_10BT_FD (1 << 6)
+#define PHY_AUTO_NEG_10BT (1 << 5)
+#define PHY_AUTO_NEG_SELECTOR 0x001F
+#define PHY_AUTO_NEG_802_3 0x0001
+
+#define PHY_REG_REMOTE_CAPABILITY 5
+
+#define PHY_REMOTE_NEXT_PAGE_NOT (1 << 15)
+#define PHY_REMOTE_ACKNOWLEDGE_NOT (1 << 14)
+#define PHY_REMOTE_REMOTE_FAULT_NOT (1 << 13)
+#define PHY_REMOTE_SYM_PAUSE (1 << 10)
+#define PHY_REMOTE_100BTX_FD (1 << 8)
+#define PHY_REMOTE_100BTX (1 << 7)
+#define PHY_REMOTE_10BT_FD (1 << 6)
+#define PHY_REMOTE_10BT (1 << 5)
+
+#define PHY_REG_LINK_MD 0x1D
+
+#define PHY_START_CABLE_DIAG (1 << 15)
+#define PHY_CABLE_DIAG_RESULT 0x6000
+#define PHY_CABLE_STAT_NORMAL 0x0000
+#define PHY_CABLE_STAT_OPEN 0x2000
+#define PHY_CABLE_STAT_SHORT 0x4000
+#define PHY_CABLE_STAT_FAILED 0x6000
+#define PHY_CABLE_10M_SHORT (1 << 12)
+#define PHY_CABLE_FAULT_COUNTER 0x01FF
+
+#define PHY_REG_PHY_CTRL 0x1F
+
+#define PHY_MODE_M 0x7
+#define PHY_MODE_S 8
+#define PHY_STAT_REVERSED_POLARITY (1 << 5)
+#define PHY_STAT_MDIX (1 << 4)
+#define PHY_FORCE_LINK (1 << 3)
+#define PHY_POWER_SAVING_ENABLE (1 << 2)
+#define PHY_REMOTE_LOOPBACK (1 << 1)
+#endif
+
+
+/* Default values are used in ksz_sw.h if these are not defined. */
+#define PRIO_QUEUES 4
+
+#define KS_PRIO_IN_REG 4
+
+#define SWITCH_PORT_NUM 4
+
+#define SW_D u8
+#define SW_R(sw, addr) (sw)->reg->r8(sw, addr)
+#define SW_W(sw, addr, val) (sw)->reg->w8(sw, addr, val)
+#define SW_SIZE (1)
+#define SW_SIZE_STR "%02x"
+
+
+#define P_BCAST_STORM_CTRL REG_PORT_CTRL_0
+#define P_PRIO_CTRL REG_PORT_CTRL_0
+#define P_TAG_CTRL REG_PORT_CTRL_0
+#define P_MIRROR_CTRL REG_PORT_CTRL_1
+#define P_802_1P_CTRL REG_PORT_CTRL_2
+#define P_STP_CTRL REG_PORT_CTRL_2
+#define P_LOCAL_CTRL REG_PORT_CTRL_5
+#define P_REMOTE_STATUS REG_PORT_STATUS_1
+#define P_FORCE_CTRL REG_PORT_CTRL_5
+#define P_NEG_RESTART_CTRL REG_PORT_CTRL_6
+#define P_SPEED_STATUS REG_PORT_STATUS_0
+#define P_LINK_STATUS REG_PORT_STATUS_1
+#define P_INS_SRC_PVID_CTRL REG_PORT_CTRL_8
+#define P_DROP_TAG_CTRL REG_PORT_CTRL_9
+#define P_RATE_LIMIT_CTRL REG_PORT_RATE_LIMIT
+
+#define S_FLUSH_TABLE_CTRL REG_SW_CTRL_0
+#define S_LINK_AGING_CTRL REG_SW_CTRL_0
+#define S_HUGE_PACKET_CTRL REG_SW_CTRL_2
+#define S_MIRROR_CTRL REG_SW_CTRL_3
+#define S_REPLACE_VID_CTRL REG_SW_CTRL_4
+#define S_PASS_PAUSE_CTRL REG_SW_CTRL_10
+#define S_TAIL_TAG_CTRL REG_SW_CTRL_10
+#define S_802_1P_PRIO_CTRL REG_SW_CTRL_12
+#define S_TOS_PRIO_CTRL REG_TOS_PRIO_CTRL_0
+#define S_IPV6_MLD_CTRL REG_SW_CTRL_21
+
+#define IND_ACC_TABLE(table) ((table) << 8)
+
+#define TAIL_TAG_OVERRIDE (1 << 6)
+#define TAIL_TAG_LOOKUP (1 << 7)
+
+#endif /* KSZ9477_REGS_H */
diff --git a/drivers/net/dsa/microchip/ksz_8895_spi.c b/drivers/net/dsa/microchip/ksz_8895_spi.c
new file mode 100644
index 000000000000..b1c9571c8e99
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz_8895_spi.c
@@ -0,0 +1,275 @@
+/*
+ * Microchip KSZ series register access through SPI
+ *
+ * Copyright (C) 2017 Woojung Huh <Woojung.Huh@microchip.com>
+ * Copyright (C) 2017 Pavel Machek <pavel@denx.de>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <asm/unaligned.h>
+
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spi/spi.h>
+
+#include "ksz_8895_reg.h"
+#include "ksz_priv.h"
+
+/* SPI frame opcodes */
+#define KS_SPIOP_RD 3
+#define KS_SPIOP_WR 2
+
+static int ksz_spi_read_reg(struct spi_device *spi, u32 reg, u8 *val,
+ unsigned int len)
+{
+ int ret;
+
+ u8 buf[2];
+
+ buf[0] = KS_SPIOP_RD;
+ buf[1] = reg;
+
+ ret = spi_write_then_read(spi, buf, 2, val, len);
+ return ret;
+}
+
+static int ksz_spi_read(struct ksz_device *dev, u32 reg, u8 *data,
+ unsigned int len)
+{
+ struct spi_device *spi = dev->priv;
+
+ return ksz_spi_read_reg(spi, reg, data, len);
+}
+
+static int ksz_spi_read8(struct ksz_device *dev, u32 reg, u8 *val)
+{
+ return ksz_spi_read(dev, reg, val, 1);
+}
+
+static int ksz_spi_read16(struct ksz_device *dev, u32 reg, u16 *val)
+{
+ int ret = ksz_spi_read(dev, reg, (u8 *)val, 2);
+
+ if (!ret)
+ *val = be16_to_cpu(*val);
+
+ return ret;
+}
+
+static int ksz_spi_read24(struct ksz_device *dev, u32 reg, u32 *val)
+{
+ int ret;
+
+ *val = 0;
+ ret = ksz_spi_read(dev, reg, (u8 *)val, 3);
+ if (!ret) {
+ *val = be32_to_cpu(*val);
+ /* convert to 24bit */
+ *val >>= 8;
+ }
+
+ return ret;
+}
+
+static int ksz_spi_read32(struct ksz_device *dev, u32 reg, u32 *val)
+{
+ int ret = ksz_spi_read(dev, reg, (u8 *)val, 4);
+
+ if (!ret)
+ *val = be32_to_cpu(*val);
+
+ return ret;
+}
+
+static int ksz_spi_write_reg(struct spi_device *spi, u32 reg, u8 *val,
+ unsigned int len)
+{
+ u8 data[12];
+
+ int i;
+
+ data[0] = KS_SPIOP_WR;
+ data[1] = reg;
+ for (i = 0; i < len; i++)
+ data[i + 2] = val[i];
+
+ return spi_write(spi, &data, 2 + len);
+}
+
+static int ksz_spi_write8(struct ksz_device *dev, u32 reg, u8 value)
+{
+ struct spi_device *spi = dev->priv;
+
+ return ksz_spi_write_reg(spi, reg, &value, 1);
+}
+
+static int ksz_spi_write16(struct ksz_device *dev, u32 reg, u16 value)
+{
+ struct spi_device *spi = dev->priv;
+
+ value = cpu_to_be16(value);
+ return ksz_spi_write_reg(spi, reg, (u8 *)&value, 2);
+}
+
+static int ksz_spi_write24(struct ksz_device *dev, u32 reg, u32 value)
+{
+ struct spi_device *spi = dev->priv;
+
+ /* make it to big endian 24bit from MSB */
+ value <<= 8;
+ value = cpu_to_be32(value);
+ return ksz_spi_write_reg(spi, reg, (u8 *)&value, 3);
+}
+
+static int ksz_spi_write32(struct ksz_device *dev, u32 reg, u32 value)
+{
+ struct spi_device *spi = dev->priv;
+
+ value = cpu_to_be32(value);
+ return ksz_spi_write_reg(spi, reg, (u8 *)&value, 4);
+}
+
+static const struct ksz_io_ops ksz_spi_ops = {
+ .read8 = ksz_spi_read8,
+ .read16 = ksz_spi_read16,
+ .read24 = ksz_spi_read24,
+ .read32 = ksz_spi_read32,
+ .write8 = ksz_spi_write8,
+ .write16 = ksz_spi_write16,
+ .write24 = ksz_spi_write24,
+ .write32 = ksz_spi_write32,
+};
+
+static int ksz_spi_sysfs_read(struct ksz_device *dev, char *buf,
+ unsigned offset, size_t count)
+{
+ int err = 0;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ ksz_read8(dev, i+offset, buf+i+offset);
+ }
+
+ return err ? err : count;
+}
+
+static int ksz_spi_sysfs_write(struct ksz_device *dev, char *buf,
+ unsigned offset, size_t count)
+{
+ int err = -EINVAL;
+
+ return err ? err : count;
+}
+
+static ssize_t ksz_spi_registers_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count)
+{
+ struct device *dev;
+ struct ksz_device *ks;
+
+ dev = container_of(kobj, struct device, kobj);
+ ks = dev_get_drvdata(dev);
+
+ return ksz_spi_sysfs_read(ks, buf, off, count);
+}
+
+static ssize_t ksz_spi_registers_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count)
+{
+ struct device *dev;
+ struct ksz_device *ks;
+
+ dev = container_of(kobj, struct device, kobj);
+ ks = dev_get_drvdata(dev);
+
+ return ksz_spi_sysfs_write(ks, buf, off, count);
+}
+
+static const struct bin_attribute ksz_spi_registers_attr = {
+ .attr = {
+ .name = "registers",
+ .mode = S_IRUSR | S_IWUSR,
+ },
+ .size = 0x100,
+ .read = ksz_spi_registers_read,
+ .write = ksz_spi_registers_write,
+};
+
+static int ksz_spi_probe(struct spi_device *spi)
+{
+ struct ksz_device *dev;
+ int ret;
+
+ dev = ksz_switch_alloc(&spi->dev, &ksz_spi_ops, spi);
+ if (!dev)
+ return -ENOMEM;
+
+ if (spi->dev.platform_data)
+ dev->pdata = spi->dev.platform_data;
+
+ ret = ksz_switch_register(dev);
+ if (ret)
+ return ret;
+
+ memcpy(&dev->regs_attr, &ksz_spi_registers_attr, sizeof(dev->regs_attr));
+ dev->regs_attr.size = 0x100;
+
+ sysfs_attr_init(&dev->regs_attr.attr);
+ ret = sysfs_create_bin_file(&spi->dev.kobj, &dev->regs_attr);
+
+ if (ret) {
+ dev_err(&spi->dev, "unable to create sysfs file, err=%d\n",
+ ret);
+ return ret;
+ }
+
+ spi_set_drvdata(spi, dev);
+
+ return 0;
+}
+
+static int ksz_spi_remove(struct spi_device *spi)
+{
+ struct ksz_device *dev = spi_get_drvdata(spi);
+
+ if (dev)
+ ksz_switch_remove(dev);
+
+ sysfs_remove_bin_file(&spi->dev.kobj, &dev->regs_attr);
+
+ return 0;
+}
+
+static const struct of_device_id ksz_dt_ids[] = {
+ { .compatible = "microchip,ksz8895" },
+ {},
+};
+MODULE_DEVICE_TABLE(of, ksz_dt_ids);
+
+static struct spi_driver ksz_spi_driver = {
+ .driver = {
+ .name = "ksz8895-switch",
+ .owner = THIS_MODULE,
+ .of_match_table = of_match_ptr(ksz_dt_ids),
+ },
+ .probe = ksz_spi_probe,
+ .remove = ksz_spi_remove,
+};
+
+module_spi_driver(ksz_spi_driver);
+
+MODULE_AUTHOR("Pavel Machek <pavel@denx.de>");
+MODULE_DESCRIPTION("Microchip KSZ Series Switch SPI access Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/dsa/microchip/ksz_9477_reg.h b/drivers/net/dsa/microchip/ksz_9477_reg.h
index 6aa6752035a1..af4d29c2ba4f 100644
--- a/drivers/net/dsa/microchip/ksz_9477_reg.h
+++ b/drivers/net/dsa/microchip/ksz_9477_reg.h
@@ -16,6 +16,7 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
+#error This is not switch we have
#ifndef __KSZ9477_REGS_H
#define __KSZ9477_REGS_H
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index b313ecdf2919..6741d05d0ac4 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -28,6 +28,7 @@
#include <net/dsa.h>
#include <net/switchdev.h>
+#include "ksz_9477_reg.h"
#include "ksz_priv.h"
static const struct {
diff --git a/drivers/net/dsa/microchip/ksz_mdio_emulation.c b/drivers/net/dsa/microchip/ksz_mdio_emulation.c
new file mode 100644
index 000000000000..a4e24506bed8
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz_mdio_emulation.c
@@ -0,0 +1,286 @@
+/**
+ * Micrel KSZ8895 SPI driver
+ *
+ * Copyright (c) 2015 Micrel, Inc.
+ *
+ * GPLv2
+ */
+
+#define PHY_ID_KSZ8895 ((KSZ8895_ID_HI << 16) | KSZ8895_ID_LO)
+
+/**
+ * sw_r_phy - read data from PHY register
+ * @sw: The switch instance.
+ * @phy: PHY address to read.
+ * @reg: PHY register to read.
+ * @val: Buffer to store the read data.
+ *
+ * This routine reads data from the PHY register.
+ */
+static void sw_r_phy(struct ksz_device *sw, u16 phy, u16 reg, u16 *val)
+{
+ u8 ctrl;
+ u8 restart;
+ u8 link;
+ u8 speed;
+ u8 force;
+ u8 p = phy;
+ u16 data = 0;
+
+ switch (reg) {
+ case PHY_REG_CTRL:
+ ksz_pread8(sw, p, P_LOCAL_CTRL, &ctrl);
+ ksz_pread8(sw, p, P_NEG_RESTART_CTRL, &restart);
+ ksz_pread8(sw, p, P_SPEED_STATUS, &speed);
+ ksz_pread8(sw, p, P_FORCE_CTRL, &force);
+ if (restart & PORT_PHY_LOOPBACK)
+ data |= PHY_LOOPBACK;
+ if (force & PORT_FORCE_100_MBIT)
+ data |= PHY_SPEED_100MBIT;
+ if (!(force & PORT_AUTO_NEG_DISABLE))
+ data |= PHY_AUTO_NEG_ENABLE;
+ if (restart & PORT_POWER_DOWN)
+ data |= PHY_POWER_DOWN;
+ if (restart & PORT_AUTO_NEG_RESTART)
+ data |= PHY_AUTO_NEG_RESTART;
+ if (force & PORT_FORCE_FULL_DUPLEX)
+ data |= PHY_FULL_DUPLEX;
+ if (speed & PORT_HP_MDIX)
+ data |= PHY_HP_MDIX;
+ if (restart & PORT_FORCE_MDIX)
+ data |= PHY_FORCE_MDIX;
+ if (restart & PORT_AUTO_MDIX_DISABLE)
+ data |= PHY_AUTO_MDIX_DISABLE;
+ if (restart & PORT_TX_DISABLE)
+ data |= PHY_TRANSMIT_DISABLE;
+ if (restart & PORT_LED_OFF)
+ data |= PHY_LED_DISABLE;
+ break;
+ case PHY_REG_STATUS:
+ ksz_pread8(sw, p, P_LINK_STATUS, &link);
+ ksz_pread8(sw, p, P_SPEED_STATUS, &speed);
+ data = PHY_100BTX_FD_CAPABLE |
+ PHY_100BTX_CAPABLE |
+ PHY_10BT_FD_CAPABLE |
+ PHY_10BT_CAPABLE |
+ PHY_AUTO_NEG_CAPABLE;
+ if (link & PORT_AUTO_NEG_COMPLETE)
+ data |= PHY_AUTO_NEG_ACKNOWLEDGE;
+ if (link & PORT_STAT_LINK_GOOD)
+ data |= PHY_LINK_STATUS;
+ break;
+ case PHY_REG_ID_1:
+ data = KSZ8895_ID_HI;
+ break;
+ case PHY_REG_ID_2:
+ data = KSZ8895_ID_LO;
+ break;
+ case PHY_REG_AUTO_NEGOTIATION:
+ ksz_pread8(sw, p, P_LOCAL_CTRL, &ctrl);
+ data = PHY_AUTO_NEG_802_3;
+ if (ctrl & PORT_AUTO_NEG_SYM_PAUSE)
+ data |= PHY_AUTO_NEG_SYM_PAUSE;
+ if (ctrl & PORT_AUTO_NEG_100BTX_FD)
+ data |= PHY_AUTO_NEG_100BTX_FD;
+ if (ctrl & PORT_AUTO_NEG_100BTX)
+ data |= PHY_AUTO_NEG_100BTX;
+ if (ctrl & PORT_AUTO_NEG_10BT_FD)
+ data |= PHY_AUTO_NEG_10BT_FD;
+ if (ctrl & PORT_AUTO_NEG_10BT)
+ data |= PHY_AUTO_NEG_10BT;
+ break;
+ case PHY_REG_REMOTE_CAPABILITY:
+ ksz_pread8(sw, p, P_REMOTE_STATUS, &link);
+ data = PHY_AUTO_NEG_802_3;
+ if (link & PORT_REMOTE_SYM_PAUSE)
+ data |= PHY_AUTO_NEG_SYM_PAUSE;
+ if (link & PORT_REMOTE_100BTX_FD)
+ data |= PHY_AUTO_NEG_100BTX_FD;
+ if (link & PORT_REMOTE_100BTX)
+ data |= PHY_AUTO_NEG_100BTX;
+ if (link & PORT_REMOTE_10BT_FD)
+ data |= PHY_AUTO_NEG_10BT_FD;
+ if (link & PORT_REMOTE_10BT)
+ data |= PHY_AUTO_NEG_10BT;
+ break;
+ default:
+ break;
+ }
+ *val = data;
+} /* sw_r_phy */
+
+/**
+ * sw_w_phy - write data to PHY register
+ * @hw: The switch instance.
+ * @phy: PHY address to write.
+ * @reg: PHY register to write.
+ * @val: Word data to write.
+ *
+ * This routine writes data to the PHY register.
+ */
+static void sw_w_phy(struct ksz_device *sw, u16 phy, u16 reg, u16 val)
+{
+ u8 ctrl;
+ u8 restart;
+ u8 speed;
+ u8 data;
+ u8 p = phy;
+
+ switch (reg) {
+ case PHY_REG_CTRL:
+ ksz_pread8(sw, p, P_SPEED_STATUS, &speed);
+ data = speed;
+ if (val & PHY_HP_MDIX)
+ data |= PORT_HP_MDIX;
+ else
+ data &= ~PORT_HP_MDIX;
+ if (data != speed)
+ ksz_pwrite8(sw, p, P_SPEED_STATUS, data);
+ ksz_pread8(sw, p, P_FORCE_CTRL, &ctrl);
+ data = ctrl;
+ if (!(val & PHY_AUTO_NEG_ENABLE))
+ data |= PORT_AUTO_NEG_DISABLE;
+ else
+ data &= ~PORT_AUTO_NEG_DISABLE;
+ if (val & PHY_SPEED_100MBIT)
+ data |= PORT_FORCE_100_MBIT;
+ else
+ data &= ~PORT_FORCE_100_MBIT;
+ if (val & PHY_FULL_DUPLEX)
+ data |= PORT_FORCE_FULL_DUPLEX;
+ else
+ data &= ~PORT_FORCE_FULL_DUPLEX;
+ if (data != ctrl)
+ ksz_pwrite8(sw, p, P_FORCE_CTRL, data);
+ ksz_pread8(sw, p, P_NEG_RESTART_CTRL, &restart);
+ data = restart;
+ if (val & PHY_LED_DISABLE)
+ data |= PORT_LED_OFF;
+ else
+ data &= ~PORT_LED_OFF;
+ if (val & PHY_TRANSMIT_DISABLE)
+ data |= PORT_TX_DISABLE;
+ else
+ data &= ~PORT_TX_DISABLE;
+ if (val & PHY_AUTO_NEG_RESTART)
+ data |= PORT_AUTO_NEG_RESTART;
+ else
+ data &= ~(PORT_AUTO_NEG_RESTART);
+ if (val & PHY_POWER_DOWN)
+ data |= PORT_POWER_DOWN;
+ else
+ data &= ~PORT_POWER_DOWN;
+ if (val & PHY_AUTO_MDIX_DISABLE)
+ data |= PORT_AUTO_MDIX_DISABLE;
+ else
+ data &= ~PORT_AUTO_MDIX_DISABLE;
+ if (val & PHY_FORCE_MDIX)
+ data |= PORT_FORCE_MDIX;
+ else
+ data &= ~PORT_FORCE_MDIX;
+ if (val & PHY_LOOPBACK)
+ data |= PORT_PHY_LOOPBACK;
+ else
+ data &= ~PORT_PHY_LOOPBACK;
+ if (data != restart)
+ ksz_pwrite8(sw, p, P_NEG_RESTART_CTRL, data);
+ break;
+ case PHY_REG_AUTO_NEGOTIATION:
+ ksz_pread8(sw, p, P_LOCAL_CTRL, &ctrl);
+ data = ctrl;
+ data &= ~(PORT_AUTO_NEG_SYM_PAUSE |
+ PORT_AUTO_NEG_100BTX_FD |
+ PORT_AUTO_NEG_100BTX |
+ PORT_AUTO_NEG_10BT_FD |
+ PORT_AUTO_NEG_10BT);
+ if (val & PHY_AUTO_NEG_SYM_PAUSE)
+ data |= PORT_AUTO_NEG_SYM_PAUSE;
+ if (val & PHY_AUTO_NEG_100BTX_FD)
+ data |= PORT_AUTO_NEG_100BTX_FD;
+ if (val & PHY_AUTO_NEG_100BTX)
+ data |= PORT_AUTO_NEG_100BTX;
+ if (val & PHY_AUTO_NEG_10BT_FD)
+ data |= PORT_AUTO_NEG_10BT_FD;
+ if (val & PHY_AUTO_NEG_10BT)
+ data |= PORT_AUTO_NEG_10BT;
+ if (data != ctrl)
+ ksz_pwrite8(sw, p, P_LOCAL_CTRL, data);
+ break;
+ default:
+ break;
+ }
+} /* sw_w_phy */
+
+static int ksz_mii_addr(int *reg, int *bank)
+{
+ int ret;
+
+ ret = (*reg & 0xC000) >> ADDR_SHIFT;
+ *bank = (*reg & 0x3000) >> BANK_SHIFT;
+ *reg &= 0x0FFF;
+ return ret;
+}
+
+static int ksz_phy_read16(struct dsa_switch *ds, int phy_id, int regnum)
+{
+ struct ksz_device *sw = ds->priv;
+ int addr;
+ int bank;
+ u16 data;
+ int ret = 0xffff;
+
+ if (phy_id > SWITCH_PORT_NUM + 1)
+ return 0xffff;
+
+ addr = ksz_mii_addr(®num, &bank);
+ BUG_ON(addr >= 6);
+
+ switch (addr) {
+ case ADDR_8:
+ case ADDR_16:
+ case ADDR_32:
+ BUG();
+
+ default:
+ if (regnum < 6) {
+ sw_r_phy(sw, phy_id, regnum, &data);
+ ret = data;
+ } else
+ ret = 0;
+ }
+
+ return ret;
+} /* ksz_mii_read */
+
+static int ksz_phy_write16(struct dsa_switch *ds, int phy_id, int regnum, u16 val)
+{
+ struct ksz_device *sw = ds->priv;
+ int addr;
+ int bank;
+ int reg;
+
+ if (phy_id > SWITCH_PORT_NUM + 1)
+ return -EINVAL;
+
+ BUG_ON(regnum >= 6);
+ reg = regnum;
+ addr = ksz_mii_addr(®num, &bank);
+
+ switch (addr) {
+ case ADDR_8:
+ case ADDR_16:
+ case ADDR_32:
+ BUG();
+ default:
+ if (regnum < 6) {
+ /* PHY device driver resets or powers down the PHY. */
+ if (0 == regnum &&
+ (val & (PHY_RESET | PHY_POWER_DOWN)))
+ break;
+ sw_w_phy(sw, phy_id, regnum, val);
+ }
+ break;
+ }
+
+ return 0;
+} /* ksz_mii_write */
diff --git a/drivers/net/dsa/microchip/ksz_priv.h b/drivers/net/dsa/microchip/ksz_priv.h
index 2a98dbd51456..1c73cdb8bbca 100644
--- a/drivers/net/dsa/microchip/ksz_priv.h
+++ b/drivers/net/dsa/microchip/ksz_priv.h
@@ -25,8 +25,6 @@
#include <linux/etherdevice.h>
#include <net/dsa.h>
-#include "ksz_9477_reg.h"
-
struct ksz_io_ops;
struct vlan_table {
@@ -60,6 +58,8 @@ struct ksz_device {
struct vlan_table *vlan_cache;
u64 mib_value[TOTAL_SWITCH_COUNTER_NUM];
+
+ struct bin_attribute regs_attr;
};
struct ksz_io_ops {
@@ -174,6 +174,7 @@ static inline int ksz_write32(struct ksz_device *dev, u32 reg, u32 value)
static inline void ksz_pread8(struct ksz_device *dev, int port, int offset,
u8 *data)
{
+ //printk("pread8 %d %d -> %d\n", port, offset, PORT_CTRL_ADDR(port, offset));
ksz_read8(dev, PORT_CTRL_ADDR(port, offset), data);
}
diff --git a/drivers/net/dsa/microchip/ksz_spi.c b/drivers/net/dsa/microchip/ksz_spi.c
index c51946983bed..22ee313052dc 100644
--- a/drivers/net/dsa/microchip/ksz_spi.c
+++ b/drivers/net/dsa/microchip/ksz_spi.c
@@ -23,6 +23,7 @@
#include <linux/module.h>
#include <linux/spi/spi.h>
+#include "ksz_9477_reg.h"
#include "ksz_priv.h"
/* SPI frame opcodes */
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index de66ca8e6201..6eb094d5bb02 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -29,7 +29,7 @@
* (eg, 0x00=port1, 0x02=port3, 0x06=port7)
*/
-#define KSZ_INGRESS_TAG_LEN 2
+#define KSZ_INGRESS_TAG_LEN 1
#define KSZ_EGRESS_TAG_LEN 1
static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -69,8 +69,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
}
tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
- tag[0] = 0;
- tag[1] = 1 << p->dp->index; /* destination port */
+ tag[0] = 1 << p->dp->index; /* destination port */
return nskb;
}
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]
^ permalink raw reply related
* [PATCH net-next 2/4] net/mlx5: Add SRIOV VGT+ support
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Eugenia Emantayev, Mohamad Haj Yahia, Saeed Mahameed
In-Reply-To: <20170827110618.20599-1-saeedm@mellanox.com>
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Implementing the VGT+ feature via acl tables.
The acl tables will hold the actual needed rules which is only the
intersection of the requested vlan-ids list and the allowed vlan-ids
list from the administrator.
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 28 ++
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 496 +++++++++++++++++-----
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 31 +-
drivers/net/ethernet/mellanox/mlx5/core/vport.c | 19 +-
include/linux/mlx5/vport.h | 6 +-
5 files changed, 458 insertions(+), 122 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index fdc2b92f020b..1a2ebe0e79ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3388,6 +3388,32 @@ static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
vlan, qos);
}
+static int mlx5e_add_vf_vlan_trunk_range(struct net_device *dev, int vf,
+ u16 start_vid, u16 end_vid,
+ __be16 vlan_proto) {
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5_core_dev *mdev = priv->mdev;
+
+ if (vlan_proto != htons(ETH_P_8021Q))
+ return -EPROTONOSUPPORT;
+
+ return mlx5_eswitch_add_vport_trunk_range(mdev->priv.eswitch, vf + 1,
+ start_vid, end_vid);
+}
+
+static int mlx5e_del_vf_vlan_trunk_range(struct net_device *dev, int vf,
+ u16 start_vid, u16 end_vid,
+ __be16 vlan_proto) {
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5_core_dev *mdev = priv->mdev;
+
+ if (vlan_proto != htons(ETH_P_8021Q))
+ return -EPROTONOSUPPORT;
+
+ return mlx5_eswitch_del_vport_trunk_range(mdev->priv.eswitch, vf + 1,
+ start_vid, end_vid);
+}
+
static int mlx5e_set_vf_spoofchk(struct net_device *dev, int vf, bool setting)
{
struct mlx5e_priv *priv = netdev_priv(dev);
@@ -3733,6 +3759,8 @@ static const struct net_device_ops mlx5e_netdev_ops = {
/* SRIOV E-Switch NDOs */
.ndo_set_vf_mac = mlx5e_set_vf_mac,
.ndo_set_vf_vlan = mlx5e_set_vf_vlan,
+ .ndo_add_vf_vlan_trunk_range = mlx5e_add_vf_vlan_trunk_range,
+ .ndo_del_vf_vlan_trunk_range = mlx5e_del_vf_vlan_trunk_range,
.ndo_set_vf_spoofchk = mlx5e_set_vf_spoofchk,
.ndo_set_vf_trust = mlx5e_set_vf_trust,
.ndo_set_vf_rate = mlx5e_set_vf_rate,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 6b84c1113301..a8e8670c7c8d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -60,12 +60,14 @@ struct vport_addr {
enum {
UC_ADDR_CHANGE = BIT(0),
MC_ADDR_CHANGE = BIT(1),
+ VLAN_CHANGE = BIT(2),
PROMISC_CHANGE = BIT(3),
};
/* Vport context events */
#define SRIOV_VPORT_EVENTS (UC_ADDR_CHANGE | \
MC_ADDR_CHANGE | \
+ VLAN_CHANGE | \
PROMISC_CHANGE)
static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport,
@@ -681,6 +683,45 @@ static void esw_update_vport_addr_list(struct mlx5_eswitch *esw,
kfree(mac_list);
}
+static void esw_update_acl_trunk_bitmap(struct mlx5_eswitch *esw, u32 vport_num)
+{
+ struct mlx5_vport *vport = &esw->vports[vport_num];
+
+ bitmap_and(vport->acl_vlan_8021q_bitmap, vport->req_vlan_bitmap,
+ vport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID);
+}
+
+static int esw_vport_egress_config(struct mlx5_eswitch *esw,
+ struct mlx5_vport *vport);
+static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
+ struct mlx5_vport *vport);
+
+/* Sync vport vlan list from vport context */
+static void esw_update_vport_vlan_list(struct mlx5_eswitch *esw, u32 vport_num)
+{
+ struct mlx5_vport *vport = &esw->vports[vport_num];
+ DECLARE_BITMAP(prev_vlans_bitmap, VLAN_N_VID);
+ int err;
+
+ bitmap_copy(prev_vlans_bitmap, vport->req_vlan_bitmap, VLAN_N_VID);
+ bitmap_zero(vport->req_vlan_bitmap, VLAN_N_VID);
+
+ if (!vport->enabled)
+ return;
+
+ err = mlx5_query_nic_vport_vlans(esw->dev, vport_num, vport->req_vlan_bitmap);
+ if (err)
+ return;
+
+ bitmap_xor(prev_vlans_bitmap, prev_vlans_bitmap, vport->req_vlan_bitmap, VLAN_N_VID);
+ if (!bitmap_weight(prev_vlans_bitmap, VLAN_N_VID))
+ return;
+
+ esw_update_acl_trunk_bitmap(esw, vport_num);
+ esw_vport_egress_config(esw, vport);
+ esw_vport_ingress_config(esw, vport);
+}
+
/* Sync vport UC/MC list from vport context
* Must be called after esw_update_vport_addr_list
*/
@@ -812,6 +853,9 @@ static void esw_vport_change_handle_locked(struct mlx5_vport *vport)
MLX5_NVPRT_LIST_TYPE_MC);
}
+ if (vport->enabled_events & VLAN_CHANGE)
+ esw_update_vport_vlan_list(esw, vport->vport);
+
if (vport->enabled_events & PROMISC_CHANGE) {
esw_update_vport_rx_mode(esw, vport->vport);
if (!IS_ERR_OR_NULL(vport->allmulti_rule))
@@ -844,18 +888,20 @@ static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
struct mlx5_vport *vport)
{
int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+ struct mlx5_flow_group *untagged_grp = NULL;
struct mlx5_flow_group *vlan_grp = NULL;
struct mlx5_flow_group *drop_grp = NULL;
struct mlx5_core_dev *dev = esw->dev;
struct mlx5_flow_namespace *root_ns;
struct mlx5_flow_table *acl;
+ /* The egress acl table contains 3 groups:
+ * 1)Allow tagged traffic with vlan_tag=vst_vlan_id/vgt+_vlan_id
+ * 2)Allow untagged traffic
+ * 2)Drop all other traffic
+ */
+ int table_size = VLAN_N_VID + 2;
void *match_criteria;
u32 *flow_group_in;
- /* The egress acl table contains 2 rules:
- * 1)Allow traffic with vlan_tag=vst_vlan_id
- * 2)Drop all other traffic.
- */
- int table_size = 2;
int err = 0;
if (!MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support))
@@ -887,11 +933,26 @@ static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+
+ /* Create flow group for allowed untagged flow rule */
MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
- MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid);
MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0);
+ untagged_grp = mlx5_create_flow_group(acl, flow_group_in);
+ if (IS_ERR(untagged_grp)) {
+ err = PTR_ERR(untagged_grp);
+ esw_warn(dev, "Failed to create E-Switch vport[%d] egress untagged flow group, err(%d)\n",
+ vport->vport, err);
+ goto out;
+ }
+
+ /* Create flow group for allowed tagged flow rules */
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid);
+ MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
+ MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, VLAN_N_VID);
+
vlan_grp = mlx5_create_flow_group(acl, flow_group_in);
if (IS_ERR(vlan_grp)) {
err = PTR_ERR(vlan_grp);
@@ -900,9 +961,10 @@ static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
goto out;
}
+ /* Create flow group for drop rule */
memset(flow_group_in, 0, inlen);
- MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
- MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
+ MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, VLAN_N_VID + 1);
+ MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, VLAN_N_VID + 1);
drop_grp = mlx5_create_flow_group(acl, flow_group_in);
if (IS_ERR(drop_grp)) {
err = PTR_ERR(drop_grp);
@@ -914,25 +976,45 @@ static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
vport->egress.acl = acl;
vport->egress.drop_grp = drop_grp;
vport->egress.allowed_vlans_grp = vlan_grp;
+ vport->egress.allow_untagged_grp = untagged_grp;
+
out:
+ if (err) {
+ if (!IS_ERR_OR_NULL(vlan_grp))
+ mlx5_destroy_flow_group(vlan_grp);
+ if (!IS_ERR_OR_NULL(untagged_grp))
+ mlx5_destroy_flow_group(untagged_grp);
+ if (!IS_ERR_OR_NULL(acl))
+ mlx5_destroy_flow_table(acl);
+ }
+
kvfree(flow_group_in);
- if (err && !IS_ERR_OR_NULL(vlan_grp))
- mlx5_destroy_flow_group(vlan_grp);
- if (err && !IS_ERR_OR_NULL(acl))
- mlx5_destroy_flow_table(acl);
return err;
}
static void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw,
struct mlx5_vport *vport)
{
- if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan))
- mlx5_del_flow_rules(vport->egress.allowed_vlan);
+ struct mlx5_acl_vlan *trunk_vlan_rule, *tmp;
+
+ if (!IS_ERR_OR_NULL(vport->egress.allowed_vst_vlan))
+ mlx5_del_flow_rules(vport->egress.allowed_vst_vlan);
+
+ list_for_each_entry_safe(trunk_vlan_rule, tmp,
+ &vport->egress.allowed_vlans_rules, list) {
+ mlx5_del_flow_rules(trunk_vlan_rule->acl_vlan_rule);
+ list_del(&trunk_vlan_rule->list);
+ kfree(trunk_vlan_rule);
+ }
if (!IS_ERR_OR_NULL(vport->egress.drop_rule))
mlx5_del_flow_rules(vport->egress.drop_rule);
- vport->egress.allowed_vlan = NULL;
+ if (!IS_ERR_OR_NULL(vport->egress.allow_untagged_rule))
+ mlx5_del_flow_rules(vport->egress.allow_untagged_rule);
+
+ vport->egress.allow_untagged_rule = NULL;
+ vport->egress.allowed_vst_vlan = NULL;
vport->egress.drop_rule = NULL;
}
@@ -945,9 +1027,12 @@ static void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw,
esw_debug(esw->dev, "Destroy vport[%d] E-Switch egress ACL\n", vport->vport);
esw_vport_cleanup_egress_rules(esw, vport);
+ mlx5_destroy_flow_group(vport->egress.allow_untagged_grp);
mlx5_destroy_flow_group(vport->egress.allowed_vlans_grp);
mlx5_destroy_flow_group(vport->egress.drop_grp);
mlx5_destroy_flow_table(vport->egress.acl);
+
+ vport->egress.allow_untagged_grp = NULL;
vport->egress.allowed_vlans_grp = NULL;
vport->egress.drop_grp = NULL;
vport->egress.acl = NULL;
@@ -956,11 +1041,15 @@ static void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw,
static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
struct mlx5_vport *vport)
{
+ bool need_vlan_filter = !!bitmap_weight(vport->info.vlan_trunk_8021q_bitmap,
+ VLAN_N_VID);
int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+ struct mlx5_flow_group *untagged_spoof_grp = NULL;
+ struct mlx5_flow_group *tagged_spoof_grp = NULL;
+ struct mlx5_flow_group *drop_grp = NULL;
struct mlx5_core_dev *dev = esw->dev;
struct mlx5_flow_namespace *root_ns;
struct mlx5_flow_table *acl;
- struct mlx5_flow_group *g;
void *match_criteria;
u32 *flow_group_in;
/* The ingress acl table contains 4 groups
@@ -969,10 +1058,11 @@ static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
* 1 drop rule from the last group):
* 1)Allow untagged traffic with smac=original mac.
* 2)Allow untagged traffic.
- * 3)Allow traffic with smac=original mac.
+ * 3)Allow tagged traffic with smac=original mac.
* 4)Drop all other traffic.
*/
- int table_size = 4;
+ int table_size = need_vlan_filter ? 8192 : 4;
+ int allow_grp_sz = 1;
int err = 0;
if (!MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support))
@@ -1006,76 +1096,71 @@ static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
- MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
- MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
- MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
+
+ if (vport->info.vlan || vport->info.qos || need_vlan_filter)
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+
+ if (vport->info.spoofchk) {
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
+ }
+
MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0);
- g = mlx5_create_flow_group(acl, flow_group_in);
- if (IS_ERR(g)) {
- err = PTR_ERR(g);
+ untagged_spoof_grp = mlx5_create_flow_group(acl, flow_group_in);
+ if (IS_ERR(untagged_spoof_grp)) {
+ err = PTR_ERR(untagged_spoof_grp);
esw_warn(dev, "Failed to create E-Switch vport[%d] ingress untagged spoofchk flow group, err(%d)\n",
vport->vport, err);
goto out;
}
- vport->ingress.allow_untagged_spoofchk_grp = g;
+
+ if (!need_vlan_filter)
+ goto drop_grp;
memset(flow_group_in, 0, inlen);
MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+ if (vport->info.spoofchk) {
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
+ }
MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid);
MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
- MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
+ MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, VLAN_N_VID);
+ allow_grp_sz = VLAN_N_VID + 1;
- g = mlx5_create_flow_group(acl, flow_group_in);
- if (IS_ERR(g)) {
- err = PTR_ERR(g);
- esw_warn(dev, "Failed to create E-Switch vport[%d] ingress untagged flow group, err(%d)\n",
- vport->vport, err);
- goto out;
- }
- vport->ingress.allow_untagged_only_grp = g;
-
- memset(flow_group_in, 0, inlen);
- MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
- MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
- MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
- MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 2);
- MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 2);
-
- g = mlx5_create_flow_group(acl, flow_group_in);
- if (IS_ERR(g)) {
- err = PTR_ERR(g);
+ tagged_spoof_grp = mlx5_create_flow_group(acl, flow_group_in);
+ if (IS_ERR(tagged_spoof_grp)) {
+ err = PTR_ERR(tagged_spoof_grp);
esw_warn(dev, "Failed to create E-Switch vport[%d] ingress spoofchk flow group, err(%d)\n",
vport->vport, err);
goto out;
}
- vport->ingress.allow_spoofchk_only_grp = g;
+drop_grp:
memset(flow_group_in, 0, inlen);
- MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 3);
- MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 3);
+ MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, allow_grp_sz);
+ MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, allow_grp_sz);
- g = mlx5_create_flow_group(acl, flow_group_in);
- if (IS_ERR(g)) {
- err = PTR_ERR(g);
+ drop_grp = mlx5_create_flow_group(acl, flow_group_in);
+ if (IS_ERR(drop_grp)) {
+ err = PTR_ERR(drop_grp);
esw_warn(dev, "Failed to create E-Switch vport[%d] ingress drop flow group, err(%d)\n",
vport->vport, err);
goto out;
}
- vport->ingress.drop_grp = g;
+ vport->ingress.allow_untagged_spoofchk_grp = untagged_spoof_grp;
+ vport->ingress.allow_tagged_spoofchk_grp = tagged_spoof_grp;
+ vport->ingress.drop_grp = drop_grp;
out:
if (err) {
- if (!IS_ERR_OR_NULL(vport->ingress.allow_spoofchk_only_grp))
- mlx5_destroy_flow_group(
- vport->ingress.allow_spoofchk_only_grp);
- if (!IS_ERR_OR_NULL(vport->ingress.allow_untagged_only_grp))
- mlx5_destroy_flow_group(
- vport->ingress.allow_untagged_only_grp);
- if (!IS_ERR_OR_NULL(vport->ingress.allow_untagged_spoofchk_grp))
- mlx5_destroy_flow_group(
- vport->ingress.allow_untagged_spoofchk_grp);
+ if (!IS_ERR_OR_NULL(tagged_spoof_grp))
+ mlx5_destroy_flow_group(tagged_spoof_grp);
+ if (!IS_ERR_OR_NULL(untagged_spoof_grp))
+ mlx5_destroy_flow_group(untagged_spoof_grp);
if (!IS_ERR_OR_NULL(vport->ingress.acl))
mlx5_destroy_flow_table(vport->ingress.acl);
}
@@ -1087,14 +1172,23 @@ static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
static void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
struct mlx5_vport *vport)
{
+ struct mlx5_acl_vlan *trunk_vlan_rule, *tmp;
+
if (!IS_ERR_OR_NULL(vport->ingress.drop_rule))
mlx5_del_flow_rules(vport->ingress.drop_rule);
- if (!IS_ERR_OR_NULL(vport->ingress.allow_rule))
- mlx5_del_flow_rules(vport->ingress.allow_rule);
+ list_for_each_entry_safe(trunk_vlan_rule, tmp,
+ &vport->ingress.allowed_vlans_rules, list) {
+ mlx5_del_flow_rules(trunk_vlan_rule->acl_vlan_rule);
+ list_del(&trunk_vlan_rule->list);
+ kfree(trunk_vlan_rule);
+ }
+
+ if (!IS_ERR_OR_NULL(vport->ingress.allow_untagged_rule))
+ mlx5_del_flow_rules(vport->ingress.allow_untagged_rule);
vport->ingress.drop_rule = NULL;
- vport->ingress.allow_rule = NULL;
+ vport->ingress.allow_untagged_rule = NULL;
}
static void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
@@ -1106,23 +1200,32 @@ static void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
esw_debug(esw->dev, "Destroy vport[%d] E-Switch ingress ACL\n", vport->vport);
esw_vport_cleanup_ingress_rules(esw, vport);
- mlx5_destroy_flow_group(vport->ingress.allow_spoofchk_only_grp);
- mlx5_destroy_flow_group(vport->ingress.allow_untagged_only_grp);
- mlx5_destroy_flow_group(vport->ingress.allow_untagged_spoofchk_grp);
- mlx5_destroy_flow_group(vport->ingress.drop_grp);
+ if (!IS_ERR_OR_NULL(vport->ingress.allow_tagged_spoofchk_grp))
+ mlx5_destroy_flow_group(vport->ingress.allow_tagged_spoofchk_grp);
+
+ if (!IS_ERR_OR_NULL(vport->ingress.allow_untagged_spoofchk_grp))
+ mlx5_destroy_flow_group(vport->ingress.allow_untagged_spoofchk_grp);
+
+ if (!IS_ERR_OR_NULL(vport->ingress.drop_grp))
+ mlx5_destroy_flow_group(vport->ingress.drop_grp);
+
mlx5_destroy_flow_table(vport->ingress.acl);
vport->ingress.acl = NULL;
vport->ingress.drop_grp = NULL;
- vport->ingress.allow_spoofchk_only_grp = NULL;
- vport->ingress.allow_untagged_only_grp = NULL;
+ vport->ingress.allow_tagged_spoofchk_grp = NULL;
vport->ingress.allow_untagged_spoofchk_grp = NULL;
}
static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
struct mlx5_vport *vport)
{
+ bool need_vlan_filter = !!bitmap_weight(vport->info.vlan_trunk_8021q_bitmap,
+ VLAN_N_VID);
+ struct mlx5_acl_vlan *trunk_vlan_rule;
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_spec *spec;
+ bool need_acl_table = true;
+ u16 vlan_id = 0;
int err = 0;
u8 *smac_v;
@@ -1133,9 +1236,19 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
return -EPERM;
}
+ if ((vport->info.vlan || vport->info.qos) && need_vlan_filter) {
+ mlx5_core_warn(esw->dev,
+ "vport[%d] configure ingress rules failed, Cannot enable both VGT+ and VST\n",
+ vport->vport);
+ return -EPERM;
+ }
+
+ need_acl_table = vport->info.vlan || vport->info.qos || vport->info.spoofchk
+ || need_vlan_filter;
+
esw_vport_cleanup_ingress_rules(esw, vport);
- if (!vport->info.vlan && !vport->info.qos && !vport->info.spoofchk) {
+ if (!need_acl_table) {
esw_vport_disable_ingress_acl(esw, vport);
return 0;
}
@@ -1158,7 +1271,10 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
goto out;
}
- if (vport->info.vlan || vport->info.qos)
+ spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+ flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+
+ if (vport->info.vlan || vport->info.qos || need_vlan_filter)
MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
if (vport->info.spoofchk) {
@@ -1170,20 +1286,53 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
ether_addr_copy(smac_v, vport->info.mac);
}
- spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
- flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
- vport->ingress.allow_rule =
- mlx5_add_flow_rules(vport->ingress.acl, spec,
- &flow_act, NULL, 0);
- if (IS_ERR(vport->ingress.allow_rule)) {
- err = PTR_ERR(vport->ingress.allow_rule);
- esw_warn(esw->dev,
- "vport[%d] configure ingress allow rule, err(%d)\n",
- vport->vport, err);
- vport->ingress.allow_rule = NULL;
- goto out;
+ /* Allow untagged */
+ if (!need_vlan_filter ||
+ (need_vlan_filter && test_bit(0, vport->info.vlan_trunk_8021q_bitmap))) {
+ vport->ingress.allow_untagged_rule =
+ mlx5_add_flow_rules(vport->ingress.acl, spec,
+ &flow_act, NULL, 0);
+ if (IS_ERR(vport->ingress.allow_untagged_rule)) {
+ err = PTR_ERR(vport->ingress.allow_untagged_rule);
+ esw_warn(esw->dev,
+ "vport[%d] configure ingress allow rule, err(%d)\n",
+ vport->vport, err);
+ vport->ingress.allow_untagged_rule = NULL;
+ goto out;
+ }
+ }
+
+ if (!need_vlan_filter)
+ goto drop_rule;
+
+ MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
+ MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag);
+ MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid);
+
+ /* VGT+ rules */
+ for_each_set_bit(vlan_id, vport->acl_vlan_8021q_bitmap, VLAN_N_VID) {
+ trunk_vlan_rule = kzalloc(sizeof(*trunk_vlan_rule), GFP_KERNEL);
+ if (!trunk_vlan_rule) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid,
+ vlan_id);
+ trunk_vlan_rule->acl_vlan_rule =
+ mlx5_add_flow_rules(vport->ingress.acl, spec, &flow_act, NULL, 0);
+ if (IS_ERR(trunk_vlan_rule->acl_vlan_rule)) {
+ err = PTR_ERR(trunk_vlan_rule->acl_vlan_rule);
+ esw_warn(esw->dev,
+ "vport[%d] configure ingress allowed vlan rule failed, err(%d)\n",
+ vport->vport, err);
+ trunk_vlan_rule->acl_vlan_rule = NULL;
+ goto out;
+ }
+ list_add(&trunk_vlan_rule->list, &vport->ingress.allowed_vlans_rules);
}
+drop_rule:
memset(spec, 0, sizeof(*spec));
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
vport->ingress.drop_rule =
@@ -1208,13 +1357,19 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
static int esw_vport_egress_config(struct mlx5_eswitch *esw,
struct mlx5_vport *vport)
{
+ bool need_vlan_filter = !!bitmap_weight(vport->info.vlan_trunk_8021q_bitmap,
+ VLAN_N_VID);
+ bool need_acl_table = vport->info.vlan || vport->info.qos ||
+ need_vlan_filter;
+ struct mlx5_acl_vlan *trunk_vlan_rule;
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_spec *spec;
+ u16 vlan_id = 0;
int err = 0;
esw_vport_cleanup_egress_rules(esw, vport);
- if (!vport->info.vlan && !vport->info.qos) {
+ if (!need_acl_table) {
esw_vport_disable_egress_acl(esw, vport);
return 0;
}
@@ -1237,24 +1392,66 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
goto out;
}
- /* Allowed vlan rule */
MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
+ spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+ flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+
+ /* Allow untagged */
+ if (need_vlan_filter && test_bit(0, vport->info.vlan_trunk_8021q_bitmap)) {
+ vport->egress.allow_untagged_rule =
+ mlx5_add_flow_rules(vport->egress.acl, spec,
+ &flow_act, NULL, 0);
+ if (IS_ERR(vport->egress.allow_untagged_rule)) {
+ err = PTR_ERR(vport->egress.allow_untagged_rule);
+ esw_warn(esw->dev,
+ "vport[%d] configure egress allow rule, err(%d)\n",
+ vport->vport, err);
+ vport->egress.allow_untagged_rule = NULL;
+ }
+ }
+
+ /* Allowed vlan rule */
MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag);
MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid);
- MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vport->info.vlan);
- spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
- flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
- vport->egress.allowed_vlan =
- mlx5_add_flow_rules(vport->egress.acl, spec,
- &flow_act, NULL, 0);
- if (IS_ERR(vport->egress.allowed_vlan)) {
- err = PTR_ERR(vport->egress.allowed_vlan);
- esw_warn(esw->dev,
- "vport[%d] configure egress allowed vlan rule failed, err(%d)\n",
- vport->vport, err);
- vport->egress.allowed_vlan = NULL;
- goto out;
+ /* VST rule */
+ if (vport->info.vlan || vport->info.qos) {
+ MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vport->info.vlan);
+
+ vport->egress.allowed_vst_vlan =
+ mlx5_add_flow_rules(vport->egress.acl, spec,
+ &flow_act, NULL, 0);
+ if (IS_ERR(vport->egress.allowed_vst_vlan)) {
+ err = PTR_ERR(vport->egress.allowed_vst_vlan);
+ esw_warn(esw->dev,
+ "vport[%d] configure egress allowed vlan rule failed, err(%d)\n",
+ vport->vport, err);
+ vport->egress.allowed_vst_vlan = NULL;
+ goto out;
+ }
+ }
+
+ /* VGT+ rules */
+ for_each_set_bit(vlan_id, vport->acl_vlan_8021q_bitmap, VLAN_N_VID) {
+ trunk_vlan_rule = kzalloc(sizeof(*trunk_vlan_rule), GFP_KERNEL);
+ if (!trunk_vlan_rule) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid,
+ vlan_id);
+ trunk_vlan_rule->acl_vlan_rule =
+ mlx5_add_flow_rules(vport->egress.acl, spec, &flow_act, NULL, 0);
+ if (IS_ERR(trunk_vlan_rule->acl_vlan_rule)) {
+ err = PTR_ERR(trunk_vlan_rule->acl_vlan_rule);
+ esw_warn(esw->dev,
+ "vport[%d] configure egress allowed vlan rule failed, err(%d)\n",
+ vport->vport, err);
+ trunk_vlan_rule->acl_vlan_rule = NULL;
+ goto out;
+ }
+ list_add(&trunk_vlan_rule->list, &vport->egress.allowed_vlans_rules);
}
/* Drop others rule (star rule) */
@@ -1271,6 +1468,8 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
vport->egress.drop_rule = NULL;
}
out:
+ if (err)
+ esw_vport_cleanup_egress_rules(esw, vport);
kvfree(spec);
return err;
}
@@ -1465,6 +1664,11 @@ static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num,
esw_debug(esw->dev, "Enabling VPORT(%d)\n", vport_num);
+ bitmap_zero(vport->req_vlan_bitmap, VLAN_N_VID);
+ bitmap_zero(vport->acl_vlan_8021q_bitmap, VLAN_N_VID);
+ bitmap_zero(vport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID);
+ INIT_LIST_HEAD(&vport->egress.allowed_vlans_rules);
+ INIT_LIST_HEAD(&vport->ingress.allowed_vlans_rules);
/* Restore old vport configuration */
esw_apply_vport_conf(esw, vport);
@@ -1824,6 +2028,8 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
ivi->trusted = evport->info.trusted;
ivi->min_tx_rate = evport->info.min_rate;
ivi->max_tx_rate = evport->info.max_rate;
+ bitmap_copy((unsigned long *)ivi->trunk_8021q,
+ evport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID);
mutex_unlock(&esw->state_lock);
return 0;
@@ -1843,6 +2049,14 @@ int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
mutex_lock(&esw->state_lock);
evport = &esw->vports[vport];
+ if (bitmap_weight(evport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID)) {
+ err = -EPERM;
+ mlx5_core_warn(esw->dev,
+ "VST is not allowed when operating in VGT+ mode vport(%d)\n",
+ vport);
+ goto unlock;
+ }
+
err = modify_esw_vport_cvlan(esw->dev, vport, vlan, qos, set_flags);
if (err)
goto unlock;
@@ -2018,6 +2232,90 @@ int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
return err;
}
+static int mlx5_eswitch_update_vport_trunk(struct mlx5_eswitch *esw,
+ struct mlx5_vport *evport,
+ unsigned long *old_trunk) {
+ DECLARE_BITMAP(diff_vlan_bm, VLAN_N_VID);
+ int err = 0;
+
+ bitmap_xor(diff_vlan_bm, old_trunk,
+ evport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID);
+ if (!bitmap_weight(diff_vlan_bm, VLAN_N_VID))
+ return err;
+
+ esw_update_acl_trunk_bitmap(esw, evport->vport);
+ if (evport->enabled && esw->mode == SRIOV_LEGACY) {
+ err = esw_vport_egress_config(esw, evport);
+ if (!err)
+ err = esw_vport_ingress_config(esw, evport);
+ }
+ if (err) {
+ bitmap_copy(evport->info.vlan_trunk_8021q_bitmap, old_trunk, VLAN_N_VID);
+ esw_update_acl_trunk_bitmap(esw, evport->vport);
+ esw_vport_egress_config(esw, evport);
+ esw_vport_ingress_config(esw, evport);
+ }
+
+ return err;
+}
+
+int mlx5_eswitch_add_vport_trunk_range(struct mlx5_eswitch *esw,
+ int vport, u16 start_vlan, u16 end_vlan)
+{
+ DECLARE_BITMAP(prev_vport_bitmap, VLAN_N_VID);
+ struct mlx5_vport *evport;
+ int err = 0;
+
+ if (!ESW_ALLOWED(esw))
+ return -EPERM;
+ if (!LEGAL_VPORT(esw, vport) || end_vlan > VLAN_N_VID || start_vlan > end_vlan)
+ return -EINVAL;
+
+ mutex_lock(&esw->state_lock);
+ evport = &esw->vports[vport];
+
+ if (evport->info.vlan || evport->info.qos) {
+ err = -EPERM;
+ mlx5_core_warn(esw->dev,
+ "VGT+ is not allowed when operating in VST mode vport(%d)\n",
+ vport);
+ goto unlock;
+ }
+
+ bitmap_copy(prev_vport_bitmap, evport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID);
+ bitmap_set(evport->info.vlan_trunk_8021q_bitmap, start_vlan,
+ end_vlan - start_vlan + 1);
+ err = mlx5_eswitch_update_vport_trunk(esw, evport, prev_vport_bitmap);
+
+unlock:
+ mutex_unlock(&esw->state_lock);
+
+ return err;
+}
+
+int mlx5_eswitch_del_vport_trunk_range(struct mlx5_eswitch *esw,
+ int vport, u16 start_vlan, u16 end_vlan)
+{
+ DECLARE_BITMAP(prev_vport_bitmap, VLAN_N_VID);
+ struct mlx5_vport *evport;
+ int err = 0;
+
+ if (!ESW_ALLOWED(esw))
+ return -EPERM;
+ if (!LEGAL_VPORT(esw, vport) || end_vlan > VLAN_N_VID || start_vlan > end_vlan)
+ return -EINVAL;
+
+ mutex_lock(&esw->state_lock);
+ evport = &esw->vports[vport];
+ bitmap_copy(prev_vport_bitmap, evport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID);
+ bitmap_clear(evport->info.vlan_trunk_8021q_bitmap, start_vlan,
+ end_vlan - start_vlan + 1);
+ err = mlx5_eswitch_update_vport_trunk(esw, evport, prev_vport_bitmap);
+ mutex_unlock(&esw->state_lock);
+
+ return err;
+}
+
int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
int vport,
struct ifla_vf_stats *vf_stats)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 565c8b7a399a..39ac2037b666 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -35,6 +35,8 @@
#include <linux/if_ether.h>
#include <linux/if_link.h>
+#include <linux/if_vlan.h>
+#include <linux/bitmap.h>
#include <net/devlink.h>
#include <linux/mlx5/device.h>
#include "lib/mpfs.h"
@@ -53,6 +55,9 @@ enum {
#define MLX5_MAX_MC_PER_VPORT(dev) \
(1 << MLX5_CAP_GEN(dev, log_max_current_mc_list))
+#define MLX5_MAX_VLAN_PER_VPORT(dev) \
+ (1 << MLX5_CAP_GEN(dev, log_max_vlan_list))
+
#define FDB_UPLINK_VPORT 0xffff
#define MLX5_MIN_BW_SHARE 1
@@ -63,19 +68,22 @@ enum {
struct vport_ingress {
struct mlx5_flow_table *acl;
struct mlx5_flow_group *allow_untagged_spoofchk_grp;
- struct mlx5_flow_group *allow_spoofchk_only_grp;
- struct mlx5_flow_group *allow_untagged_only_grp;
+ struct mlx5_flow_group *allow_tagged_spoofchk_grp;
struct mlx5_flow_group *drop_grp;
- struct mlx5_flow_handle *allow_rule;
+ struct mlx5_flow_handle *allow_untagged_rule;
+ struct list_head allowed_vlans_rules;
struct mlx5_flow_handle *drop_rule;
};
struct vport_egress {
struct mlx5_flow_table *acl;
+ struct mlx5_flow_group *allow_untagged_grp;
struct mlx5_flow_group *allowed_vlans_grp;
struct mlx5_flow_group *drop_grp;
- struct mlx5_flow_handle *allowed_vlan;
+ struct mlx5_flow_handle *allowed_vst_vlan;
struct mlx5_flow_handle *drop_rule;
+ struct mlx5_flow_handle *allow_untagged_rule;
+ struct list_head allowed_vlans_rules;
};
struct mlx5_vport_info {
@@ -88,6 +96,8 @@ struct mlx5_vport_info {
u32 max_rate;
bool spoofchk;
bool trusted;
+ /* the admin approved vlan list */
+ DECLARE_BITMAP(vlan_trunk_8021q_bitmap, VLAN_N_VID);
};
struct mlx5_vport {
@@ -95,6 +105,10 @@ struct mlx5_vport {
int vport;
struct hlist_head uc_list[MLX5_L2_ADDR_HASH_SIZE];
struct hlist_head mc_list[MLX5_L2_ADDR_HASH_SIZE];
+ /* The requested vlan list from the vport side */
+ DECLARE_BITMAP(req_vlan_bitmap, VLAN_N_VID);
+ /* Actual accepted vlans on the acl tables */
+ DECLARE_BITMAP(acl_vlan_8021q_bitmap, VLAN_N_VID);
struct mlx5_flow_handle *promisc_rule;
struct mlx5_flow_handle *allmulti_rule;
struct work_struct vport_change_handler;
@@ -133,6 +147,11 @@ struct mlx5_eswitch_fdb {
};
};
+struct mlx5_acl_vlan {
+ struct mlx5_flow_handle *acl_vlan_rule;
+ struct list_head list;
+};
+
struct mlx5_esw_sq {
struct mlx5_flow_handle *send_to_vport_rule;
struct list_head list;
@@ -218,6 +237,10 @@ int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
u32 max_rate, u32 min_rate);
int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
int vport, struct ifla_vf_info *ivi);
+int mlx5_eswitch_add_vport_trunk_range(struct mlx5_eswitch *esw,
+ int vport, u16 start_vlan, u16 end_vlan);
+int mlx5_eswitch_del_vport_trunk_range(struct mlx5_eswitch *esw,
+ int vport, u16 start_vlan, u16 end_vlan);
int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
int vport,
struct ifla_vf_stats *vf_stats);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 5abfec1c3399..c3afc7af6280 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -381,28 +381,18 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev,
}
EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_list);
-int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
- u32 vport,
- u16 vlans[],
- int *size)
+int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev, u32 vport,
+ unsigned long *vlans)
{
u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
void *nic_vport_ctx;
int req_list_size;
- int max_list_size;
int out_sz;
void *out;
int err;
int i;
- req_list_size = *size;
- max_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list);
- if (req_list_size > max_list_size) {
- mlx5_core_warn(dev, "Requested list size (%d) > (%d) max list size\n",
- req_list_size, max_list_size);
- req_list_size = max_list_size;
- }
-
+ req_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list);
out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
req_list_size * MLX5_ST_SZ_BYTES(vlan_layout);
@@ -429,12 +419,11 @@ int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx,
allowed_list_size);
- *size = req_list_size;
for (i = 0; i < req_list_size; i++) {
void *vlan_addr = MLX5_ADDR_OF(nic_vport_context,
nic_vport_ctx,
current_uc_mac_address[i]);
- vlans[i] = MLX5_GET(vlan_layout, vlan_addr, vlan);
+ bitmap_set(vlans, MLX5_GET(vlan_layout, vlan_addr, vlan), 1);
}
out:
kfree(out);
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 656c70b65dd2..a285bd04eefb 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -97,10 +97,8 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
int promisc_uc,
int promisc_mc,
int promisc_all);
-int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
- u32 vport,
- u16 vlans[],
- int *size);
+int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev, u32 vport,
+ unsigned long *vlans);
int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
u16 vlans[],
int list_size);
--
2.13.0
^ permalink raw reply related
* [PATCH net-next 1/4] net: Add SRIOV VGT+ support
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Eugenia Emantayev, Mohamad Haj Yahia, Saeed Mahameed
In-Reply-To: <20170827110618.20599-1-saeedm@mellanox.com>
From: Mohamad Haj Yahia <mohamad@mellanox.com>
VGT+ is a security feature that gives the administrator the ability of
controlling the allowed vlan-ids list that can be transmitted/received
from/to the VF.
The allowed vlan-ids list is called "trunk".
Admin can add/remove a range of allowed vlan-ids via iptool.
Example:
After this series of configuration :
1) ip link set eth3 vf 0 trunk add 10 100 (allow vlan-id 10-100, default tpid 0x8100)
2) ip link set eth3 vf 0 trunk add 105 proto 802.1q (allow vlan-id 105 tpid 0x8100)
3) ip link set eth3 vf 0 trunk add 105 proto 802.1ad (allow vlan-id 105 tpid 0x88a8)
4) ip link set eth3 vf 0 trunk rem 90 (block vlan-id 90)
5) ip link set eth3 vf 0 trunk rem 50 60 (block vlan-ids 50-60)
The VF 0 can only communicate on vlan-ids: 10-49,61-89,91-100,105 with
tpid 0x8100 and vlan-id 105 with tpid 0x88a8.
For this purpose we added the following netlink sr-iov commands:
1) IFLA_VF_VLAN_RANGE: used to add/remove allowed vlan-ids range.
We added the ifla_vf_vlan_range struct to specify the range we want to
add/remove from the userspace.
We added ndo_add_vf_vlan_trunk_range and ndo_del_vf_vlan_trunk_range
netdev ops to add/remove allowed vlan-ids range in the netdev.
2) IFLA_VF_VLAN_TRUNK: used to query the allowed vlan-ids trunk.
We added trunk bitmap to the ifla_vf_info struct to get the current
allowed vlan-ids trunk from the netdev.
We added ifla_vf_vlan_trunk struct for sending the allowed vlan-ids
trunk to the userspace.
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
include/linux/if_link.h | 2 +
include/linux/netdevice.h | 12 +++++
include/uapi/linux/if_link.h | 20 ++++++++
net/core/rtnetlink.c | 109 +++++++++++++++++++++++++++++++------------
4 files changed, 114 insertions(+), 29 deletions(-)
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 0b17c585b5cd..da70af27e42e 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -25,6 +25,8 @@ struct ifla_vf_info {
__u32 max_tx_rate;
__u32 rss_query_en;
__u32 trusted;
+ __u64 trunk_8021q[VF_VLAN_BITMAP];
+ __u64 trunk_8021ad[VF_VLAN_BITMAP];
__be16 vlan_proto;
};
#endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c5475b37a631..10633cabc58f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -959,6 +959,10 @@ struct xfrmdev_ops {
* Hash Key. This is needed since on some devices VF share this information
* with PF and querying it may introduce a theoretical security risk.
* int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
+ * int (*ndo_add_vf_vlan_trunk_range)(struct net_device *dev, int vf,
+ * u16 start_vid, u16 end_vid, __be16 proto);
+ * int (*ndo_del_vf_vlan_trunk_range)(struct net_device *dev, int vf,
+ * u16 start_vid, u16 end_vid, __be16 proto);
* int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
* int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
* void *type_data);
@@ -1208,6 +1212,14 @@ struct net_device_ops {
int (*ndo_set_vf_rss_query_en)(
struct net_device *dev,
int vf, bool setting);
+ int (*ndo_add_vf_vlan_trunk_range)(
+ struct net_device *dev,
+ int vf, u16 start_vid,
+ u16 end_vid, __be16 proto);
+ int (*ndo_del_vf_vlan_trunk_range)(
+ struct net_device *dev,
+ int vf, u16 start_vid,
+ u16 end_vid, __be16 proto);
int (*ndo_setup_tc)(struct net_device *dev,
enum tc_setup_type type,
void *type_data);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d062c58d5cb..3aa895c5fbc1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -168,6 +168,8 @@ enum {
#ifndef __KERNEL__
#define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg))))
#define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg))
+#define BITS_PER_BYTE 8
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#endif
enum {
@@ -645,6 +647,8 @@ enum {
IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */
IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */
IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */
+ IFLA_VF_VLAN_RANGE, /* add/delete vlan range filtering */
+ IFLA_VF_VLAN_TRUNK, /* vlan trunk filtering */
__IFLA_VF_MAX,
};
@@ -669,6 +673,7 @@ enum {
#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1)
#define MAX_VLAN_LIST_LEN 1
+#define VF_VLAN_N_VID 4096
struct ifla_vf_vlan_info {
__u32 vf;
@@ -677,6 +682,21 @@ struct ifla_vf_vlan_info {
__be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */
};
+struct ifla_vf_vlan_range {
+ __u32 vf;
+ __u32 start_vid; /* 1 - 4095 */
+ __u32 end_vid; /* 1 - 4095 */
+ __u32 setting;
+ __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */
+};
+
+#define VF_VLAN_BITMAP DIV_ROUND_UP(VF_VLAN_N_VID, sizeof(__u64) * BITS_PER_BYTE)
+struct ifla_vf_vlan_trunk {
+ __u32 vf;
+ __u64 allowed_vlans_8021q_bm[VF_VLAN_BITMAP];
+ __u64 allowed_vlans_8021ad_bm[VF_VLAN_BITMAP];
+};
+
struct ifla_vf_tx_rate {
__u32 vf;
__u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a78fd61da0ec..56909f11d88e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -827,6 +827,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
nla_total_size(MAX_VLAN_LIST_LEN *
sizeof(struct ifla_vf_vlan_info)) +
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_vlan_trunk)) +
nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
@@ -1098,31 +1099,43 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_link_state vf_linkstate;
struct ifla_vf_vlan_info vf_vlan_info;
struct ifla_vf_spoofchk vf_spoofchk;
+ struct ifla_vf_vlan_trunk *vf_trunk;
struct ifla_vf_tx_rate vf_tx_rate;
struct ifla_vf_stats vf_stats;
struct ifla_vf_trust vf_trust;
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
struct ifla_vf_mac vf_mac;
- struct ifla_vf_info ivi;
+ struct ifla_vf_info *ivi;
- memset(&ivi, 0, sizeof(ivi));
+ ivi = kzalloc(sizeof(*ivi), GFP_KERNEL);
+ if (!ivi)
+ return -ENOMEM;
+
+ vf_trunk = kzalloc(sizeof(*vf_trunk), GFP_KERNEL);
+ if (!vf_trunk) {
+ kfree(ivi);
+ return -ENOMEM;
+ }
/* Not all SR-IOV capable drivers support the
* spoofcheck and "RSS query enable" query. Preset to
* -1 so the user space tool can detect that the driver
* didn't report anything.
*/
- ivi.spoofchk = -1;
- ivi.rss_query_en = -1;
- ivi.trusted = -1;
+ ivi->spoofchk = -1;
+ ivi->rss_query_en = -1;
+ ivi->trusted = -1;
+ memset(ivi->mac, 0, sizeof(ivi->mac));
+ memset(ivi->trunk_8021q, 0, sizeof(ivi->trunk_8021q));
+ memset(ivi->trunk_8021ad, 0, sizeof(ivi->trunk_8021ad));
/* The default value for VF link state is "auto"
* IFLA_VF_LINK_STATE_AUTO which equals zero
*/
- ivi.linkstate = 0;
+ ivi->linkstate = 0;
/* VLAN Protocol by default is 802.1Q */
- ivi.vlan_proto = htons(ETH_P_8021Q);
- if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
+ ivi->vlan_proto = htons(ETH_P_8021Q);
+ if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, ivi))
return 0;
memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
@@ -1135,21 +1148,24 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_spoofchk.vf =
vf_linkstate.vf =
vf_rss_query_en.vf =
- vf_trust.vf = ivi.vf;
-
- memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
- vf_vlan.vlan = ivi.vlan;
- vf_vlan.qos = ivi.qos;
- vf_vlan_info.vlan = ivi.vlan;
- vf_vlan_info.qos = ivi.qos;
- vf_vlan_info.vlan_proto = ivi.vlan_proto;
- vf_tx_rate.rate = ivi.max_tx_rate;
- vf_rate.min_tx_rate = ivi.min_tx_rate;
- vf_rate.max_tx_rate = ivi.max_tx_rate;
- vf_spoofchk.setting = ivi.spoofchk;
- vf_linkstate.link_state = ivi.linkstate;
- vf_rss_query_en.setting = ivi.rss_query_en;
- vf_trust.setting = ivi.trusted;
+ vf_trunk->vf =
+ vf_trust.vf = ivi->vf;
+
+ memcpy(vf_mac.mac, ivi->mac, sizeof(ivi->mac));
+ memcpy(vf_trunk->allowed_vlans_8021q_bm, ivi->trunk_8021q, sizeof(ivi->trunk_8021q));
+ memcpy(vf_trunk->allowed_vlans_8021ad_bm, ivi->trunk_8021ad, sizeof(ivi->trunk_8021ad));
+ vf_vlan.vlan = ivi->vlan;
+ vf_vlan.qos = ivi->qos;
+ vf_vlan_info.vlan = ivi->vlan;
+ vf_vlan_info.qos = ivi->qos;
+ vf_vlan_info.vlan_proto = ivi->vlan_proto;
+ vf_tx_rate.rate = ivi->max_tx_rate;
+ vf_rate.min_tx_rate = ivi->min_tx_rate;
+ vf_rate.max_tx_rate = ivi->max_tx_rate;
+ vf_spoofchk.setting = ivi->spoofchk;
+ vf_linkstate.link_state = ivi->linkstate;
+ vf_rss_query_en.setting = ivi->rss_query_en;
+ vf_trust.setting = ivi->trusted;
vf = nla_nest_start(skb, IFLA_VF_INFO);
if (!vf)
goto nla_put_vfinfo_failure;
@@ -1167,7 +1183,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
sizeof(vf_rss_query_en),
&vf_rss_query_en) ||
nla_put(skb, IFLA_VF_TRUST,
- sizeof(vf_trust), &vf_trust))
+ sizeof(vf_trust), &vf_trust) ||
+ nla_put(skb, IFLA_VF_VLAN_TRUNK,
+ sizeof(*vf_trunk), vf_trunk))
goto nla_put_vf_failure;
vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
if (!vfvlanlist)
@@ -1202,12 +1220,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
}
nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
+ kfree(vf_trunk);
+ kfree(ivi);
return 0;
nla_put_vf_failure:
nla_nest_cancel(skb, vf);
nla_put_vfinfo_failure:
nla_nest_cancel(skb, vfinfo);
+ kfree(vf_trunk);
+ kfree(ivi);
return -EMSGSIZE;
}
@@ -1784,6 +1806,26 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
return err;
}
+ if (tb[IFLA_VF_VLAN_RANGE]) {
+ struct ifla_vf_vlan_range *ivvr =
+ nla_data(tb[IFLA_VF_VLAN_RANGE]);
+ bool add = !!ivvr->setting;
+
+ err = -EOPNOTSUPP;
+ if (add && ops->ndo_add_vf_vlan_trunk_range)
+ err = ops->ndo_add_vf_vlan_trunk_range(dev, ivvr->vf,
+ ivvr->start_vid,
+ ivvr->end_vid,
+ ivvr->vlan_proto);
+ else if (!add && ops->ndo_del_vf_vlan_trunk_range)
+ err = ops->ndo_del_vf_vlan_trunk_range(dev, ivvr->vf,
+ ivvr->start_vid,
+ ivvr->end_vid,
+ ivvr->vlan_proto);
+ if (err < 0)
+ return err;
+ }
+
if (tb[IFLA_VF_VLAN_LIST]) {
struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
struct nlattr *attr;
@@ -1815,21 +1857,30 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_TX_RATE]) {
struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
- struct ifla_vf_info ivf;
+ struct ifla_vf_info *ivf;
+
+ ivf = kzalloc(sizeof(*ivf), GFP_KERNEL);
+ if (!ivf)
+ return -ENOMEM;
err = -EOPNOTSUPP;
if (ops->ndo_get_vf_config)
- err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
- if (err < 0)
+ err = ops->ndo_get_vf_config(dev, ivt->vf, ivf);
+ if (err < 0) {
+ kfree(ivf);
return err;
+ }
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_rate)
err = ops->ndo_set_vf_rate(dev, ivt->vf,
- ivf.min_tx_rate,
+ ivf->min_tx_rate,
ivt->rate);
- if (err < 0)
+ if (err < 0) {
+ kfree(ivf);
return err;
+ }
+ kfree(ivf);
}
if (tb[IFLA_VF_RATE]) {
--
2.13.0
^ permalink raw reply related
* [PATCH net-next 4/4] net/mlx5e: E-switch, Add steering drop counters
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, Eugenia Emantayev, Saeed Mahameed
In-Reply-To: <20170827110618.20599-1-saeedm@mellanox.com>
From: Eugenia Emantayev <eugenia@mellanox.com>
Add flow counters to count packets dropped due to drop rules
configured in eswitch egress and ingress ACLs.
These counters will count VFs violations and incoming traffic drops.
Will be presented on hypervisor via standard 'ip -s link show' command.
Example: "ip -s link show dev enp5s0f0"
6: enp5s0f0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000
link/ether 24:8a:07:a5:28:f0 brd ff:ff:ff:ff:ff:ff
RX: bytes packets errors dropped overrun mcast
0 0 0 0 0 2
TX: bytes packets errors dropped carrier collsns
1406 17 0 0 0 0
vf 0 MAC 00:00:ca:fe:ca:fe, vlan 5, spoof checking off, link-state auto, trust off, query_rss off
RX: bytes packets mcast bcast dropped
1666 29 14 32 0
TX: bytes packets dropped
2880 44 2412
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 97 ++++++++++++++++++++--
drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 +
.../net/ethernet/mellanox/mlx5/core/fs_counters.c | 6 ++
3 files changed, 98 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index a8e8670c7c8d..6c992e43e397 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -37,6 +37,7 @@
#include <linux/mlx5/fs.h>
#include "mlx5_core.h"
#include "eswitch.h"
+#include "fs_core.h"
#define UPLINK_VPORT 0xFFFF
@@ -1007,8 +1008,14 @@ static void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw,
kfree(trunk_vlan_rule);
}
- if (!IS_ERR_OR_NULL(vport->egress.drop_rule))
+ if (!IS_ERR_OR_NULL(vport->egress.drop_rule)) {
+ struct mlx5_fc *drop_counter =
+ mlx5_flow_rule_counter(vport->egress.drop_rule);
+
mlx5_del_flow_rules(vport->egress.drop_rule);
+ if (drop_counter)
+ mlx5_fc_destroy(vport->dev, drop_counter);
+ }
if (!IS_ERR_OR_NULL(vport->egress.allow_untagged_rule))
mlx5_del_flow_rules(vport->egress.allow_untagged_rule);
@@ -1174,8 +1181,14 @@ static void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
{
struct mlx5_acl_vlan *trunk_vlan_rule, *tmp;
- if (!IS_ERR_OR_NULL(vport->ingress.drop_rule))
+ if (!IS_ERR_OR_NULL(vport->ingress.drop_rule)) {
+ struct mlx5_fc *drop_counter =
+ mlx5_flow_rule_counter(vport->ingress.drop_rule);
+
mlx5_del_flow_rules(vport->ingress.drop_rule);
+ if (drop_counter)
+ mlx5_fc_destroy(vport->dev, drop_counter);
+ }
list_for_each_entry_safe(trunk_vlan_rule, tmp,
&vport->ingress.allowed_vlans_rules, list) {
@@ -1222,6 +1235,8 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
bool need_vlan_filter = !!bitmap_weight(vport->info.vlan_trunk_8021q_bitmap,
VLAN_N_VID);
struct mlx5_acl_vlan *trunk_vlan_rule;
+ struct mlx5_flow_destination dest;
+ struct mlx5_fc *counter = NULL;
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_spec *spec;
bool need_acl_table = true;
@@ -1333,18 +1348,33 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
}
drop_rule:
+ /* Alloc ingress drop flow counter */
+ counter = mlx5_fc_create(esw->dev, false);
+ if (IS_ERR(counter)) {
+ esw_warn(esw->dev,
+ "vport[%d] configure ingress drop rule counter failed\n",
+ vport->vport);
+ counter = NULL;
+ } else {
+ dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+ dest.counter = counter;
+ }
+
+ /* Drop others rule (star rule) */
memset(spec, 0, sizeof(*spec));
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+ if (counter)
+ flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
vport->ingress.drop_rule =
- mlx5_add_flow_rules(vport->ingress.acl, spec,
- &flow_act, NULL, 0);
+ mlx5_add_flow_rules(vport->ingress.acl, spec, &flow_act, &dest, 1);
if (IS_ERR(vport->ingress.drop_rule)) {
err = PTR_ERR(vport->ingress.drop_rule);
esw_warn(esw->dev,
"vport[%d] configure ingress drop rule, err(%d)\n",
vport->vport, err);
vport->ingress.drop_rule = NULL;
- goto out;
+ if (counter)
+ mlx5_fc_destroy(vport->dev, counter);
}
out:
@@ -1362,6 +1392,8 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
bool need_acl_table = vport->info.vlan || vport->info.qos ||
need_vlan_filter;
struct mlx5_acl_vlan *trunk_vlan_rule;
+ struct mlx5_flow_destination dest;
+ struct mlx5_fc *counter = NULL;
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_spec *spec;
u16 vlan_id = 0;
@@ -1454,18 +1486,33 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
list_add(&trunk_vlan_rule->list, &vport->egress.allowed_vlans_rules);
}
+ /* Alloc egress drop flow counter */
+ counter = mlx5_fc_create(esw->dev, false);
+ if (IS_ERR(counter)) {
+ esw_warn(esw->dev,
+ "vport[%d] configure egress drop rule counter failed\n",
+ vport->vport);
+ counter = NULL;
+ } else {
+ dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+ dest.counter = counter;
+ }
+
/* Drop others rule (star rule) */
memset(spec, 0, sizeof(*spec));
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+ if (counter)
+ flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
vport->egress.drop_rule =
- mlx5_add_flow_rules(vport->egress.acl, spec,
- &flow_act, NULL, 0);
+ mlx5_add_flow_rules(vport->egress.acl, spec, &flow_act, &dest, 1);
if (IS_ERR(vport->egress.drop_rule)) {
err = PTR_ERR(vport->egress.drop_rule);
esw_warn(esw->dev,
"vport[%d] configure egress drop rule failed, err(%d)\n",
vport->vport, err);
vport->egress.drop_rule = NULL;
+ if (counter)
+ mlx5_fc_destroy(vport->dev, counter);
}
out:
if (err)
@@ -2316,6 +2363,38 @@ int mlx5_eswitch_del_vport_trunk_range(struct mlx5_eswitch *esw,
return err;
}
+static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev,
+ int vport_idx,
+ u64 *rx_dropped,
+ u64 *tx_dropped)
+{
+ struct mlx5_eswitch *esw = dev->priv.eswitch;
+ struct mlx5_vport *vport = &esw->vports[vport_idx];
+ struct mlx5_fc *drop_counter;
+ u16 idx = 0;
+ u64 dummy;
+
+ if (!vport->enabled)
+ return 0;
+
+ if (vport->egress.drop_rule) {
+ drop_counter = mlx5_flow_rule_counter(vport->egress.drop_rule);
+ if (drop_counter) {
+ idx = drop_counter->id;
+ mlx5_fc_query(dev, idx, rx_dropped, &dummy);
+ }
+ }
+
+ if (vport->ingress.drop_rule) {
+ drop_counter = mlx5_flow_rule_counter(vport->ingress.drop_rule);
+ if (drop_counter) {
+ idx = drop_counter->id;
+ mlx5_fc_query(dev, idx, tx_dropped, &dummy);
+ }
+ }
+ return 0;
+}
+
int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
int vport,
struct ifla_vf_stats *vf_stats)
@@ -2376,6 +2455,10 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
vf_stats->broadcast =
MLX5_GET_CTR(out, received_eth_broadcast.packets);
+ mlx5_eswitch_query_vport_drop_stats(esw->dev, vport,
+ &vf_stats->rx_dropped,
+ &vf_stats->tx_dropped);
+
free_out:
kvfree(out);
return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 5509a752f98e..e86d75fbc0f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -221,6 +221,8 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
unsigned long delay);
void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
unsigned long interval);
+int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
+ u64 *packets, u64 *bytes);
int mlx5_init_fs(struct mlx5_core_dev *dev);
void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 89d1f8650033..b7ab929d5f8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -312,6 +312,12 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
}
}
+int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id,
+ u64 *packets, u64 *bytes)
+{
+ return mlx5_cmd_fc_query(dev, id, packets, bytes);
+}
+
void mlx5_fc_query_cached(struct mlx5_fc *counter,
u64 *bytes, u64 *packets, u64 *lastuse)
{
--
2.13.0
^ permalink raw reply related
* [PATCH net-next 0/4] SRIOV VF VGT+ and violation counters support
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, Eugenia Emantayev, Saeed Mahameed
Hi Dave
This series provides two security SRIOV related features (VGT+ and VF violation counters).
VGT+ is a security feature that gives the administrator the ability of controlling
the allowed VGT vlan IDs list that can be transmitted/received from/to the VF.
The allowed VGT vlan IDs list is called "trunk".
Admin can add/remove a range of allowed vlan-ids via iptool:
ip link set { DEVICE } [ vf NUM [ trunk { add | rem } START-VLAN-ID [ END-VLAN-ID ] [ proto VLAN-PROTO ] ] ]
Example:
After this series of configuration :
1) ip link set eth3 vf 0 trunk add 10 100 (allow vlan-id 10-100, default tpid 0x8100)
2) ip link set eth3 vf 0 trunk add 105 proto 802.1q (allow vlan-id 105 tpid 0x8100)
3) ip link set eth3 vf 0 trunk add 105 proto 802.1ad (allow vlan-id 105 tpid 0x88a8)
4) ip link set eth3 vf 0 trunk rem 90 (block vlan-id 90)
5) ip link set eth3 vf 0 trunk rem 50 60 (block vlan-ids 50-60)
VF 0 can only communicate on vlan-ids: 10-49,61-89,91-100,105 with tpid 0x8100 and vlan-id 105 with tpid 0x88a8.
For this purpose following net_device callbacks were added:
int (*ndo_add_vf_vlan_trunk_range)(struct net_device *dev, int vf, u16 start_vid, u16 end_vid, __be16 proto);
int (*ndo_del_vf_vlan_trunk_range)(struct net_device *dev, int vf, u16 start_vid, u16 end_vid, __be16 proto);
This feature is implemented and demonstrated in mlx5 via ACL steering tables and vlan rules attached to the VF's
corresponding E-Switch vport.
I addition to VGT+ we introduce new set of counter to VF statistics, to collect counters for traffic violating
VF ACL rules (such as VGT+ violation), for that we extend the current ifla_vf_stats to include rx_dropped/tx_dropped
to be reported per VF.
Example:
> ip link set eth3 vf 0 trunk add 10 100
VF 0 transmits 2412 packets on a vlan id not in [10,100] range will be dropped and reported in hypervisor
via:
> ip -s link show dev enp5s0f0"
6: enp5s0f0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000
[...]
vf 0 MAC 00:00:ca:fe:ca:fe, vlan 5, spoof checking off, link-state auto, trust off, query_rss off
RX: bytes packets mcast bcast dropped
1666 29 14 32 0
TX: bytes packets dropped
2880 44 2412
Thanks,
Saeed.
Eugenia Emantayev (2):
net/core: Add violation counters to VF statisctics
net/mlx5e: E-switch, Add steering drop counters
Mohamad Haj Yahia (2):
net: Add SRIOV VGT+ support
net/mlx5: Add SRIOV VGT+ support
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 28 +
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 589 +++++++++++++++++----
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 31 +-
drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 +
.../net/ethernet/mellanox/mlx5/core/fs_counters.c | 6 +
drivers/net/ethernet/mellanox/mlx5/core/vport.c | 19 +-
include/linux/if_link.h | 4 +
include/linux/mlx5/vport.h | 6 +-
include/linux/netdevice.h | 12 +
include/uapi/linux/if_link.h | 22 +
net/core/rtnetlink.c | 119 +++--
11 files changed, 681 insertions(+), 157 deletions(-)
--
2.13.0
^ permalink raw reply
* [PATCH net-next 3/4] net/core: Add violation counters to VF statisctics
From: Saeed Mahameed @ 2017-08-27 11:06 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, Eugenia Emantayev, Saeed Mahameed
In-Reply-To: <20170827110618.20599-1-saeedm@mellanox.com>
From: Eugenia Emantayev <eugenia@mellanox.com>
Add receive and transmit violation counters to be
displayed in iproute2 VF statistics.
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
include/linux/if_link.h | 2 ++
include/uapi/linux/if_link.h | 2 ++
net/core/rtnetlink.c | 10 +++++++++-
3 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index da70af27e42e..ebf3448acb5b 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -12,6 +12,8 @@ struct ifla_vf_stats {
__u64 tx_bytes;
__u64 broadcast;
__u64 multicast;
+ __u64 rx_dropped;
+ __u64 tx_dropped;
};
struct ifla_vf_info {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3aa895c5fbc1..68cd31b281a1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -743,6 +743,8 @@ enum {
IFLA_VF_STATS_BROADCAST,
IFLA_VF_STATS_MULTICAST,
IFLA_VF_STATS_PAD,
+ IFLA_VF_STATS_RX_DROPPED,
+ IFLA_VF_STATS_TX_DROPPED,
__IFLA_VF_STATS_MAX,
};
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 56909f11d88e..1a653bb00d6e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -845,6 +845,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_MULTICAST */
nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)) +
nla_total_size(sizeof(struct ifla_vf_trust)));
return size;
} else
@@ -1214,7 +1218,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+ vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+ vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+ vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
nla_nest_cancel(skb, vfstats);
goto nla_put_vf_failure;
}
--
2.13.0
^ permalink raw reply related
* (unknown),
From: agar2000 @ 2017-08-27 10:55 UTC (permalink / raw)
To: netdev
[-- Attachment #1: MAIL_34929959_netdev.zip --]
[-- Type: application/zip, Size: 72397 bytes --]
^ permalink raw reply
* Re: [PATCH net] bridge: check for null fdb->dst before notifying switchdev drivers
From: Arkadi Sharshevsky @ 2017-08-27 8:46 UTC (permalink / raw)
To: Roopa Prabhu, davem; +Cc: netdev
In-Reply-To: <1503807228-16281-1-git-send-email-roopa@cumulusnetworks.com>
On 08/27/2017 07:13 AM, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> current switchdev drivers dont seem to support offloading fdb
> entries pointing to the bridge device which have fdb->dst
> not set to any port. This patch adds a NULL fdb->dst check in
> the switchdev notifier code.
>
> This patch fixes the below NULL ptr dereference:
> $bridge fdb add 00:02:00:00:00:33 dev br0 self
>
> [ 69.953374] BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000008
> [ 69.954044] IP: br_switchdev_fdb_notify+0x29/0x80
> [ 69.954044] PGD 66527067
> [ 69.954044] P4D 66527067
> [ 69.954044] PUD 7899c067
> [ 69.954044] PMD 0
> [ 69.954044]
> [ 69.954044] Oops: 0000 [#1] SMP
> [ 69.954044] Modules linked in:
> [ 69.954044] CPU: 1 PID: 3074 Comm: bridge Not tainted 4.13.0-rc6+ #1
> [ 69.954044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org
> 04/01/2014
> [ 69.954044] task: ffff88007b827140 task.stack: ffffc90001564000
> [ 69.954044] RIP: 0010:br_switchdev_fdb_notify+0x29/0x80
> [ 69.954044] RSP: 0018:ffffc90001567918 EFLAGS: 00010246
> [ 69.954044] RAX: 0000000000000000 RBX: ffff8800795e0880 RCX:
> 00000000000000c0
> [ 69.954044] RDX: ffffc90001567920 RSI: 000000000000001c RDI:
> ffff8800795d0600
> [ 69.954044] RBP: ffffc90001567938 R08: ffff8800795d0600 R09:
> 0000000000000000
> [ 69.954044] R10: ffffc90001567a88 R11: ffff88007b849400 R12:
> ffff8800795e0880
> [ 69.954044] R13: ffff8800795d0600 R14: ffffffff81ef8880 R15:
> 000000000000001c
> [ 69.954044] FS: 00007f93d3085700(0000) GS:ffff88007fd00000(0000)
> knlGS:0000000000000000
> [ 69.954044] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 69.954044] CR2: 0000000000000008 CR3: 0000000066551000 CR4:
> 00000000000006e0
> [ 69.954044] Call Trace:
> [ 69.954044] fdb_notify+0x3f/0xf0
> [ 69.954044] __br_fdb_add.isra.12+0x1a7/0x370
> [ 69.954044] br_fdb_add+0x178/0x280
> [ 69.954044] rtnl_fdb_add+0x10a/0x200
> [ 69.954044] rtnetlink_rcv_msg+0x1b4/0x240
> [ 69.954044] ? skb_free_head+0x21/0x40
> [ 69.954044] ? rtnl_calcit.isra.18+0xf0/0xf0
> [ 69.954044] netlink_rcv_skb+0xed/0x120
> [ 69.954044] rtnetlink_rcv+0x15/0x20
> [ 69.954044] netlink_unicast+0x180/0x200
> [ 69.954044] netlink_sendmsg+0x291/0x370
> [ 69.954044] ___sys_sendmsg+0x180/0x2e0
> [ 69.954044] ? filemap_map_pages+0x2db/0x370
> [ 69.954044] ? do_wp_page+0x11d/0x420
> [ 69.954044] ? __handle_mm_fault+0x794/0xd80
> [ 69.954044] ? vma_link+0xcb/0xd0
> [ 69.954044] __sys_sendmsg+0x4c/0x90
> [ 69.954044] SyS_sendmsg+0x12/0x20
> [ 69.954044] do_syscall_64+0x63/0xe0
> [ 69.954044] entry_SYSCALL64_slow_path+0x25/0x25
> [ 69.954044] RIP: 0033:0x7f93d2bad690
> [ 69.954044] RSP: 002b:00007ffc7217a638 EFLAGS: 00000246 ORIG_RAX:
> 000000000000002e
> [ 69.954044] RAX: ffffffffffffffda RBX: 00007ffc72182eac RCX:
> 00007f93d2bad690
> [ 69.954044] RDX: 0000000000000000 RSI: 00007ffc7217a670 RDI:
> 0000000000000003
> [ 69.954044] RBP: 0000000059a1f7f8 R08: 0000000000000006 R09:
> 000000000000000a
> [ 69.954044] R10: 00007ffc7217a400 R11: 0000000000000246 R12:
> 00007ffc7217a670
> [ 69.954044] R13: 00007ffc72182a98 R14: 00000000006114c0 R15:
> 00007ffc72182aa0
> [ 69.954044] Code: 1f 00 66 66 66 66 90 55 48 89 e5 48 83 ec 20 f6 47
> 20 04 74 0a 83 fe 1c 74 09 83 fe 1d 74 2c c9 66 90 c3 48 8b 47 10 48 8d
> 55 e8 <48> 8b 70 08 0f b7 47 1e 48 83 c7 18 48 89 7d f0 bf 03 00 00 00
> [ 69.954044] RIP: br_switchdev_fdb_notify+0x29/0x80 RSP:
> ffffc90001567918
> [ 69.954044] CR2: 0000000000000008
> [ 69.954044] ---[ end trace 03e9eec4a82c238b ]---
>
> Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about FDB add/del")
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> ---
> net/bridge/br_switchdev.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
> index 181a44d..f6b1c7d 100644
> --- a/net/bridge/br_switchdev.c
> +++ b/net/bridge/br_switchdev.c
> @@ -115,7 +115,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
> void
> br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
> {
> - if (!fdb->added_by_user)
> + if (!fdb->added_by_user || !fdb->dst)
> return;
>
> switch (type) {
>
Thanks, missed that.
Arkadi
^ permalink raw reply
* Re: [Intel-wired-lan] [PATCH] e1000e: apply burst mode settings only on default
From: Neftin, Sasha @ 2017-08-27 8:34 UTC (permalink / raw)
To: Willem de Bruijn, jeffrey.t.kirsher, alexander.h.duyck,
raanan.avargil, dima.ruinskiy
Cc: netdev, Willem de Bruijn, intel-wired-lan
In-Reply-To: <e589e146-3536-9ef3-486c-f5d115eb83cf@intel.com>
On 8/27/2017 11:32, Neftin, Sasha wrote:
> On 8/27/2017 11:30, Neftin, Sasha wrote:
>> On 8/25/2017 18:06, Willem de Bruijn wrote:
>>> From: Willem de Bruijn <willemb@google.com>
>>>
>>> Devices that support FLAG2_DMA_BURST have different default values
>>> for RDTR and RADV. Apply burst mode default settings only when no
>>> explicit value was passed at module load.
>>>
>>> The RDTR default is zero. If the module is loaded for low latency
>>> operation with RxIntDelay=0, do not override this value with a burst
>>> default of 32.
>>>
>>> Move the decision to apply burst values earlier, where explicitly
>>> initialized module variables can be distinguished from defaults.
>>>
>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>> ---
>>> drivers/net/ethernet/intel/e1000e/e1000.h | 4 ----
>>> drivers/net/ethernet/intel/e1000e/netdev.c | 8 --------
>>> drivers/net/ethernet/intel/e1000e/param.c | 16 +++++++++++++++-
>>> 3 files changed, 15 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h
>>> b/drivers/net/ethernet/intel/e1000e/e1000.h
>>> index 98e68888abb1..2311b31bdcac 100644
>>> --- a/drivers/net/ethernet/intel/e1000e/e1000.h
>>> +++ b/drivers/net/ethernet/intel/e1000e/e1000.h
>>> @@ -94,10 +94,6 @@ struct e1000_info;
>>> */
>>> #define E1000_CHECK_RESET_COUNT 25
>>> -#define DEFAULT_RDTR 0
>>> -#define DEFAULT_RADV 8
>>> -#define BURST_RDTR 0x20
>>> -#define BURST_RADV 0x20
>>> #define PCICFG_DESC_RING_STATUS 0xe4
>>> #define FLUSH_DESC_REQUIRED 0x100
>>> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c
>>> b/drivers/net/ethernet/intel/e1000e/netdev.c
>>> index 327dfe5bedc0..47b89aac7969 100644
>>> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
>>> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
>>> @@ -3223,14 +3223,6 @@ static void e1000_configure_rx(struct
>>> e1000_adapter *adapter)
>>> */
>>> ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
>>> ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
>>> -
>>> - /* override the delay timers for enabling bursting, only if
>>> - * the value was not set by the user via module options
>>> - */
>>> - if (adapter->rx_int_delay == DEFAULT_RDTR)
>>> - adapter->rx_int_delay = BURST_RDTR;
>>> - if (adapter->rx_abs_int_delay == DEFAULT_RADV)
>>> - adapter->rx_abs_int_delay = BURST_RADV;
>>> }
>>> /* set the Receive Delay Timer Register */
>>> diff --git a/drivers/net/ethernet/intel/e1000e/param.c
>>> b/drivers/net/ethernet/intel/e1000e/param.c
>>> index 6d8c39abee16..bb696c98f9b0 100644
>>> --- a/drivers/net/ethernet/intel/e1000e/param.c
>>> +++ b/drivers/net/ethernet/intel/e1000e/param.c
>>> @@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute
>>> Interrupt Delay");
>>> /* Receive Interrupt Delay in units of 1.024 microseconds
>>> * hardware will likely hang if you set this to anything but zero.
>>> *
>>> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>>> + *
>>> * Valid Range: 0-65535
>>> */
>>> E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
>>> +#define DEFAULT_RDTR 0
>>> +#define BURST_RDTR 0x20
>>> #define MAX_RXDELAY 0xFFFF
>>> #define MIN_RXDELAY 0
>>> /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
>>> + *
>>> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>>> *
>>> * Valid Range: 0-65535
>>> */
>>> E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
>>> +#define DEFAULT_RADV 8
>>> +#define BURST_RADV 0x20
>>> #define MAX_RXABSDELAY 0xFFFF
>>> #define MIN_RXABSDELAY 0
>>> @@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter
>>> *adapter)
>>> .max = MAX_RXDELAY } }
>>> };
>>> + if (adapter->flags2 & FLAG2_DMA_BURST)
>>> + opt.def = BURST_RDTR;
>>> +
>>> if (num_RxIntDelay > bd) {
>>> adapter->rx_int_delay = RxIntDelay[bd];
>>> e1000_validate_option(&adapter->rx_int_delay, &opt,
>>> @@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter
>>> *adapter)
>>> }
>>> /* Receive Absolute Interrupt Delay */
>>> {
>>> - static const struct e1000_option opt = {
>>> + static struct e1000_option opt = {
>>> .type = range_option,
>>> .name = "Receive Absolute Interrupt Delay",
>>> .err = "using default of "
>>> @@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter
>>> *adapter)
>>> .max = MAX_RXABSDELAY } }
>>> };
>>> + if (adapter->flags2 & FLAG2_DMA_BURST)
>>> + opt.def = BURST_RADV;
>>> +
>>> if (num_RxAbsIntDelay > bd) {
>>> adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
>>> e1000_validate_option(&adapter->rx_abs_int_delay, &opt,
>>
>> This patch looks good for me, but I would like hear second opinion.
>>
>> _______________________________________________
>> Intel-wired-lan mailing list
>> Intel-wired-lan@osuosl.org
>> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
>
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
^ permalink raw reply
* Re: [Intel-wired-lan] [PATCH] e1000e: apply burst mode settings only on default
From: Neftin, Sasha @ 2017-08-27 8:32 UTC (permalink / raw)
To: Willem de Bruijn, "jeffrey.t.kirsher
Cc: netdev, Willem de Bruijn, intel-wired-lan
In-Reply-To: <291b863f-552e-2f3b-f658-e812d0848949@intel.com>
On 8/27/2017 11:30, Neftin, Sasha wrote:
> On 8/25/2017 18:06, Willem de Bruijn wrote:
>> From: Willem de Bruijn <willemb@google.com>
>>
>> Devices that support FLAG2_DMA_BURST have different default values
>> for RDTR and RADV. Apply burst mode default settings only when no
>> explicit value was passed at module load.
>>
>> The RDTR default is zero. If the module is loaded for low latency
>> operation with RxIntDelay=0, do not override this value with a burst
>> default of 32.
>>
>> Move the decision to apply burst values earlier, where explicitly
>> initialized module variables can be distinguished from defaults.
>>
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>> ---
>> drivers/net/ethernet/intel/e1000e/e1000.h | 4 ----
>> drivers/net/ethernet/intel/e1000e/netdev.c | 8 --------
>> drivers/net/ethernet/intel/e1000e/param.c | 16 +++++++++++++++-
>> 3 files changed, 15 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h
>> b/drivers/net/ethernet/intel/e1000e/e1000.h
>> index 98e68888abb1..2311b31bdcac 100644
>> --- a/drivers/net/ethernet/intel/e1000e/e1000.h
>> +++ b/drivers/net/ethernet/intel/e1000e/e1000.h
>> @@ -94,10 +94,6 @@ struct e1000_info;
>> */
>> #define E1000_CHECK_RESET_COUNT 25
>> -#define DEFAULT_RDTR 0
>> -#define DEFAULT_RADV 8
>> -#define BURST_RDTR 0x20
>> -#define BURST_RADV 0x20
>> #define PCICFG_DESC_RING_STATUS 0xe4
>> #define FLUSH_DESC_REQUIRED 0x100
>> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c
>> b/drivers/net/ethernet/intel/e1000e/netdev.c
>> index 327dfe5bedc0..47b89aac7969 100644
>> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
>> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
>> @@ -3223,14 +3223,6 @@ static void e1000_configure_rx(struct
>> e1000_adapter *adapter)
>> */
>> ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
>> ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
>> -
>> - /* override the delay timers for enabling bursting, only if
>> - * the value was not set by the user via module options
>> - */
>> - if (adapter->rx_int_delay == DEFAULT_RDTR)
>> - adapter->rx_int_delay = BURST_RDTR;
>> - if (adapter->rx_abs_int_delay == DEFAULT_RADV)
>> - adapter->rx_abs_int_delay = BURST_RADV;
>> }
>> /* set the Receive Delay Timer Register */
>> diff --git a/drivers/net/ethernet/intel/e1000e/param.c
>> b/drivers/net/ethernet/intel/e1000e/param.c
>> index 6d8c39abee16..bb696c98f9b0 100644
>> --- a/drivers/net/ethernet/intel/e1000e/param.c
>> +++ b/drivers/net/ethernet/intel/e1000e/param.c
>> @@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute
>> Interrupt Delay");
>> /* Receive Interrupt Delay in units of 1.024 microseconds
>> * hardware will likely hang if you set this to anything but zero.
>> *
>> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>> + *
>> * Valid Range: 0-65535
>> */
>> E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
>> +#define DEFAULT_RDTR 0
>> +#define BURST_RDTR 0x20
>> #define MAX_RXDELAY 0xFFFF
>> #define MIN_RXDELAY 0
>> /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
>> + *
>> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
>> *
>> * Valid Range: 0-65535
>> */
>> E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
>> +#define DEFAULT_RADV 8
>> +#define BURST_RADV 0x20
>> #define MAX_RXABSDELAY 0xFFFF
>> #define MIN_RXABSDELAY 0
>> @@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter
>> *adapter)
>> .max = MAX_RXDELAY } }
>> };
>> + if (adapter->flags2 & FLAG2_DMA_BURST)
>> + opt.def = BURST_RDTR;
>> +
>> if (num_RxIntDelay > bd) {
>> adapter->rx_int_delay = RxIntDelay[bd];
>> e1000_validate_option(&adapter->rx_int_delay, &opt,
>> @@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter
>> *adapter)
>> }
>> /* Receive Absolute Interrupt Delay */
>> {
>> - static const struct e1000_option opt = {
>> + static struct e1000_option opt = {
>> .type = range_option,
>> .name = "Receive Absolute Interrupt Delay",
>> .err = "using default of "
>> @@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter
>> *adapter)
>> .max = MAX_RXABSDELAY } }
>> };
>> + if (adapter->flags2 & FLAG2_DMA_BURST)
>> + opt.def = BURST_RADV;
>> +
>> if (num_RxAbsIntDelay > bd) {
>> adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
>> e1000_validate_option(&adapter->rx_abs_int_delay, &opt,
>
> This patch looks good for me, but I would like hear second opinion.
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
^ permalink raw reply
* Re: [patch net-next 11/12] mlxsw: spectrum_dpipe: Add support for IPv4 host table dump
From: Arkadi Sharshevsky @ 2017-08-27 8:31 UTC (permalink / raw)
To: David Ahern, Jiri Pirko, netdev; +Cc: davem, idosch, mlxsw
In-Reply-To: <a4335a01-a98b-573c-9747-b4039188340c@gmail.com>
On 08/25/2017 10:51 PM, David Ahern wrote:
> On 8/25/17 2:26 AM, Arkadi Sharshevsky wrote:
>>
>>
>> On 08/24/2017 10:26 PM, David Ahern wrote:
>>> On 8/23/17 11:40 PM, Jiri Pirko wrote:
>>>> +static int
>>>> +mlxsw_sp_dpipe_table_host_entries_get(struct mlxsw_sp *mlxsw_sp,
>>>> + struct devlink_dpipe_entry *entry,
>>>> + bool counters_enabled,
>>>> + struct devlink_dpipe_dump_ctx *dump_ctx,
>>>> + int type)
>>>> +{
>>>> + int rif_neigh_count = 0;
>>>> + int rif_neigh_skip = 0;
>>>> + int neigh_count = 0;
>>>> + int rif_count;
>>>> + int i, j;
>>>> + int err;
>>>> +
>>>> + rtnl_lock();
>>>
>>> Why does a h/w driver dumping its tables need the rtnl lock?
>>>
>>
>> This table represents the hw IPv4 arp table, and the
>> driver depends on rtnl to be held.
>>
>
> Meaning mlxsw does not have its own locks protecting data structures --
> e.g., rif adds and deletes, so it is relying on rtnl?
>
> Also, this dpipe capability seems to be just dumping data structures
> maintained by the driver. ie., you can compare the mlxsw view of
> networking state to IPv4 and IPv6 level tables. Any plans to offer a
> command that reads data from the h/w and passes that back to the user?
> i.e, a command to compare kernel tables to h/w state?
>
So this infra should provide several things-
1) Reveal the interactions between various hardware tables
2) Counters for this tables
3) Debugabillity
The first two can be achieved right now. Regarding debugabillity, which
is a bit vague, the current assumption is that the drivers internal data
structures are synced with hardware (which is no always true), and maybe
are not synced with the kernel, so this can be achieved right now by
dumping the internal state of the driver. Furthermore, the counters are
dumped from the hardware and give the user additional indication.
I completely agree that the hardware should be dumped in order to
validate the internal data structures are really synced with HW. This
could be usable for observing data corruptions inside the ASIC and
various complex bugs.
In order to address that I though about maybe add a flag called
"validate_hw" so that during the dump the driver<-->hw state could be
validated.
What do you think about it?
Thanks,
Arkadi
^ permalink raw reply
* Re: [Intel-wired-lan] [PATCH] e1000e: apply burst mode settings only on default
From: Neftin, Sasha @ 2017-08-27 8:30 UTC (permalink / raw)
To: Willem de Bruijn, "jeffrey.t.kirsher
Cc: netdev, Willem de Bruijn, intel-wired-lan
In-Reply-To: <20170825150626.2843-1-willemdebruijn.kernel@gmail.com>
On 8/25/2017 18:06, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
>
> Devices that support FLAG2_DMA_BURST have different default values
> for RDTR and RADV. Apply burst mode default settings only when no
> explicit value was passed at module load.
>
> The RDTR default is zero. If the module is loaded for low latency
> operation with RxIntDelay=0, do not override this value with a burst
> default of 32.
>
> Move the decision to apply burst values earlier, where explicitly
> initialized module variables can be distinguished from defaults.
>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
> drivers/net/ethernet/intel/e1000e/e1000.h | 4 ----
> drivers/net/ethernet/intel/e1000e/netdev.c | 8 --------
> drivers/net/ethernet/intel/e1000e/param.c | 16 +++++++++++++++-
> 3 files changed, 15 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
> index 98e68888abb1..2311b31bdcac 100644
> --- a/drivers/net/ethernet/intel/e1000e/e1000.h
> +++ b/drivers/net/ethernet/intel/e1000e/e1000.h
> @@ -94,10 +94,6 @@ struct e1000_info;
> */
> #define E1000_CHECK_RESET_COUNT 25
>
> -#define DEFAULT_RDTR 0
> -#define DEFAULT_RADV 8
> -#define BURST_RDTR 0x20
> -#define BURST_RADV 0x20
> #define PCICFG_DESC_RING_STATUS 0xe4
> #define FLUSH_DESC_REQUIRED 0x100
>
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
> index 327dfe5bedc0..47b89aac7969 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> @@ -3223,14 +3223,6 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
> */
> ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
> ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
> -
> - /* override the delay timers for enabling bursting, only if
> - * the value was not set by the user via module options
> - */
> - if (adapter->rx_int_delay == DEFAULT_RDTR)
> - adapter->rx_int_delay = BURST_RDTR;
> - if (adapter->rx_abs_int_delay == DEFAULT_RADV)
> - adapter->rx_abs_int_delay = BURST_RADV;
> }
>
> /* set the Receive Delay Timer Register */
> diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c
> index 6d8c39abee16..bb696c98f9b0 100644
> --- a/drivers/net/ethernet/intel/e1000e/param.c
> +++ b/drivers/net/ethernet/intel/e1000e/param.c
> @@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute Interrupt Delay");
> /* Receive Interrupt Delay in units of 1.024 microseconds
> * hardware will likely hang if you set this to anything but zero.
> *
> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
> + *
> * Valid Range: 0-65535
> */
> E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
> +#define DEFAULT_RDTR 0
> +#define BURST_RDTR 0x20
> #define MAX_RXDELAY 0xFFFF
> #define MIN_RXDELAY 0
>
> /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
> + *
> + * Burst variant is used as default if device has FLAG2_DMA_BURST.
> *
> * Valid Range: 0-65535
> */
> E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
> +#define DEFAULT_RADV 8
> +#define BURST_RADV 0x20
> #define MAX_RXABSDELAY 0xFFFF
> #define MIN_RXABSDELAY 0
>
> @@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
> .max = MAX_RXDELAY } }
> };
>
> + if (adapter->flags2 & FLAG2_DMA_BURST)
> + opt.def = BURST_RDTR;
> +
> if (num_RxIntDelay > bd) {
> adapter->rx_int_delay = RxIntDelay[bd];
> e1000_validate_option(&adapter->rx_int_delay, &opt,
> @@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter *adapter)
> }
> /* Receive Absolute Interrupt Delay */
> {
> - static const struct e1000_option opt = {
> + static struct e1000_option opt = {
> .type = range_option,
> .name = "Receive Absolute Interrupt Delay",
> .err = "using default of "
> @@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
> .max = MAX_RXABSDELAY } }
> };
>
> + if (adapter->flags2 & FLAG2_DMA_BURST)
> + opt.def = BURST_RADV;
> +
> if (num_RxAbsIntDelay > bd) {
> adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
> e1000_validate_option(&adapter->rx_abs_int_delay, &opt,
This patch looks good for me, but I would like hear second opinion.
^ permalink raw reply
* Re: [PATCH] pktgen: add a new sample script for 40G and above link testing
From: Tariq Toukan @ 2017-08-27 8:25 UTC (permalink / raw)
To: Robert Hoo, davem, tariqt, brouer, kyle.leet; +Cc: netdev, robert.hu
In-Reply-To: <1503653196-64418-1-git-send-email-robert.hu@linux.intel.com>
On 25/08/2017 12:26 PM, Robert Hoo wrote:
> (Sorry for yesterday's wrong sending, I finally fixed my MTA and git
> send-email settings.)
>
> It's hard to benchmark 40G+ network bandwidth using ordinary
> tools like iperf, netperf (see reference 1).
> Pktgen, packet generator from Kernel sapce, shall be a candidate.
> I then tried with pktgen multiqueue sample scripts, but still
> cannot reach line rate.
Try samples 03 and 04.
> I then derived this NUMA awared irq affinity sample script from
> multi-queue sample one, successfully benchmarked 40G link. I think this can
> also be useful for 100G reference, though I haven't got device to test yet.
>
> This script simply does:
> Detect $DEV's NUMA node belonging.
> Bind each thread (processor from that NUMA node) with each $DEV queue's
> irq affinity, 1:1 mapping.
> How many '-t' threads input determines how many queues will be
> utilized.
I agree this is an essential capability.
This was the main reason I added support for the -f argument.
Using it, I could choose cores of local NUMA, especially for single
thread, or when cores of the NUMA are sequential.
>
> Tested with Intel XL710 NIC with Cisco 3172 switch.
>
> It would be even slightly better if the irqbalance service is turned
> off outside.
>
> Referrences:
> https://people.netfilter.org/hawk/presentations/LCA2015/net_stack_challenges_100G_LCA2015.pdf
> http://www.intel.cn/content/dam/www/public/us/en/documents/reference-guides/xl710-x710-performance-tuning-linux-guide.pdf
>
> Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
> ---
Regards,
Tariq Toukan
^ permalink raw reply
* [PATCH] be2net: Fix some u16 fields appropriately
From: Haishuang Yan @ 2017-08-27 7:24 UTC (permalink / raw)
To: Sathya Perla, jit Khaparde, Sriharsha Basavapatna, Somnath Kotur
Cc: netdev, linux-kernel, Haishuang Yan
In be_tx_compl_process, frag_index declared as u32, so it's better to
declare last_index as u32 also.
CC: Ajit Khaparde <ajit.khaparde@broadcom.com>
Fixes: b0fd2eb28bd4 ("be2net: Declare some u16 fields as u32 to improve
performance")
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
---
drivers/net/ethernet/emulex/benet/be.h | 2 +-
drivers/net/ethernet/emulex/benet/be_main.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 674cf9d..2ba4d61 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -255,7 +255,7 @@ struct be_tx_stats {
/* Structure to hold some data of interest obtained from a TX CQE */
struct be_tx_compl_info {
u8 status; /* Completion status */
- u16 end_index; /* Completed TXQ Index */
+ u32 end_index; /* Completed TXQ Index */
};
struct be_tx_obj {
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 319eee3..3645344 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2606,7 +2606,7 @@ static struct be_tx_compl_info *be_tx_compl_get(struct be_tx_obj *txo)
}
static u16 be_tx_compl_process(struct be_adapter *adapter,
- struct be_tx_obj *txo, u16 last_index)
+ struct be_tx_obj *txo, u32 last_index)
{
struct sk_buff **sent_skbs = txo->sent_skb_list;
struct be_queue_info *txq = &txo->q;
--
1.8.3.1
^ permalink raw reply related
* [PATCH] igb: check memory allocation failure
From: Christophe JAILLET @ 2017-08-27 6:39 UTC (permalink / raw)
To: jeffrey.t.kirsher
Cc: intel-wired-lan, netdev, linux-kernel, kernel-janitors,
Christophe JAILLET
Check memory allocation failures and return -ENOMEM in such cases, as
already done for other memory allocations in this function.
This avoids NULL pointers dereference.
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
drivers/net/ethernet/intel/igb/igb_main.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..837d9b46a390 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3162,6 +3162,8 @@ static int igb_sw_init(struct igb_adapter *adapter)
/* Setup and initialize a copy of the hw vlan table array */
adapter->shadow_vfta = kcalloc(E1000_VLAN_FILTER_TBL_SIZE, sizeof(u32),
GFP_ATOMIC);
+ if (!adapter->shadow_vfta)
+ return -ENOMEM;
/* This call may decrease the number of queues */
if (igb_init_interrupt_scheme(adapter, true)) {
--
2.11.0
^ permalink raw reply related
* [PATCH net] bridge: check for null fdb->dst before notifying switchdev drivers
From: Roopa Prabhu @ 2017-08-27 4:13 UTC (permalink / raw)
To: davem; +Cc: netdev, arkadis
From: Roopa Prabhu <roopa@cumulusnetworks.com>
current switchdev drivers dont seem to support offloading fdb
entries pointing to the bridge device which have fdb->dst
not set to any port. This patch adds a NULL fdb->dst check in
the switchdev notifier code.
This patch fixes the below NULL ptr dereference:
$bridge fdb add 00:02:00:00:00:33 dev br0 self
[ 69.953374] BUG: unable to handle kernel NULL pointer dereference at
0000000000000008
[ 69.954044] IP: br_switchdev_fdb_notify+0x29/0x80
[ 69.954044] PGD 66527067
[ 69.954044] P4D 66527067
[ 69.954044] PUD 7899c067
[ 69.954044] PMD 0
[ 69.954044]
[ 69.954044] Oops: 0000 [#1] SMP
[ 69.954044] Modules linked in:
[ 69.954044] CPU: 1 PID: 3074 Comm: bridge Not tainted 4.13.0-rc6+ #1
[ 69.954044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org
04/01/2014
[ 69.954044] task: ffff88007b827140 task.stack: ffffc90001564000
[ 69.954044] RIP: 0010:br_switchdev_fdb_notify+0x29/0x80
[ 69.954044] RSP: 0018:ffffc90001567918 EFLAGS: 00010246
[ 69.954044] RAX: 0000000000000000 RBX: ffff8800795e0880 RCX:
00000000000000c0
[ 69.954044] RDX: ffffc90001567920 RSI: 000000000000001c RDI:
ffff8800795d0600
[ 69.954044] RBP: ffffc90001567938 R08: ffff8800795d0600 R09:
0000000000000000
[ 69.954044] R10: ffffc90001567a88 R11: ffff88007b849400 R12:
ffff8800795e0880
[ 69.954044] R13: ffff8800795d0600 R14: ffffffff81ef8880 R15:
000000000000001c
[ 69.954044] FS: 00007f93d3085700(0000) GS:ffff88007fd00000(0000)
knlGS:0000000000000000
[ 69.954044] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 69.954044] CR2: 0000000000000008 CR3: 0000000066551000 CR4:
00000000000006e0
[ 69.954044] Call Trace:
[ 69.954044] fdb_notify+0x3f/0xf0
[ 69.954044] __br_fdb_add.isra.12+0x1a7/0x370
[ 69.954044] br_fdb_add+0x178/0x280
[ 69.954044] rtnl_fdb_add+0x10a/0x200
[ 69.954044] rtnetlink_rcv_msg+0x1b4/0x240
[ 69.954044] ? skb_free_head+0x21/0x40
[ 69.954044] ? rtnl_calcit.isra.18+0xf0/0xf0
[ 69.954044] netlink_rcv_skb+0xed/0x120
[ 69.954044] rtnetlink_rcv+0x15/0x20
[ 69.954044] netlink_unicast+0x180/0x200
[ 69.954044] netlink_sendmsg+0x291/0x370
[ 69.954044] ___sys_sendmsg+0x180/0x2e0
[ 69.954044] ? filemap_map_pages+0x2db/0x370
[ 69.954044] ? do_wp_page+0x11d/0x420
[ 69.954044] ? __handle_mm_fault+0x794/0xd80
[ 69.954044] ? vma_link+0xcb/0xd0
[ 69.954044] __sys_sendmsg+0x4c/0x90
[ 69.954044] SyS_sendmsg+0x12/0x20
[ 69.954044] do_syscall_64+0x63/0xe0
[ 69.954044] entry_SYSCALL64_slow_path+0x25/0x25
[ 69.954044] RIP: 0033:0x7f93d2bad690
[ 69.954044] RSP: 002b:00007ffc7217a638 EFLAGS: 00000246 ORIG_RAX:
000000000000002e
[ 69.954044] RAX: ffffffffffffffda RBX: 00007ffc72182eac RCX:
00007f93d2bad690
[ 69.954044] RDX: 0000000000000000 RSI: 00007ffc7217a670 RDI:
0000000000000003
[ 69.954044] RBP: 0000000059a1f7f8 R08: 0000000000000006 R09:
000000000000000a
[ 69.954044] R10: 00007ffc7217a400 R11: 0000000000000246 R12:
00007ffc7217a670
[ 69.954044] R13: 00007ffc72182a98 R14: 00000000006114c0 R15:
00007ffc72182aa0
[ 69.954044] Code: 1f 00 66 66 66 66 90 55 48 89 e5 48 83 ec 20 f6 47
20 04 74 0a 83 fe 1c 74 09 83 fe 1d 74 2c c9 66 90 c3 48 8b 47 10 48 8d
55 e8 <48> 8b 70 08 0f b7 47 1e 48 83 c7 18 48 89 7d f0 bf 03 00 00 00
[ 69.954044] RIP: br_switchdev_fdb_notify+0x29/0x80 RSP:
ffffc90001567918
[ 69.954044] CR2: 0000000000000008
[ 69.954044] ---[ end trace 03e9eec4a82c238b ]---
Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about FDB add/del")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
net/bridge/br_switchdev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 181a44d..f6b1c7d 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -115,7 +115,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
void
br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
{
- if (!fdb->added_by_user)
+ if (!fdb->added_by_user || !fdb->dst)
return;
switch (type) {
--
2.1.4
^ permalink raw reply related
* Re: [PATCH RFC WIP 0/5] IGMP snooping for local traffic
From: Andrew Lunn @ 2017-08-26 23:02 UTC (permalink / raw)
To: Nikolay Aleksandrov
Cc: netdev, Vivien Didelot, Florian Fainelli, jiri, roopa, stephen,
bridge
In-Reply-To: <c3f5050f-7a38-c28c-31d0-df5909259fb1@cumulusnetworks.com>
> Hi Andrew,
>
> Have you taken a look at mglist (the boolean, probably needs a rename) ? It is for
> exactly that purpose, to track which groups the bridge is interested in.
> I assume I'm forgetting or missing something here.
>
> > performed. With a pure software bridge, it is not required. All
> > mulitcast frames are passed to the brX interface, and the network
>
> If mglist (again the boolean) is false then they won't be passed up.
>
> > stack filters them, as it does for any interface. However, when
> > hardware offload is involved, things change. We should program the
> > hardware to only send multcast packets to the host when the host has
> > in interest in them.
>
> Granted the boolean mglist might need some changes (esp. with host group leave)
> but I think it can be used to program switchdev for host join/leave, can't
> we adjust its behaviour instead of introducing this complexity and avoid many
> headaches ?
I would like to avoid this complexity as well. I will take a look at
mglist. Thanks for the hint.
Andrew
^ permalink raw reply
* Re: [PATCH RFC WIP 0/5] IGMP snooping for local traffic
From: Nikolay Aleksandrov @ 2017-08-26 22:40 UTC (permalink / raw)
To: Andrew Lunn, netdev; +Cc: Florian Fainelli, Vivien Didelot, roopa, bridge, jiri
In-Reply-To: <c3f5050f-7a38-c28c-31d0-df5909259fb1@cumulusnetworks.com>
On 27.08.2017 01:17, Nikolay Aleksandrov wrote:
> On 26/08/17 23:56, Andrew Lunn wrote:
>> This is a WIP patchset i would like comments on from bridge, switchdev
>> and hardware offload people.
>>
>> The linux bridge supports IGMP snooping. It will listen to IGMP
>> reports on bridge ports and keep track of which groups have been
>> joined on an interface. It will then forward multicast based on this
>> group membership.
>>
>> When the bridge adds or removed groups from an interface, it uses
>> switchdev to request the hardware add an mdb to a port, so the
>> hardware can perform the selective forwarding between ports.
>>
>> What is not covered by the current bridge code, is IGMP joins/leaves
>> from the host on the brX interface. No such monitoring is
>
> Hi Andrew,
>
> Have you taken a look at mglist (the boolean, probably needs a rename) ? It is for
> exactly that purpose, to track which groups the bridge is interested in.
> I assume I'm forgetting or missing something here.
>
>> performed. With a pure software bridge, it is not required. All
>> mulitcast frames are passed to the brX interface, and the network
>
> If mglist (again the boolean) is false then they won't be passed up.
>
>> stack filters them, as it does for any interface. However, when
>> hardware offload is involved, things change. We should program the
>> hardware to only send multcast packets to the host when the host has
>> in interest in them.
>
> Granted the boolean mglist might need some changes (esp. with host group leave)
> but I think it can be used to program switchdev for host join/leave, can't
> we adjust its behaviour instead of introducing this complexity and avoid many
> headaches ?
>
>>
>> Thus we need to perform IGMP snooping on the brX interface, just like
>> any other interface of the bridge. However, currently the brX
>> interface is missing all the needed data structures to do this. There
>> is no net_bridge_port structure for the brX interface. This strucuture
>> is created when an interface is added to the bridge. But the brX
>> interface is not a member of the bridge. So this patchset makes the
>> brX interface a first class member of the bridge. When the brX
>> interface is opened, the interface is added to the bridge. A
>> net_bridge_port is allocated for it, and IGMP snooping is performed as
>> usual.
>
> I have actually discussed this idea long time ago with Vlad and it has very nice
> upsides (most important one removing br/port checks everywhere) but it blows up
> fast with special cases for the bridge and things look very similar. You'll need
> to rework the whole bridge and turn every bridge special case into either a port
> generic one or again bridge-specific special case but with a check for the new flag.
> I will not point out every bug that comes out of this, but registering the bridge
> rx handler to itself is simply wrong on many levels and breaks many setups.
This was a digression about making the bridge a proper port of itself
(e.g. port 0, linked and all), it is only tangential to this
implementation as it doesn't link the new port.
>
>>
>> There are some complexities here. Some assumptions are broken, like
>> the master interface of a port interface is the bridge interface. The
>> brX interface cannot be its own master. The use of
>> netdev_master_upper_dev_get() within the bridge code has been changed
>> to reflecit this. The bridge receive handler needs to not process
>> frames for the brX interface, etc.
>>
>> The interface downward to the hardware is also an issue. The code
>> presented here is a hack and needs to change. But that is secondary
>> and can be solved once it is agreed how the bridge needs to change to
>> support this use case.
>
> Definitely agree with this statement. :-)
>
>>
>> Comment welcome and wanted.
>>
>> Andrew
>>
>> Andrew Lunn (5):
>> net: rtnetlink: Handle bridge port without upper device
>> net: bridge: Skip receive handler on brX interface
>> net: bridge: Make the brX interface a member of the bridge
>> net: dsa: HACK: Handle MDB add/remove for none-switch ports
>> net: dsa: Don't include CPU port when adding MDB to a port
>>
>> include/linux/if_bridge.h | 1 +
>> net/bridge/br_device.c | 12 ++++++++++--
>> net/bridge/br_if.c | 37 ++++++++++++++++++++++++-------------
>> net/bridge/br_input.c | 4 ++++
>> net/bridge/br_mdb.c | 2 --
>> net/bridge/br_multicast.c | 7 ++++---
>> net/bridge/br_private.h | 1 +
>> net/core/rtnetlink.c | 23 +++++++++++++++++++++--
>> net/dsa/port.c | 19 +++++++++++++++++--
>> net/dsa/switch.c | 2 +-
>> 10 files changed, 83 insertions(+), 25 deletions(-)
>>
>
^ permalink raw reply
* Re: [PATCH RFC WIP 0/5] IGMP snooping for local traffic
From: Nikolay Aleksandrov @ 2017-08-26 22:17 UTC (permalink / raw)
To: Andrew Lunn, netdev
Cc: Vivien Didelot, Florian Fainelli, jiri, roopa, stephen, bridge
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>
On 26/08/17 23:56, Andrew Lunn wrote:
> This is a WIP patchset i would like comments on from bridge, switchdev
> and hardware offload people.
>
> The linux bridge supports IGMP snooping. It will listen to IGMP
> reports on bridge ports and keep track of which groups have been
> joined on an interface. It will then forward multicast based on this
> group membership.
>
> When the bridge adds or removed groups from an interface, it uses
> switchdev to request the hardware add an mdb to a port, so the
> hardware can perform the selective forwarding between ports.
>
> What is not covered by the current bridge code, is IGMP joins/leaves
> from the host on the brX interface. No such monitoring is
Hi Andrew,
Have you taken a look at mglist (the boolean, probably needs a rename) ? It is for
exactly that purpose, to track which groups the bridge is interested in.
I assume I'm forgetting or missing something here.
> performed. With a pure software bridge, it is not required. All
> mulitcast frames are passed to the brX interface, and the network
If mglist (again the boolean) is false then they won't be passed up.
> stack filters them, as it does for any interface. However, when
> hardware offload is involved, things change. We should program the
> hardware to only send multcast packets to the host when the host has
> in interest in them.
Granted the boolean mglist might need some changes (esp. with host group leave)
but I think it can be used to program switchdev for host join/leave, can't
we adjust its behaviour instead of introducing this complexity and avoid many
headaches ?
>
> Thus we need to perform IGMP snooping on the brX interface, just like
> any other interface of the bridge. However, currently the brX
> interface is missing all the needed data structures to do this. There
> is no net_bridge_port structure for the brX interface. This strucuture
> is created when an interface is added to the bridge. But the brX
> interface is not a member of the bridge. So this patchset makes the
> brX interface a first class member of the bridge. When the brX
> interface is opened, the interface is added to the bridge. A
> net_bridge_port is allocated for it, and IGMP snooping is performed as
> usual.
I have actually discussed this idea long time ago with Vlad and it has very nice
upsides (most important one removing br/port checks everywhere) but it blows up
fast with special cases for the bridge and things look very similar. You'll need
to rework the whole bridge and turn every bridge special case into either a port
generic one or again bridge-specific special case but with a check for the new flag.
I will not point out every bug that comes out of this, but registering the bridge
rx handler to itself is simply wrong on many levels and breaks many setups.
>
> There are some complexities here. Some assumptions are broken, like
> the master interface of a port interface is the bridge interface. The
> brX interface cannot be its own master. The use of
> netdev_master_upper_dev_get() within the bridge code has been changed
> to reflecit this. The bridge receive handler needs to not process
> frames for the brX interface, etc.
>
> The interface downward to the hardware is also an issue. The code
> presented here is a hack and needs to change. But that is secondary
> and can be solved once it is agreed how the bridge needs to change to
> support this use case.
Definitely agree with this statement. :-)
>
> Comment welcome and wanted.
>
> Andrew
>
> Andrew Lunn (5):
> net: rtnetlink: Handle bridge port without upper device
> net: bridge: Skip receive handler on brX interface
> net: bridge: Make the brX interface a member of the bridge
> net: dsa: HACK: Handle MDB add/remove for none-switch ports
> net: dsa: Don't include CPU port when adding MDB to a port
>
> include/linux/if_bridge.h | 1 +
> net/bridge/br_device.c | 12 ++++++++++--
> net/bridge/br_if.c | 37 ++++++++++++++++++++++++-------------
> net/bridge/br_input.c | 4 ++++
> net/bridge/br_mdb.c | 2 --
> net/bridge/br_multicast.c | 7 ++++---
> net/bridge/br_private.h | 1 +
> net/core/rtnetlink.c | 23 +++++++++++++++++++++--
> net/dsa/port.c | 19 +++++++++++++++++--
> net/dsa/switch.c | 2 +-
> 10 files changed, 83 insertions(+), 25 deletions(-)
>
^ permalink raw reply
* Re: [PATCH v4 4/5] net: stmmac: dwmac-sun8i: choose internal PHY via phy-is-integrated
From: Andrew Lunn @ 2017-08-26 21:20 UTC (permalink / raw)
To: Corentin Labbe
Cc: robh+dt, mark.rutland, maxime.ripard, wens, linux,
peppe.cavallaro, alexandre.torgue, f.fainelli, icenowy, netdev,
devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <20170826073311.25612-5-clabbe.montjoie@gmail.com>
Hi Corentin
I think we have now all agreed this is an mdio-mux, plus it is also an
MII mux. We should represent that in device tree. This patchset does
this. However, as it is now, the mux structure in DT is ignored. All
it does is search for the phy-is-integrated flags and goes on that.
I made the comment that the device tree representation cannot be
implemented using an MDIO mux driver, because of driver loading
issues. However, the core of the MDIO mux code is just a library,
symbols exported as GPL, free for anything to use.
What i think should happen is the mdio-mux is implemented inside the
MAC driver, using the mux-core as a library. The device tree structure
of a mix is then reflected within Linux. The mux switch callback is
implemented within the MAC driver. So it can reset the MAC when the
mux is switched. The 'phy-is-integrated' property is then no longer
needed.
I would suggest a binding something like:
emac: ethernet@1c0b000 {
compatible = "allwinner,sun8i-h3-emac";
syscon = <&syscon>;
reg = <0x01c0b000 0x104>;
interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "macirq";
resets = <&ccu RST_BUS_EMAC>;
reset-names = "stmmaceth";
clocks = <&ccu CLK_BUS_EMAC>;
clock-names = "stmmaceth";
#address-cells = <1>;
#size-cells = <0>;
phy-handle = <&int_mii_phy>;
phy-mode = "mii";
allwinner,leds-active-low;
mdio: mdio {
#address-cells = <1>;
#size-cells = <0>;
}
mdio-mux {
#address-cells = <1>;
#size-cells = <0>;
mdio@0 {
reg = <0>;
#address-cells = <1>;
#size-cells = <0>;
int_mii_phy: ethernet-phy@1 {
reg = <1>;
clocks = <&ccu CLK_BUS_EPHY>;
resets = <&ccu RST_BUS_EPHY>;
};
};
ext_mdio: mdio@0 {
#address-cells = <1>;
#size-cells = <0>;
ext_rgmii_phy: ethernet-phy@1 {
reg = <1>;
};
};
};
};
Andrew
^ permalink raw reply
* [PATCH RFC WIP 4/5] net: dsa: HACK: Handle MDB add/remove for none-switch ports
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
To: netdev
Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
bridge, Andrew Lunn
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>
When there is a mdb added to a port which is not in the switch, we
need the switch to forward traffic for the group to the software
bridge, so it can forward it out the none-switch port.
The current implementation is a hack and will be replaced. Currently
only the bridge soft interface is supported. When there is a
join/leave on the soft interface, switchdev calls are made on the soft
interface device, brX. This does not have a switchdev ops structure
registered, so all lower interfaces of brX get there switchdev
function called. These are switch ports, and do have switchdev ops. By
comparing the original interface to the called interface, we can
determine this is not for a switch port, and add/remove the mdb to the
CPU port.
---
net/dsa/port.c | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/net/dsa/port.c b/net/dsa/port.c
index d6e07176df3f..d8e4bfefd97d 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -194,8 +194,15 @@ int dsa_port_mdb_add(struct dsa_port *dp,
.mdb = mdb,
};
- pr_info("dsa_port_mdb_add: %d %d", info.sw_index, info.port);
-
+ if (dp->netdev != mdb->obj.orig_dev) {
+ /* Not a port for this switch, so forward
+ * multicast out the CPU port to the bridge.
+ */
+ struct dsa_switch_tree *dst = dp->ds->dst;
+ struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
+ info.port = cpu_dp->index;
+ return dsa_port_notify(cpu_dp, DSA_NOTIFIER_MDB_ADD, &info);
+ }
return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info);
}
@@ -208,6 +215,14 @@ int dsa_port_mdb_del(struct dsa_port *dp,
.mdb = mdb,
};
+ if (dp->netdev != mdb->obj.orig_dev) {
+ struct dsa_switch_tree *dst = dp->ds->dst;
+ struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
+
+ info.port = cpu_dp->index;
+ return dsa_port_notify(cpu_dp, DSA_NOTIFIER_MDB_DEL, &info);
+ }
+
return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info);
}
--
2.14.1
^ permalink raw reply related
* [PATCH RFC WIP 3/5] net: bridge: Make the brX interface a member of the bridge
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
To: netdev
Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
bridge, Andrew Lunn
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>
In order to perform IGMP snooping on the brX interface, it has to be
part of the bridge, so that the code snooping on normal bridge ports
keeps track of IGMP joins and leaves.
When the brX interface is opened, add the interface to the bridge.
When the brX interface is closed, remove it from the bridge.
This port does however need some special handling. So add a bridge
port flag, BR_SOFT_INTERFACE, indicating a port is the sort interface
of the bridge.
When the port is added to the bridge, the netdev for this port cannot
be linked to the master device, since it is the master device.
Similarly when removing the port, it cannot be unlinked from the
master device.
With the brX interface now being a member of the bridge, and having
all associated structures, we can process IGMP messages sent by the
interface. This is done by the br_multicast_rcv() function, which
takes the bridge_port structure as a parameter. This cannot be easily
found, so keep track of it in the net_bridge structure.
---
include/linux/if_bridge.h | 1 +
net/bridge/br_device.c | 12 ++++++++++--
net/bridge/br_if.c | 37 ++++++++++++++++++++++++-------------
net/bridge/br_mdb.c | 2 --
net/bridge/br_multicast.c | 7 ++++---
net/bridge/br_private.h | 1 +
6 files changed, 40 insertions(+), 20 deletions(-)
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 3cd18ac0697f..8a03821d1827 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -49,6 +49,7 @@ struct br_ip_list {
#define BR_MULTICAST_TO_UNICAST BIT(12)
#define BR_VLAN_TUNNEL BIT(13)
#define BR_BCAST_FLOOD BIT(14)
+#define BR_SOFT_INTERFACE BIT(15)
#define BR_DEFAULT_AGEING_TIME (300 * HZ)
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 861ae2a165f4..f27ca62fd4a5 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -69,7 +69,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_flood(br, skb, BR_PKT_MULTICAST, false, true);
goto out;
}
- if (br_multicast_rcv(br, NULL, skb, vid)) {
+ if (br_multicast_rcv(br, br->local_port, skb, vid)) {
kfree_skb(skb);
goto out;
}
@@ -133,6 +133,14 @@ static void br_dev_uninit(struct net_device *dev)
static int br_dev_open(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
+ int err;
+
+ err = br_add_if(br, br->dev);
+ if (err)
+ return err;
+
+ br->local_port = list_first_or_null_rcu(&br->port_list,
+ struct net_bridge_port, list);
netdev_update_features(dev);
netif_start_queue(dev);
@@ -161,7 +169,7 @@ static int br_dev_stop(struct net_device *dev)
netif_stop_queue(dev);
- return 0;
+ return br_del_if(br, br->dev);
}
static void br_get_stats64(struct net_device *dev,
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f3aef22931ab..49208e774191 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -284,7 +284,8 @@ static void del_nbp(struct net_bridge_port *p)
nbp_update_port_count(br);
- netdev_upper_dev_unlink(dev, br->dev);
+ if (!(p->flags & BR_SOFT_INTERFACE))
+ netdev_upper_dev_unlink(dev, br->dev);
dev->priv_flags &= ~IFF_BRIDGE_PORT;
@@ -362,6 +363,8 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
+ if (br->dev == dev)
+ p->flags |= BR_SOFT_INTERFACE;
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
@@ -500,8 +503,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
return -EINVAL;
/* No bridging of bridges */
- if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
- return -ELOOP;
+ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) {
+ /* Unless it is our own soft interface */
+ if (br->dev != dev)
+ return -ELOOP;
+ }
/* Device is already being bridged */
if (br_port_exists(dev))
@@ -540,9 +546,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
dev->priv_flags |= IFF_BRIDGE_PORT;
- err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
- if (err)
- goto err5;
+ if (!(p->flags & BR_SOFT_INTERFACE)) {
+ err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
+ if (err)
+ goto err5;
+ }
err = nbp_switchdev_mark_set(p);
if (err)
@@ -563,13 +571,15 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
else
netdev_set_rx_headroom(dev, br_hr);
- if (br_fdb_insert(br, p, dev->dev_addr, 0))
- netdev_err(dev, "failed insert local address bridge forwarding table\n");
+ if (!(p->flags & BR_SOFT_INTERFACE)) {
+ if (br_fdb_insert(br, p, dev->dev_addr, 0))
+ netdev_err(dev, "failed insert local address bridge forwarding table\n");
- err = nbp_vlan_init(p);
- if (err) {
- netdev_err(dev, "failed to initialize vlan filtering on this port\n");
- goto err7;
+ err = nbp_vlan_init(p);
+ if (err) {
+ netdev_err(dev, "failed to initialize vlan filtering on this port\n");
+ goto err7;
+ }
}
spin_lock_bh(&br->lock);
@@ -597,7 +607,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
br_fdb_delete_by_port(br, p, 0, 1);
nbp_update_port_count(br);
err6:
- netdev_upper_dev_unlink(dev, br->dev);
+ if (!(p->flags & BR_SOFT_INTERFACE))
+ netdev_upper_dev_unlink(dev, br->dev);
err5:
dev->priv_flags &= ~IFF_BRIDGE_PORT;
netdev_rx_handler_unregister(dev);
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index a0b11e7d67d9..47f0d9b4221d 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -117,8 +117,6 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct br_mdb_entry e;
port = p->port;
- if (!port)
- continue;
memset(&e, 0, sizeof(e));
e.ifindex = port->dev->ifindex;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index dae3af1f531a..f1bf9ec15de8 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -915,7 +915,7 @@ static void __br_multicast_send_query(struct net_bridge *br,
if (!skb)
return;
- if (port) {
+ if (port && !(port->flags & BR_SOFT_INTERFACE)) {
skb->dev = port->dev;
br_multicast_count(br, port, skb, igmp_type,
BR_MCAST_DIR_TX);
@@ -944,8 +944,9 @@ static void br_multicast_send_query(struct net_bridge *br,
memset(&br_group.u, 0, sizeof(br_group.u));
- if (port ? (own_query == &port->ip4_own_query) :
- (own_query == &br->ip4_own_query)) {
+ if (port && !(port->flags & BR_SOFT_INTERFACE) ?
+ (own_query == &port->ip4_own_query) :
+ (own_query == &br->ip4_own_query)) {
other_query = &br->ip4_other_query;
br_group.proto = htons(ETH_P_IP);
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index fd9ee73e0a6d..c4b99a35abb0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -296,6 +296,7 @@ struct net_bridge {
spinlock_t lock;
spinlock_t hash_lock;
struct list_head port_list;
+ struct net_bridge_port *local_port;
struct net_device *dev;
struct pcpu_sw_netstats __percpu *stats;
/* These fields are accessed on each packet */
--
2.14.1
^ permalink raw reply related
* [PATCH RFC WIP 2/5] net: bridge: Skip receive handler on brX interface
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
To: netdev
Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
bridge, Andrew Lunn
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>
The brX interface will soon become a member of the bridge. As such, it
will get a receiver handler assigned. However, we don't want to handle
packets received on this soft interfaces. So detect the condition and
say all the packets pass.
---
net/bridge/br_input.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7637f58c1226..38c2a41968f2 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -267,6 +267,10 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
return RX_HANDLER_CONSUMED;
p = br_port_get_rcu(skb->dev);
+
+ if (p->dev == p->br->dev)
+ return RX_HANDLER_PASS;
+
if (p->flags & BR_VLAN_TUNNEL) {
if (br_handle_ingress_vlan_tunnel(skb, p,
nbp_vlan_group_rcu(p)))
--
2.14.1
^ permalink raw reply related
* [PATCH RFC WIP 1/5] net: rtnetlink: Handle bridge port without upper device
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
To: netdev
Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
bridge, Andrew Lunn
In-Reply-To: <1503780970-10312-1-git-send-email-andrew@lunn.ch>
The brX interface will with a following patch becomes a member of the
bridge. It however cannot be a slave interface, since it would have to
be a slave of itself. netdev_master_upper_dev_get() returns NULL as a
result. Handle this NULL, by knowing this bridge slave must also be
the master, i.e. what we are looking for.
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
---
net/core/rtnetlink.c | 23 +++++++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9201e3621351..2673eb430b6f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3093,8 +3093,12 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
(dev->priv_flags & IFF_BRIDGE_PORT)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
- const struct net_device_ops *ops = br_dev->netdev_ops;
+ const struct net_device_ops *ops;
+ if (!br_dev)
+ br_dev = dev;
+
+ ops = br_dev->netdev_ops;
err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
nlh->nlmsg_flags);
if (err)
@@ -3197,7 +3201,12 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
(dev->priv_flags & IFF_BRIDGE_PORT)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
- const struct net_device_ops *ops = br_dev->netdev_ops;
+ const struct net_device_ops *ops;
+
+ if (!br_dev)
+ br_dev = dev;
+
+ ops = br_dev->netdev_ops;
if (ops->ndo_fdb_del)
err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
@@ -3332,6 +3341,8 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (!br_idx) { /* user did not specify a specific bridge */
if (dev->priv_flags & IFF_BRIDGE_PORT) {
br_dev = netdev_master_upper_dev_get(dev);
+ if (!br_dev)
+ br_dev = dev;
cops = br_dev->netdev_ops;
}
} else {
@@ -3410,6 +3421,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
int err = 0;
+ if (!br_dev)
+ br_dev = dev;
+
nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -3647,6 +3661,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ if (!br_dev)
+ br_dev = dev;
if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
err = -EOPNOTSUPP;
@@ -3723,6 +3739,9 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ if (!br_dev)
+ br_dev = dev;
+
if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
err = -EOPNOTSUPP;
goto out;
--
2.14.1
^ permalink raw reply related
* [PATCH RFC WIP 0/5] IGMP snooping for local traffic
From: Andrew Lunn @ 2017-08-26 20:56 UTC (permalink / raw)
To: netdev
Cc: Vivien Didelot, Florian Fainelli, nikolay, jiri, roopa, stephen,
bridge, Andrew Lunn
This is a WIP patchset i would like comments on from bridge, switchdev
and hardware offload people.
The linux bridge supports IGMP snooping. It will listen to IGMP
reports on bridge ports and keep track of which groups have been
joined on an interface. It will then forward multicast based on this
group membership.
When the bridge adds or removed groups from an interface, it uses
switchdev to request the hardware add an mdb to a port, so the
hardware can perform the selective forwarding between ports.
What is not covered by the current bridge code, is IGMP joins/leaves
from the host on the brX interface. No such monitoring is
performed. With a pure software bridge, it is not required. All
mulitcast frames are passed to the brX interface, and the network
stack filters them, as it does for any interface. However, when
hardware offload is involved, things change. We should program the
hardware to only send multcast packets to the host when the host has
in interest in them.
Thus we need to perform IGMP snooping on the brX interface, just like
any other interface of the bridge. However, currently the brX
interface is missing all the needed data structures to do this. There
is no net_bridge_port structure for the brX interface. This strucuture
is created when an interface is added to the bridge. But the brX
interface is not a member of the bridge. So this patchset makes the
brX interface a first class member of the bridge. When the brX
interface is opened, the interface is added to the bridge. A
net_bridge_port is allocated for it, and IGMP snooping is performed as
usual.
There are some complexities here. Some assumptions are broken, like
the master interface of a port interface is the bridge interface. The
brX interface cannot be its own master. The use of
netdev_master_upper_dev_get() within the bridge code has been changed
to reflecit this. The bridge receive handler needs to not process
frames for the brX interface, etc.
The interface downward to the hardware is also an issue. The code
presented here is a hack and needs to change. But that is secondary
and can be solved once it is agreed how the bridge needs to change to
support this use case.
Comment welcome and wanted.
Andrew
Andrew Lunn (5):
net: rtnetlink: Handle bridge port without upper device
net: bridge: Skip receive handler on brX interface
net: bridge: Make the brX interface a member of the bridge
net: dsa: HACK: Handle MDB add/remove for none-switch ports
net: dsa: Don't include CPU port when adding MDB to a port
include/linux/if_bridge.h | 1 +
net/bridge/br_device.c | 12 ++++++++++--
net/bridge/br_if.c | 37 ++++++++++++++++++++++++-------------
net/bridge/br_input.c | 4 ++++
net/bridge/br_mdb.c | 2 --
net/bridge/br_multicast.c | 7 ++++---
net/bridge/br_private.h | 1 +
net/core/rtnetlink.c | 23 +++++++++++++++++++++--
net/dsa/port.c | 19 +++++++++++++++++--
net/dsa/switch.c | 2 +-
10 files changed, 83 insertions(+), 25 deletions(-)
--
2.14.1
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox