Netdev List
 help / color / mirror / Atom feed
* [PATCH v12 03/10] netdev: cavium: octeon: Add Octeon III BGX Ethernet Nexus
From: Steven J. Hill @ 2018-06-27 21:25 UTC (permalink / raw)
  To: netdev; +Cc: Carlos Munoz, Chandrakala Chavva, Steven J. Hill
In-Reply-To: <1530134719-19407-1-git-send-email-steven.hill@cavium.com>

From: Carlos Munoz <cmunoz@cavium.com>

Add the BGX nexus architeture for Octeon III BGX Ethernet.

Signed-off-by: Carlos Munoz <cmunoz@cavium.com>
Signed-off-by: Steven J. Hill <Steven.Hill@cavium.com>
---
 .../net/ethernet/cavium/octeon/octeon3-bgx-nexus.c | 670 +++++++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-bgx.h   | 281 +++++----
 2 files changed, 831 insertions(+), 120 deletions(-)
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c

diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c b/drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c
new file mode 100644
index 0000000..fced298
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c
@@ -0,0 +1,670 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Octeon III BGX Nexus Ethernet driver
+ *
+ * Copyright (C) 2018 Cavium, Inc.
+ */
+
+#include <linux/ctype.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+
+#include "octeon3.h"
+
+static atomic_t request_mgmt_once;
+static atomic_t load_driver_once;
+static atomic_t pki_id;
+
+static char *mix_port;
+module_param(mix_port, charp, 0444);
+MODULE_PARM_DESC(mix_port, "Specifies which ports connect to MIX interfaces.");
+
+static char *pki_port;
+module_param(pki_port, charp, 0444);
+MODULE_PARM_DESC(pki_port, "Specifies which ports connect to the PKI.");
+
+#define MAX_MIX_PER_NODE	2
+#define MAX_MIX			(MAX_NODES * MAX_MIX_PER_NODE)
+
+/* struct mix_port_lmac - Describes a lmac that connects to a mix port. The lmac
+ *			  must be on the same node as the mix.
+ * @node: Node of the lmac.
+ * @bgx: Bgx of the lmac.
+ * @lmac: Lmac index.
+ */
+struct mix_port_lmac {
+	int node;
+	int bgx;
+	int lmac;
+};
+
+/* mix_ports_lmacs contains all the lmacs connected to mix ports */
+static struct mix_port_lmac mix_port_lmacs[MAX_MIX];
+
+/* pki_ports keeps track of the lmacs connected to the pki */
+static bool pki_ports[MAX_NODES][MAX_BGX_PER_NODE][MAX_LMAC_PER_BGX];
+
+/* Created platform devices get added to this list */
+static struct list_head pdev_list;
+static struct mutex pdev_list_lock;
+
+/* Created platform device use this structure to add themselves to the list */
+struct pdev_list_item {
+	struct list_head list;
+	struct platform_device *pdev;
+};
+
+/* is_lmac_to_mix - Search the list of lmacs connected to mix'es for a match.
+ * @node: Numa node of lmac to search for.
+ * @bgx: Bgx of lmac to search for.
+ * @lmac: Lmac index to search for.
+ *
+ * Returns true if the lmac is connected to a mix.
+ * Returns false if the lmac is not connected to a mix.
+ */
+static bool is_lmac_to_mix(int node, int bgx, int lmac)
+{
+	int i;
+
+	for (i = 0; i < MAX_MIX; i++) {
+		if (mix_port_lmacs[i].node == node &&
+		    mix_port_lmacs[i].bgx == bgx &&
+		    mix_port_lmacs[i].lmac == lmac)
+			return true;
+	}
+
+	return false;
+}
+
+/* is_lmac_to_pki - Search the list of lmacs connected to the pki for a match.
+ * @node: Numa node of lmac to search for.
+ * @bgx: Bgx of lmac to search for.
+ * @lmac: Lmac index to search for.
+ *
+ * Returns true if the lmac is connected to the pki.
+ * Returns false if the lmac is not connected to the pki.
+ */
+static bool is_lmac_to_pki(int node, int bgx, int lmac)
+{
+	return pki_ports[node][bgx][lmac];
+}
+
+/* is_lmac_to_xcv - Check if this lmac is connected to the xcv block (rgmii).
+ * @of_node: Device node to check.
+ *
+ * Returns true if the lmac is connected to the xcv port.
+ * Returns false if the lmac is not connected to the xcv port.
+ */
+static bool is_lmac_to_xcv(struct device_node *of_node)
+{
+	return of_device_is_compatible(of_node, "cavium,octeon-7360-xcv");
+}
+
+static int bgx_probe(struct platform_device *pdev)
+{
+	struct platform_device *new_dev, *pki_dev;
+	struct mac_platform_data platform_data;
+	int i, interface, numa_node, r = 0;
+	struct device_node *child;
+	const __be32 *reg;
+	u64 addr, data;
+	char id[64];
+	u32 port;
+
+	reg = of_get_property(pdev->dev.of_node, "reg", NULL);
+	addr = of_translate_address(pdev->dev.of_node, reg);
+	interface = (addr >> 24) & 0xf;
+	numa_node = (addr >> 36) & 0x7;
+
+	/* Assign 8 CAM entries per LMAC */
+	for (i = 0; i < 32; i++) {
+		data = i >> 3;
+		oct_csr_write(data,
+			      BGX_CMR_RX_ADRX_CAM(numa_node, interface, i));
+	}
+
+	for_each_available_child_of_node(pdev->dev.of_node, child) {
+		struct pdev_list_item *pdev_item;
+		bool is_mix = false;
+		bool is_pki = false;
+		bool is_xcv = false;
+
+		if (!of_device_is_compatible(child, "cavium,octeon-7890-bgx-port") &&
+		    !of_device_is_compatible(child, "cavium,octeon-7360-xcv"))
+			continue;
+		r = of_property_read_u32(child, "reg", &port);
+		if (r)
+			return -ENODEV;
+
+		is_mix = is_lmac_to_mix(numa_node, interface, port);
+		is_pki = is_lmac_to_pki(numa_node, interface, port);
+		is_xcv = is_lmac_to_xcv(child);
+
+		/* Check if this port should be configured */
+		if (!is_mix && !is_pki)
+			continue;
+
+		/* Connect to PKI/PKO */
+		data = oct_csr_read(BGX_CMR_CONFIG(numa_node, interface, port));
+		if (is_mix)
+			data |= BGX_CMR_CONFIG_MIX_EN;
+		else
+			data &= ~BGX_CMR_CONFIG_MIX_EN;
+		oct_csr_write(data, BGX_CMR_CONFIG(numa_node, interface, port));
+
+		/* Unreset the mix bgx interface or it will interfare with the
+		 * other ports.
+		 */
+		if (is_mix) {
+			data = oct_csr_read(BGX_CMR_GLOBAL_CONFIG(numa_node,
+								  interface));
+			if (!port)
+				data &= ~BGX_CMR_GLOBAL_CONFIG_CMR_MIX0_RST;
+			else if (port == 1)
+				data &= ~BGX_CMR_GLOBAL_CONFIG_CMR_MIX1_RST;
+			oct_csr_write(data, BGX_CMR_GLOBAL_CONFIG(numa_node,
+								  interface));
+		}
+
+		snprintf(id, sizeof(id), "%llx.%u.ethernet-mac",
+			 (unsigned long long)addr, port);
+		new_dev = of_platform_device_create(child, id, &pdev->dev);
+		if (!new_dev) {
+			dev_err(&pdev->dev, "Error creating %s\n", id);
+			continue;
+		}
+		platform_data.mac_type = BGX_MAC;
+		platform_data.numa_node = numa_node;
+		platform_data.interface = interface;
+		platform_data.port = port;
+		if (is_xcv)
+			platform_data.src_type = XCV;
+		else
+			platform_data.src_type = QLM;
+
+		/* Add device to the list of created devices so we can remove it
+		 * on exit.
+		 */
+		pdev_item = kmalloc(sizeof(*pdev_item), GFP_KERNEL);
+		pdev_item->pdev = new_dev;
+		mutex_lock(&pdev_list_lock);
+		list_add(&pdev_item->list, &pdev_list);
+		mutex_unlock(&pdev_list_lock);
+
+		atomic_inc(&pki_id);
+		pki_dev = platform_device_register_data(&new_dev->dev,
+				is_mix ? "octeon_mgmt" : "ethernet-mac-pki",
+				(int)atomic_read(&pki_id),
+				&platform_data, sizeof(platform_data));
+		dev_info(&pdev->dev, "Created %s %u: %p\n",
+			 is_mix ? "MIX" : "PKI", pki_dev->id, pki_dev);
+
+		/* Add device to the list of created devices so we can remove it
+		 * on exit.
+		 */
+		pdev_item = kmalloc(sizeof(*pdev_item), GFP_KERNEL);
+		pdev_item->pdev = pki_dev;
+		mutex_lock(&pdev_list_lock);
+		list_add(&pdev_item->list, &pdev_list);
+		mutex_unlock(&pdev_list_lock);
+
+#ifdef CONFIG_NUMA
+		new_dev->dev.numa_node = pdev->dev.numa_node;
+		pki_dev->dev.numa_node = pdev->dev.numa_node;
+#endif
+		/* One time request driver module */
+		if (is_mix) {
+			if (atomic_cmpxchg(&request_mgmt_once, 0, 1) == 0)
+				request_module_nowait("octeon_mgmt");
+		}
+		if (is_pki) {
+			if (atomic_cmpxchg(&load_driver_once, 0, 1) == 0)
+				request_module_nowait("octeon3-ethernet");
+		}
+	}
+
+	dev_info(&pdev->dev, "Probed\n");
+	return 0;
+}
+
+/* bgx_mix_init_from_fdt - Initialize the list of lmacs that connect to mix
+ *			   ports from information in the device tree.
+ *
+ * Returns 0 if successful.
+ * Returns <0 for error codes.
+ */
+static int bgx_mix_init_from_fdt(void)
+{
+	struct device_node *node, *parent = NULL;
+	int mix = 0;
+
+	for_each_compatible_node(node, NULL, "cavium,octeon-7890-mix") {
+		struct device_node *lmac_fdt_node;
+		const __be32 *reg;
+		u64 addr;
+
+		/* Get the fdt node of the lmac connected to this mix */
+		lmac_fdt_node = of_parse_phandle(node, "cavium,mac-handle", 0);
+		if (!lmac_fdt_node)
+			goto err;
+
+		/* Get the numa node and bgx of the lmac */
+		parent = of_get_parent(lmac_fdt_node);
+		if (!parent)
+			goto err;
+		reg = of_get_property(parent, "reg", NULL);
+		if (!reg)
+			goto err;
+		addr = of_translate_address(parent, reg);
+		of_node_put(parent);
+		parent = NULL;
+
+		mix_port_lmacs[mix].node = (addr >> 36) & 0x7;
+		mix_port_lmacs[mix].bgx = (addr >> 24) & 0xf;
+
+		/* Get the lmac index */
+		reg = of_get_property(lmac_fdt_node, "reg", NULL);
+		if (!reg)
+			goto err;
+
+		mix_port_lmacs[mix].lmac = *reg;
+
+		mix++;
+		if (mix >= MAX_MIX)
+			break;
+	}
+
+	return 0;
+err:
+	pr_warn("Invalid device tree mix port information\n");
+	for (mix = 0; mix < MAX_MIX; mix++) {
+		mix_port_lmacs[mix].node = -1;
+		mix_port_lmacs[mix].bgx = -1;
+		mix_port_lmacs[mix].lmac = -1;
+	}
+	if (parent)
+		of_node_put(parent);
+
+	return -EINVAL;
+}
+
+/* bgx_mix_init_from_param - Initialize the list of lmacs that connect to mix
+ *			     ports from information in the "mix_port" parameter.
+ *			     The mix_port parameter format is as follows:
+ *			     mix_port=nbl
+ *			     where:
+ *				n = node
+ *				b = bgx
+ *				l = lmac
+ *			     There can be up to 4 lmacs defined separated by
+ *			     commas. For example to select node0, bgx0, lmac0
+ *			     and node0, bgx4, lamc0, the mix_port parameter
+ *			     would be: mix_port=000,040
+ *
+ * Returns 0 if successful.
+ * Returns <0 for error codes.
+ */
+static int bgx_mix_init_from_param(void)
+{
+	char *p = mix_port;
+	int i, mix = 0;
+
+	while (*p) {
+		int node = -1;
+		int bgx = -1;
+		int lmac = -1;
+
+		if (strlen(p) < 3)
+			goto err;
+
+		/* Get the numa node */
+		if (!isdigit(*p))
+			goto err;
+		node = *p - '0';
+		if (node >= MAX_NODES)
+			goto err;
+
+		/* Get the bgx */
+		p++;
+		if (!isdigit(*p))
+			goto err;
+		bgx = *p - '0';
+		if (bgx >= MAX_BGX_PER_NODE)
+			goto err;
+
+		/* Get the lmac index */
+		p++;
+		if (!isdigit(*p))
+			goto err;
+		lmac = *p - '0';
+		if (lmac >= 2)
+			goto err;
+
+		/* Only one lmac0 and one lmac1 per node is supported */
+		for (i = 0; i < MAX_MIX; i++) {
+			if (mix_port_lmacs[i].node == node &&
+			    mix_port_lmacs[i].lmac == lmac)
+				goto err;
+		}
+
+		mix_port_lmacs[mix].node = node;
+		mix_port_lmacs[mix].bgx = bgx;
+		mix_port_lmacs[mix].lmac = lmac;
+
+		p++;
+		if (*p == ',')
+			p++;
+
+		mix++;
+		if (mix >= MAX_MIX)
+			break;
+	}
+
+	return 0;
+err:
+	pr_warn("Invalid parameter mix_port=%s\n", mix_port);
+	for (mix = 0; mix < MAX_MIX; mix++) {
+		mix_port_lmacs[mix].node = -1;
+		mix_port_lmacs[mix].bgx = -1;
+		mix_port_lmacs[mix].lmac = -1;
+	}
+	return -EINVAL;
+}
+
+/* bgx_mix_port_lmacs_init - Initialize the mix_port_lmacs variable with the
+ *			     lmacs that connect to mic ports.
+ *
+ * Returns 0 if successful.
+ * Returns <0 for error codes.
+ */
+static int bgx_mix_port_lmacs_init(void)
+{
+	int mix;
+
+	/* Start with no mix ports configured */
+	for (mix = 0; mix < MAX_MIX; mix++) {
+		mix_port_lmacs[mix].node = -1;
+		mix_port_lmacs[mix].bgx = -1;
+		mix_port_lmacs[mix].lmac = -1;
+	}
+
+	/* Check if no mix port should be configured */
+	if (mix_port && !strcmp(mix_port, "none"))
+		return 0;
+
+	/* Configure the mix ports using information from the device tree if no
+	 * parameter was passed. Otherwise, use the information in the module
+	 * parameter.
+	 */
+	if (!mix_port)
+		bgx_mix_init_from_fdt();
+	else
+		bgx_mix_init_from_param();
+
+	return 0;
+}
+
+/* bgx_parse_pki_elem - Parse a single element (node, bgx, or lmac) out a pki
+ *			lmac string and set its bitmap accordingly.
+ * @str: Pki lmac string to parse.
+ * @bitmap: Updated with the bits selected by str.
+ * @size: Maximum size of the bitmap.
+ *
+ * Returns number of characters processed from str.
+ * Returns <0 for error codes.
+ */
+static int bgx_parse_pki_elem(const char *str, unsigned long *bitmap, int size)
+{
+	const char *p = str;
+	int bit, len = -1;
+
+	if (*p == 0) {
+		/* If identifier is missing, the whole subset is allowed */
+		bitmap_set(bitmap, 0, size);
+		len = 0;
+	} else if (*p == '*') {
+		/* If identifier is an asterisk, the whole subset is allowed */
+		bitmap_set(bitmap, 0, size);
+		len = 1;
+	} else if (isdigit(*p)) {
+		/* If identifier is a digit, only the bit corresponding to the
+		 * digit is set.
+		 */
+		bit = *p - '0';
+		if (bit < size) {
+			bitmap_set(bitmap, bit, 1);
+			len = 1;
+		}
+	} else if (*p == '[') {
+		/* If identifier is a bracket, all the bits corresponding to
+		 * the digits inside the bracket are set.
+		 */
+		p++;
+		len = 1;
+		do {
+			if (isdigit(*p)) {
+				bit = *p - '0';
+				if (bit < size)
+					bitmap_set(bitmap, bit, 1);
+				else
+					return -1;
+			} else {
+				return -1;
+			}
+			p++;
+			len++;
+		} while (*p != ']');
+		len++;
+	} else {
+		len = -1;
+	}
+
+	return len;
+}
+
+/* bgx_pki_bitmap_set - Set the bitmap bits for all elements (node, bgx, and
+ *			lmac) selected by a pki lmac string.
+ * @str: Pki lmac string to process.
+ * @node: Updated with the nodes specified in the pki lmac string.
+ * @bgx: Updated with the bgx's specified in the pki lmac string.
+ * @lmac: Updated with the lmacs specified in the pki lmac string.
+ *
+ * Returns 0 if successful.
+ * Returns <0 for error codes.
+ */
+static unsigned long bgx_pki_bitmap_set(const char *str, unsigned long *node,
+					unsigned long *bgx, unsigned long *lmac)
+{
+	const char *p = str;
+	int len;
+
+	/* Parse the node */
+	len = bgx_parse_pki_elem(p, node, MAX_NODES);
+	if (len < 0)
+		goto err;
+
+	/* Parse the bgx */
+	p += len;
+	len = bgx_parse_pki_elem(p, bgx, MAX_BGX_PER_NODE);
+	if (len < 0)
+		goto err;
+
+	/* Parse the lmac */
+	p += len;
+	len = bgx_parse_pki_elem(p, lmac, MAX_LMAC_PER_BGX);
+	if (len < 0)
+		goto err;
+
+	return 0;
+err:
+	bitmap_zero(node, MAX_NODES);
+	bitmap_zero(bgx, MAX_BGX_PER_NODE);
+	bitmap_zero(lmac, MAX_LMAC_PER_BGX);
+	return len;
+}
+
+/* bgx_pki_init_from_param - Initialize the list of lmacs that connect to the
+ *			     pki from information in the "pki_port" parameter.
+ *
+ *			     The pki_port parameter format is as follows:
+ *			     pki_port=nbl
+ *			     where:
+ *				n = node
+ *				b = bgx
+ *				l = lmac
+ *
+ *			     Commas must be used to separate multiple lmacs:
+ *			     pki_port=000,100,110
+ *
+ *			     Asterisks (*) specify all possible characters in
+ *			     the subset:
+ *			     pki_port=00* (all lmacs of node0 bgx0).
+ *
+ *			     Missing lmacs identifiers default to all
+ *			     possible characters in the subset:
+ *			     pki_port=00 (all lmacs on node0 bgx0)
+ *
+ *			     Brackets ('[' and ']') specify the valid
+ *			     characters in the subset:
+ *			     pki_port=00[01] (lmac0 and lmac1 of node0 bgx0).
+ *
+ * Returns 0 if successful.
+ * Returns <0 for error codes.
+ */
+static int bgx_pki_init_from_param(void)
+{
+	DECLARE_BITMAP(lmac_bitmap, MAX_LMAC_PER_BGX);
+	DECLARE_BITMAP(bgx_bitmap, MAX_BGX_PER_NODE);
+	DECLARE_BITMAP(node_bitmap, MAX_NODES);
+	char *cur, *next;
+
+	/* Parse each comma separated lmac specifier */
+	cur = pki_port;
+	while (cur) {
+		unsigned long bgx, lmac, node;
+
+		bitmap_zero(node_bitmap, BITS_PER_LONG);
+		bitmap_zero(bgx_bitmap, BITS_PER_LONG);
+		bitmap_zero(lmac_bitmap, BITS_PER_LONG);
+
+		next = strchr(cur, ',');
+		if (next)
+			*next++ = '\0';
+
+		/* Convert the specifier into a bitmap */
+		bgx_pki_bitmap_set(cur, node_bitmap, bgx_bitmap, lmac_bitmap);
+
+		/* Mark the lmacs to be connected to the pki */
+		for_each_set_bit(node, node_bitmap, MAX_NODES) {
+			for_each_set_bit(bgx, bgx_bitmap, MAX_BGX_PER_NODE) {
+				for_each_set_bit(lmac, lmac_bitmap,
+						 MAX_LMAC_PER_BGX)
+					pki_ports[node][bgx][lmac] = true;
+			}
+		}
+
+		cur = next;
+	}
+
+	return 0;
+}
+
+/* bgx_pki_ports_init - Initialize the pki_ports variable with the lmacs that
+ *			connect to the pki.
+ *
+ * Returns 0 if successful.
+ * Returns <0 for error codes.
+ */
+static int bgx_pki_ports_init(void)
+{
+	bool def_val;
+	int i, j, k;
+
+	/* Whether all ports default to connect to the pki or not depend on the
+	 * passed module parameter (if any).
+	 */
+	if (pki_port)
+		def_val = false;
+	else
+		def_val = true;
+
+	for (i = 0; i < MAX_NODES; i++) {
+		for (j = 0; j < MAX_BGX_PER_NODE; j++) {
+			for (k = 0; k < MAX_LMAC_PER_BGX; k++)
+				pki_ports[i][j][k] = def_val;
+		}
+	}
+
+	/* Check if ports have to be individually configured */
+	if (pki_port && strcmp(pki_port, "none"))
+		bgx_pki_init_from_param();
+
+	return 0;
+}
+
+static int bgx_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static void bgx_shutdown(struct platform_device *pdev)
+{
+}
+
+static const struct of_device_id bgx_match[] = {
+	{
+		.compatible = "cavium,octeon-7890-bgx",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, bgx_match);
+
+static struct platform_driver bgx_driver = {
+	.probe		= bgx_probe,
+	.remove		= bgx_remove,
+	.shutdown       = bgx_shutdown,
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= KBUILD_MODNAME,
+		.of_match_table = bgx_match,
+	},
+};
+
+/* Allow bgx_port driver to force this driver to load */
+void bgx_nexus_load(void)
+{
+}
+EXPORT_SYMBOL(bgx_nexus_load);
+
+static int __init bgx_driver_init(void)
+{
+	INIT_LIST_HEAD(&pdev_list);
+	mutex_init(&pdev_list_lock);
+	bgx_mix_port_lmacs_init();
+	bgx_pki_ports_init();
+
+	return platform_driver_register(&bgx_driver);
+}
+
+static void __exit bgx_driver_exit(void)
+{
+	struct pdev_list_item *pdev_item;
+
+	mutex_lock(&pdev_list_lock);
+	while (!list_empty(&pdev_list)) {
+		pdev_item = list_first_entry(&pdev_list,
+					     struct pdev_list_item, list);
+		list_del(&pdev_item->list);
+		platform_device_unregister(pdev_item->pdev);
+		kfree(pdev_item);
+	}
+	mutex_unlock(&pdev_list_lock);
+
+	platform_driver_unregister(&bgx_driver);
+}
+
+module_init(bgx_driver_init);
+module_exit(bgx_driver_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cavium, Inc. <support@caviumnetworks.com>");
+MODULE_DESCRIPTION("Cavium, Inc. BGX MAC Nexus driver.");
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-bgx.h b/drivers/net/ethernet/cavium/octeon/octeon3-bgx.h
index df794f5..58ed870 100644
--- a/drivers/net/ethernet/cavium/octeon/octeon3-bgx.h
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-bgx.h
@@ -8,14 +8,83 @@
 
 #include <linux/bitops.h>
 
-#define BGX_CMR_CHAN_MSK_MASK		GENMASK_ULL(15, 0)
-#define BGX_CMR_CHAN_MSK_SHIFT		16
-#define BGX_CMR_CONFIG_DATA_PKT_RX_EN	BIT(14)
-#define BGX_CMR_CONFIG_DATA_PKT_TX_EN	BIT(13)
-#define BGX_CMR_CONFIG_ENABLE	BIT(15)
-#define BGX_CMR_CONFIG_LMAC_TYPE_MASK	GENMASK_ULL(10, 8)
-#define BGX_CMR_CONFIG_LMAC_TYPE_SHIFT	8
-#define BGX_CMR_CONFIG_MIX_EN		BIT(11)
+#define BGX_RX_FIFO_SIZE		(64 * 1024)
+#define BGX_TX_FIFO_SIZE		(32 * 1024)
+
+/* Bgx register definitions */
+#define BGX_BASE			0x11800e0000000ull
+#define BGX_ADDR(node, bgx, index)	(BGX_BASE + SET_XKPHYS +	       \
+					 NODE_OFFSET(node) + ((bgx) << 24) +   \
+					 ((index) << 20))
+#define BGX_ADDR_CAM(n, b, mac)		(BGX_ADDR(n, b, 0) + ((mac) << 3))
+
+#define BGX_CMR_CONFIG(n, b, i)		(BGX_ADDR(n, b, i) + 0x00000)
+#define BGX_CMR_GLOBAL_CONFIG(n, b)	(BGX_ADDR(n, b, 0) + 0x00008)
+#define BGX_CMR_RX_ID_MAP(n, b, i)	(BGX_ADDR(n, b, i) + 0x00028)
+#define BGX_CMR_RX_BP_ON(n, b, i)	(BGX_ADDR(n, b, i) + 0x00088)
+#define BGX_CMR_RX_ADR_CTL(n, b, i)	(BGX_ADDR(n, b, i) + 0x000a0)
+#define BGX_CMR_RX_FIFO_LEN(n, b, i)	(BGX_ADDR(n, b, i) + 0x000c0)
+#define BGX_CMR_RX_ADRX_CAM(n, b, m)	(BGX_ADDR_CAM(n, b, m) + 0x00100)
+#define BGX_CMR_CHAN_MSK_AND(n, b)	(BGX_ADDR(n, b, 0) + 0x00200)
+#define BGX_CMR_CHAN_MSK_OR(n, b)	(BGX_ADDR(n, b, 0) + 0x00208)
+#define BGX_CMR_TX_FIFO_LEN(n, b, i)	(BGX_ADDR(n, b, i) + 0x00418)
+#define BGX_CMR_TX_LMACS(n, b)		(BGX_ADDR(n, b, 0) + 0x01000)
+
+#define BGX_SPU_CONTROL1(n, b, i)	(BGX_ADDR(n, b, i) + 0x10000)
+#define BGX_SPU_STATUS1(n, b, i)	(BGX_ADDR(n, b, i) + 0x10008)
+#define BGX_SPU_STATUS2(n, b, i)	(BGX_ADDR(n, b, i) + 0x10020)
+#define BGX_SPU_BX_STATUS(n, b, i)	(BGX_ADDR(n, b, i) + 0x10028)
+#define BGX_SPU_BR_STATUS1(n, b, i)	(BGX_ADDR(n, b, i) + 0x10030)
+#define BGX_SPU_BR_STATUS2(n, b, i)	(BGX_ADDR(n, b, i) + 0x10038)
+#define BGX_SPU_BR_BIP_ERR_CNT(n, b, i)	(BGX_ADDR(n, b, i) + 0x10058)
+#define BGX_SPU_BR_PMD_CONTROL(n, b, i)	(BGX_ADDR(n, b, i) + 0x10068)
+#define BGX_SPU_BR_PMD_LP_CUP(n, b, i)	(BGX_ADDR(n, b, i) + 0x10078)
+#define BGX_SPU_BR_PMD_LD_CUP(n, b, i)	(BGX_ADDR(n, b, i) + 0x10088)
+#define BGX_SPU_BR_PMD_LD_REP(n, b, i)	(BGX_ADDR(n, b, i) + 0x10090)
+#define BGX_SPU_FEC_CONTROL(n, b, i)	(BGX_ADDR(n, b, i) + 0x100a0)
+#define BGX_SPU_AN_CONTROL(n, b, i)	(BGX_ADDR(n, b, i) + 0x100c8)
+#define BGX_SPU_AN_STATUS(n, b, i)	(BGX_ADDR(n, b, i) + 0x100d0)
+#define BGX_SPU_AN_ADV(n, b, i)		(BGX_ADDR(n, b, i) + 0x100d8)
+#define BGX_SPU_MISC_CONTROL(n, b, i)	(BGX_ADDR(n, b, i) + 0x10218)
+#define BGX_SPU_INT(n, b, i)		(BGX_ADDR(n, b, i) + 0x10220)
+#define BGX_SPU_DBG_CONTROL(n, b)	(BGX_ADDR(n, b, 0) + 0x10300)
+
+#define BGX_SMU_RX_INT(n, b, i)		(BGX_ADDR(n, b, i) + 0x20000)
+#define BGX_SMU_RX_FRM_CTL(n, b, i)	(BGX_ADDR(n, b, i) + 0x20008)
+#define BGX_SMU_RX_JABBER(n, b, i)	(BGX_ADDR(n, b, i) + 0x20018)
+#define BGX_SMU_RX_CTL(n, b, i)		(BGX_ADDR(n, b, i) + 0x20030)
+#define BGX_SMU_TX_APPEND(n, b, i)	(BGX_ADDR(n, b, i) + 0x20100)
+#define BGX_SMU_TX_MIN_PKT(n, b, i)	(BGX_ADDR(n, b, i) + 0x20118)
+#define BGX_SMU_TX_INT(n, b, i)		(BGX_ADDR(n, b, i) + 0x20140)
+#define BGX_SMU_TX_CTL(n, b, i)		(BGX_ADDR(n, b, i) + 0x20160)
+#define BGX_SMU_TX_THRESH(n, b, i)	(BGX_ADDR(n, b, i) + 0x20168)
+#define BGX_SMU_CTRL(n, b, i)		(BGX_ADDR(n, b, i) + 0x20200)
+
+#define BGX_GMP_PCS_MR_CONTROL(n, b, i)	(BGX_ADDR(n, b, i) + 0x30000)
+#define BGX_GMP_PCS_MR_STATUS(n, b, i)	(BGX_ADDR(n, b, i) + 0x30008)
+#define BGX_GMP_PCS_AN_ADV(n, b, i)	(BGX_ADDR(n, b, i) + 0x30010)
+#define BGX_GMP_PCS_LINK_TIMER(n, b, i)	(BGX_ADDR(n, b, i) + 0x30040)
+#define BGX_GMP_PCS_SGM_AN_ADV(n, b, i)	(BGX_ADDR(n, b, i) + 0x30068)
+#define BGX_GMP_PCS_MISC_CTL(n, b, i)	(BGX_ADDR(n, b, i) + 0x30078)
+#define BGX_GMP_GMI_PRT_CFG(n, b, i)	(BGX_ADDR(n, b, i) + 0x38010)
+#define BGX_GMP_GMI_RX_FRM_CTL(n, b, i)	(BGX_ADDR(n, b, i) + 0x38018)
+#define BGX_GMP_GMI_RX_JABBER(n, b, i)	(BGX_ADDR(n, b, i) + 0x38038)
+#define BGX_GMP_GMI_TX_THRESH(n, b, i)	(BGX_ADDR(n, b, i) + 0x38210)
+#define BGX_GMP_GMI_TX_APPEND(n, b, i)	(BGX_ADDR(n, b, i) + 0x38218)
+#define BGX_GMP_GMI_TX_SLOT(n, b, i)	(BGX_ADDR(n, b, i) + 0x38220)
+#define BGX_GMP_GMI_TX_BURST(n, b, i)	(BGX_ADDR(n, b, i) + 0x38228)
+#define BGX_GMP_GMI_TX_MIN_PKT(n, b, i)	(BGX_ADDR(n, b, i) + 0x38240)
+#define BGX_GMP_GMI_TX_SGMII_CTL(n, b, i) (BGX_ADDR(n, b, i) + 0x38300)
+
+/* Bgx register bitfields */
+#define BGX_CMR_CHAN_MSK_MASK			GENMASK_ULL(15, 0)
+#define BGX_CMR_CHAN_MSK_SHIFT			16
+#define BGX_CMR_CONFIG_DATA_PKT_RX_EN		BIT(14)
+#define BGX_CMR_CONFIG_DATA_PKT_TX_EN		BIT(13)
+#define BGX_CMR_CONFIG_ENABLE			BIT(15)
+#define BGX_CMR_CONFIG_LMAC_TYPE_MASK		GENMASK_ULL(10, 8)
+#define BGX_CMR_CONFIG_LMAC_TYPE_SHIFT		8
+#define BGX_CMR_CONFIG_MIX_EN			BIT(11)
 #define BGX_CMR_GLOBAL_CONFIG_CMR_MIX0_RST	BIT(3)
 #define BGX_CMR_GLOBAL_CONFIG_CMR_MIX1_RST	BIT(4)
 #define BGX_CMR_RX_ADR_CTL_ACCEPT_ALL_MCST	BIT(1)
@@ -23,128 +92,100 @@
 #define BGX_CMR_RX_ADR_CTL_CAM_ACCEPT		BIT(3)
 #define BGX_CMR_RX_ADR_CTL_MCST_MODE_MASK	GENMASK_ULL(2, 1)
 #define BGX_CMR_RX_ADR_CTL_USE_CAM_FILTER	BIT(2)
-#define BGX_CMR_RX_ADRX_CAM_EN		BIT(48)
-#define BGX_CMR_RX_ADRX_CAM_ID_SHIFT	52
-#define BGX_CMR_RX_FIFO_LEN_MASK	GENMASK_ULL(12, 0)
-#define BGX_CMR_RX_ID_MAP_PKND_MASK	GENMASK_ULL(7, 0)
-#define BGX_CMR_RX_ID_MAP_RID_MASK	GENMASK_ULL(14, 8)
-#define BGX_CMR_RX_ID_MAP_RID_SHIFT	8
-#define BGX_CMR_TX_FIFO_LEN_LMAC_IDLE	BIT(13)
-#define BGX_CMR_TX_LMACS_MASK		GENMASK_ULL(2, 0)
-#define BGX_GMP_GMI_PRT_CFG_DUPLEX	BIT(2)
-#define BGX_GMP_GMI_PRT_CFG_RX_IDLE	BIT(12)
-#define BGX_GMP_GMI_PRT_CFG_SLOTTIME	BIT(3)
-#define BGX_GMP_GMI_PRT_CFG_SPEED	BIT(1)
-#define BGX_GMP_GMI_PRT_CFG_SPEED_MSB	BIT(8)
-#define BGX_GMP_GMI_PRT_CFG_TX_IDLE	BIT(13)
+#define BGX_CMR_RX_ADRX_CAM_EN			BIT(48)
+#define BGX_CMR_RX_ADRX_CAM_ID_SHIFT		52
+#define BGX_CMR_RX_FIFO_LEN_MASK		GENMASK_ULL(12, 0)
+#define BGX_CMR_RX_ID_MAP_PKND_MASK		GENMASK_ULL(7, 0)
+#define BGX_CMR_RX_ID_MAP_RID_MASK		GENMASK_ULL(14, 8)
+#define BGX_CMR_RX_ID_MAP_RID_SHIFT		8
+#define BGX_CMR_TX_FIFO_LEN_LMAC_IDLE		BIT(13)
+#define BGX_CMR_TX_LMACS_MASK			GENMASK_ULL(2, 0)
+
+#define BGX_SPU_AN_ADV_FEC_ABLE			BIT(46)
+#define BGX_SPU_AN_ADV_FEC_REQ			BIT(47)
+#define BGX_SPU_AN_ADV_A100G_CR10		BIT(26)
+#define BGX_SPU_AN_ADV_A40G_CR4			BIT(25)
+#define BGX_SPU_AN_ADV_A40G_KR4			BIT(24)
+#define BGX_SPU_AN_ADV_A10G_KR			BIT(23)
+#define BGX_SPU_AN_ADV_A10G_KX4			BIT(22)
+#define BGX_SPU_AN_ADV_A1G_KX			BIT(21)
+#define BGX_SPU_AN_ADV_RF			BIT(13)
+#define BGX_SPU_AN_ADV_XNP_ABLE			BIT(12)
+#define BGX_SPU_AN_CONTROL_XNP_EN		BIT(13)
+#define BGX_SPU_AN_CONTROL_AN_EN		BIT(12)
+#define BGX_SPU_AN_CONTROL_AN_RESTART		BIT(9)
+#define BGX_SPU_AN_STATUS_AN_COMPLETE		BIT(5)
+#define BGX_SPU_BR_PMD_CONTROL_TRAIN_EN		BIT(1)
+#define BGX_SPU_BR_PMD_CONTROL_TRAIN_RESTART	BIT(0)
+#define BGX_SPU_BR_STATUS1_BLK_LOCK		BIT(0)
+#define BGX_SPU_BR_STATUS2_LATCHED_LOCK		BIT(15)
+#define BGX_SPU_BR_STATUS2_LATCHED_BER		BIT(14)
+#define BGX_SPU_BX_STATUS_ALIGND		BIT(12)
+#define BGX_SPU_CONTROL1_RESET			BIT(15)
+#define BGX_SPU_CONTROL1_LOOPBACK		BIT(14)
+#define BGX_SPU_CONTROL1_LO_PWR			BIT(11)
+#define BGX_SPU_DBG_CONTROL_US_CLK_PERIOD_MASK	GENMASK_ULL(43, 32)
+#define BGX_SPU_DBG_CONTROL_US_CLK_PERIOD_SHIFT	32
+#define BGX_SPU_DBG_CONTROL_AN_NONCE_MATCH_DIS	BIT(29)
+#define BGX_SPU_DBG_CONTROL_AN_ARB_LINK_CHK_EN	BIT(18)
+#define BGX_SPU_FEC_CONTROL_FEC_EN		BIT(0)
+#define BGX_SPU_INT_AN_COMPLETE			BIT(12)
+#define BGX_SPU_INT_AN_LINK_GOOD		BIT(11)
+#define BGX_SPU_INT_AN_PAGE_RX			BIT(10)
+#define BGX_SPU_INT_TRAINING_FAILURE		BIT(14)
+#define BGX_SPU_INT_TRAINING_DONE		BIT(13)
+#define BGX_SPU_MISC_CONTROL_RX_PACKET_DIS	BIT(12)
+#define BGX_SPU_MISC_CONTROL_INTLV_RDISP	BIT(10)
+#define BGX_SPU_STATUS1_RCV_LINK		BIT(2)
+#define BGX_SPU_STATUS2_RCVFLT			BIT(10)
+
+#define BGX_SMU_CTRL_TX_IDLE			BIT(1)
+#define BGX_SMU_CTRL_RX_IDLE			BIT(0)
+#define BGX_SMU_RX_CTL_STATUS_MASK		GENMASK_ULL(1, 0)
+#define BGX_SMU_TX_APPEND_FCS_C			BIT(3)
+#define BGX_SMU_TX_APPEND_FCS_D			BIT(2)
+#define BGX_SMU_TX_APPEND_PAD			BIT(1)
+#define BGX_SMU_TX_CTL_LS_MASK			GENMASK_ULL(5, 4)
+#define BGX_SMU_TX_CTL_UNI_EN			BIT(1)
+#define BGX_SMU_TX_CTL_DIC_EN			BIT(0)
+
+#define BGX_GMP_GMI_PRT_CFG_TX_IDLE		BIT(13)
+#define BGX_GMP_GMI_PRT_CFG_RX_IDLE		BIT(12)
+#define BGX_GMP_GMI_PRT_CFG_SPEED_MSB		BIT(8)
+#define BGX_GMP_GMI_PRT_CFG_SLOTTIME		BIT(3)
+#define BGX_GMP_GMI_PRT_CFG_DUPLEX		BIT(2)
+#define BGX_GMP_GMI_PRT_CFG_SPEED		BIT(1)
 #define BGX_GMP_GMI_TX_APPEND_FCS		BIT(2)
 #define BGX_GMP_GMI_TX_APPEND_PAD		BIT(1)
 #define BGX_GMP_GMI_TX_APPEND_PREAMBLE		BIT(0)
 #define BGX_GMP_GMI_TX_THRESH_DEFAULT		0x20
-#define BGX_GMP_PCS_AN_ADV_FULL_DUPLEX		BIT(5)
-#define BGX_GMP_PCS_AN_ADV_HALF_DUPLEX		BIT(6)
-#define BGX_GMP_PCS_AN_ADV_PAUSE_ASYMMETRIC	2
-#define BGX_GMP_PCS_AN_ADV_PAUSE_BOTH		3
-#define BGX_GMP_PCS_AN_ADV_PAUSE_MASK	GENMASK_ULL(8, 7)
+#define BGX_GMP_PCS_AN_ADV_REM_FLT_MASK		GENMASK_ULL(13, 12)
+#define BGX_GMP_PCS_AN_ADV_PAUSE_MASK		GENMASK_ULL(8, 7)
+#define BGX_GMP_PCS_AN_ADV_PAUSE_SHIFT		7
 #define BGX_GMP_PCS_AN_ADV_PAUSE_NONE		0
-#define BGX_GMP_PCS_AN_ADV_PAUSE_SHIFT	7
 #define BGX_GMP_PCS_AN_ADV_PAUSE_SYMMETRIC	1
-#define BGX_GMP_PCS_AN_ADV_REM_FLT_MASK	GENMASK_ULL(13, 12)
+#define BGX_GMP_PCS_AN_ADV_PAUSE_ASYMMETRIC	2
+#define BGX_GMP_PCS_AN_ADV_PAUSE_BOTH		3
+#define BGX_GMP_PCS_AN_ADV_HALF_DUPLEX		BIT(6)
+#define BGX_GMP_PCS_AN_ADV_FULL_DUPLEX		BIT(5)
 #define BGX_GMP_PCS_LINK_TIMER_COUNT_SHIFT	10
-#define BGX_GMP_PCS_MISC_CTL_GMXENO	BIT(11)
-#define BGX_GMP_PCS_MISC_CTL_MAC_PHY	BIT(9)
-#define BGX_GMP_PCS_MISC_CTL_MODE	BIT(8)
+#define BGX_GMP_PCS_MISC_CTL_GMXENO		BIT(11)
+#define BGX_GMP_PCS_MISC_CTL_MAC_PHY		BIT(9)
+#define BGX_GMP_PCS_MISC_CTL_MODE		BIT(8)
 #define BGX_GMP_PCS_MISC_CTL_SAMP_PT_MASK	GENMASK_ULL(6, 0)
-#define BGX_GMP_PCS_MR_CONTROL_AN_EN	BIT(12)
-#define BGX_GMP_PCS_MR_CONTROL_PWR_DN	BIT(11)
-#define BGX_GMP_PCS_MR_CONTROL_RESET	BIT(15)
-#define BGX_GMP_PCS_MR_CONTROL_RST_AN	BIT(9)
-#define BGX_GMP_PCS_MR_CONTROL_SPDLSB	BIT(13)
-#define BGX_GMP_PCS_MR_CONTROL_SPDMSB	BIT(6)
-#define BGX_GMP_PCS_MR_STATUS_AN_CPT	BIT(5)
+#define BGX_GMP_PCS_MR_CONTROL_RESET		BIT(15)
+#define BGX_GMP_PCS_MR_CONTROL_SPDLSB		BIT(13)
+#define BGX_GMP_PCS_MR_CONTROL_AN_EN		BIT(12)
+#define BGX_GMP_PCS_MR_CONTROL_PWR_DN		BIT(11)
+#define BGX_GMP_PCS_MR_CONTROL_RST_AN		BIT(9)
+#define BGX_GMP_PCS_MR_CONTROL_SPDMSB		BIT(6)
+#define BGX_GMP_PCS_MR_STATUS_AN_CPT		BIT(5)
 #define BGX_GMP_PCS_SGM_AN_ADV_DUPLEX_FULL	BIT(12)
-#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_10		0
-#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_10000	3
-#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_1000	2
-#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_100	1
 #define BGX_GMP_PCS_SGM_AN_ADV_SPEED_MASK	GENMASK_ULL(11, 10)
 #define BGX_GMP_PCS_SGM_AN_ADV_SPEED_SHIFT	10
-#define BGX_SMU_CTRL_RX_IDLE		BIT(0)
-#define BGX_SMU_CTRL_TX_IDLE		BIT(1)
-#define BGX_SMU_RX_CTL_STATUS_MASK	GENMASK_ULL(1, 0)
-#define BGX_SMU_TX_APPEND_FCS_C		BIT(3)
-#define BGX_SMU_TX_APPEND_FCS_D			BIT(2)
-#define BGX_SMU_TX_APPEND_PAD			BIT(1)
-#define BGX_SMU_TX_CTL_DIC_EN	BIT(0)
-#define BGX_SMU_TX_CTL_LS_MASK		GENMASK_ULL(5, 4)
-#define BGX_SMU_TX_CTL_UNI_EN	BIT(1)
-#define BGX_SPU_AN_ADV_A100G_CR10	BIT(26)
-#define BGX_SPU_AN_ADV_A10G_KR		BIT(23)
-#define BGX_SPU_AN_ADV_A10G_KX4		BIT(22)
-#define BGX_SPU_AN_ADV_A1G_KX		BIT(21)
-#define BGX_SPU_AN_ADV_A40G_CR4		BIT(25)
-#define BGX_SPU_AN_ADV_A40G_KR4		BIT(24)
-#define BGX_SPU_AN_ADV_FEC_ABLE		BIT(46)
-#define BGX_SPU_AN_ADV_FEC_REQ		BIT(47)
-#define BGX_SPU_AN_ADV_RF		BIT(13)
-#define BGX_SPU_AN_ADV_XNP_ABLE		BIT(12)
-#define BGX_SPU_AN_CONTROL_AN_EN	BIT(12)
-#define BGX_SPU_AN_CONTROL_AN_RESTART	BIT(9)
-#define BGX_SPU_AN_CONTROL_XNP_EN	BIT(13)
-#define BGX_SPU_AN_STATUS_AN_COMPLETE	BIT(5)
-#define BGX_SPU_BR_PMD_CONTROL_TRAIN_EN		BIT(1)
-#define BGX_SPU_BR_PMD_CONTROL_TRAIN_RESTART	BIT(0)
-#define BGX_SPU_BR_STATUS1_BLK_LOCK		BIT(0)
-#define BGX_SPU_BR_STATUS2_LATCHED_BER		BIT(14)
-#define BGX_SPU_BR_STATUS2_LATCHED_LOCK		BIT(15)
-#define BGX_SPU_BX_STATUS_ALIGND	BIT(12)
-#define BGX_SPU_CONTROL1_LOOPBACK	BIT(14)
-#define BGX_SPU_CONTROL1_LO_PWR	BIT(11)
-#define BGX_SPU_CONTROL1_RESET	BIT(15)
-#define BGX_SPU_DBG_CONTROL_AN_ARB_LINK_CHK_EN	BIT(18)
-#define BGX_SPU_DBG_CONTROL_AN_NONCE_MATCH_DIS	BIT(29)
-#define BGX_SPU_DBG_CONTROL_US_CLK_PERIOD_MASK	GENMASK_ULL(43, 32)
-#define BGX_SPU_DBG_CONTROL_US_CLK_PERIOD_SHIFT	32
-#define BGX_SPU_FEC_CONTROL_FEC_EN	BIT(0)
-#define BGX_SPU_INT_AN_COMPLETE		BIT(12)
-#define BGX_SPU_INT_AN_LINK_GOOD	BIT(11)
-#define BGX_SPU_INT_AN_PAGE_RX		BIT(10)
-#define BGX_SPU_INT_TRAINING_DONE	BIT(13)
-#define BGX_SPU_INT_TRAINING_FAILURE	BIT(14)
-#define BGX_SPU_MISC_CONTROL_INTLV_RDISP	BIT(10)
-#define BGX_SPU_MISC_CONTROL_RX_PACKET_DIS	BIT(12)
-#define BGX_SPU_STATUS1_RCV_LINK	BIT(2)
-#define BGX_SPU_STATUS2_RCVFLT		BIT(10)
-#define GSER_BR_RX_CTL_RXT_EER		BIT(15)
-#define GSER_BR_RX_CTL_RXT_ESV		BIT(14)
-#define GSER_BR_RX_CTL_RXT_SWM		BIT(2)
-#define GSER_BR_RX_EER_RXT_EER         BIT(15)
-#define GSER_BR_RX_EER_RXT_ESV         BIT(14)
-#define GSER_LANE_LBERT_CFG_LBERT_PM_EN		BIT(6)
-#define GSER_LANE_MODE_LMODE_MASK	GENMASK_ULL(3, 0)
-#define GSER_LANE_MODE_MASK	GENMASK_ULL(3, 0)
-#define GSER_LANE_PCS_CTLIFC_0_CFG_TX_COEFF_REQ_OVRRD_VAL	BIT(12)
-#define GSER_LANE_PCS_CTLIFC_2_CFG_TX_COEFF_REQ_OVRRD_EN	BIT(7)
-#define GSER_LANE_PCS_CTLIFC_2_CTLIFC_OVRRD_REQ			BIT(15)
-#define GSER_LANE_P_MODE_1_VMA_MM	BIT(14)
-#define GSER_PHY_CTL_PHY_PD	BIT(0)
-#define GSER_PHY_CTL_PHY_RESET	BIT(1)
-#define GSER_RX_EIE_DETSTS_CDRLOCK_SHIFT	8
-#define XCV_BATCH_CRD_RET_CRD_RET	BIT(0)
-#define XCV_COMP_CTL_DRV_BYP		BIT(63)
-#define XCV_CTL_LPBK_INT	BIT(2)
-#define XCV_CTL_SPEED_MASK	GENMASK_ULL(1, 0)
-#define XCV_DLL_CTL_CLKRX_BYP		BIT(23)
-#define XCV_DLL_CTL_CLKRX_SET_MASK	GENMASK_ULL(22, 16)
-#define XCV_DLL_CTL_CLKTX_BYP		BIT(15)
-#define XCV_DLL_CTL_REFCLK_SEL_MASK	GENMASK_ULL(1, 0)
-#define XCV_RESET_CLKRST	BIT(15)
-#define XCV_RESET_COMP		BIT(7)
-#define XCV_RESET_DLLRST	BIT(11)
-#define XCV_RESET_ENABLE	BIT(63)
-#define XCV_RESET_RX_DAT_RST_N	BIT(0)
-#define XCV_RESET_RX_PKT_RST_N	BIT(1)
-#define XCV_RESET_TX_DAT_RST_N	BIT(2)
-#define XCV_RESET_TX_PKT_RST_N	BIT(3)
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_10		0
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_100	1
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_1000	2
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_10000	3
 
 #endif /* _OCTEON3_BGX_H_ */
-- 
2.1.4

^ permalink raw reply related

* [PATCH v12 02/10] netdev: cavium: octeon: Header for Octeon III BGX Ethernet
From: Steven J. Hill @ 2018-06-27 21:25 UTC (permalink / raw)
  To: netdev; +Cc: Carlos Munoz, Chandrakala Chavva, Steven J. Hill
In-Reply-To: <1530134719-19407-1-git-send-email-steven.hill@cavium.com>

From: Carlos Munoz <cmunoz@cavium.com>

Add the common header file used by the Octeon III BGX Ethernet
driver.

Signed-off-by: Carlos Munoz <cmunoz@cavium.com>
Signed-off-by: Steven J. Hill <Steven.Hill@cavium.com>
---
 drivers/net/ethernet/cavium/octeon/octeon3-bgx.h | 150 +++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3.h     | 330 +++++++++++++++++++++++
 2 files changed, 480 insertions(+)
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx.h
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3.h

diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-bgx.h b/drivers/net/ethernet/cavium/octeon/octeon3-bgx.h
new file mode 100644
index 0000000..df794f5
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-bgx.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Octeon III BGX Nexus Ethernet driver
+ *
+ * Copyright (C) 2018 Cavium, Inc.
+ */
+#ifndef _OCTEON3_BGX_H_
+#define _OCTEON3_BGX_H_
+
+#include <linux/bitops.h>
+
+#define BGX_CMR_CHAN_MSK_MASK		GENMASK_ULL(15, 0)
+#define BGX_CMR_CHAN_MSK_SHIFT		16
+#define BGX_CMR_CONFIG_DATA_PKT_RX_EN	BIT(14)
+#define BGX_CMR_CONFIG_DATA_PKT_TX_EN	BIT(13)
+#define BGX_CMR_CONFIG_ENABLE	BIT(15)
+#define BGX_CMR_CONFIG_LMAC_TYPE_MASK	GENMASK_ULL(10, 8)
+#define BGX_CMR_CONFIG_LMAC_TYPE_SHIFT	8
+#define BGX_CMR_CONFIG_MIX_EN		BIT(11)
+#define BGX_CMR_GLOBAL_CONFIG_CMR_MIX0_RST	BIT(3)
+#define BGX_CMR_GLOBAL_CONFIG_CMR_MIX1_RST	BIT(4)
+#define BGX_CMR_RX_ADR_CTL_ACCEPT_ALL_MCST	BIT(1)
+#define BGX_CMR_RX_ADR_CTL_BCST_ACCEPT		BIT(0)
+#define BGX_CMR_RX_ADR_CTL_CAM_ACCEPT		BIT(3)
+#define BGX_CMR_RX_ADR_CTL_MCST_MODE_MASK	GENMASK_ULL(2, 1)
+#define BGX_CMR_RX_ADR_CTL_USE_CAM_FILTER	BIT(2)
+#define BGX_CMR_RX_ADRX_CAM_EN		BIT(48)
+#define BGX_CMR_RX_ADRX_CAM_ID_SHIFT	52
+#define BGX_CMR_RX_FIFO_LEN_MASK	GENMASK_ULL(12, 0)
+#define BGX_CMR_RX_ID_MAP_PKND_MASK	GENMASK_ULL(7, 0)
+#define BGX_CMR_RX_ID_MAP_RID_MASK	GENMASK_ULL(14, 8)
+#define BGX_CMR_RX_ID_MAP_RID_SHIFT	8
+#define BGX_CMR_TX_FIFO_LEN_LMAC_IDLE	BIT(13)
+#define BGX_CMR_TX_LMACS_MASK		GENMASK_ULL(2, 0)
+#define BGX_GMP_GMI_PRT_CFG_DUPLEX	BIT(2)
+#define BGX_GMP_GMI_PRT_CFG_RX_IDLE	BIT(12)
+#define BGX_GMP_GMI_PRT_CFG_SLOTTIME	BIT(3)
+#define BGX_GMP_GMI_PRT_CFG_SPEED	BIT(1)
+#define BGX_GMP_GMI_PRT_CFG_SPEED_MSB	BIT(8)
+#define BGX_GMP_GMI_PRT_CFG_TX_IDLE	BIT(13)
+#define BGX_GMP_GMI_TX_APPEND_FCS		BIT(2)
+#define BGX_GMP_GMI_TX_APPEND_PAD		BIT(1)
+#define BGX_GMP_GMI_TX_APPEND_PREAMBLE		BIT(0)
+#define BGX_GMP_GMI_TX_THRESH_DEFAULT		0x20
+#define BGX_GMP_PCS_AN_ADV_FULL_DUPLEX		BIT(5)
+#define BGX_GMP_PCS_AN_ADV_HALF_DUPLEX		BIT(6)
+#define BGX_GMP_PCS_AN_ADV_PAUSE_ASYMMETRIC	2
+#define BGX_GMP_PCS_AN_ADV_PAUSE_BOTH		3
+#define BGX_GMP_PCS_AN_ADV_PAUSE_MASK	GENMASK_ULL(8, 7)
+#define BGX_GMP_PCS_AN_ADV_PAUSE_NONE		0
+#define BGX_GMP_PCS_AN_ADV_PAUSE_SHIFT	7
+#define BGX_GMP_PCS_AN_ADV_PAUSE_SYMMETRIC	1
+#define BGX_GMP_PCS_AN_ADV_REM_FLT_MASK	GENMASK_ULL(13, 12)
+#define BGX_GMP_PCS_LINK_TIMER_COUNT_SHIFT	10
+#define BGX_GMP_PCS_MISC_CTL_GMXENO	BIT(11)
+#define BGX_GMP_PCS_MISC_CTL_MAC_PHY	BIT(9)
+#define BGX_GMP_PCS_MISC_CTL_MODE	BIT(8)
+#define BGX_GMP_PCS_MISC_CTL_SAMP_PT_MASK	GENMASK_ULL(6, 0)
+#define BGX_GMP_PCS_MR_CONTROL_AN_EN	BIT(12)
+#define BGX_GMP_PCS_MR_CONTROL_PWR_DN	BIT(11)
+#define BGX_GMP_PCS_MR_CONTROL_RESET	BIT(15)
+#define BGX_GMP_PCS_MR_CONTROL_RST_AN	BIT(9)
+#define BGX_GMP_PCS_MR_CONTROL_SPDLSB	BIT(13)
+#define BGX_GMP_PCS_MR_CONTROL_SPDMSB	BIT(6)
+#define BGX_GMP_PCS_MR_STATUS_AN_CPT	BIT(5)
+#define BGX_GMP_PCS_SGM_AN_ADV_DUPLEX_FULL	BIT(12)
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_10		0
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_10000	3
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_1000	2
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_100	1
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_MASK	GENMASK_ULL(11, 10)
+#define BGX_GMP_PCS_SGM_AN_ADV_SPEED_SHIFT	10
+#define BGX_SMU_CTRL_RX_IDLE		BIT(0)
+#define BGX_SMU_CTRL_TX_IDLE		BIT(1)
+#define BGX_SMU_RX_CTL_STATUS_MASK	GENMASK_ULL(1, 0)
+#define BGX_SMU_TX_APPEND_FCS_C		BIT(3)
+#define BGX_SMU_TX_APPEND_FCS_D			BIT(2)
+#define BGX_SMU_TX_APPEND_PAD			BIT(1)
+#define BGX_SMU_TX_CTL_DIC_EN	BIT(0)
+#define BGX_SMU_TX_CTL_LS_MASK		GENMASK_ULL(5, 4)
+#define BGX_SMU_TX_CTL_UNI_EN	BIT(1)
+#define BGX_SPU_AN_ADV_A100G_CR10	BIT(26)
+#define BGX_SPU_AN_ADV_A10G_KR		BIT(23)
+#define BGX_SPU_AN_ADV_A10G_KX4		BIT(22)
+#define BGX_SPU_AN_ADV_A1G_KX		BIT(21)
+#define BGX_SPU_AN_ADV_A40G_CR4		BIT(25)
+#define BGX_SPU_AN_ADV_A40G_KR4		BIT(24)
+#define BGX_SPU_AN_ADV_FEC_ABLE		BIT(46)
+#define BGX_SPU_AN_ADV_FEC_REQ		BIT(47)
+#define BGX_SPU_AN_ADV_RF		BIT(13)
+#define BGX_SPU_AN_ADV_XNP_ABLE		BIT(12)
+#define BGX_SPU_AN_CONTROL_AN_EN	BIT(12)
+#define BGX_SPU_AN_CONTROL_AN_RESTART	BIT(9)
+#define BGX_SPU_AN_CONTROL_XNP_EN	BIT(13)
+#define BGX_SPU_AN_STATUS_AN_COMPLETE	BIT(5)
+#define BGX_SPU_BR_PMD_CONTROL_TRAIN_EN		BIT(1)
+#define BGX_SPU_BR_PMD_CONTROL_TRAIN_RESTART	BIT(0)
+#define BGX_SPU_BR_STATUS1_BLK_LOCK		BIT(0)
+#define BGX_SPU_BR_STATUS2_LATCHED_BER		BIT(14)
+#define BGX_SPU_BR_STATUS2_LATCHED_LOCK		BIT(15)
+#define BGX_SPU_BX_STATUS_ALIGND	BIT(12)
+#define BGX_SPU_CONTROL1_LOOPBACK	BIT(14)
+#define BGX_SPU_CONTROL1_LO_PWR	BIT(11)
+#define BGX_SPU_CONTROL1_RESET	BIT(15)
+#define BGX_SPU_DBG_CONTROL_AN_ARB_LINK_CHK_EN	BIT(18)
+#define BGX_SPU_DBG_CONTROL_AN_NONCE_MATCH_DIS	BIT(29)
+#define BGX_SPU_DBG_CONTROL_US_CLK_PERIOD_MASK	GENMASK_ULL(43, 32)
+#define BGX_SPU_DBG_CONTROL_US_CLK_PERIOD_SHIFT	32
+#define BGX_SPU_FEC_CONTROL_FEC_EN	BIT(0)
+#define BGX_SPU_INT_AN_COMPLETE		BIT(12)
+#define BGX_SPU_INT_AN_LINK_GOOD	BIT(11)
+#define BGX_SPU_INT_AN_PAGE_RX		BIT(10)
+#define BGX_SPU_INT_TRAINING_DONE	BIT(13)
+#define BGX_SPU_INT_TRAINING_FAILURE	BIT(14)
+#define BGX_SPU_MISC_CONTROL_INTLV_RDISP	BIT(10)
+#define BGX_SPU_MISC_CONTROL_RX_PACKET_DIS	BIT(12)
+#define BGX_SPU_STATUS1_RCV_LINK	BIT(2)
+#define BGX_SPU_STATUS2_RCVFLT		BIT(10)
+#define GSER_BR_RX_CTL_RXT_EER		BIT(15)
+#define GSER_BR_RX_CTL_RXT_ESV		BIT(14)
+#define GSER_BR_RX_CTL_RXT_SWM		BIT(2)
+#define GSER_BR_RX_EER_RXT_EER         BIT(15)
+#define GSER_BR_RX_EER_RXT_ESV         BIT(14)
+#define GSER_LANE_LBERT_CFG_LBERT_PM_EN		BIT(6)
+#define GSER_LANE_MODE_LMODE_MASK	GENMASK_ULL(3, 0)
+#define GSER_LANE_MODE_MASK	GENMASK_ULL(3, 0)
+#define GSER_LANE_PCS_CTLIFC_0_CFG_TX_COEFF_REQ_OVRRD_VAL	BIT(12)
+#define GSER_LANE_PCS_CTLIFC_2_CFG_TX_COEFF_REQ_OVRRD_EN	BIT(7)
+#define GSER_LANE_PCS_CTLIFC_2_CTLIFC_OVRRD_REQ			BIT(15)
+#define GSER_LANE_P_MODE_1_VMA_MM	BIT(14)
+#define GSER_PHY_CTL_PHY_PD	BIT(0)
+#define GSER_PHY_CTL_PHY_RESET	BIT(1)
+#define GSER_RX_EIE_DETSTS_CDRLOCK_SHIFT	8
+#define XCV_BATCH_CRD_RET_CRD_RET	BIT(0)
+#define XCV_COMP_CTL_DRV_BYP		BIT(63)
+#define XCV_CTL_LPBK_INT	BIT(2)
+#define XCV_CTL_SPEED_MASK	GENMASK_ULL(1, 0)
+#define XCV_DLL_CTL_CLKRX_BYP		BIT(23)
+#define XCV_DLL_CTL_CLKRX_SET_MASK	GENMASK_ULL(22, 16)
+#define XCV_DLL_CTL_CLKTX_BYP		BIT(15)
+#define XCV_DLL_CTL_REFCLK_SEL_MASK	GENMASK_ULL(1, 0)
+#define XCV_RESET_CLKRST	BIT(15)
+#define XCV_RESET_COMP		BIT(7)
+#define XCV_RESET_DLLRST	BIT(11)
+#define XCV_RESET_ENABLE	BIT(63)
+#define XCV_RESET_RX_DAT_RST_N	BIT(0)
+#define XCV_RESET_RX_PKT_RST_N	BIT(1)
+#define XCV_RESET_TX_DAT_RST_N	BIT(2)
+#define XCV_RESET_TX_PKT_RST_N	BIT(3)
+
+#endif /* _OCTEON3_BGX_H_ */
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3.h b/drivers/net/ethernet/cavium/octeon/octeon3.h
new file mode 100644
index 0000000..99c092d
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3.h
@@ -0,0 +1,330 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Octeon III BGX Ethernet Driver
+ *
+ * Copyright (C) 2018 Cavium, Inc.
+ */
+#ifndef _OCTEON3_H_
+#define _OCTEON3_H_
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+
+#include <asm/octeon/octeon.h>
+
+#include "octeon3-bgx.h"
+#include "octeon3-pki.h"
+#include "octeon3-pko.h"
+#include "octeon3-sso.h"
+
+#define MAX_CORES			48
+#define MAX_NODES			2
+#define NODE_MASK			(MAX_NODES - 1)
+#define MAX_BGX_PER_NODE		6
+#define MAX_LMAC_PER_BGX		4
+
+#define IOBDMA_ORDERED_IO_ADDR		0xffffffffffffa200ull
+#define LMTDMA_ORDERED_IO_ADDR		0xffffffffffffa400ull
+#define SCRATCH_BASE_ADDR		0xffffffffffff8000ull
+
+#define PKO_LMTLINE			2ull
+#define LMTDMA_SCR_OFFSET		(PKO_LMTLINE * CVMX_CACHE_LINE_SIZE)
+#define PKO_LMTDMA_SCRADDR_SHIFT	3
+
+/* Registers are accessed via xkphys */
+#define SET_XKPHYS			BIT_ULL(63)
+#define NODE_OFFSET(node)		((node) * 0x1000000000ull)
+
+/* DPI registers */
+#define DPI_BASE			0x1df0000000000ull
+#define DPI_ADDR(n)			(DPI_BASE + SET_XKPHYS + NODE_OFFSET(n))
+#define DPI_CTL(n)			(DPI_ADDR(n) + 0x00040)
+
+/* Gser register definitions */
+#define GSER_BASE			0x1180090000000ull
+#define GSER_ADDR(n, gser)		(GSER_BASE + SET_XKPHYS +	       \
+					 NODE_OFFSET(n) + ((gser) << 24))
+#define GSER_LANE_OFFSET(lane)		((lane) << 20)
+#define GSER_LANE_OFFSET1(lane)		((lane) << 7)
+#define GSER_LANE_OFFSET2(lane)		((lane) << 5)
+
+#define GSER_LANE_ADDR(n, g, l)		(GSER_ADDR(n, g) + GSER_LANE_OFFSET(l))
+#define GSER_LANE_ADDR1(n, g, l)	(GSER_ADDR(n, g) + GSER_LANE_OFFSET1(l))
+#define GSER_LANE_ADDR2(n, g, l)	(GSER_ADDR(n, g) + GSER_LANE_OFFSET2(l))
+
+#define GSER_PHY_CTL(n, g)		(GSER_LANE_ADDR(n, g, 0) + 0x000000)
+#define GSER_CFG(n, g)			(GSER_LANE_ADDR(n, g, 0) + 0x000080)
+#define GSER_LANE_MODE(n, g)		(GSER_LANE_ADDR(n, g, 0) + 0x000118)
+#define GSER_RX_EIE_DETSTS(n, g)	(GSER_LANE_ADDR(n, g, 0) + 0x000150)
+#define GSER_LANE_LBERT_CFG(n, g, l)	(GSER_LANE_ADDR(n, g, l) + 0x4c0020)
+#define GSER_LANE_PCS_CTLIFC_0(n, g, l)	(GSER_LANE_ADDR(n, g, l) + 0x4c0060)
+#define GSER_LANE_PCS_CTLIFC_2(n, g, l)	(GSER_LANE_ADDR(n, g, l) + 0x4c0070)
+#define GSER_BR_RX_CTL(n, g, l)		(GSER_LANE_ADDR1(n, g, l) + 0x000400)
+#define GSER_BR_RX_EER(n, g, l)		(GSER_LANE_ADDR1(n, g, l) + 0x000418)
+#define GSER_LANE_P_MODE_1(n, g, m)	(GSER_LANE_ADDR2(n, g, m) + 0x4e0048)
+
+/* Gser register bitfields */
+#define GSER_PHY_CTL_PHY_RESET		BIT(1)
+#define GSER_PHY_CTL_PHY_PD		BIT(0)
+#define GSER_CFG_BGX			BIT(2)
+#define GSER_LANE_MODE_LMODE_MASK	GENMASK_ULL(3, 0)
+#define GSER_RX_EIE_DETSTS_CDRLCK_SHIFT	8
+#define GSER_LANE_LBERT_CFG_LBERT_PM_EN	BIT(6)
+#define GSER_LANE_PCS_CTLIFC_0_CFG_TX_COEFF_REQ_OVRRD_VAL	BIT(12)
+#define GSER_LANE_PCS_CTLIFC_2_CTLIFC_OVRRD_REQ			BIT(15)
+#define GSER_LANE_PCS_CTLIFC_2_CFG_TX_COEFF_REQ_OVRRD_EN	BIT(7)
+#define GSER_BR_RX_CTL_RXT_EER		BIT(15)
+#define GSER_BR_RX_CTL_RXT_ESV		BIT(14)
+#define GSER_BR_RX_CTL_RXT_SWM		BIT(2)
+#define GSER_BR_RX_EER_RXT_EER		BIT(15)
+#define GSER_BR_RX_EER_RXT_ESV		BIT(14)
+#define GSER_LANE_P_MODE_1_VMA_MM	BIT(14)
+
+/* XCV register definitions */
+#define XCV_BASE			0x11800db000000ull
+#define XCV_ADDR(n)			(XCV_BASE + SET_XKPHYS + NODE_OFFSET(n))
+
+#define XCV_RESET(n)			(XCV_ADDR(n) + 0x0000)
+#define XCV_DLL_CTL(n)			(XCV_ADDR(n) + 0x0010)
+#define XCV_COMP_CTL(n)			(XCV_ADDR(n) + 0x0020)
+#define XCV_CTL(n)			(XCV_ADDR(n) + 0x0030)
+#define XCV_INT(n)			(XCV_ADDR(n) + 0x0040)
+#define XCV_INBND_STATUS(n)		(XCV_ADDR(n) + 0x0080)
+#define XCV_BATCH_CRD_RET(n)		(XCV_ADDR(n) + 0x0100)
+
+/* XCV register bitfields */
+#define XCV_RESET_ENABLE		BIT(63)
+#define XCV_RESET_CLKRST		BIT(15)
+#define XCV_RESET_DLLRST		BIT(11)
+#define XCV_RESET_COMP			BIT(7)
+#define XCV_RESET_TX_PKT_RST_N		BIT(3)
+#define XCV_RESET_TX_DAT_RST_N		BIT(2)
+#define XCV_RESET_RX_PKT_RST_N		BIT(1)
+#define XCV_RESET_RX_DAT_RST_N		BIT(0)
+#define XCV_DLL_CTL_CLKRX_BYP		BIT(23)
+#define XCV_DLL_CTL_CLKRX_SET_MASK	GENMASK_ULL(22, 16)
+#define XCV_DLL_CTL_CLKTX_BYP		BIT(15)
+#define XCV_DLL_CTL_REFCLK_SEL_MASK	GENMASK_ULL(1, 0)
+#define XCV_COMP_CTL_DRV_BYP		BIT(63)
+#define XCV_CTL_LPBK_INT		BIT(2)
+#define XCV_CTL_SPEED_MASK		GENMASK_ULL(1, 0)
+#define XCV_BATCH_CRD_RET_CRD_RET	BIT(0)
+
+enum octeon3_mac_type {
+	BGX_MAC,
+	SRIO_MAC
+};
+
+enum octeon3_src_type {
+	QLM,
+	XCV
+};
+
+struct mac_platform_data {
+	enum octeon3_mac_type mac_type;
+	enum octeon3_src_type src_type;
+	int interface;
+	int numa_node;
+	int port;
+};
+
+struct bgx_port_netdev_priv {
+	struct bgx_port_priv *bgx_priv;
+};
+
+union wqe_word0 {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 rsvd_0:4,
+		__BITFIELD_FIELD(u64 aura:12,
+		__BITFIELD_FIELD(u64 rsvd_1:1,
+		__BITFIELD_FIELD(u64 apad:3,
+		__BITFIELD_FIELD(u64 channel:12,
+		__BITFIELD_FIELD(u64 bufs:8,
+		__BITFIELD_FIELD(u64 style:8,
+		__BITFIELD_FIELD(u64 rsvd_2:10,
+		__BITFIELD_FIELD(u64 pknd:6,
+		;)))))))))
+	};
+};
+
+union wqe_word1 {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 len:16,
+		__BITFIELD_FIELD(u64 rsvd_0:2,
+		__BITFIELD_FIELD(u64 rsvd_1:2,
+		__BITFIELD_FIELD(u64 grp:10,
+		__BITFIELD_FIELD(u64 tag_type:2,
+		__BITFIELD_FIELD(u64 tag:32,
+		;))))))
+	};
+};
+
+union wqe_word2 {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 software:1,
+		__BITFIELD_FIELD(u64 lg_hdr_type:5,
+		__BITFIELD_FIELD(u64 lf_hdr_type:5,
+		__BITFIELD_FIELD(u64 le_hdr_type:5,
+		__BITFIELD_FIELD(u64 ld_hdr_type:5,
+		__BITFIELD_FIELD(u64 lc_hdr_type:5,
+		__BITFIELD_FIELD(u64 lb_hdr_type:5,
+		__BITFIELD_FIELD(u64 is_la_ether:1,
+		__BITFIELD_FIELD(u64 rsvd_0:8,
+		__BITFIELD_FIELD(u64 vlan_valid:1,
+		__BITFIELD_FIELD(u64 vlan_stacked:1,
+		__BITFIELD_FIELD(u64 stat_inc:1,
+		__BITFIELD_FIELD(u64 pcam_flag4:1,
+		__BITFIELD_FIELD(u64 pcam_flag3:1,
+		__BITFIELD_FIELD(u64 pcam_flag2:1,
+		__BITFIELD_FIELD(u64 pcam_flag1:1,
+		__BITFIELD_FIELD(u64 is_frag:1,
+		__BITFIELD_FIELD(u64 is_l3_bcast:1,
+		__BITFIELD_FIELD(u64 is_l3_mcast:1,
+		__BITFIELD_FIELD(u64 is_l2_bcast:1,
+		__BITFIELD_FIELD(u64 is_l2_mcast:1,
+		__BITFIELD_FIELD(u64 is_raw:1,
+		__BITFIELD_FIELD(u64 err_level:3,
+		__BITFIELD_FIELD(u64 err_code:8,
+		;))))))))))))))))))))))))
+	};
+};
+
+union buf_ptr {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 size:16,
+		__BITFIELD_FIELD(u64 packet_outside_wqe:1,
+		__BITFIELD_FIELD(u64 rsvd0:5,
+		__BITFIELD_FIELD(u64 addr:42,
+		;))))
+	};
+};
+
+union wqe_word4 {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 ptr_vlan:8,
+		__BITFIELD_FIELD(u64 ptr_layer_g:8,
+		__BITFIELD_FIELD(u64 ptr_layer_f:8,
+		__BITFIELD_FIELD(u64 ptr_layer_e:8,
+		__BITFIELD_FIELD(u64 ptr_layer_d:8,
+		__BITFIELD_FIELD(u64 ptr_layer_c:8,
+		__BITFIELD_FIELD(u64 ptr_layer_b:8,
+		__BITFIELD_FIELD(u64 ptr_layer_a:8,
+		;))))))))
+	};
+};
+
+struct wqe {
+	union wqe_word0	word0;
+	union wqe_word1	word1;
+	union wqe_word2	word2;
+	union buf_ptr packet_ptr;
+	union wqe_word4	word4;
+	u64 wqe_data[11];
+};
+
+enum port_mode {
+	PORT_MODE_DISABLED,
+	PORT_MODE_SGMII,
+	PORT_MODE_RGMII,
+	PORT_MODE_XAUI,
+	PORT_MODE_RXAUI,
+	PORT_MODE_XLAUI,
+	PORT_MODE_XFI,
+	PORT_MODE_10G_KR,
+	PORT_MODE_40G_KR4
+};
+
+enum lane_mode {
+	R_25G_REFCLK100,
+	R_5G_REFCLK100,
+	R_8G_REFCLK100,
+	R_125G_REFCLK15625_KX,
+	R_3125G_REFCLK15625_XAUI,
+	R_103125G_REFCLK15625_KR,
+	R_125G_REFCLK15625_SGMII,
+	R_5G_REFCLK15625_QSGMII,
+	R_625G_REFCLK15625_RXAUI,
+	R_25G_REFCLK125,
+	R_5G_REFCLK125,
+	R_8G_REFCLK125
+};
+
+struct port_status {
+	int link;
+	int duplex;
+	int speed;
+};
+
+static inline u64 oct_csr_read(u64 addr)
+{
+	return __raw_readq((void __iomem *)addr);
+}
+
+static inline void oct_csr_write(u64 data, u64 addr)
+{
+	__raw_writeq(data, (void __iomem *)addr);
+}
+
+extern int ilk0_lanes;
+extern int ilk1_lanes;
+
+void bgx_nexus_load(void);
+
+int bgx_port_allocate_pknd(int node);
+int bgx_port_get_pknd(int node, int bgx, int index);
+enum port_mode bgx_port_get_mode(int node, int bgx, int index);
+int bgx_port_get_qlm(int node, int bgx, int index);
+void bgx_port_set_netdev(struct device *dev, struct net_device *netdev);
+int bgx_port_enable(struct net_device *netdev);
+int bgx_port_disable(struct net_device *netdev);
+const u8 *bgx_port_get_mac(struct net_device *netdev);
+void bgx_port_set_rx_filtering(struct net_device *netdev);
+int bgx_port_change_mtu(struct net_device *netdev, int new_mtu);
+int bgx_port_ethtool_get_link_ksettings(struct net_device *netdev,
+					struct ethtool_link_ksettings *cmd);
+int bgx_port_ethtool_get_settings(struct net_device *netdev,
+				  struct ethtool_cmd *cmd);
+int bgx_port_ethtool_set_settings(struct net_device *netdev,
+				  struct ethtool_cmd *cmd);
+int bgx_port_ethtool_nway_reset(struct net_device *netdev);
+int bgx_port_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd);
+
+void bgx_port_mix_assert_reset(struct net_device *netdev, int mix, bool v);
+
+int octeon3_pki_vlan_init(int node);
+int octeon3_pki_cluster_init(int node, struct platform_device *pdev);
+int octeon3_pki_ltype_init(int node);
+int octeon3_pki_enable(int node);
+int octeon3_pki_port_init(int node, int aura, int grp, int skip, int mb_size,
+			  int pknd, int num_rx_cxt);
+int octeon3_pki_get_stats(int node, int pknd, u64 *packets, u64 *octets,
+			  u64 *dropped);
+int octeon3_pki_set_ptp_skip(int node, int pknd, int skip);
+int octeon3_pki_port_shutdown(int node, int pknd);
+void octeon3_pki_shutdown(int node);
+
+void octeon3_sso_pass1_limit(int node, int grp);
+int octeon3_sso_init(int node, int aura);
+void octeon3_sso_shutdown(int node, int aura);
+int octeon3_sso_alloc_groups(int node, int *groups, int cnt, int start);
+void octeon3_sso_free_groups(int node, int *groups, int cnt);
+void octeon3_sso_irq_set(int node, int grp, bool en);
+
+int octeon3_pko_interface_init(int node, int interface, int index,
+			       enum octeon3_mac_type mac_type, int ipd_port);
+int octeon3_pko_activate_dq(int node, int dq, int cnt);
+int octeon3_pko_get_fifo_size(int node, int interface, int index,
+			      enum octeon3_mac_type mac_type);
+int octeon3_pko_set_mac_options(int node, int interface, int index,
+				enum octeon3_mac_type mac_type, bool fcs_en,
+				bool pad_en, int fcs_sop_off);
+int octeon3_pko_init_global(int node, int aura);
+int octeon3_pko_interface_uninit(int node, const int *dq, int num_dq);
+int octeon3_pko_exit_global(int node);
+
+#endif /* _OCTEON3_H_ */
-- 
2.1.4

^ permalink raw reply related

* [PATCH v12 01/10] dt-bindings: Add Cavium Octeon Common Ethernet Interface.
From: Steven J. Hill @ 2018-06-27 21:25 UTC (permalink / raw)
  To: netdev; +Cc: Carlos Munoz, Chandrakala Chavva, Steven J. Hill
In-Reply-To: <1530134719-19407-1-git-send-email-steven.hill@cavium.com>

From: Carlos Munoz <cmunoz@cavium.com>

Add bindings for Common Ethernet Interface (BGX) block.

Signed-off-by: Carlos Munoz <cmunoz@cavium.com>
Signed-off-by: Steven J. Hill <Steven.Hill@cavium.com>
---
 .../devicetree/bindings/net/cavium-bgx.txt         | 59 ++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/cavium-bgx.txt

diff --git a/Documentation/devicetree/bindings/net/cavium-bgx.txt b/Documentation/devicetree/bindings/net/cavium-bgx.txt
new file mode 100644
index 0000000..21c9606
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/cavium-bgx.txt
@@ -0,0 +1,59 @@
+* Common Ethernet Interface (BGX) block
+
+Properties:
+
+- compatible: "cavium,octeon-7890-bgx": Compatibility with all cn7xxx SOCs.
+
+- reg: The base address of the BGX block.
+
+- #address-cells: Must be <1>.
+
+- #size-cells: Must be <0>.  BGX addresses have no size component.
+
+Typically a BGX block has several children each representing a ethernet
+interface.
+
+Example:
+
+	ethernet-mac-nexus@11800e0000000 {
+		compatible = "cavium,octeon-7890-bgx";
+		reg = <0x00011800 0xe0000000 0x00000000 0x01000000>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		ethernet-mac@0 {
+			...
+			reg = <0>;
+		};
+	};
+
+
+* Ethernet Interface (BGX port) connects to PKI/PKO
+
+Properties:
+
+- compatible: "cavium,octeon-7890-bgx-port": Compatibility with all cn7xxx
+  SOCs.
+
+- reg: The index of the interface withing the BGX block.
+
+- local-mac-address: Mac address for the interface.
+
+- phy-handle: phandle to the phy node connected to the interface.
+
+
+* Ethernet Interface (BGX port) connects to XCV
+
+
+Properties:
+
+- compatible: "cavium,octeon-7360-xcv": Compatibility with cn73xx SOCs.
+
+- reg: The index of the interface withing the BGX block.
+
+- local-mac-address: Mac address for the interface.
+
+- phy-handle: phandle to the phy node connected to the interface.
+
+- cavium,rx-clk-delay-bypass: Set to <1> to bypass the rx clock delay setting.
+  Needed by the Micrel PHY.
-- 
2.1.4

^ permalink raw reply related

* [PATCH v12 00/10] netdev: octeon-ethernet: Add Cavium Octeon III support.
From: Steven J. Hill @ 2018-06-27 21:25 UTC (permalink / raw)
  To: netdev; +Cc: Chandrakala Chavva

Add the Cavium OCTEON III network driver. There are some corresponding
MIPS architecture support changes which will be upstreamed separately.

Changes in v12:

o Complete reorganization of driver files and defined all bitfields
  used in the driver.
o Implemented suggested changes from Andrew Lunn.
o Ran checkpatch and did whitespace cleanups.

Carlos Munoz (9):
  dt-bindings: Add Cavium Octeon Common Ethernet Interface.
  netdev: cavium: octeon: Header for Octeon III BGX Ethernet
  netdev: cavium: octeon: Add Octeon III BGX Ethernet Nexus
  netdev: cavium: octeon: Add Octeon III BGX Ports
  netdev: cavium: octeon: Add Octeon III PKI Support
  netdev: cavium: octeon: Add Octeon III PKO Support
  netdev: cavium: octeon: Add Octeon III SSO Support
  netdev: cavium: octeon: Add Octeon III BGX Ethernet core
  netdev: cavium: octeon: Add Octeon III BGX Ethernet building

David Daney (1):
  MAINTAINERS: Add entry for
    drivers/net/ethernet/cavium/octeon/octeon3-*

 .../devicetree/bindings/net/cavium-bgx.txt         |   59 +
 MAINTAINERS                                        |    6 +
 drivers/net/ethernet/cavium/Kconfig                |   22 +-
 drivers/net/ethernet/cavium/octeon/Makefile        |    8 +-
 .../net/ethernet/cavium/octeon/octeon3-bgx-nexus.c |  670 ++++++
 .../net/ethernet/cavium/octeon/octeon3-bgx-port.c  | 2192 ++++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-bgx.h   |  191 ++
 drivers/net/ethernet/cavium/octeon/octeon3-core.c  | 2363 ++++++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-pki.c   |  789 +++++++
 drivers/net/ethernet/cavium/octeon/octeon3-pki.h   |  113 +
 drivers/net/ethernet/cavium/octeon/octeon3-pko.c   | 1638 ++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-pko.h   |  159 ++
 drivers/net/ethernet/cavium/octeon/octeon3-sso.c   |  221 ++
 drivers/net/ethernet/cavium/octeon/octeon3-sso.h   |   89 +
 drivers/net/ethernet/cavium/octeon/octeon3.h       |  330 +++
 15 files changed, 8848 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/cavium-bgx.txt
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-port.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx.h
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-core.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pki.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pki.h
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pko.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pko.h
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-sso.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-sso.h
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3.h

-- 
2.1.4

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 0/4] Add switchdev on TI-CPSW
From: Grygorii Strashko @ 2018-06-27 21:05 UTC (permalink / raw)
  To: Ilias Apalodimas, Andrew Lunn
  Cc: netdev, ivan.khoronzhuk, nsekhar, jiri, ivecera, f.fainelli,
	francois.ozog, yogeshs, spatton, Jose.Abreu
In-Reply-To: <20180618174922.GA28394@apalos>



On 06/18/2018 12:49 PM, Ilias Apalodimas wrote:
> On Mon, Jun 18, 2018 at 07:30:25PM +0200, Andrew Lunn wrote:
>> On Mon, Jun 18, 2018 at 07:46:02PM +0300, Ilias Apalodimas wrote:
>>> On Mon, Jun 18, 2018 at 06:28:36PM +0200, Andrew Lunn wrote:
>>>>> Yes, if the CPU port is added on the VLAN then unregistered multicast traffic
>>>>> (and thus IGMP joins) will reach the CPU port and everything will work as
>>>>> expected. I think we should not consider this as a "problem" as long as it's
>>>>> descibed properly in Documentation. This switch is excected to support this.
>>>>
>>>> Back to the two e1000e. What would you expect to happen with them?
>>>> Either IGMP snooping needs to work, or your don't do snooping at
>>>> all.
>>> That's a different use case
>>
>> I disagree. That is the exact same use case. I add ports to a bridge
>> and i expect the bridge to either do IGMP snooping, or just forward
>> all multicast. That is the users expectations. That is how the Linux
>> network stack works. If the hardware has limitations you want to try
>> to hide them from the user.
> Why is this a limitation? Isn't it proper functionality?
> If you configure the CPU port as "passthrough" (which again is
> the default) then everything works just like e1000e. The user doesn't have to do
> anything at all, MDBs are added/deleted based on proper timers/joins etc.
> If the user chooses to use the cpu port as a kind of internal L-2 filter,
> that's up to him, but having hardware do the filtering for you isn't something
> i'd call a limitation.
> 
> I am not sure what's the case here though. The hardware operates as you want
> by defaulti. As added functionality the user can, if he chooses to, add the
> MDBs manually instead of having some piece of code do that for him.
> This was clearly described in the first RFC since it was the only reason to add
> a CPU port. Is there a problem with the user controlling these capabilities of
> the hardware?
>>>> So by default, it just needs to work. You can give the user the option
>>>> to shoot themselves in the foot, but they need to actively pull the
>>>> trigger to blow their own foot off.
>>
>>> Yes it does by default. I don't consider it "foot shooting" though.
>>> If we stop thinking about switches connected to user environments
>>
>> I never think about switches. I think about a block of acceleration
>> hardware, which i try to offload Linux networking to.  And if the
>> hardware cannot accelerate Linux network functions properly, i don't
>> try to offload it. That way it always operates in the same way, and
>> the user expectations are clear.

Yeh. Sry, but the user expectations depends on application area. 
For the industrial & automotive application It's not only about acceleration only - it's
about filtering (hm, hw filtering is acceleration also :), which is the strict requirement.
Yes, CPSW will help Linux networking stuck to accelerate packet forwarding,
but it's expected to do this only for *allowed* traffic.
One of the main points is to prevent DoS attacks when Linux Host will not be able to
perform required operations and will stuck processing some network (mcast) traffic. 
Example, remote control/monitoring stations where one Port connected
to the private sensor/control network and another is wan/trunk port. Plus, Linux Host is running 
industrial application which monitor/control some hw by itself - activity on WAN port should
have minimal effect on ability of Linux Host industrial application to perform it target function
and prevent packets flooding to the private network.
As result, after boot, such systems are expected to work following the rule:
 "everything is prohibited which is not explicitly allowed".
And that's exactly what CPSW allows to achieve by filling manually/statically ALE VLAN/FDB/MDB tables and
this way prevent IGMP/.. Flood Attacks.

Hope, above will help understand our use-cases and why we "annoyingly" asking about
possibility to do manual/static configuration and not rely on IGMP or other great, but not 
applicable for all use cases, networking technologies.

> The acceleration block is working properly here. The user has the ability to
> instruct the acceleration block not to accelerate all the traffic but specific
> cases he chooses to. Isn't that a valid use case since the hardware supports
> that ?

-- 
regards,
-grygorii

^ permalink raw reply

* Re: [PATCH bpf 4/4] xsk: fix potential race in SKB TX completion code
From: Daniel Borkmann @ 2018-06-27 21:00 UTC (permalink / raw)
  To: Eric Dumazet, Magnus Karlsson, bjorn.topel, ast, netdev; +Cc: qi.z.zhang, pavel
In-Reply-To: <54fa1b60-ebae-94a2-8036-06df6c05307a@gmail.com>

On 06/27/2018 05:55 PM, Eric Dumazet wrote:
> On 06/27/2018 07:02 AM, Magnus Karlsson wrote:
>> There was a potential race in the TX completion code for
>> the SKB case when the TX napi thread and the error path
>> of the sendmsg code could both call the SKB destructor
>> at the same time. Fixed by introducing a spin_lock in the
>> destructor.
> 
> Wow, what is the impact on performance ?
> 
> Please describe a bit more what is the problem.

Agree, this should at min be better clarified in the commit log.

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH net-next 2/3] rds: Enable RDS IPv6 support
From: Santosh Shilimkar @ 2018-06-27 20:45 UTC (permalink / raw)
  To: Ka-Cheong Poon, Sowmini Varadhan; +Cc: netdev, davem, rds-devel
In-Reply-To: <e83dbf4a-b813-4edc-9245-d738faaedb4b@oracle.com>

On 6/27/2018 3:07 AM, Ka-Cheong Poon wrote:
> On 06/26/2018 09:08 PM, Sowmini Varadhan wrote:
>> On (06/26/18 21:02), Ka-Cheong Poon wrote:

[...]

> 
> 
> I don't expect RDS apps will want to use link local address
> in the first place.  In fact, most normal network apps don't.
>
This is not true.

> 
>> You're not doing this for IPv4 and RDS today (you dont have to do this
>> for UDP, afaik)
> 
> 
> Do you know of any IPv4 RDS app which uses IPv4 link local
> address?  In fact, IPv4 link local address is explicitly
> disallowed for active active bonding.
>
Yes. Cluster-ware HAIP makes use of link local addresses. That
check was mainly because of RDMA CM issues but that only means
active-active isn't used. The bonding works just fine and if
needed cluster-ware can also use TCP transport.

Lets not add this new behavior for link local and its
actually not relevant to really v6 addressing support.

Regards,
Santosh

^ permalink raw reply

* [RFC PATCH] net, mm: account sock objects to kmemcg
From: Shakeel Butt @ 2018-06-27 20:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Vladimir Davydov, Greg Thelen, Roman Gushchin,
	David S . Miller, Eric Dumazet, Kirill Tkhai, linux-kernel,
	netdev, linux-mm, Shakeel Butt

Currently the kernel accounts the memory for network traffic through
mem_cgroup_[un]charge_skmem() interface. However the memory accounted
only includes the truesize of sk_buff which does not include the size of
sock objects. In our production environment, with opt-out kmem
accounting, the sock kmem caches (TCP[v6], UDP[v6], RAW[v6], UNIX) are
among the top most charged kmem caches and consume a significant amount
of memory which can not be left as system overhead. So, this patch
converts the kmem caches of more important sock objects to SLAB_ACCOUNT.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
 net/ipv4/raw.c      | 1 +
 net/ipv4/tcp_ipv4.c | 2 +-
 net/ipv4/udp.c      | 1 +
 net/ipv6/raw.c      | 1 +
 net/ipv6/tcp_ipv6.c | 2 +-
 net/ipv6/udp.c      | 1 +
 net/unix/af_unix.c  | 1 +
 7 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index abb3c9490c55..2c4b04c6461a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -988,6 +988,7 @@ struct proto raw_prot = {
 	.hash		   = raw_hash_sk,
 	.unhash		   = raw_unhash_sk,
 	.obj_size	   = sizeof(struct raw_sock),
+	.slab_flags	   = SLAB_ACCOUNT,
 	.useroffset	   = offsetof(struct raw_sock, filter),
 	.usersize	   = sizeof_field(struct raw_sock, filter),
 	.h.raw_hash	   = &raw_v4_hashinfo,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fed3f1c66167..9ae31979aefa 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2459,7 +2459,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
-	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU | SLAB_ACCOUNT,
 	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9bb27df4dac5..26e07b8a83cc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2657,6 +2657,7 @@ struct proto udp_prot = {
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_udp_wmem_min),
 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_udp_rmem_min),
 	.obj_size		= sizeof(struct udp_sock),
+	.slab_flags		= SLAB_ACCOUNT,
 	.h.udp_table		= &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_udp_setsockopt,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ce6f0d15b5dd..044ed44e7c16 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1272,6 +1272,7 @@ struct proto rawv6_prot = {
 	.hash		   = raw_hash_sk,
 	.unhash		   = raw_unhash_sk,
 	.obj_size	   = sizeof(struct raw6_sock),
+	.slab_flags	   = SLAB_ACCOUNT,
 	.useroffset	   = offsetof(struct raw6_sock, filter),
 	.usersize	   = sizeof_field(struct raw6_sock, filter),
 	.h.raw_hash	   = &raw_v6_hashinfo,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b620d9b72e59..7187609ca25f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1973,7 +1973,7 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
-	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU | SLAB_ACCOUNT,
 	.twsk_prot		= &tcp6_timewait_sock_ops,
 	.rsk_prot		= &tcp6_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e6645cae403e..47c9a3c74981 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1582,6 +1582,7 @@ struct proto udpv6_prot = {
 	.sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
 	.sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
 	.obj_size		= sizeof(struct udp6_sock),
+	.slab_flags		= SLAB_ACCOUNT,
 	.h.udp_table		= &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_udpv6_setsockopt,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 95b02a71fd47..5e3e377a7269 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -742,6 +742,7 @@ static struct proto unix_proto = {
 	.name			= "UNIX",
 	.owner			= THIS_MODULE,
 	.obj_size		= sizeof(struct unix_sock),
+	.slab_flags		= SLAB_ACCOUNT,
 };
 
 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
-- 
2.18.0.rc2.346.g013aa6912e-goog

^ permalink raw reply related

* Re: [RFC v2, net-next, PATCH 4/4] net/cpsw_switchdev: add switchdev mode of operation on cpsw driver
From: Arnd Bergmann @ 2018-06-27 20:40 UTC (permalink / raw)
  To: Grygorii Strashko
  Cc: Ilias Apalodimas, Ivan Vecera, Florian Fainelli, Andrew Lunn,
	Networking, ivan.khoronzhuk, Sekhar Nori,
	Jiří Pírko, Francois Ozog, yogeshs, spatton,
	Jose.Abreu
In-Reply-To: <8e908165-9412-57b8-bc30-5b337fdd4033@ti.com>

On Wed, Jun 27, 2018 at 9:18 PM, Grygorii Strashko
<grygorii.strashko@ti.com> wrote:
> On 06/22/2018 02:45 AM, Ilias Apalodimas wrote:
>> On Thu, Jun 21, 2018 at 05:31:31PM +0200, Arnd Bergmann wrote:
>>> On Thu, Jun 21, 2018 at 2:45 PM, Ilias Apalodimas
>>> <ilias.apalodimas@linaro.org> wrote:
>>>> On Thu, Jun 21, 2018 at 02:19:55PM +0200, Ivan Vecera wrote:
>>>
>>
>> If people like this idea, i can send a V3 with these changes.
>
> Nop. I do not think this is good idea, because "dual_mac" mode has very strict
> meaning and requirements. In "dual_mac" mode both port should be teated and work
> as *separate network devices" (like two, not connected PCI eth cards) - the fact that
> it's implemented on top of hw, which can do packet switching doesn't matter here and just a
> technical solution.
> Main requirements:
> 1) No packet forwarding is allowed inside hw under any circumstances, only Linux
>    Host SW can consume or forward packets
> 2) One interface should not block another inside CPSW hw which implies special FIFOs/HW
>  configuration

Could you explain the reasoning behind those requirements? I honestly don't
see what difference it makes, given that a new driver with switchdev support
would look exactly like the dual_emac mode as long as you don't add the
two interfaces into a bridge, and the user-visible behavior is already required
to be the same.

> As per, above switchdev functionality doesn't make too much sense in "dual_mac" mode and
> introducing dual meaning for this mode is not a good choice either.
>
> Again, as discussed, option 4 is considered as preferred.

Do you mean creating an incompatible binding that could be implemented by
the same driver, or duplicating the driver with one copy of the old binding
and one copy for the new binding?

      Arnd

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Flavio Leitner @ 2018-06-27 20:19 UTC (permalink / raw)
  To: Cong Wang
  Cc: Eric Dumazet, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <CAM_iQpWOxCk0GYPEAwrdrPg91mhyHR==pzhRGF3fS9+pF493xA@mail.gmail.com>

On Wed, Jun 27, 2018 at 12:06:16PM -0700, Cong Wang wrote:
> On Wed, Jun 27, 2018 at 5:32 AM Flavio Leitner <fbl@redhat.com> wrote:
> >
> > On Tue, Jun 26, 2018 at 06:28:27PM -0700, Cong Wang wrote:
> > > On Tue, Jun 26, 2018 at 5:39 PM Flavio Leitner <fbl@redhat.com> wrote:
> > > >
> > > > On Tue, Jun 26, 2018 at 05:29:51PM -0700, Cong Wang wrote:
> > > > > On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner <fbl@redhat.com> wrote:
> > > > > >
> > > > > > It is still isolated, the sk carries the netns info and it is
> > > > > > orphaned when it re-enters the stack.
> > > > >
> > > > > Then what difference does your patch make?
> > > >
> > > > Don't forget it is fixing two issues.
> > >
> > > Sure. I am only talking about TSQ from the very beginning.
> > > Let me rephrase my above question:
> > > What difference does your patch make to TSQ?
> >
> > It avoids burstiness.
> 
> Never even mentioned in changelog or in your patch. :-/

It's part of queueing and helping the bufferbloat problem in the
commit message.

> > > > > Before your patch:
> > > > > veth orphans skb in its xmit
> > > > >
> > > > > After your patch:
> > > > > RX orphans it when re-entering stack (as you claimed, I don't know)
> > > >
> > > > ip_rcv, and equivalents.
> > >
> > > ip_rcv() is L3, we enter a stack from L1. So your above claim is incorrect. :)
> >
> > Maybe you found a problem, could you please point me to where in
> > between L1 to L3 the socket is relevant?
> 
> Of course, ingress qdisc is in L2. Do I need to say more? This
> is where we can re-route the packets, for example, redirecting it to
> yet another netns. This is in fact what we use in production, not anything
> that only in my imagination.
> 
> You really have to think about why you allow a different netns influence
> another netns by holding the skb to throttle the source TCP socket.

Maybe I wasn't clear and you didn't understand the question. Please find
a spot where the preserved socket is used incorrectly.

> > > which means you have to update Documentation/networking/ip-sysctl.txt
> > > too.
> >
> > How it is never targeted? Whole point is to avoid queueing traffic.
> 
> What queues? You really need to define it, seriously.
> 
> 
> > Would you be okay if I include this chunk?
> 
> No, still lack of an explanation why it comes across netns for
> a good reason.

Because it doesn't. Since you talk more about veth, let's pick it
as an example. The TX is nothing more than add to the CPU backlog,
right? That is netns agnostic. The same for processing that queue
which will push the skb anyways and will call skb_orphan().

How can one netns avoid/delay the skb_orphan()? And even if does
that, what gain will you have to allow queuing of more and more
packets in the CPU backlog? It is stalled.

> > diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> > index ce8fbf5aa63c..f4c042be0216 100644
> > --- a/Documentation/networking/ip-sysctl.txt
> > +++ b/Documentation/networking/ip-sysctl.txt
> > @@ -733,11 +733,11 @@ tcp_limit_output_bytes - INTEGER
> >         Controls TCP Small Queue limit per tcp socket.
> >         TCP bulk sender tends to increase packets in flight until it
> >         gets losses notifications. With SNDBUF autotuning, this can
> > -       result in a large amount of packets queued in qdisc/device
> > -       on the local machine, hurting latency of other flows, for
> > -       typical pfifo_fast qdiscs.
> > -       tcp_limit_output_bytes limits the number of bytes on qdisc
> > -       or device to reduce artificial RTT/cwnd and reduce bufferbloat.
> > +       result in a large amount of packets queued on the local machine
> > +       (e.g.: qdiscs, CPU backlog, or device) hurting latency of other
> 
> 
> Apparently CPU backlog never happens when leaving the host.

-- 
Flavio

^ permalink raw reply

* Re: Fwd: [PATCH 0/6] offload Linux LAG devices to the TC datapath
From: Or Gerlitz @ 2018-06-27 20:13 UTC (permalink / raw)
  To: John Hurley
  Cc: Or Gerlitz, Jakub Kicinski, Jiri Pirko, Linux Netdev List,
	Simon Horman, Andy Gospodarek
In-Reply-To: <CAK+XE==PXr2pcss8ZGz2-Urp694ym0EpCNo368dg2CJKQ34_2g@mail.gmail.com>

On Tue, Jun 26, 2018 at 9:16 PM, John Hurley <john.hurley@netronome.com> wrote:
> On Tue, Jun 26, 2018 at 3:57 PM, Or Gerlitz <ogerlitz@mellanox.com> wrote:
>>> -------- Forwarded Message --------
>>> Subject: [PATCH 0/6] offload Linux LAG devices to the TC datapath
>>> Date: Thu, 21 Jun 2018 14:35:55 +0100
>>> From: John Hurley <john.hurley@netronome.com>
>>> To: dev@openvswitch.org, roid@mellanox.com, gavi@mellanox.com, paulb@mellanox.com, fbl@sysclose.org, simon.horman@netronome.com
>>> CC: John Hurley <john.hurley@netronome.com>
>>>
>>> This patchset extends OvS TC and the linux-netdev implementation to
>>> support the offloading of Linux Link Aggregation devices (LAG) and their
>>> slaves. TC blocks are used to provide this offload. Blocks, in TC, group
>>> together a series of qdiscs. If a filter is added to one of these qdiscs
>>> then it applied to all. Similarly, if a packet is matched on one of the
>>> grouped qdiscs then the stats for the entire block are increased. The
>>> basis of the LAG offload is that the LAG master (attached to the OvS
>>> bridge) and slaves that may exist outside of OvS are all added to the same
>>> TC block. OvS can then control the filters and collect the stats on the
>>> slaves via its interaction with the LAG master.
>>>
>>> The TC API is extended within OvS to allow the addition of a block id to
>>> ingress qdisc adds. Block ids are then assigned to each LAG master that is
>>> attached to the OvS bridge. The linux netdev netlink socket is used to
>>> monitor slave devices. If a LAG slave is found whose master is on the bridge
>>> then it is added to the same block as its master. If the underlying slaves
>>> belong to an offloadable device then the Linux LAG device can be offloaded
>>> to hardware.
>>
>> Guys (J/J/J),
>>
>> Doing this here b/c
>>
>> a. this has impact on the kernel side of things
>>
>> b. I am more of a netdev and not openvswitch citizen..
>>
>> some comments,
>>
>> 1. this + Jakub's patch for the reply are really a great design
>>
>> 2. re the egress side of things. Some NIC HWs can't just use LAG
>> as the egress port destination of an ACL (tc rule) and the HW rule
>> needs to be duplicated to both HW ports. So... in that case, you
>> see the HW driver doing the duplication (:() or we can somehow
>> make it happen from user-space?
>>
>
> Hi Or,
> I'm not sure how rule duplication would work for rules that egress to
> a LAG device.
> Perhaps this could be done for an active/backup mode where user-space
> adds a rule to 1 port and deletes from another as appropriate.
> For load balancing modes where the egress port is selected based on a
> hash of packet fields, it would be a lot more complicated.
> OvS can do this with its own bonds as far as I'm aware but (if
> recirculation is turned off) it basically creates exact match datapath
> entries for each packet flow.
> Perhaps I do not fully understand your question?

Hi John,

Some NICs don't support egress lag hashing, still they can provide HW
high-availability(HA) and load-balancing (LB)-- specifically here we
are referring
to get a VF netdev HA and LB without any action on their side, once we bond
the uplink reps, apply your patch on ovs and some more..

So the use-case I am targeting (1) does it with kernel bond/team
(2) uses the LAG/802.3ad mode of bonding/teaming

and needs to duplicate rules where the egress is the bond.

>
>> 3. for the case of overlay networks, e.g OVS based vxlan tunnel, the
>> ingress (decap) rule is set on the vxlan device. Jakub, you mentioned
>> a possible kernel patch to the HW (nfp, mlx5) drivers to have them bind
>> to the tunnel device for ingress rules. If we have agreed way to identify
>> uplink representors, can we do that from ovs too? does it matter if we are
>> bonding + encapsulating or just encapsulating? note that under encap scheme
>> the bond is typically not part of the OVS bridge.
>>
>
> If we have a way to bind the HW drivers to tunnel devs for ingress
> rules then this should work fine with OvS (possibly requiring a small
> patch - Id need to check).
>
> In terms of bonding + encap this probably needs to be handled in the
> hw itself for the same reason I mentioned in point 2.

so we have two cases where the stack/ovs can't do and the hw driver
needs to act, lets try to improve and reduce that..

^ permalink raw reply

* Re: [PATCH 0/6] offload Linux LAG devices to the TC datapath
From: Or Gerlitz @ 2018-06-27 20:07 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Or Gerlitz, John Hurley, Jiri Pirko, Linux Netdev List,
	ASAP_Direct_Dev, Simon Horman, Andy Gospodarek
In-Reply-To: <20180626153140.6a06eb97@cakuba.netronome.com>

On Wed, Jun 27, 2018 at 1:31 AM, Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
> On Tue, 26 Jun 2018 17:57:08 +0300, Or Gerlitz wrote:

>> 2. re the egress side of things. Some NIC HWs can't just use LAG
>> as the egress port destination of an ACL (tc rule) and the HW rule
>> needs to be duplicated to both HW ports. So... in that case, you
>> see the HW driver doing the duplication (:() or we can somehow
>> make it happen from user-space?

> It's the TC core that does the duplication.  Drivers which don't need
> the duplication (e.g. mlxsw) will not register a new callback for each
> port on which shared block is bound.  They will keep one list of rules,
> and a list of ports that those rules apply to.

[snip]

> Drivers which need duplication (multiplication) (all NICs?) have to
> register a new callback for each port bound to a shared block.  And TC
> will call those drivers as many times as they have callbacks registered
> == as many times as they have ports bound to the block.  Each time
> callback is invoked the driver will figure out the ingress port based
> on the cb_priv and use <ingress, cookie> as the key in its rule table
> (or have a separate rule table per ingress port).

[snip snip]

> I may be wrong, but I think you split the rules tables per port for mlx5

correct,  currently I have a rule table per physical port.

> So again you just register a callback every time shared block is bound,
> and then TC core will send add/remove rule commands down to the driver,
> relaying existing rules as well if needed.

Let's see, the NIC uplink rep port devices were bounded (say) by ovs to
a shared-block because they are the lower devices (hate the slavish jargon)
of a bond device.

Next, the TC stack will invoke the callback over these ports, when ingress
rule is added on the bond.

But we are talking on ingress rule set on a non-uplink rep (VF rep) port,
where bonding is the egress of the rule. I guess the callback which you probably
refer to (you hinted there below) is the egdev one, correct? you are suggesting
that bonding will do egdev registration... I am a bit confused.

> Does that clarify things or were you asking more about the active
> passive thing John mentioned or some way to save rule space?

no (didn't refer to active-passive) and no (didn't look to save rule space)
yes for active-active in a HW that needs duplicated rules (NICs).

>> 3. for the case of overlay networks, e.g OVS based vxlan tunnel, the
>> ingress (decap) rule is set on the vxlan device. Jakub, you mentioned
>> a possible kernel patch to the HW (nfp, mlx5) drivers to have them bind
>> to the tunnel device for ingress rules. If we have agreed way to identify
>> uplink representors, can we do that from ovs too?
>
> I'm not sure, there can be multiple tunnel devices.  Plus we really
> want to know the tunnel type, e.g. vxlan vs geneve, so simple shared
> block propagation will probably not cut it.  If that's what you're
> referring to.

isn't knowing the tunnel type already missing today? I saw you
started patches the tunnel key set action for Geneve, does upstream
+ the patches you sent works or more is missing to get geneve encap
through the TC stack?

>> does it matter if we are bonding + encapsulating or just
>> encapsulating? note that under encap scheme the bond is typically not
>> part of the OVS bridge.

> I don't think that matters in general, driver doing bonding offload
> should just start recognizing the bond master as "their port" and
> register an egdev callback for redirects to master today (or equivalent
> in the new scheme once that materializes...)

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Cong Wang @ 2018-06-27 19:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Flavio Leitner, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <096ada36-8e05-c330-e5b3-3f6fcc77aea2@gmail.com>

On Wed, Jun 27, 2018 at 12:33 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 06/27/2018 11:59 AM, Cong Wang wrote:
>
> >
> > IIRC, this skb_orphan() was introduced much earlier than TSQ, probably
> > from the beginning of veth.
>
> Sigh
>
> SO_SNDBUF was invented years ago before veth.

Yeah, probably when there was only one stack on one host.
SO_SNDBUF is aligned to networking stack basis.

>
> You focus on TSQ while it is only one of the many things that are broken.
>

I think it is the opposite: this patchset _potentially_ breaks things, not fixes
anything.


> >
> > Leaving the stack should be effectively equivalent to leaving the host,
> > from the view of network isolation.
> >
>
>
> Having a UDP socket being able to burn a cpu and fill a qdisc is a major bug.
>


Very true, network isolation never isolates CPU or memory.
It is cpuset's job to provide physical CPU isolation, not networking
namespace. I don't want to defend this, but it is the current design.


> Bu default (blocking send() syscalls) the following loop should
> block the thread if socket sk_wmem_alloc hits sk_sndbuf, this is
> the beauty of backpressure.
>
> while (1)
>     send(fd, ...);
>
> With skb_orphan(), sk_wmem_alloc will stay around 0, so the loop will burn a cpu
> and fill a qdisc, eventually breaking "network isolation", since other sockets
> might be unable to send a single packet.

Won't the same happen when congestion on a physical connection
between two hosts? Does 'host isolation' break too?

>
> If you have a concrete case where the skb_orphan() is needed, then you will have
> to add a parameter to let the admin opt-in for this.

Please see the other reply from me, where I list 3 or 4 reasons.

^ permalink raw reply

* Re: [PATCH v2 net-next 0/2] net: preserve sock reference when scrubbing the skb.
From: Cong Wang @ 2018-06-27 19:39 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: Linux Kernel Network Developers, Eric Dumazet, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <20180627133426.3858-1-fbl@redhat.com>

Let me rephrase why I don't like this patchset:

1. Let's forget about TSQ for a moment, skb_orphan() before leaving
the stack is not just reasonable but also aligning to network isolation
design. You can't claim skb_orphan() is broken from beginning, it is
designed in this way and it is intentional.

2. Now, let's consider the current TSQ behavior (without any patch):

2a. For packets leaving the host or just leaving the stack to another
netns, there is no difference, and this should be expected from user's
point of view, because I don't need to think about its destination to
decide how I should configure tcp_limit_output_bytes.

2b. The hidden pipeline behind TSQ is well defined, that is, any
queues in between L4 and L2, most importantly qdisc. I can easily
predict the number of queues my packets will go through with a
given configuration. This also aligns with 2a.

2c. Isolation is respected as it should. TCP sockets in this netns
won't be influenced by any factor in another netns.

Now with your patchset:

2a. There is an apparent difference for packets leaving the host
and for packets just leaving this stack.

2b. You extend the pipeline to another netns's L3, which means
the number of queues is now unpredictable.

2c. Isolation is now slightly broken, the other netns could influence
the source netns.

I don't see you have any good argument on any of these 3 points.

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Eric Dumazet @ 2018-06-27 19:33 UTC (permalink / raw)
  To: Cong Wang, Eric Dumazet
  Cc: Flavio Leitner, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <CAM_iQpUnyVHPb96TTNeNo0Ydu9D9s4HqVMwr-YX822euSPXngw@mail.gmail.com>



On 06/27/2018 11:59 AM, Cong Wang wrote:

> 
> IIRC, this skb_orphan() was introduced much earlier than TSQ, probably
> from the beginning of veth.

Sigh

SO_SNDBUF was invented years ago before veth.

You focus on TSQ while it is only one of the many things that are broken.

> 
> Leaving the stack should be effectively equivalent to leaving the host,
> from the view of network isolation.
> 


Having a UDP socket being able to burn a cpu and fill a qdisc is a major bug.

Bu default (blocking send() syscalls) the following loop should
block the thread if socket sk_wmem_alloc hits sk_sndbuf, this is
the beauty of backpressure.

while (1)
    send(fd, ...);  

With skb_orphan(), sk_wmem_alloc will stay around 0, so the loop will burn a cpu
and fill a qdisc, eventually breaking "network isolation", since other sockets
might be unable to send a single packet.

If you have a concrete case where the skb_orphan() is needed, then you will have
to add a parameter to let the admin opt-in for this.

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 4/4] net/cpsw_switchdev: add switchdev mode of operation on cpsw driver
From: Grygorii Strashko @ 2018-06-27 19:18 UTC (permalink / raw)
  To: Ilias Apalodimas, Arnd Bergmann
  Cc: Ivan Vecera, Florian Fainelli, Andrew Lunn, Networking,
	ivan.khoronzhuk, Sekhar Nori, Jiří Pírko,
	Francois Ozog, yogeshs, spatton, Jose.Abreu
In-Reply-To: <20180622074532.GA27414@apalos>



On 06/22/2018 02:45 AM, Ilias Apalodimas wrote:
> On Thu, Jun 21, 2018 at 05:31:31PM +0200, Arnd Bergmann wrote:
>> On Thu, Jun 21, 2018 at 2:45 PM, Ilias Apalodimas
>> <ilias.apalodimas@linaro.org> wrote:
>>> On Thu, Jun 21, 2018 at 02:19:55PM +0200, Ivan Vecera wrote:
>>
>>> The driver is currently widely used and that's the reason we tried to avoid
>>> rewriting it. The current driver uses a DTS option to distinguish between two
>>> existing modes. This patch just adds a third one. So to my understanding we
>>> have the following options:
>>> 1. The driver already uses DTS to configure the hardware mode. Although this is
>>> techincally wrong, we can add a third mode on DTS called 'switchdev;', get rid
>>> of the .config option and keep the configuration method common (although not
>>> optimal).
>>> 2. Keep the .config option which overrides the 2 existing modes.
>>> 3. Introduce a devlink option. If this is applied for all 3 modes, it will break
>>> backwards compatibility, so it's not an option. If it's introduced for
>>> configuring 'switchdev' mode only, we fall into the same pitfall as option 2),
>>> we have something that overrides our current config, slightly better though
>>> since it's not a compile time option.
>>> 4. rewrite the driver
>>
>> As I understand it, the switchdev support can also be added without
>> becoming incompatible with the existing behavior, this is how I would
>> suggest it gets added in a way that keeps the existing DT binding and
>> user view while adding switchdev:
>>
>> * In non-"dual-emac" mode, show only one network device that is
>>    configured as a transparent switch as today. Any users that today
>>    add TI's third-party ioctl interface with a non-upstreamable patch
>>    can keep using this mode and try to forward-port that patch.
> Correct
>> * In "dual-emac" mode (as selected through DT), the hardware is
>>     configured to be viewed as two separate network devices as before,
>>     regardless of kernel configuration. Users can add the two device
>>     to a bridge device as before, and enabling switchdev support in
>>     the kernel configuration (based on your patch series) would change
>>     nothing else than using hardware support in the background to
>>     reconfigure the HW VLAN settings.
>>
>> This does not require using devlink, adding a third mode, or changing
>> the DT binding or the user-visible behavior when switchdev is enabled,
>> but should get all the features you want.
>>
> Correct again. This is doable and the changes on the current patchset are
> somewhat trivial (detecting a bridge and making the configuration changes
> on the fly).
>>> If it was a brand new driver, i'd definitely pick 4. Since it's a pre-existing
>>> driver though i can't rule out the rest of the options.
>>
>> I think the suggestion was to have a new driver with a new binding
>> so that the DT could choose between the two drivers, one with
>> somewhat obscure behavior and the other with proper behavior.
>>
>> However, from what I can tell, the only requirement to get a somewhat
>> reasonable behavior is that you enable "dual-emac" mode in DT
>> to get switchdev support. It would be trivial to add a new compatible
>> value that only allows that mode along with supporting switchdev,
>> but I don't think that's necessary here.
>>
>> Writing a new driver might also be a good idea (depending on the
>> quality of the existing one, I haven't looked in detail), but again
>> I would see no reason for the new driver to be incompatible with
>> the existing binding, so a gradual cleanup seems like a better
>> approach.
> Agree
>>
> 
> If people like this idea, i can send a V3 with these changes.

Nop. I do not think this is good idea, because "dual_mac" mode has very strict
meaning and requirements. In "dual_mac" mode both port should be teated and work
as *separate network devices" (like two, not connected PCI eth cards) - the fact that
it's implemented on top of hw, which can do packet switching doesn't matter here and just a
technical solution.
Main requirements:
1) No packet forwarding is allowed inside hw under any circumstances, only Linux
   Host SW can consume or forward packets
2) One interface should not block another inside CPSW hw which implies special FIFOs/HW
 configuration
As per, above switchdev functionality doesn't make too much sense in "dual_mac" mode and
introducing dual meaning for this mode is not a good choice either.

Again, as discussed, option 4 is considered as preferred.

-- 
regards,
-grygorii

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Cong Wang @ 2018-06-27 19:06 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: Eric Dumazet, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <20180627123155.GW19565@plex.lan>

On Wed, Jun 27, 2018 at 5:32 AM Flavio Leitner <fbl@redhat.com> wrote:
>
> On Tue, Jun 26, 2018 at 06:28:27PM -0700, Cong Wang wrote:
> > On Tue, Jun 26, 2018 at 5:39 PM Flavio Leitner <fbl@redhat.com> wrote:
> > >
> > > On Tue, Jun 26, 2018 at 05:29:51PM -0700, Cong Wang wrote:
> > > > On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner <fbl@redhat.com> wrote:
> > > > >
> > > > > It is still isolated, the sk carries the netns info and it is
> > > > > orphaned when it re-enters the stack.
> > > >
> > > > Then what difference does your patch make?
> > >
> > > Don't forget it is fixing two issues.
> >
> > Sure. I am only talking about TSQ from the very beginning.
> > Let me rephrase my above question:
> > What difference does your patch make to TSQ?
>
> It avoids burstiness.

Never even mentioned in changelog or in your patch. :-/


>
> > > > Before your patch:
> > > > veth orphans skb in its xmit
> > > >
> > > > After your patch:
> > > > RX orphans it when re-entering stack (as you claimed, I don't know)
> > >
> > > ip_rcv, and equivalents.
> >
> > ip_rcv() is L3, we enter a stack from L1. So your above claim is incorrect. :)
>
> Maybe you found a problem, could you please point me to where in
> between L1 to L3 the socket is relevant?
>

Of course, ingress qdisc is in L2. Do I need to say more? This
is where we can re-route the packets, for example, redirecting it to
yet another netns. This is in fact what we use in production, not anything
that only in my imagination.

You really have to think about why you allow a different netns influence
another netns by holding the skb to throttle the source TCP socket.


>
> > > > And for veth pair:
> > > > xmit from one side is RX for the other side
> > > > So, where is the queueing? Where is the buffer bloat? GRO list??
> > >
> > > CPU backlog.
> >
> > Yeah, but this is never targeted by TSQ:
> >
> >         tcp_limit_output_bytes limits the number of bytes on qdisc
> >         or device to reduce artificial RTT/cwnd and reduce bufferbloat.
> >
> > which means you have to update Documentation/networking/ip-sysctl.txt
> > too.
>
> How it is never targeted? Whole point is to avoid queueing traffic.

What queues? You really need to define it, seriously.


> Would you be okay if I include this chunk?

No, still lack of an explanation why it comes across netns for
a good reason.

>
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index ce8fbf5aa63c..f4c042be0216 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -733,11 +733,11 @@ tcp_limit_output_bytes - INTEGER
>         Controls TCP Small Queue limit per tcp socket.
>         TCP bulk sender tends to increase packets in flight until it
>         gets losses notifications. With SNDBUF autotuning, this can
> -       result in a large amount of packets queued in qdisc/device
> -       on the local machine, hurting latency of other flows, for
> -       typical pfifo_fast qdiscs.
> -       tcp_limit_output_bytes limits the number of bytes on qdisc
> -       or device to reduce artificial RTT/cwnd and reduce bufferbloat.
> +       result in a large amount of packets queued on the local machine
> +       (e.g.: qdiscs, CPU backlog, or device) hurting latency of other


Apparently CPU backlog never happens when leaving the host.

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Cong Wang @ 2018-06-27 18:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Flavio Leitner, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <d1818974-5445-4dc2-1c3c-2b67d2826d6f@gmail.com>

On Tue, Jun 26, 2018 at 7:35 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 06/26/2018 05:44 PM, Cong Wang wrote:
>
> > With this, a netns could totally throttle a TCP socket in a different
> > netns by holding the packets infinitely (e.g. putting them in a loop).
> > This is where the isolation breaks.
> >
>
> That is fine, really. Admin error -> Working as intended.

The point is never it is an error or not, the point is one netns could
influence another one with this change.

>
> The current scrubbing is simply wrong, not documented, and added by someone
> who had absolutely not intended all the side effects.
>

IIRC, this skb_orphan() was introduced much earlier than TSQ, probably
from the beginning of veth.

Leaving the stack should be effectively equivalent to leaving the host,
from the view of network isolation.

^ permalink raw reply

* Re: [PATCH net-next 1/1] tc-testing: initial version of tunnel_key unit tests
From: Lucas Bates @ 2018-06-27 18:50 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Keara Leibovitz, David Miller, Linux Kernel Network Developers,
	Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <b5a9955f3f64ad5c017e0a995ae2d1580f08092e.camel@redhat.com>

On Tue, Jun 26, 2018 at 10:51 AM, Davide Caratti <dcaratti@redhat.com> wrote:
> On Tue, 2018-06-26 at 09:17 -0400, Keara Leibovitz wrote:
>> Create unittests for the tc tunnel_key action.
>>
>>
>> Signed-off-by: Keara Leibovitz <kleib@mojatatu.com>
>> ---
>>  .../tc-testing/tc-tests/actions/tunnel_key.json    | 676 +++++++++++++++++++++
>>  1 file changed, 676 insertions(+)
>>  create mode 100644 tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
>>
>> diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
>> new file mode 100644
>> index 000000000000..bfe522ac8177
>
> hello Keara!
>
> I think the 'teardown' stage in some of these tests should be reviewed.
> Those that are meant to test invalid configurations (like dc6b) should
> allow non-zero exit codes in the teardown stage, if the wrong
> configuration is catched by the userspace TC tool, before talking to the
> kernel.
>
> Otherwise, those tests will fail when they are invoked one by one with the
> act_tunnel_key module unloaded.
>
Hi Davide, I thought I'd weigh in here.

In the short term, I think this is reasonable, but it's not a feasible
long-term solution.  Here's why:

Allowing non-zero exit codes on setup and teardown was a precaution
that needed to be implemented as flushing actions in a freshly-booted
kernel returned errors - certain actions would only allow you to flush
after that action had been added.

But, doing this on so many test cases means that we can lose control
of the test environment, especially since a lot of commands get copied
between test cases.  One test's command under test becomes the next
test case's setup command, etc.  This can cause false results and
potentially waste a lot of time for someone trying to track down a
bug... Or cause bugs to be missed.

So, how to fix: we've had some discussions about it already.  Jiri had
requested the addition of a config file (like the one at
tools/testing/selftests/net/forwarding/config, and maybe an addition
to the README for tdc for explanation.  People would then possibly be
restricted to running one test case file at a time based on what
options they had loaded...  This is still not ideal.

I think the best possible fix is to add a new plugin for tdc to
exclude tests based on the kernel config.  This would require the
addition of a new optional field to the test case format, where any
and all included modules required for the test to work would be
listed.  The plugin would look at this information, do its best to
determine if the currently running kernel supports it, and allows the
test to run or be skipped as a result.

Let me show an example of the new field:
>> --- /dev/null
>> +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
>> @@ -0,0 +1,676 @@
>>
> ...
>
>> +    {
>> +        "id": "dc6b",
>> +        "name": "Add tunnel_key set action with missing mandatory src_ip parameter",
>> +        "category": [
>> +            "actions",
>> +            "tunnel_key"
>> +        ],
               "reqModules": [
                   "CONFIG_NET_ACT_TUNNEL_KEY"
               ],
>> +        "setup": [
>> +            [
>> +                "$TC actions flush action tunnel_key",
>> +                0,
>> +                1,
>> +                255
>> +            ]
>> +        ],
>> +        "cmdUnderTest": "$TC actions add action tunnel_key set dst_ip 20.20.20.2 id 100",
>> +        "expExitCode": "255",
>> +        "verifyCmd": "$TC actions list action tunnel_key",
>> +        "matchPattern": "action order [0-9]+: tunnel_key set.*dst_ip 20.20.20.2.*key_id 100",
>> +        "matchCount": "0",
>> +        "teardown": [
>> +            "$TC actions flush action tunnel_key"
>> +        ]
>> +    },

As we venture into more and more complicated tests, where different
modules would start getting mixed together, this might be the most
effective route.

This plugin will require some changes I've made to our local version
of tdc that I've been testing out - they change the way tdc handles
its test results, and also give it the ability to skip tests without
affecting the rest of the test run.

Until I'm able to submit everything, I'd be OK with having Keara add
the non-zero exit codes to the teardown on her tests.  In the meantime
we'll get the README updated and config file added as well.

How does this sound?


- Lucas

^ permalink raw reply

* Re: [PATCH rdma-next 08/12] overflow.h: Add arithmetic shift helper
From: Kees Cook @ 2018-06-27 18:44 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Rasmus Villemoes, Leon Romanovsky, Doug Ledford, Leon Romanovsky,
	RDMA mailing list, Hadar Hen Zion, Matan Barak, Michael J Ruhl,
	Noa Osherovich, Raed Salem, Yishai Hadas, Saeed Mahameed,
	linux-netdev, LKML
In-Reply-To: <20180627181012.GM20754@mellanox.com>

On Wed, Jun 27, 2018 at 11:10 AM, Jason Gunthorpe <jgg@mellanox.com> wrote:
> Leon? Seems like agreement, Can you work with this version?
>
> #include <stdint.h>
> #include <stdbool.h>
> #include <assert.h>
>
> #define u64 uint64_t
>
> /*
>  * Compute *d = (a << s)
>  *
>  * Returns true if '*d' cannot hold the result or 'a << s' doesn't make sense.
>  * - 'a << s' causes bits to be lost when stored in d
>  * - 's' is garbage (eg negative) or so large that a << s is guarenteed to be 0
>  * - 'a' is negative
>  * - 'a << s' sets the sign bit, if any, in '*d'
>  * *d is not defined if false is returned.
>  */
> #define check_shift_overflow(a, s, d)                                          \
>         ({                                                                     \
>                 typeof(a) _a = a;                                              \
>                 typeof(s) _s = s;                                              \
>                 typeof(d) _d = d;                                              \
>                 u64 _a_full = _a;                                              \
>                 unsigned int _to_shift =                                       \
>                         _s >= 0 && _s < 8 * sizeof(*d) ? _s : 0;               \
>                                                                                \
>                 *_d = (_a_full << _to_shift);                                  \
>                                                                                \
>                 (_to_shift != _s || *_d < 0 || _a < 0 ||                       \
>                  (*_d >> _to_shift) != a);                                     \
>         })
>
> int main(int argc, const char *argv[])
> {
>         int32_t s32;
>         uint32_t u32;
>
>         assert(check_shift_overflow(1, 0, &s32) == false && s32 == (1 << 0));
>         assert(check_shift_overflow(1, 1, &s32) == false && s32 == (1 << 1));
>         assert(check_shift_overflow(1, 30, &s32) == false && s32 == (1 << 30));
>         assert(check_shift_overflow(1, 31, &s32) == true);
>         assert(check_shift_overflow(1, 32, &s32) == true);
>         assert(check_shift_overflow(-1, 1, &s32) == true);
>         assert(check_shift_overflow(-1, 0, &s32) == true);
>
>         assert(check_shift_overflow(1, 0, &u32) == false && u32 == (1 << 0));
>         assert(check_shift_overflow(1, 1, &u32) == false && u32 == (1 << 1));
>         assert(check_shift_overflow(1, 30, &u32) == false && u32 == (1 << 30));
>         assert(check_shift_overflow(1, 31, &u32) == false && u32 == (1UL << 31));
>         assert(check_shift_overflow(1, 32, &u32) == true);
>         assert(check_shift_overflow(-1, 1, &u32) == true);
>         assert(check_shift_overflow(-1, 0, &u32) == true);
>
>         assert(check_shift_overflow(0xFFFFFFFF, 0, &u32) == false && u32 == (0xFFFFFFFFUL << 0));
>         assert(check_shift_overflow(0xFFFFFFFF, 1, &u32) == true);
>         assert(check_shift_overflow(0xFFFFFFFF, 0, &s32) == true);
>         assert(check_shift_overflow(0xFFFFFFFF, 1, &s32) == true);
> }

Oh yes, please include these tests in lib/test_overflow.c too! Nice. :)

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply

* Re: [patch net-next 0/9] net: sched: introduce chain templates support with offloading to mlxsw
From: Jakub Kicinski @ 2018-06-27 18:36 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Linux Netdev List, David Miller, Jamal Hadi Salim, Cong Wang,
	Simon Horman, John Hurley, David Ahern, mlxsw
In-Reply-To: <20180627075017.GA2007@nanopsycho>

On Wed, 27 Jun 2018 09:50:17 +0200, Jiri Pirko wrote:
> Tue, Jun 26, 2018 at 11:18:58PM CEST, jakub.kicinski@netronome.com wrote:
> >On Tue, 26 Jun 2018 09:12:17 +0200, Jiri Pirko wrote:  
> >> Tue, Jun 26, 2018 at 09:00:45AM CEST, jakub.kicinski@netronome.com wrote:  
> >> >On Mon, Jun 25, 2018 at 11:43 PM, Jiri Pirko <jiri@resnulli.us> wrote:    
> >> >> Tue, Jun 26, 2018 at 06:58:50AM CEST, jakub.kicinski@netronome.com wrote:    
> >> >>>On Mon, 25 Jun 2018 23:01:39 +0200, Jiri Pirko wrote:    
> >> >>>> From: Jiri Pirko <jiri@mellanox.com>
> >> >>>>
> >> >>>> For the TC clsact offload these days, some of HW drivers need
> >> >>>> to hold a magic ball. The reason is, with the first inserted rule inside
> >> >>>> HW they need to guess what fields will be used for the matching. If
> >> >>>> later on this guess proves to be wrong and user adds a filter with a
> >> >>>> different field to match, there's a problem. Mlxsw resolves it now with
> >> >>>> couple of patterns. Those try to cover as many match fields as possible.
> >> >>>> This aproach is far from optimal, both performance-wise and scale-wise.
> >> >>>> Also, there is a combination of filters that in certain order won't
> >> >>>> succeed.
> >> >>>>
> >> >>>> Most of the time, when user inserts filters in chain, he knows right away
> >> >>>> how the filters are going to look like - what type and option will they
> >> >>>> have. For example, he knows that he will only insert filters of type
> >> >>>> flower matching destination IP address. He can specify a template that
> >> >>>> would cover all the filters in the chain.    
> >> >>>
> >> >>>Perhaps it's lack of sleep, but this paragraph threw me a little off
> >> >>>the track.  IIUC the goal of this set is to provide a way to inform the
> >> >>>HW about expected matches before any rule is programmed into the HW.
> >> >>>Not before any rule is added to a particular chain.  One can just use
> >> >>>the first rule in the chain to make a guess about the chain, but thanks
> >> >>>to this set user can configure *all* chains before any rules are added.    
> >> >>
> >> >> The template is per-chain. User can use template for chain x and
> >> >> not-use it for chain y. Up to him.    
> >> >
> >> >Makes sense.
> >> >
> >> >I can't help but wonder if it'd be better to associate the
> >> >constraints/rules with chains instead of creating a new "template"
> >> >object.  It seems more natural to create a chain with specific
> >> >constraints in place than add and delete template of which there can
> >> >be at most one to a chain...  Perhaps that's more about the user space
> >> >tc command line.  Anyway, not a strong objection, just a thought.    
> >> 
> >> Hmm. I don't think it is good idea. User should see the template in a
> >> "show" command per chain. We would have to have 2 show commands, one to
> >> list the template objects and one to list templates per chains. It makes
> >> things more complicated for no good reason. I think that this simple
> >> chain-lock is easier and serves the purpose.  
> >
> >Hm, I think the dump is fine, what I was thinking about was:
> >
> ># tc chain add dev dummy0 ingress chain_index 22 \
> >     ^^^^^
> >	template proto ip \
> >	^^^^^^^^
> >	flower dst_mac 00:00:00:00:00:00/00:00:00:00:FF:FF  
> 
> Okay, I got it. I see 2 issues.
> 1) user might expect to add a chain without the template. But that does
>    not make sense really. Chains are created/deleted implicitly
>    according to refcount.
> 2) there is not chain object like this available to user. Adding it just
>    for template looks odd. Also, the "filter" and "template" are very
>    much alike. They both are added to a chain, they both implicitly
>    create chain if it does not exist, etc.

Yeah, that part makes is tricky :/

> if you don't like "tc filter template add dev dummy0 ingress", how
> about:
> "tc template add dev dummy0 ingress ..."
> "tc template add dev dummy0 ingress chain 22 ..."
> that makes more sense I think.

Mmm..  how about:

 tc chaintemplate add dev dummy0 ingress...

or

 tc restrictedchain add dev dummy0 ingress chain_index XX template ...

^ permalink raw reply

* [PATCH 1/2] net: phy: DP83TC811: Add INT_STAT3
From: Dan Murphy @ 2018-06-27 18:16 UTC (permalink / raw)
  To: andrew, f.fainelli; +Cc: netdev, Dan Murphy

Add INT_STAT3 interrupt setting and clearing.

Also fixed writing to INT_STAT2 when disabling
the interrupts as there was a double write to
INT_STAT1.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
---
 drivers/net/phy/dp83tc811.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/dp83tc811.c b/drivers/net/phy/dp83tc811.c
index 081d99aa3985..f8653f5d8789 100644
--- a/drivers/net/phy/dp83tc811.c
+++ b/drivers/net/phy/dp83tc811.c
@@ -21,6 +21,7 @@
 #define MII_DP83811_SGMII_CTRL	0x09
 #define MII_DP83811_INT_STAT1	0x12
 #define MII_DP83811_INT_STAT2	0x13
+#define MII_DP83811_INT_STAT3	0x18
 #define MII_DP83811_RESET_CTRL	0x1f
 
 #define DP83811_HW_RESET	BIT(15)
@@ -44,6 +45,11 @@
 #define DP83811_OVERVOLTAGE_INT_EN	BIT(6)
 #define DP83811_UNDERVOLTAGE_INT_EN	BIT(7)
 
+/* INT_STAT3 bits */
+#define DP83811_LPS_INT_EN	BIT(0)
+#define DP83811_NO_FRAME_INT_EN	BIT(3)
+#define DP83811_POR_DONE_INT_EN	BIT(4)
+
 #define MII_DP83811_RXSOP1	0x04a5
 #define MII_DP83811_RXSOP2	0x04a6
 #define MII_DP83811_RXSOP3	0x04a7
@@ -81,6 +87,10 @@ static int dp83811_ack_interrupt(struct phy_device *phydev)
 	if (err < 0)
 		return err;
 
+	err = phy_read(phydev, MII_DP83811_INT_STAT3);
+	if (err < 0)
+		return err;
+
 	return 0;
 }
 
@@ -216,13 +226,29 @@ static int dp83811_config_intr(struct phy_device *phydev)
 				DP83811_UNDERVOLTAGE_INT_EN);
 
 		err = phy_write(phydev, MII_DP83811_INT_STAT2, misr_status);
+		if (err < 0)
+			return err;
+
+		misr_status = phy_read(phydev, MII_DP83811_INT_STAT3);
+		if (misr_status < 0)
+			return misr_status;
+
+		misr_status |= (DP83811_LPS_INT_EN |
+				DP83811_NO_FRAME_INT_EN |
+				DP83811_POR_DONE_INT_EN);
+
+		err = phy_write(phydev, MII_DP83811_INT_STAT3, misr_status);
 
 	} else {
 		err = phy_write(phydev, MII_DP83811_INT_STAT1, 0);
 		if (err < 0)
 			return err;
 
-		err = phy_write(phydev, MII_DP83811_INT_STAT1, 0);
+		err = phy_write(phydev, MII_DP83811_INT_STAT2, 0);
+		if (err < 0)
+			return err;
+
+		err = phy_write(phydev, MII_DP83811_INT_STAT3, 0);
 	}
 
 	return err;
-- 
2.17.0.582.gccdcbd54c

^ permalink raw reply related

* [PATCH 2/2] net: phy: DP83TC811: Fix SGMII enable/disable
From: Dan Murphy @ 2018-06-27 18:16 UTC (permalink / raw)
  To: andrew, f.fainelli; +Cc: netdev, Dan Murphy
In-Reply-To: <20180627181618.23463-1-dmurphy@ti.com>

If SGMII was selected in the DT then the device should
write the SGMII enable bit.

If SGMII is not selected in the DT then the SGMII bit
should be disabled.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
---
 arch/arm/configs/omap2plus_defconfig |  1 +
 drivers/net/phy/dp83tc811.c          | 20 +++++++++-----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 06fb948ecfb3..30857d5b7a6c 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -182,6 +182,7 @@ CONFIG_TI_CPTS=y
 CONFIG_AT803X_PHY=y
 CONFIG_DP83848_PHY=y
 CONFIG_DP83867_PHY=y
+CONFIG_DP83TC811_PHY=y
 CONFIG_MICREL_PHY=y
 CONFIG_SMSC_PHY=y
 CONFIG_PPP=m
diff --git a/drivers/net/phy/dp83tc811.c b/drivers/net/phy/dp83tc811.c
index f8653f5d8789..78cad134a79e 100644
--- a/drivers/net/phy/dp83tc811.c
+++ b/drivers/net/phy/dp83tc811.c
@@ -284,21 +284,19 @@ static int dp83811_config_init(struct phy_device *phydev)
 	if (err < 0)
 		return err;
 
+	value = phy_read(phydev, MII_DP83811_SGMII_CTRL);
 	if (phydev->interface == PHY_INTERFACE_MODE_SGMII) {
-		value = phy_read(phydev, MII_DP83811_SGMII_CTRL);
-		if (!(value & DP83811_SGMII_EN)) {
-			err = phy_write(phydev, MII_DP83811_SGMII_CTRL,
+		err = phy_write(phydev, MII_DP83811_SGMII_CTRL,
 					(DP83811_SGMII_EN | value));
-			if (err < 0)
-				return err;
-		} else {
-			err = phy_write(phydev, MII_DP83811_SGMII_CTRL,
-					(~DP83811_SGMII_EN & value));
-			if (err < 0)
-				return err;
-		}
+	} else {
+		err = phy_write(phydev, MII_DP83811_SGMII_CTRL,
+				(~DP83811_SGMII_EN & value));
 	}
 
+	if (err < 0)
+
+		return err;
+
 	value = DP83811_WOL_MAGIC_EN | DP83811_WOL_SECURE_ON | DP83811_WOL_EN;
 
 	return phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG,
-- 
2.17.0.582.gccdcbd54c

^ permalink raw reply related

* Re: [PATCH net-next] liquidio: fix kernel panic when NIC firmware is older than 1.7.2
From: Felix Manlunas @ 2018-06-28  5:18 UTC (permalink / raw)
  To: Shannon Nelson
  Cc: davem, netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	ricardo.farrington
In-Reply-To: <5670609b-5056-305d-ee0f-8fed471d8381@oracle.com>

On Tue, Jun 26, 2018 at 09:03:25AM -0700, Shannon Nelson wrote:
> On 6/26/2018 4:58 AM, Felix Manlunas wrote:
> > From: Rick Farrington <ricardo.farrington@cavium.com>
> > 
> > Pre-1.7.2 NIC firmware does not support (and does not respond to) the "get
> > speed" command which is sent by the 1.7.2 driver during modprobe.  Due to a
> > bug in older firmware (with respect to unknown commands), this unsupported
> > command causes a cascade of errors that ends in a kernel panic.
> > 
> > Fix it by making the sending of the "get speed" command conditional on the
> > firmware version.
> > 
> > Signed-off-by: Rick Farrington <ricardo.farrington@cavium.com>
> > Acked-by: Derek Chickles <derek.chickles@cavium.com>
> > Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
> > ---
> > Note: To avoid checkpatch.pl "WARNING: line over 80 characters", the comma
> >        that separates the arguments in the call to strcmp() was placed one
> >        line below the usual spot.
> > 
> >   drivers/net/ethernet/cavium/liquidio/lio_main.c | 11 ++++++++++-
> >   1 file changed, 10 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
> > index 7cb4e75..f83f884 100644
> > --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
> > +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
> > @@ -3671,7 +3671,16 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
> >                       OCTEON_CN2350_25GB_SUBSYS_ID ||
> >                   octeon_dev->subsystem_id ==
> >                       OCTEON_CN2360_25GB_SUBSYS_ID) {
> > -                     liquidio_get_speed(lio);
> > +                     /* speed control unsupported in f/w older than 1.7.2 */
> > +                     if (strcmp(octeon_dev->fw_info.liquidio_firmware_version
> > +                        , "1.7.2") < 0) {
> 
> Will the liquidio_firmware_version ever end up something like 1.7.10?
> If so, this strcmp() may not do what you want.
> 
> sln

Yes, it's possible that the liquidio_firmware_version will reach 1.7.10.

We agree that using strcmp() will not give the correct result for that case,
so please disregard this patch.

We will submit a V2 patch.

Felix

^ permalink raw reply

* Re: [PATCH rdma-next 08/12] overflow.h: Add arithmetic shift helper
From: Leon Romanovsky @ 2018-06-27 18:22 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Rasmus Villemoes, Doug Ledford, Kees Cook, RDMA mailing list,
	Hadar Hen Zion, Matan Barak, Michael J Ruhl, Noa Osherovich,
	Raed Salem, Yishai Hadas, Saeed Mahameed, linux-netdev,
	linux-kernel
In-Reply-To: <20180627181012.GM20754@mellanox.com>

On Wed, Jun 27, 2018 at 12:10:12PM -0600, Jason Gunthorpe wrote:
> On Wed, Jun 27, 2018 at 11:36:03AM +0200, Rasmus Villemoes wrote:
> >    OK. The requirement of everything having the same type for the
> >    check_*_overflow when gccs builtins are not available was mostly a
> >    consequence of my inability to implement completely type-generic
> >    versions (but also to enforce some sanity, so people don't do
> >    check_add_overflow( s8, size_t, int*)). There's no gcc builtin for
> >    shift, but if it's relatively simple to one allowing a and *d to have
> >    different types, then why not. It's of course particularly convenient
> >    to allow a bare "1" (i.e. int) as a while having *d have some random
> >    type.
>
> Yes
>
> >    Wouldn't check_shift_overflow(-1, 4, &someint) just put -16 in someint
> >    and report no overflow? That's what I'd expect, if negative values are
> >    to be supported at all.
>
> I would say that is not a desired outcome, bitshift is defined on
> bits, if the caller wanted something defined as signed multiply they
> should use multiply.
>
> IMHO, nobody writes 'a << b' expecting sign preservation..
>
> >    Well, the types you can check at compile-time, the values not, so you
> >    still have to define the result, i.e. contents of *d, for negative
> >    values (even if we decide that "overflow" should always be signalled in
> >    that case).
>
> Why do a need to define a 'result' beyond whatever the not-undefined
> behavior shift expression produces?
>
> >      What about more like this?
> >                check_shift_overflow(a, s, d) ({
> >                    // Shift is always performed on the machine's largest
> >      unsigned
> >                    u64 _a = a;
> >                    typeof(s) _s = s;
> >                    typeof(d) _d = d;
> >                    // Make s safe against UB
> >                    unsigned int _to_shift = _s >= 0 && _s < 8*sizeof(*d) : _s ? 0;
> >                    *_d = (_a << _to_shift);
> >                     // s is malformed
> >                    (_to_shift != _s ||
> >                     // d is a signed type and became negative
> >                     *_d < 0 ||
> >                     // a is a signed type and was negative
> >                     _a < 0 ||
> >                     // Not invertable means a was truncated during
> >      shifting
> >                     (*_d >> _to_shift) != a))
> >                })
> >      I'm not seeing a UB with this?
> >
> >    Something like that might work, but you're not there yet. In
> >    particular, your test for whether a is negative is thwarted by using
> >    u64 for _a and testing _a < 0...
>
> Oops, yes that was intended to be 'a', and of course we need to
> capture it..
>
> Leon? Seems like agreement, Can you work with this version?

Yes, sure, I waited for an agreement.

>
> #include <stdint.h>
> #include <stdbool.h>
> #include <assert.h>
>
> #define u64 uint64_t
>
> /*
>  * Compute *d = (a << s)
>  *
>  * Returns true if '*d' cannot hold the result or 'a << s' doesn't make sense.
>  * - 'a << s' causes bits to be lost when stored in d
>  * - 's' is garbage (eg negative) or so large that a << s is guarenteed to be 0
>  * - 'a' is negative
>  * - 'a << s' sets the sign bit, if any, in '*d'
>  * *d is not defined if false is returned.
>  */
> #define check_shift_overflow(a, s, d)                                          \
> 	({                                                                     \
> 		typeof(a) _a = a;                                              \
> 		typeof(s) _s = s;                                              \
> 		typeof(d) _d = d;                                              \
> 		u64 _a_full = _a;                                              \
> 		unsigned int _to_shift =                                       \
> 			_s >= 0 && _s < 8 * sizeof(*d) ? _s : 0;               \
>                                                                                \
> 		*_d = (_a_full << _to_shift);                                  \
>                                                                                \
> 		(_to_shift != _s || *_d < 0 || _a < 0 ||                       \
> 		 (*_d >> _to_shift) != a);                                     \
> 	})
>
> int main(int argc, const char *argv[])
> {
> 	int32_t s32;
> 	uint32_t u32;
>
> 	assert(check_shift_overflow(1, 0, &s32) == false && s32 == (1 << 0));
> 	assert(check_shift_overflow(1, 1, &s32) == false && s32 == (1 << 1));
> 	assert(check_shift_overflow(1, 30, &s32) == false && s32 == (1 << 30));
> 	assert(check_shift_overflow(1, 31, &s32) == true);
> 	assert(check_shift_overflow(1, 32, &s32) == true);
> 	assert(check_shift_overflow(-1, 1, &s32) == true);
> 	assert(check_shift_overflow(-1, 0, &s32) == true);
>
> 	assert(check_shift_overflow(1, 0, &u32) == false && u32 == (1 << 0));
> 	assert(check_shift_overflow(1, 1, &u32) == false && u32 == (1 << 1));
> 	assert(check_shift_overflow(1, 30, &u32) == false && u32 == (1 << 30));
> 	assert(check_shift_overflow(1, 31, &u32) == false && u32 == (1UL << 31));
> 	assert(check_shift_overflow(1, 32, &u32) == true);
> 	assert(check_shift_overflow(-1, 1, &u32) == true);
> 	assert(check_shift_overflow(-1, 0, &u32) == true);
>
> 	assert(check_shift_overflow(0xFFFFFFFF, 0, &u32) == false && u32 == (0xFFFFFFFFUL << 0));
> 	assert(check_shift_overflow(0xFFFFFFFF, 1, &u32) == true);
> 	assert(check_shift_overflow(0xFFFFFFFF, 0, &s32) == true);
> 	assert(check_shift_overflow(0xFFFFFFFF, 1, &s32) == true);
> }
>
> Thanks,
> Jason

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox