Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 06/10] net: netcp: ethss: get phy-handle only if link interface is MAC-to-PHY
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, davem, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>

Currently to parse phy-handle, driver doesn't check if the interface is
MAC to PHY. This patch add this check for all MAC to PHY interface types
supported by the driver.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index cb48f88..9266961 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -2956,7 +2956,9 @@ static int init_slave(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 	}
 
 	slave->open = false;
-	slave->phy_node = of_parse_phandle(node, "phy-handle", 0);
+	if ((slave->link_interface == SGMII_LINK_MAC_PHY) ||
+	    (slave->link_interface == XGMII_LINK_MAC_PHY))
+		slave->phy_node = of_parse_phandle(node, "phy-handle", 0);
 	slave->port_num = gbe_get_slave_port(gbe_dev, slave->slave_num);
 
 	if (slave->link_interface >= XGMII_LINK_MAC_PHY)
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 05/10] net: netcp: store network statistics in 64 bits
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, davem, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>

From: Michael Scherban <m-scherban@ti.com>

Previously the network statistics were stored in 32 bit variable
which can cause some stats to roll over after several minutes of
high traffic. This implements 64 bit storage so larger numbers
can be stored.

Signed-off-by: Michael Scherban <m-scherban@ti.com>
Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 drivers/net/ethernet/ti/netcp.h      | 18 ++++++++++
 drivers/net/ethernet/ti/netcp_core.c | 68 +++++++++++++++++++++++++++++-------
 2 files changed, 74 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h
index a92abd6..d243c5d 100644
--- a/drivers/net/ethernet/ti/netcp.h
+++ b/drivers/net/ethernet/ti/netcp.h
@@ -23,6 +23,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/soc/ti/knav_dma.h>
+#include <linux/u64_stats_sync.h>
 
 /* Maximum Ethernet frame size supported by Keystone switch */
 #define NETCP_MAX_FRAME_SIZE		9504
@@ -68,6 +69,20 @@ struct netcp_addr {
 	struct list_head	node;
 };
 
+struct netcp_stats {
+	struct u64_stats_sync   syncp_rx ____cacheline_aligned_in_smp;
+	u64                     rx_packets;
+	u64                     rx_bytes;
+	u32                     rx_errors;
+	u32                     rx_dropped;
+
+	struct u64_stats_sync   syncp_tx ____cacheline_aligned_in_smp;
+	u64                     tx_packets;
+	u64                     tx_bytes;
+	u32                     tx_errors;
+	u32                     tx_dropped;
+};
+
 struct netcp_intf {
 	struct device		*dev;
 	struct device		*ndev_dev;
@@ -88,6 +103,9 @@ struct netcp_intf {
 	struct napi_struct	rx_napi;
 	struct napi_struct	tx_napi;
 
+	/* 64-bit netcp stats */
+	struct netcp_stats	stats;
+
 	void			*rx_channel;
 	const char		*dma_chan_name;
 	u32			rx_pool_size;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 286fd8d..b077ed4 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -629,6 +629,7 @@ static void netcp_free_rx_desc_chain(struct netcp_intf *netcp,
 
 static void netcp_empty_rx_queue(struct netcp_intf *netcp)
 {
+	struct netcp_stats *rx_stats = &netcp->stats;
 	struct knav_dma_desc *desc;
 	unsigned int dma_sz;
 	dma_addr_t dma;
@@ -642,16 +643,17 @@ static void netcp_empty_rx_queue(struct netcp_intf *netcp)
 		if (unlikely(!desc)) {
 			dev_err(netcp->ndev_dev, "%s: failed to unmap Rx desc\n",
 				__func__);
-			netcp->ndev->stats.rx_errors++;
+			rx_stats->rx_errors++;
 			continue;
 		}
 		netcp_free_rx_desc_chain(netcp, desc);
-		netcp->ndev->stats.rx_dropped++;
+		rx_stats->rx_dropped++;
 	}
 }
 
 static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 {
+	struct netcp_stats *rx_stats = &netcp->stats;
 	unsigned int dma_sz, buf_len, org_buf_len;
 	struct knav_dma_desc *desc, *ndesc;
 	unsigned int pkt_sz = 0, accum_sz;
@@ -757,8 +759,8 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 		if (unlikely(ret)) {
 			dev_err(netcp->ndev_dev, "RX hook %d failed: %d\n",
 				rx_hook->order, ret);
-			netcp->ndev->stats.rx_errors++;
 			/* Free the primary descriptor */
+			rx_stats->rx_dropped++;
 			knav_pool_desc_put(netcp->rx_pool, desc);
 			dev_kfree_skb(skb);
 			return 0;
@@ -767,8 +769,10 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 	/* Free the primary descriptor */
 	knav_pool_desc_put(netcp->rx_pool, desc);
 
-	netcp->ndev->stats.rx_packets++;
-	netcp->ndev->stats.rx_bytes += skb->len;
+	u64_stats_update_begin(&rx_stats->syncp_rx);
+	rx_stats->rx_packets++;
+	rx_stats->rx_bytes += skb->len;
+	u64_stats_update_end(&rx_stats->syncp_rx);
 
 	/* push skb up the stack */
 	skb->protocol = eth_type_trans(skb, netcp->ndev);
@@ -777,7 +781,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 
 free_desc:
 	netcp_free_rx_desc_chain(netcp, desc);
-	netcp->ndev->stats.rx_errors++;
+	rx_stats->rx_errors++;
 	return 0;
 }
 
@@ -1008,6 +1012,7 @@ static void netcp_free_tx_desc_chain(struct netcp_intf *netcp,
 static int netcp_process_tx_compl_packets(struct netcp_intf *netcp,
 					  unsigned int budget)
 {
+	struct netcp_stats *tx_stats = &netcp->stats;
 	struct knav_dma_desc *desc;
 	struct netcp_tx_cb *tx_cb;
 	struct sk_buff *skb;
@@ -1022,7 +1027,7 @@ static int netcp_process_tx_compl_packets(struct netcp_intf *netcp,
 		desc = knav_pool_desc_unmap(netcp->tx_pool, dma, dma_sz);
 		if (unlikely(!desc)) {
 			dev_err(netcp->ndev_dev, "failed to unmap Tx desc\n");
-			netcp->ndev->stats.tx_errors++;
+			tx_stats->tx_errors++;
 			continue;
 		}
 
@@ -1033,7 +1038,7 @@ static int netcp_process_tx_compl_packets(struct netcp_intf *netcp,
 		netcp_free_tx_desc_chain(netcp, desc, dma_sz);
 		if (!skb) {
 			dev_err(netcp->ndev_dev, "No skb in Tx desc\n");
-			netcp->ndev->stats.tx_errors++;
+			tx_stats->tx_errors++;
 			continue;
 		}
 
@@ -1050,8 +1055,10 @@ static int netcp_process_tx_compl_packets(struct netcp_intf *netcp,
 			netif_wake_subqueue(netcp->ndev, subqueue);
 		}
 
-		netcp->ndev->stats.tx_packets++;
-		netcp->ndev->stats.tx_bytes += skb->len;
+		u64_stats_update_begin(&tx_stats->syncp_tx);
+		tx_stats->tx_packets++;
+		tx_stats->tx_bytes += skb->len;
+		u64_stats_update_end(&tx_stats->syncp_tx);
 		dev_kfree_skb(skb);
 		pkts++;
 	}
@@ -1272,6 +1279,7 @@ static int netcp_tx_submit_skb(struct netcp_intf *netcp,
 static int netcp_ndo_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 {
 	struct netcp_intf *netcp = netdev_priv(ndev);
+	struct netcp_stats *tx_stats = &netcp->stats;
 	int subqueue = skb_get_queue_mapping(skb);
 	struct knav_dma_desc *desc;
 	int desc_count, ret = 0;
@@ -1287,7 +1295,7 @@ static int netcp_ndo_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 			/* If we get here, the skb has already been dropped */
 			dev_warn(netcp->ndev_dev, "padding failed (%d), packet dropped\n",
 				 ret);
-			ndev->stats.tx_dropped++;
+			tx_stats->tx_dropped++;
 			return ret;
 		}
 		skb->len = NETCP_MIN_PACKET_SIZE;
@@ -1315,7 +1323,7 @@ static int netcp_ndo_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	return NETDEV_TX_OK;
 
 drop:
-	ndev->stats.tx_dropped++;
+	tx_stats->tx_dropped++;
 	if (desc)
 		netcp_free_tx_desc_chain(netcp, desc, sizeof(*desc));
 	dev_kfree_skb(skb);
@@ -1897,12 +1905,46 @@ static int netcp_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 	return 0;
 }
 
+static struct rtnl_link_stats64 *
+netcp_get_stats(struct net_device *ndev, struct rtnl_link_stats64 *stats)
+{
+	struct netcp_intf *netcp = netdev_priv(ndev);
+	struct netcp_stats *p = &netcp->stats;
+	u64 rxpackets, rxbytes, txpackets, txbytes;
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin_irq(&p->syncp_rx);
+		rxpackets       = p->rx_packets;
+		rxbytes         = p->rx_bytes;
+	} while (u64_stats_fetch_retry_irq(&p->syncp_rx, start));
+
+	do {
+		start = u64_stats_fetch_begin_irq(&p->syncp_tx);
+		txpackets       = p->tx_packets;
+		txbytes         = p->tx_bytes;
+	} while (u64_stats_fetch_retry_irq(&p->syncp_tx, start));
+
+	stats->rx_packets = rxpackets;
+	stats->rx_bytes = rxbytes;
+	stats->tx_packets = txpackets;
+	stats->tx_bytes = txbytes;
+
+	/* The following are stored as 32 bit */
+	stats->rx_errors = p->rx_errors;
+	stats->rx_dropped = p->rx_dropped;
+	stats->tx_dropped = p->tx_dropped;
+
+	return stats;
+}
+
 static const struct net_device_ops netcp_netdev_ops = {
 	.ndo_open		= netcp_ndo_open,
 	.ndo_stop		= netcp_ndo_stop,
 	.ndo_start_xmit		= netcp_ndo_start_xmit,
 	.ndo_set_rx_mode	= netcp_set_rx_mode,
 	.ndo_do_ioctl           = netcp_ndo_ioctl,
+	.ndo_get_stats64        = netcp_get_stats,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_vlan_rx_add_vid	= netcp_rx_add_vid,
@@ -1949,6 +1991,8 @@ static int netcp_create_interface(struct netcp_device *netcp_device,
 	INIT_LIST_HEAD(&netcp->txhook_list_head);
 	INIT_LIST_HEAD(&netcp->rxhook_list_head);
 	INIT_LIST_HEAD(&netcp->addr_list);
+	u64_stats_init(&netcp->stats.syncp_rx);
+	u64_stats_init(&netcp->stats.syncp_tx);
 	netcp->netcp_device = netcp_device;
 	netcp->dev = netcp_device->device;
 	netcp->ndev = ndev;
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 03/10] net: netcp: extract eflag from desc for rx_hook handling
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, davem, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>

Extract the eflag bits from the received desc and pass it down
the rx_hook chain to be available for netcp modules. Also the
psdata and epib data has to be inspected by the netcp modules.
So the desc can be freed only after returning from the rx_hook.
So move knav_pool_desc_put() after the rx_hook processing.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
---
 drivers/net/ethernet/ti/netcp.h      |  1 +
 drivers/net/ethernet/ti/netcp_core.c | 20 +++++++++++++++++---
 include/linux/soc/ti/knav_dma.h      |  2 ++
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h
index 0f58c58..a92abd6 100644
--- a/drivers/net/ethernet/ti/netcp.h
+++ b/drivers/net/ethernet/ti/netcp.h
@@ -115,6 +115,7 @@ struct netcp_packet {
 	struct sk_buff		*skb;
 	__le32			*epib;
 	u32			*psdata;
+	u32			eflags;
 	unsigned int		psdata_len;
 	struct netcp_intf	*netcp;
 	struct netcp_tx_pipe	*tx_pipe;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index c243335..a136c56 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -122,6 +122,13 @@ static void get_pkt_info(dma_addr_t *buff, u32 *buff_len, dma_addr_t *ndesc,
 	*ndesc = le32_to_cpu(desc->next_desc);
 }
 
+static void get_desc_info(u32 *desc_info, u32 *pkt_info,
+			  struct knav_dma_desc *desc)
+{
+	*desc_info = le32_to_cpu(desc->desc_info);
+	*pkt_info = le32_to_cpu(desc->packet_info);
+}
+
 static u32 get_sw_data(int index, struct knav_dma_desc *desc)
 {
 	/* No Endian conversion needed as this data is untouched by hw */
@@ -653,6 +660,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 	struct netcp_packet p_info;
 	struct sk_buff *skb;
 	void *org_buf_ptr;
+	u32 tmp;
 
 	dma_desc = knav_queue_pop(netcp->rx_queue, &dma_sz);
 	if (!dma_desc)
@@ -724,9 +732,6 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 		knav_pool_desc_put(netcp->rx_pool, ndesc);
 	}
 
-	/* Free the primary descriptor */
-	knav_pool_desc_put(netcp->rx_pool, desc);
-
 	/* check for packet len and warn */
 	if (unlikely(pkt_sz != accum_sz))
 		dev_dbg(netcp->ndev_dev, "mismatch in packet size(%d) & sum of fragments(%d)\n",
@@ -739,6 +744,11 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 	p_info.skb = skb;
 	skb->dev = netcp->ndev;
 	p_info.rxtstamp_complete = false;
+	get_desc_info(&tmp, &p_info.eflags, desc);
+	p_info.epib = desc->epib;
+	p_info.psdata = (u32 __force *)desc->psdata;
+	p_info.eflags = ((p_info.eflags >> KNAV_DMA_DESC_EFLAGS_SHIFT) &
+			 KNAV_DMA_DESC_EFLAGS_MASK);
 	list_for_each_entry(rx_hook, &netcp->rxhook_list_head, list) {
 		int ret;
 
@@ -748,10 +758,14 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 			dev_err(netcp->ndev_dev, "RX hook %d failed: %d\n",
 				rx_hook->order, ret);
 			netcp->ndev->stats.rx_errors++;
+			/* Free the primary descriptor */
+			knav_pool_desc_put(netcp->rx_pool, desc);
 			dev_kfree_skb(skb);
 			return 0;
 		}
 	}
+	/* Free the primary descriptor */
+	knav_pool_desc_put(netcp->rx_pool, desc);
 
 	netcp->ndev->stats.rx_packets++;
 	netcp->ndev->stats.rx_bytes += skb->len;
diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h
index 35cb926..2b78826 100644
--- a/include/linux/soc/ti/knav_dma.h
+++ b/include/linux/soc/ti/knav_dma.h
@@ -41,6 +41,8 @@
 #define KNAV_DMA_DESC_RETQ_SHIFT		0
 #define KNAV_DMA_DESC_RETQ_MASK			MASK(14)
 #define KNAV_DMA_DESC_BUF_LEN_MASK		MASK(22)
+#define KNAV_DMA_DESC_EFLAGS_MASK		MASK(4)
+#define KNAV_DMA_DESC_EFLAGS_SHIFT		20
 
 #define KNAV_DMA_NUM_EPIB_WORDS			4
 #define KNAV_DMA_NUM_PS_WORDS			16
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 10/10] net: netcp: ale: add proper ale entry mask bits for netcp switch ALE
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA, linux-omap-u79uwXL29TY76Z2rM5mHXA,
	grygorii.strashko-l0cyMroinI0, mugunthanvnm-l0cyMroinI0,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, arnd-r2nGTMty4D4,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, devicetree-u79uwXL29TY76Z2rM5mHXA,
	mark.rutland-5wv7dgnIgG8, robh+dt-DgEjT+Ai2ygdnm+yROfE0A
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2-l0cyMroinI0@public.gmane.org>

For NetCP NU Switch ALE, some of the mask bits are different than
defaults used in the driver. Add a new macro DEFINE_ALE_FIELD1 that use
a configurable mask bits and use it in the driver. These bits are set to
correct values by using the new variables added to cpsw_ale structure
and re-used in the macros. The parameter nu_switch_ale is configured by
the caller driver to indicate the ALE is for that switch and is used in
the ALE driver to do customization as needed.

Signed-off-by: Murali Karicheri <m-karicheri2-l0cyMroinI0@public.gmane.org>
Signed-off-by: Sekhar Nori <nsekhar-l0cyMroinI0@public.gmane.org>
---
 drivers/net/ethernet/ti/cpsw_ale.c | 99 ++++++++++++++++++++++++++++++--------
 drivers/net/ethernet/ti/cpsw_ale.h |  4 ++
 2 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index 62a18d6..ddd43e0 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -29,6 +29,7 @@
 
 #define ALE_VERSION_MAJOR(rev, mask) (((rev) >> 8) & (mask))
 #define ALE_VERSION_MINOR(rev)	(rev & 0xff)
+#define ALE_VERSION_1R3		0x0103
 #define ALE_VERSION_1R4		0x0104
 
 /* ALE Registers */
@@ -46,6 +47,7 @@
 #define ALE_UNKNOWNVLAN_UNREG_MCAST_FLOOD	0x94
 #define ALE_UNKNOWNVLAN_REG_MCAST_FLOOD		0x98
 #define ALE_UNKNOWNVLAN_FORCE_UNTAG_EGRESS	0x9C
+#define ALE_VLAN_MASK_MUX(reg)			(0xc0 + (0x4 * (reg)))
 
 #define ALE_TABLE_WRITE		BIT(31)
 
@@ -96,20 +98,34 @@ static inline void cpsw_ale_set_field(u32 *ale_entry, u32 start, u32 bits,
 	cpsw_ale_set_field(ale_entry, start, bits, value);		\
 }
 
+#define DEFINE_ALE_FIELD1(name, start)					\
+static inline int cpsw_ale_get_##name(u32 *ale_entry, u32 bits)		\
+{									\
+	return cpsw_ale_get_field(ale_entry, start, bits);		\
+}									\
+static inline void cpsw_ale_set_##name(u32 *ale_entry, u32 value,	\
+		u32 bits)						\
+{									\
+	cpsw_ale_set_field(ale_entry, start, bits, value);		\
+}
+
 DEFINE_ALE_FIELD(entry_type,		60,	2)
 DEFINE_ALE_FIELD(vlan_id,		48,	12)
 DEFINE_ALE_FIELD(mcast_state,		62,	2)
-DEFINE_ALE_FIELD(port_mask,		66,     3)
+DEFINE_ALE_FIELD1(port_mask,		66)
 DEFINE_ALE_FIELD(super,			65,	1)
 DEFINE_ALE_FIELD(ucast_type,		62,     2)
-DEFINE_ALE_FIELD(port_num,		66,     2)
+DEFINE_ALE_FIELD1(port_num,		66)
 DEFINE_ALE_FIELD(blocked,		65,     1)
 DEFINE_ALE_FIELD(secure,		64,     1)
-DEFINE_ALE_FIELD(vlan_untag_force,	24,	3)
-DEFINE_ALE_FIELD(vlan_reg_mcast,	16,	3)
-DEFINE_ALE_FIELD(vlan_unreg_mcast,	8,	3)
-DEFINE_ALE_FIELD(vlan_member_list,	0,	3)
+DEFINE_ALE_FIELD1(vlan_untag_force,	24)
+DEFINE_ALE_FIELD1(vlan_reg_mcast,	16)
+DEFINE_ALE_FIELD1(vlan_unreg_mcast,	8)
+DEFINE_ALE_FIELD1(vlan_member_list,	0)
 DEFINE_ALE_FIELD(mcast,			40,	1)
+/* ALE NetCP nu switch specific */
+DEFINE_ALE_FIELD(vlan_unreg_mcast_idx,	20,	3)
+DEFINE_ALE_FIELD(vlan_reg_mcast_idx,	44,	3)
 
 /* The MAC address field in the ALE entry cannot be macroized as above */
 static inline void cpsw_ale_get_addr(u32 *ale_entry, u8 *addr)
@@ -235,14 +251,16 @@ static void cpsw_ale_flush_mcast(struct cpsw_ale *ale, u32 *ale_entry,
 {
 	int mask;
 
-	mask = cpsw_ale_get_port_mask(ale_entry);
+	mask = cpsw_ale_get_port_mask(ale_entry,
+				      ale->port_mask_bits);
 	if ((mask & port_mask) == 0)
 		return; /* ports dont intersect, not interested */
 	mask &= ~port_mask;
 
 	/* free if only remaining port is host port */
 	if (mask)
-		cpsw_ale_set_port_mask(ale_entry, mask);
+		cpsw_ale_set_port_mask(ale_entry, mask,
+				       ale->port_mask_bits);
 	else
 		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE);
 }
@@ -303,7 +321,7 @@ int cpsw_ale_add_ucast(struct cpsw_ale *ale, u8 *addr, int port,
 	cpsw_ale_set_ucast_type(ale_entry, ALE_UCAST_PERSISTANT);
 	cpsw_ale_set_secure(ale_entry, (flags & ALE_SECURE) ? 1 : 0);
 	cpsw_ale_set_blocked(ale_entry, (flags & ALE_BLOCKED) ? 1 : 0);
-	cpsw_ale_set_port_num(ale_entry, port);
+	cpsw_ale_set_port_num(ale_entry, port, ale->port_num_bits);
 
 	idx = cpsw_ale_match_addr(ale, addr, (flags & ALE_VLAN) ? vid : 0);
 	if (idx < 0)
@@ -350,9 +368,11 @@ int cpsw_ale_add_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
 	cpsw_ale_set_super(ale_entry, (flags & ALE_BLOCKED) ? 1 : 0);
 	cpsw_ale_set_mcast_state(ale_entry, mcast_state);
 
-	mask = cpsw_ale_get_port_mask(ale_entry);
+	mask = cpsw_ale_get_port_mask(ale_entry,
+				      ale->port_mask_bits);
 	port_mask |= mask;
-	cpsw_ale_set_port_mask(ale_entry, port_mask);
+	cpsw_ale_set_port_mask(ale_entry, port_mask,
+			       ale->port_mask_bits);
 
 	if (idx < 0)
 		idx = cpsw_ale_match_free(ale);
@@ -379,7 +399,8 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
 	cpsw_ale_read(ale, idx, ale_entry);
 
 	if (port_mask)
-		cpsw_ale_set_port_mask(ale_entry, port_mask);
+		cpsw_ale_set_port_mask(ale_entry, port_mask,
+				       ale->port_mask_bits);
 	else
 		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE);
 
@@ -388,6 +409,21 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_del_mcast);
 
+/* ALE NetCP NU switch specific vlan functions */
+static void cpsw_ale_set_vlan_mcast(struct cpsw_ale *ale, u32 *ale_entry,
+				    int reg_mcast, int unreg_mcast)
+{
+	int idx;
+
+	/* Set VLAN registered multicast flood mask */
+	idx = cpsw_ale_get_vlan_reg_mcast_idx(ale_entry);
+	writel(reg_mcast, ale->params.ale_regs + ALE_VLAN_MASK_MUX(idx));
+
+	/* Set VLAN unregistered multicast flood mask */
+	idx = cpsw_ale_get_vlan_unreg_mcast_idx(ale_entry);
+	writel(unreg_mcast, ale->params.ale_regs + ALE_VLAN_MASK_MUX(idx));
+}
+
 int cpsw_ale_add_vlan(struct cpsw_ale *ale, u16 vid, int port, int untag,
 		      int reg_mcast, int unreg_mcast)
 {
@@ -401,10 +437,16 @@ int cpsw_ale_add_vlan(struct cpsw_ale *ale, u16 vid, int port, int untag,
 	cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_VLAN);
 	cpsw_ale_set_vlan_id(ale_entry, vid);
 
-	cpsw_ale_set_vlan_untag_force(ale_entry, untag);
-	cpsw_ale_set_vlan_reg_mcast(ale_entry, reg_mcast);
-	cpsw_ale_set_vlan_unreg_mcast(ale_entry, unreg_mcast);
-	cpsw_ale_set_vlan_member_list(ale_entry, port);
+	cpsw_ale_set_vlan_untag_force(ale_entry, untag, ale->vlan_field_bits);
+	if (!ale->params.nu_switch_ale) {
+		cpsw_ale_set_vlan_reg_mcast(ale_entry, reg_mcast,
+					    ale->vlan_field_bits);
+		cpsw_ale_set_vlan_unreg_mcast(ale_entry, unreg_mcast,
+					      ale->vlan_field_bits);
+	} else {
+		cpsw_ale_set_vlan_mcast(ale, ale_entry, reg_mcast, unreg_mcast);
+	}
+	cpsw_ale_set_vlan_member_list(ale_entry, port, ale->vlan_field_bits);
 
 	if (idx < 0)
 		idx = cpsw_ale_match_free(ale);
@@ -430,7 +472,8 @@ int cpsw_ale_del_vlan(struct cpsw_ale *ale, u16 vid, int port_mask)
 	cpsw_ale_read(ale, idx, ale_entry);
 
 	if (port_mask)
-		cpsw_ale_set_vlan_member_list(ale_entry, port_mask);
+		cpsw_ale_set_vlan_member_list(ale_entry, port_mask,
+					      ale->vlan_field_bits);
 	else
 		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE);
 
@@ -458,12 +501,15 @@ void cpsw_ale_set_allmulti(struct cpsw_ale *ale, int allmulti)
 		if (type != ALE_TYPE_VLAN)
 			continue;
 
-		unreg_mcast = cpsw_ale_get_vlan_unreg_mcast(ale_entry);
+		unreg_mcast =
+			cpsw_ale_get_vlan_unreg_mcast(ale_entry,
+						      ale->vlan_field_bits);
 		if (allmulti)
 			unreg_mcast |= 1;
 		else
 			unreg_mcast &= ~1;
-		cpsw_ale_set_vlan_unreg_mcast(ale_entry, unreg_mcast);
+		cpsw_ale_set_vlan_unreg_mcast(ale_entry, unreg_mcast,
+					      ale->vlan_field_bits);
 		cpsw_ale_write(ale, idx, ale_entry);
 	}
 }
@@ -769,6 +815,14 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 	dev_info(ale->params.dev,
 		 "ALE Table size %ld\n", ale->params.ale_entries);
 
+	/* set default bits for existing h/w */
+	ale->port_mask_bits = 3;
+	ale->port_num_bits = 2;
+	ale->vlan_field_bits = 3;
+
+	/* Set defaults override for ALE on NetCP NU switch and for version
+	 * 1R3
+	 */
 	if (ale->params.nu_switch_ale) {
 		/* Separate registers for unknown vlan configuration.
 		 * Also there are N bits, where N is number of ale
@@ -793,6 +847,13 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 		ale_controls[ALE_PORT_UNTAGGED_EGRESS].shift = 0;
 		ale_controls[ALE_PORT_UNTAGGED_EGRESS].offset =
 					ALE_UNKNOWNVLAN_FORCE_UNTAG_EGRESS;
+		ale->port_mask_bits = ale->params.ale_ports;
+		ale->port_num_bits = ale->params.ale_ports - 1;
+		ale->vlan_field_bits = ale->params.ale_ports;
+	} else if (ale->version == ALE_VERSION_1R3) {
+		ale->port_mask_bits = ale->params.ale_ports;
+		ale->port_num_bits = 3;
+		ale->vlan_field_bits = ale->params.ale_ports;
 	}
 
 	cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1);
diff --git a/drivers/net/ethernet/ti/cpsw_ale.h b/drivers/net/ethernet/ti/cpsw_ale.h
index b1c7954..25d24e8 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.h
+++ b/drivers/net/ethernet/ti/cpsw_ale.h
@@ -39,6 +39,10 @@ struct cpsw_ale {
 	unsigned long		ageout;
 	int			allmulti;
 	u32			version;
+	/* These bits are different on NetCP NU Switch ALE */
+	u32			port_mask_bits;
+	u32			port_num_bits;
+	u32			vlan_field_bits;
 };
 
 enum cpsw_ale_control {
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH net-next 09/10] net: netcp: ale: use ale_status to size the ale table
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA, linux-omap-u79uwXL29TY76Z2rM5mHXA,
	grygorii.strashko-l0cyMroinI0, mugunthanvnm-l0cyMroinI0,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, arnd-r2nGTMty4D4,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, devicetree-u79uwXL29TY76Z2rM5mHXA,
	mark.rutland-5wv7dgnIgG8, robh+dt-DgEjT+Ai2ygdnm+yROfE0A
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2-l0cyMroinI0@public.gmane.org>

ALE h/w on newer version of NetCP (K2E/L/G) does provide a ALE_STATUS
register for the size of the ALE Table implemented in h/w. Currently
for example we set ALE Table size to 1024 for NetCP ALE on
K2E even though the ALE Status/Documentation shows it has 8192 entries.
So take advantage of this register to read the size of ALE table supported
and use that value in the driver for the newer version of NetCP ALE.
For NetCP lite, ALE Table size is much less (64) and indicated by a size
of zero in ALE_STATUS. So use that as a default for now. While at it,
also fix the ale table size on 10G switch to 2048 per User guide
http://www.ti.com/lit/ug/spruhj5/spruhj5.pdf

Signed-off-by: Murali Karicheri <m-karicheri2-l0cyMroinI0@public.gmane.org>
Signed-off-by: Sekhar Nori <nsekhar-l0cyMroinI0@public.gmane.org>
---
 drivers/net/ethernet/ti/cpsw_ale.c    | 31 ++++++++++++++++++++++++++++++-
 drivers/net/ethernet/ti/netcp_ethss.c |  4 +---
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index e15db39..62a18d6 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -33,6 +33,7 @@
 
 /* ALE Registers */
 #define ALE_IDVER		0x00
+#define ALE_STATUS		0x04
 #define ALE_CONTROL		0x08
 #define ALE_PRESCALE		0x10
 #define ALE_UNKNOWNVLAN		0x18
@@ -58,6 +59,10 @@
 #define ALE_UCAST_OUI			2
 #define ALE_UCAST_TOUCHED		3
 
+#define ALE_TABLE_SIZE_MULTIPLIER	1024
+#define ALE_STATUS_SIZE_MASK		0x1f
+#define ALE_TABLE_SIZE_DEFAULT		64
+
 static inline int cpsw_ale_get_field(u32 *ale_entry, u32 start, u32 bits)
 {
 	int idx;
@@ -728,7 +733,7 @@ static void cpsw_ale_timer(unsigned long arg)
 
 void cpsw_ale_start(struct cpsw_ale *ale)
 {
-	u32 rev;
+	u32 rev, ale_entries;
 
 	rev = __raw_readl(ale->params.ale_regs + ALE_IDVER);
 	if (!ale->params.major_ver_mask)
@@ -740,6 +745,30 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 		 ALE_VERSION_MAJOR(rev, ale->params.major_ver_mask),
 		 ALE_VERSION_MINOR(rev));
 
+	if (!ale->params.ale_entries) {
+		ale_entries =
+			__raw_readl(ale->params.ale_regs + ALE_STATUS) &
+				    ALE_STATUS_SIZE_MASK;
+		/* ALE available on newer NetCP switches has introduced
+		 * a register, ALE_STATUS, to indicate the size of ALE
+		 * table which shows the size as a multiple of 1024 entries.
+		 * For these, params.ale_entries will be set to zero. So
+		 * read the register and update the value of ale_entries.
+		 * ALE table on NetCP lite, is much smaller and is indicated
+		 * by a value of zero in ALE_STATUS. So use a default value
+		 * of ALE_TABLE_SIZE_DEFAULT for this. Caller is expected
+		 * to set the value of ale_entries for all other versions
+		 * of ALE.
+		 */
+		if (!ale_entries)
+			ale_entries = ALE_TABLE_SIZE_DEFAULT;
+		else
+			ale_entries *= ALE_TABLE_SIZE_MULTIPLIER;
+		ale->params.ale_entries = ale_entries;
+	}
+	dev_info(ale->params.dev,
+		 "ALE Table size %ld\n", ale->params.ale_entries);
+
 	if (ale->params.nu_switch_ale) {
 		/* Separate registers for unknown vlan configuration.
 		 * Also there are N bits, where N is number of ale
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index b37fb73..80d68cb 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -94,7 +94,6 @@
 #define GBENU_CPTS_OFFSET		0x1d000
 #define GBENU_ALE_OFFSET		0x1e000
 #define GBENU_HOST_PORT_NUM		0
-#define GBENU_NUM_ALE_ENTRIES		1024
 
 /* 10G Ethernet SS defines */
 #define XGBE_MODULE_NAME		"netcp-xgbe"
@@ -114,7 +113,7 @@
 #define XGBE10_ALE_OFFSET		0x700
 #define XGBE10_HW_STATS_OFFSET		0x800
 #define XGBE10_HOST_PORT_NUM		0
-#define XGBE10_NUM_ALE_ENTRIES		1024
+#define XGBE10_NUM_ALE_ENTRIES		2048
 
 #define	GBE_TIMER_INTERVAL			(HZ / 2)
 
@@ -3548,7 +3547,6 @@ static int set_gbenu_ethss_priv(struct gbe_priv *gbe_dev,
 	gbe_dev->ale_reg = gbe_dev->switch_regs + GBENU_ALE_OFFSET;
 	gbe_dev->ale_ports = gbe_dev->max_num_ports;
 	gbe_dev->host_port = GBENU_HOST_PORT_NUM;
-	gbe_dev->ale_entries = GBE13_NUM_ALE_ENTRIES;
 	gbe_dev->stats_en_mask = (1 << (gbe_dev->max_num_ports)) - 1;
 
 	/* Subsystem registers */
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH net-next 08/10] net: netcp: ale: update to support unknown vlan controls for NU switch
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, davem, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>

In NU Ethernet switch used on some of the Keystone SoCs, there is
separate UNKNOWNVLAN register for membership, unreg mcast flood, reg
mcast flood and force untag egress bits in ALE. So control for these
fields require different address offset, shift and size of field.
As this ALE has the same version number as ALE in CPSW found on other
SoCs, customazation based on version number is not possible. So
use a configuration parameter, nu_switch_ale, to identify the ALE
ALE found in NU Switch. Different treatment is needed for NU Switch
ALE due to difference in the ale table bits, separate unknown vlan
registers etc. The register information available in ale_controls,
needs to be updated to support the netcp NU switch h/w. So it is not
constant array any more since it needs to be updated based
on ALE type. The header of the file is also updated to indicate it
supports N port switch ALE, not just 3 port. The version mask is
3 bits in NU Switch ALE vs 8 bits on other ALE types.

While at it, change the debug print to info print so that ALE
version gets displayed in boot log.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 drivers/net/ethernet/ti/cpsw_ale.c    | 50 +++++++++++++++++++++++++++++++----
 drivers/net/ethernet/ti/cpsw_ale.h    | 13 ++++++++-
 drivers/net/ethernet/ti/netcp_ethss.c |  5 +++-
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index 43b061b..e15db39 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -1,5 +1,5 @@
 /*
- * Texas Instruments 3-Port Ethernet Switch Address Lookup Engine
+ * Texas Instruments N-Port Ethernet Switch Address Lookup Engine
  *
  * Copyright (C) 2012 Texas Instruments
  *
@@ -27,8 +27,9 @@
 
 #define BITMASK(bits)		(BIT(bits) - 1)
 
-#define ALE_VERSION_MAJOR(rev)	((rev >> 8) & 0xff)
+#define ALE_VERSION_MAJOR(rev, mask) (((rev) >> 8) & (mask))
 #define ALE_VERSION_MINOR(rev)	(rev & 0xff)
+#define ALE_VERSION_1R4		0x0104
 
 /* ALE Registers */
 #define ALE_IDVER		0x00
@@ -39,6 +40,12 @@
 #define ALE_TABLE		0x34
 #define ALE_PORTCTL		0x40
 
+/* ALE NetCP NU switch specific Registers */
+#define ALE_UNKNOWNVLAN_MEMBER			0x90
+#define ALE_UNKNOWNVLAN_UNREG_MCAST_FLOOD	0x94
+#define ALE_UNKNOWNVLAN_REG_MCAST_FLOOD		0x98
+#define ALE_UNKNOWNVLAN_FORCE_UNTAG_EGRESS	0x9C
+
 #define ALE_TABLE_WRITE		BIT(31)
 
 #define ALE_TYPE_FREE			0
@@ -464,7 +471,7 @@ struct ale_control_info {
 	int		bits;
 };
 
-static const struct ale_control_info ale_controls[ALE_NUM_CONTROLS] = {
+static struct ale_control_info ale_controls[ALE_NUM_CONTROLS] = {
 	[ALE_ENABLE]		= {
 		.name		= "enable",
 		.offset		= ALE_CONTROL,
@@ -724,8 +731,41 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 	u32 rev;
 
 	rev = __raw_readl(ale->params.ale_regs + ALE_IDVER);
-	dev_dbg(ale->params.dev, "initialized cpsw ale revision %d.%d\n",
-		ALE_VERSION_MAJOR(rev), ALE_VERSION_MINOR(rev));
+	if (!ale->params.major_ver_mask)
+		ale->params.major_ver_mask = 0xff;
+	ale->version =
+		(ALE_VERSION_MAJOR(rev, ale->params.major_ver_mask) << 8) |
+		 ALE_VERSION_MINOR(rev);
+	dev_info(ale->params.dev, "initialized cpsw ale version %d.%d\n",
+		 ALE_VERSION_MAJOR(rev, ale->params.major_ver_mask),
+		 ALE_VERSION_MINOR(rev));
+
+	if (ale->params.nu_switch_ale) {
+		/* Separate registers for unknown vlan configuration.
+		 * Also there are N bits, where N is number of ale
+		 * ports and shift value should be 0
+		 */
+		ale_controls[ALE_PORT_UNKNOWN_VLAN_MEMBER].bits =
+					ale->params.ale_ports;
+		ale_controls[ALE_PORT_UNKNOWN_VLAN_MEMBER].offset =
+					ALE_UNKNOWNVLAN_MEMBER;
+		ale_controls[ALE_PORT_UNKNOWN_MCAST_FLOOD].bits =
+					ale->params.ale_ports;
+		ale_controls[ALE_PORT_UNKNOWN_MCAST_FLOOD].shift = 0;
+		ale_controls[ALE_PORT_UNKNOWN_MCAST_FLOOD].offset =
+					ALE_UNKNOWNVLAN_UNREG_MCAST_FLOOD;
+		ale_controls[ALE_PORT_UNKNOWN_REG_MCAST_FLOOD].bits =
+					ale->params.ale_ports;
+		ale_controls[ALE_PORT_UNKNOWN_REG_MCAST_FLOOD].shift = 0;
+		ale_controls[ALE_PORT_UNKNOWN_REG_MCAST_FLOOD].offset =
+					ALE_UNKNOWNVLAN_REG_MCAST_FLOOD;
+		ale_controls[ALE_PORT_UNTAGGED_EGRESS].bits =
+					ale->params.ale_ports;
+		ale_controls[ALE_PORT_UNTAGGED_EGRESS].shift = 0;
+		ale_controls[ALE_PORT_UNTAGGED_EGRESS].offset =
+					ALE_UNKNOWNVLAN_FORCE_UNTAG_EGRESS;
+	}
+
 	cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1);
 	cpsw_ale_control_set(ale, 0, ALE_CLEAR, 1);
 
diff --git a/drivers/net/ethernet/ti/cpsw_ale.h b/drivers/net/ethernet/ti/cpsw_ale.h
index a700189..b1c7954 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.h
+++ b/drivers/net/ethernet/ti/cpsw_ale.h
@@ -1,5 +1,5 @@
 /*
- * Texas Instruments 3-Port Ethernet Switch Address Lookup Engine APIs
+ * Texas Instruments N-Port Ethernet Switch Address Lookup Engine APIs
  *
  * Copyright (C) 2012 Texas Instruments
  *
@@ -21,6 +21,16 @@ struct cpsw_ale_params {
 	unsigned long		ale_ageout;	/* in secs */
 	unsigned long		ale_entries;
 	unsigned long		ale_ports;
+	/* NU Switch has specific handling as number of bits in ALE entries
+	 * are different than other versions of ALE. Also there are specific
+	 * registers for unknown vlan specific fields. So use nu_switch_ale
+	 * to identify this hardware.
+	 */
+	bool			nu_switch_ale;
+	/* mask bit used in NU Switch ALE is 3 bits instead of 8 bits. So
+	 * pass it from caller.
+	 */
+	u32			major_ver_mask;
 };
 
 struct cpsw_ale {
@@ -28,6 +38,7 @@ struct cpsw_ale {
 	struct timer_list	timer;
 	unsigned long		ageout;
 	int			allmulti;
+	u32			version;
 };
 
 enum cpsw_ale_control {
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 4b2a911..b37fb73 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -3716,7 +3716,10 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev,
 	ale_params.ale_ageout	= GBE_DEFAULT_ALE_AGEOUT;
 	ale_params.ale_entries	= gbe_dev->ale_entries;
 	ale_params.ale_ports	= gbe_dev->ale_ports;
-
+	if (IS_SS_ID_MU(gbe_dev)) {
+		ale_params.major_ver_mask = 0x7;
+		ale_params.nu_switch_ale = true;
+	}
 	gbe_dev->ale = cpsw_ale_create(&ale_params);
 	if (!gbe_dev->ale) {
 		dev_err(gbe_dev->dev, "error initializing ale engine\n");
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 07/10] net: netcp: use hw capability to remove FCS word from rx packets
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, davem, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>

Some of the newer Ethernet switch hw (such as that on k2e/l/g) can
strip the Etherenet FCS from packet at the port 0 egress of the switch.
So use this capability instead of doing it in software.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 drivers/net/ethernet/ti/netcp.h       |  2 ++
 drivers/net/ethernet/ti/netcp_core.c  |  8 ++++++--
 drivers/net/ethernet/ti/netcp_ethss.c | 10 ++++++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h
index d243c5d..8900a6f 100644
--- a/drivers/net/ethernet/ti/netcp.h
+++ b/drivers/net/ethernet/ti/netcp.h
@@ -102,6 +102,8 @@ struct netcp_intf {
 	void			*rx_fdq[KNAV_DMA_FDQ_PER_CHAN];
 	struct napi_struct	rx_napi;
 	struct napi_struct	tx_napi;
+#define ETH_SW_CAN_REMOVE_ETH_FCS	BIT(0)
+	u32			hw_cap;
 
 	/* 64-bit netcp stats */
 	struct netcp_stats	stats;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index b077ed4..68a75cc 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -739,8 +739,12 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 		dev_dbg(netcp->ndev_dev, "mismatch in packet size(%d) & sum of fragments(%d)\n",
 			pkt_sz, accum_sz);
 
-	/* Remove ethernet FCS from the packet */
-	__pskb_trim(skb, skb->len - ETH_FCS_LEN);
+	/* Newer version of the Ethernet switch can trim the Ethernet FCS
+	 * from the packet and is indicated in hw_cap. So trim it only for
+	 * older h/w
+	 */
+	if (!(netcp->hw_cap & ETH_SW_CAN_REMOVE_ETH_FCS))
+		__pskb_trim(skb, skb->len - ETH_FCS_LEN);
 
 	/* Call each of the RX hooks */
 	p_info.skb = skb;
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 9266961..4b2a911 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -133,6 +133,7 @@
 #define MACSL_FULLDUPLEX			BIT(0)
 
 #define GBE_CTL_P0_ENABLE			BIT(2)
+#define ETH_SW_CTL_P0_TX_CRC_REMOVE		BIT(13)
 #define GBE13_REG_VAL_STAT_ENABLE_ALL		0xff
 #define XGBE_REG_VAL_STAT_ENABLE_ALL		0xf
 #define GBE_STATS_CD_SEL			BIT(28)
@@ -2847,7 +2848,7 @@ static int gbe_open(void *intf_priv, struct net_device *ndev)
 	struct netcp_intf *netcp = netdev_priv(ndev);
 	struct gbe_slave *slave = gbe_intf->slave;
 	int port_num = slave->port_num;
-	u32 reg;
+	u32 reg, val;
 	int ret;
 
 	reg = readl(GBE_REG_ADDR(gbe_dev, switch_regs, id_ver));
@@ -2877,7 +2878,12 @@ static int gbe_open(void *intf_priv, struct net_device *ndev)
 	writel(0, GBE_REG_ADDR(gbe_dev, switch_regs, ptype));
 
 	/* Control register */
-	writel(GBE_CTL_P0_ENABLE, GBE_REG_ADDR(gbe_dev, switch_regs, control));
+	val = GBE_CTL_P0_ENABLE;
+	if (IS_SS_ID_MU(gbe_dev)) {
+		val |= ETH_SW_CTL_P0_TX_CRC_REMOVE;
+		netcp->hw_cap = ETH_SW_CAN_REMOVE_ETH_FCS;
+	}
+	writel(val, GBE_REG_ADDR(gbe_dev, switch_regs, control));
 
 	/* All statistics enabled and STAT AB visible by default */
 	writel(gbe_dev->stats_en_mask, GBE_REG_ADDR(gbe_dev, switch_regs,
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 04/10] net: netcp: remove the redundant memmov()
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, davem, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>

The psdata is populated with command data by netcp modules
to the tail of the buffer and set_words() copy the same
to the front of the psdata. So remove the redundant memmov
function call.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
---
 drivers/net/ethernet/ti/netcp_core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index a136c56..286fd8d 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1226,9 +1226,9 @@ static int netcp_tx_submit_skb(struct netcp_intf *netcp,
 		/* psdata points to both native-endian and device-endian data */
 		__le32 *psdata = (void __force *)p_info.psdata;
 
-		memmove(p_info.psdata, p_info.psdata + p_info.psdata_len,
-			p_info.psdata_len);
-		set_words(p_info.psdata, p_info.psdata_len, psdata);
+		set_words((u32 *)psdata +
+			  (KNAV_DMA_NUM_PS_WORDS - p_info.psdata_len),
+			  p_info.psdata_len, psdata);
 		tmp |= (p_info.psdata_len & KNAV_DMA_DESC_PSLEN_MASK) <<
 			KNAV_DMA_DESC_PSLEN_SHIFT;
 	}
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 02/10] net: netcp: ethss: add support of 10gbe pcsr link status
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, davem, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>

From: WingMan Kwok <w-kwok2@ti.com>

The 10GBASE-R Physical Coding Sublayer (PCS-R) module provides
functionality of a physical coding sublayer (PCS) on data being
transferred between a demuxed XGMII and SerDes supporting a 16
or 32 bit interface.  From the driver point of view, whether
a ethernet link is up or not depends also on the status of the
block-lock bit of the PCSR.  This patch adds the checking of that
bit in order to determine the link status.

Signed-off-by: WingMan Kwok <w-kwok2@ti.com>
Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 .../devicetree/bindings/net/keystone-netcp.txt     |  3 ++
 drivers/net/ethernet/ti/netcp_ethss.c              | 37 ++++++++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/keystone-netcp.txt b/Documentation/devicetree/bindings/net/keystone-netcp.txt
index 0854a73..57fc13f 100644
--- a/Documentation/devicetree/bindings/net/keystone-netcp.txt
+++ b/Documentation/devicetree/bindings/net/keystone-netcp.txt
@@ -75,6 +75,9 @@ Required properties:
 - syscon-subsys:	phandle to syscon node of the switch
 			subsystem registers.
 
+- syscon-pcsr:		(10gbe only) phandle to syscon node of the
+			switch PCSR registers.
+
 - reg:		register location and the size for the following register
 		regions in the specified order.
 		- switch subsystem registers
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 473edda1..cb48f88 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -63,6 +63,12 @@
 #define GBE13_ALE_OFFSET		0x600
 #define GBE13_HOST_PORT_NUM		0
 #define GBE13_NUM_ALE_ENTRIES		1024
+/* offset relative to PCSR regmap */
+#define XGBE10_PCSR_OFFSET(x)		((x) * 0x80)
+#define XGBE10_PCSR_RX_STATUS(x)	(XGBE10_PCSR_OFFSET(x) + 0x0C)
+
+#define XGBE10_PCSR_BLOCK_LOCK_MASK	BIT(30)
+#define XGBE10_PCSR_BLOCK_LOCK_SHIFT	30
 
 /* 1G Ethernet NU SS defines */
 #define GBENU_MODULE_NAME		"netcp-gbenu"
@@ -2111,6 +2117,10 @@ static void netcp_ethss_link_state_action(struct gbe_priv *gbe_dev,
 
 	if (phy)
 		phy_print_status(phy);
+	else if (slave->link_interface == XGMII_LINK_MAC_MAC_FORCED) {
+		netdev_printk(KERN_INFO, ndev,
+			      "Link is %s\n", (up ? "Up" : "Down"));
+	}
 }
 
 static bool gbe_phy_link_status(struct gbe_slave *slave)
@@ -2123,18 +2133,29 @@ static void netcp_ethss_update_link_state(struct gbe_priv *gbe_dev,
 					  struct net_device *ndev)
 {
 	int sp = slave->slave_num;
-	int phy_link_state, sgmii_link_state = 1, link_state;
+	int phy_link_state, sw_link_state = 1, link_state, ret;
+	u32 pcsr_rx_stat;
 
 	if (!slave->open)
 		return;
 
 	if (!SLAVE_LINK_IS_XGMII(slave)) {
-		sgmii_link_state =
+		sw_link_state =
 			netcp_sgmii_get_port_link(SGMII_BASE(gbe_dev, sp), sp);
+	} else if (slave->link_interface == XGMII_LINK_MAC_MAC_FORCED) {
+		/* read status from pcsr status reg */
+		ret = regmap_read(gbe_dev->pcsr_regmap,
+				  XGBE10_PCSR_RX_STATUS(sp), &pcsr_rx_stat);
+
+		if (ret)
+			return;
+
+		sw_link_state = (pcsr_rx_stat & XGBE10_PCSR_BLOCK_LOCK_MASK) >>
+				 XGBE10_PCSR_BLOCK_LOCK_SHIFT;
 	}
 
 	phy_link_state = gbe_phy_link_status(slave);
-	link_state = phy_link_state & sgmii_link_state;
+	link_state = phy_link_state & sw_link_state;
 
 	if (atomic_xchg(&slave->link_state, link_state) != link_state)
 		netcp_ethss_link_state_action(gbe_dev, ndev, slave,
@@ -3154,6 +3175,16 @@ static int set_xgbe_ethss10_priv(struct gbe_priv *gbe_dev,
 		return PTR_ERR(gbe_dev->ss_regmap);
 	}
 
+	gbe_dev->pcsr_regmap = syscon_regmap_lookup_by_phandle(node,
+							       "syscon-pcsr");
+
+	if (IS_ERR(gbe_dev->pcsr_regmap)) {
+		dev_err(gbe_dev->dev,
+			"pcsr regmap lookup failed: %ld\n",
+			PTR_ERR(gbe_dev->pcsr_regmap));
+		return PTR_ERR(gbe_dev->pcsr_regmap);
+	}
+
 	ret = of_address_to_resource(node, XGBE_SM_REG_INDEX, &res);
 	if (ret) {
 		dev_err(gbe_dev->dev,
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 01/10] net: netcp: ethss: add support of subsystem register region regmap
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev, linux-omap, grygorii.strashko, mugunthanvnm, linux-kernel,
	arnd, davem, devicetree, mark.rutland, robh+dt
In-Reply-To: <1482271793-7671-1-git-send-email-m-karicheri2@ti.com>

From: WingMan Kwok <w-kwok2@ti.com>

10gbe phy driver needs to access the 10gbe subsystem control
register during phy initialization. To facilitate the shared
access of the subsystem register region between the 10gbe Ethernet
driver and the phy driver, this patch adds support of the
subsystem register region defined by a syscon node in the dts.

Although there is no shared access to the gbe subsystem register
region, using syscon for that is for the sake of consistency.

This change is backward compatible with previously released gbe
devicetree bindings.

Signed-off-by: WingMan Kwok <w-kwok2@ti.com>
Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 .../devicetree/bindings/net/keystone-netcp.txt     |  16 ++-
 drivers/net/ethernet/ti/netcp_ethss.c              | 140 +++++++++++++++++----
 2 files changed, 127 insertions(+), 29 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/keystone-netcp.txt b/Documentation/devicetree/bindings/net/keystone-netcp.txt
index 04ba1dc..0854a73 100644
--- a/Documentation/devicetree/bindings/net/keystone-netcp.txt
+++ b/Documentation/devicetree/bindings/net/keystone-netcp.txt
@@ -72,20 +72,24 @@ Required properties:
 		"ti,netcp-gbe-2" for 1GbE N NetCP 1.5 (N=2)
 		"ti,netcp-xgbe" for 10 GbE
 
+- syscon-subsys:	phandle to syscon node of the switch
+			subsystem registers.
+
 - reg:		register location and the size for the following register
 		regions in the specified order.
 		- switch subsystem registers
+		- sgmii module registers
 		- sgmii port3/4 module registers (only for NetCP 1.4)
 		- switch module registers
 		- serdes registers (only for 10G)
 
 		NetCP 1.4 ethss, here is the order
-			index #0 - switch subsystem registers
+			index #0 - sgmii module registers
 			index #1 - sgmii port3/4 module registers
 			index #2 - switch module registers
 
 		NetCP 1.5 ethss 9 port, 5 port and 2 port
-			index #0 - switch subsystem registers
+			index #0 - sgmii module registers
 			index #1 - switch module registers
 			index #2 - serdes registers
 
@@ -145,6 +149,11 @@ Optional properties:
 
 Example binding:
 
+gbe_subsys: subsys@2090000 {
+	compatible = "syscon";
+	reg = <0x02090000 0x100>;
+};
+
 netcp: netcp@2000000 {
 	reg = <0x2620110 0x8>;
 	reg-names = "efuse";
@@ -163,7 +172,8 @@ netcp: netcp@2000000 {
 		ranges;
 		gbe@90000 {
 			label = "netcp-gbe";
-			reg = <0x90000 0x300>, <0x90400 0x400>, <0x90800 0x700>;
+			syscon-subsys = <&gbe_subsys>;
+			reg = <0x90100 0x200>, <0x90400 0x200>, <0x90800 0x700>;
 			/* enable-ale; */
 			tx-queue = <648>;
 			tx-channel = <8>;
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index c7e547e..473edda1 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -19,9 +19,11 @@
  */
 
 #include <linux/io.h>
+#include <linux/mfd/syscon.h>
 #include <linux/module.h>
 #include <linux/of_mdio.h>
 #include <linux/of_address.h>
+#include <linux/regmap.h>
 #include <linux/if_vlan.h>
 #include <linux/ptp_classify.h>
 #include <linux/net_tstamp.h>
@@ -43,7 +45,10 @@
 #define GBE_MODULE_NAME			"netcp-gbe"
 #define GBE_SS_VERSION_14		0x4ed21104
 
+/* for devicetree backward compatible only */
 #define GBE_SS_REG_INDEX		0
+
+#define GBE_SGMII_REG_INDEX		0
 #define GBE_SGMII34_REG_INDEX		1
 #define GBE_SM_REG_INDEX		2
 /* offset relative to base of GBE_SS_REG_INDEX */
@@ -71,9 +76,11 @@
 #define IS_SS_ID_NU(d) \
 	(GBE_IDENT((d)->ss_version) == GBE_SS_ID_NU)
 
-#define GBENU_SS_REG_INDEX		0
+#define GBENU_SGMII_REG_INDEX		0
 #define GBENU_SM_REG_INDEX		1
+/* offset relative to base of GBE_SS_REG_INDEX */
 #define GBENU_SGMII_MODULE_OFFSET	0x100
+/* offset relative to base of GBENU_SM_REG_INDEX */
 #define GBENU_HOST_PORT_OFFSET		0x1000
 #define GBENU_SLAVE_PORT_OFFSET		0x2000
 #define GBENU_EMAC_OFFSET		0x2330
@@ -82,13 +89,12 @@
 #define GBENU_ALE_OFFSET		0x1e000
 #define GBENU_HOST_PORT_NUM		0
 #define GBENU_NUM_ALE_ENTRIES		1024
-#define GBENU_SGMII_MODULE_SIZE		0x100
 
 /* 10G Ethernet SS defines */
 #define XGBE_MODULE_NAME		"netcp-xgbe"
 #define XGBE_SS_VERSION_10		0x4ee42100
 
-#define XGBE_SS_REG_INDEX		0
+#define XGBE_SGMII_REG_INDEX		0
 #define XGBE_SM_REG_INDEX		1
 #define XGBE_SERDES_REG_INDEX		2
 
@@ -173,6 +179,7 @@
 #define XGBE_SET_REG_OFS(p, rb, rn) p->rb##_ofs.rn = \
 		offsetof(struct xgbe##_##rb, rn)
 #define GBE_REG_ADDR(p, rb, rn) (p->rb + p->rb##_ofs.rn)
+#define GBE_REG_OFS(p, rb, rn) ((p)->rb##_ofs.rn)
 
 #define HOST_TX_PRI_MAP_DEFAULT			0x00000000
 
@@ -225,6 +232,7 @@
 /* The PTP event messages - Sync, Delay_Req, Pdelay_Req, and Pdelay_Resp. */
 #define EVENT_MSG_BITS (BIT(0) | BIT(1) | BIT(2) | BIT(3))
 #endif /* CONFIG_TI_CPTS */
+#define SGMII_MODULE_SIZE			0x100
 
 struct xgbe_ss_regs {
 	u32	id_ver;
@@ -716,7 +724,9 @@ struct gbe_priv {
 	u32				ss_version;
 	u32				stats_en_mask;
 
-	void __iomem			*ss_regs;
+	struct regmap			*ss_regmap;
+	struct regmap			*pcsr_regmap;
+	void __iomem                    *ss_regs;
 	void __iomem			*switch_regs;
 	void __iomem			*host_port_regs;
 	void __iomem			*ale_reg;
@@ -2192,7 +2202,7 @@ static void gbe_port_config(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 			    int max_rx_len)
 {
 	void __iomem *rx_maxlen_reg;
-	u32 xgmii_mode;
+	int ret;
 
 	if (max_rx_len > NETCP_MAX_FRAME_SIZE)
 		max_rx_len = NETCP_MAX_FRAME_SIZE;
@@ -2200,9 +2210,16 @@ static void gbe_port_config(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 	/* Enable correct MII mode at SS level */
 	if ((gbe_dev->ss_version == XGBE_SS_VERSION_10) &&
 	    (slave->link_interface >= XGMII_LINK_MAC_PHY)) {
-		xgmii_mode = readl(GBE_REG_ADDR(gbe_dev, ss_regs, control));
-		xgmii_mode |= (1 << slave->slave_num);
-		writel(xgmii_mode, GBE_REG_ADDR(gbe_dev, ss_regs, control));
+		ret = regmap_update_bits(gbe_dev->ss_regmap,
+					 GBE_REG_OFS(gbe_dev, ss_regs, control),
+					 1 << slave->slave_num,
+					 1 << slave->slave_num);
+
+		if (ret) {
+			dev_err(gbe_dev->dev,
+				"regmap update xgmii mode bit Failed\n");
+			return;
+		}
 	}
 
 	if (IS_SS_ID_MU(gbe_dev))
@@ -3127,35 +3144,46 @@ static int set_xgbe_ethss10_priv(struct gbe_priv *gbe_dev,
 	void __iomem *regs;
 	int ret, i;
 
-	ret = of_address_to_resource(node, XGBE_SS_REG_INDEX, &res);
+	gbe_dev->ss_regmap = syscon_regmap_lookup_by_phandle(node,
+							     "syscon-subsys");
+
+	if (IS_ERR(gbe_dev->ss_regmap)) {
+		dev_err(gbe_dev->dev,
+			"subsys regmap lookup failed: %ld\n",
+			PTR_ERR(gbe_dev->ss_regmap));
+		return PTR_ERR(gbe_dev->ss_regmap);
+	}
+
+	ret = of_address_to_resource(node, XGBE_SM_REG_INDEX, &res);
 	if (ret) {
 		dev_err(gbe_dev->dev,
-			"Can't xlate xgbe of node(%s) ss address at %d\n",
-			node->name, XGBE_SS_REG_INDEX);
+			"Can't xlate xgbe of node(%s) sm address at %d\n",
+			node->name, XGBE_SM_REG_INDEX);
 		return ret;
 	}
 
 	regs = devm_ioremap_resource(gbe_dev->dev, &res);
 	if (IS_ERR(regs)) {
-		dev_err(gbe_dev->dev, "Failed to map xgbe ss register base\n");
+		dev_err(gbe_dev->dev, "Failed to map xgbe sm register base\n");
 		return PTR_ERR(regs);
 	}
-	gbe_dev->ss_regs = regs;
+	gbe_dev->switch_regs = regs;
 
-	ret = of_address_to_resource(node, XGBE_SM_REG_INDEX, &res);
+	ret = of_address_to_resource(node, XGBE_SGMII_REG_INDEX, &res);
 	if (ret) {
 		dev_err(gbe_dev->dev,
-			"Can't xlate xgbe of node(%s) sm address at %d\n",
-			node->name, XGBE_SM_REG_INDEX);
+			"Can't xlate xgbe of node(%s) sgmii address at %d\n",
+			node->name, XGBE_SGMII_REG_INDEX);
 		return ret;
 	}
 
 	regs = devm_ioremap_resource(gbe_dev->dev, &res);
 	if (IS_ERR(regs)) {
-		dev_err(gbe_dev->dev, "Failed to map xgbe sm register base\n");
+		dev_err(gbe_dev->dev,
+			"Failed to map xgbe sgmii register base\n");
 		return PTR_ERR(regs);
 	}
-	gbe_dev->switch_regs = regs;
+	gbe_dev->sgmii_port_regs = regs;
 
 	ret = of_address_to_resource(node, XGBE_SERDES_REG_INDEX, &res);
 	if (ret) {
@@ -3171,6 +3199,8 @@ static int set_xgbe_ethss10_priv(struct gbe_priv *gbe_dev,
 		return PTR_ERR(regs);
 	}
 	gbe_dev->xgbe_serdes_regs = regs;
+	gbe_dev->sgmii_port34_regs = gbe_dev->sgmii_port_regs +
+				     (2 * SGMII_MODULE_SIZE);
 
 	gbe_dev->num_stats_mods = gbe_dev->max_num_ports;
 	gbe_dev->et_stats = xgbe10_et_stats;
@@ -3195,9 +3225,9 @@ static int set_xgbe_ethss10_priv(struct gbe_priv *gbe_dev,
 	}
 
 	gbe_dev->ss_version = XGBE_SS_VERSION_10;
-	gbe_dev->sgmii_port_regs = gbe_dev->ss_regs +
-					XGBE10_SGMII_MODULE_OFFSET;
-	gbe_dev->host_port_regs = gbe_dev->ss_regs + XGBE10_HOST_PORT_OFFSET;
+
+	gbe_dev->host_port_regs = gbe_dev->switch_regs +
+					XGBE10_HOST_PORT_OFFSET;
 
 	for (i = 0; i < gbe_dev->max_num_ports; i++)
 		gbe_dev->hw_stats_regs[i] = gbe_dev->switch_regs +
@@ -3228,8 +3258,8 @@ static int set_xgbe_ethss10_priv(struct gbe_priv *gbe_dev,
 	return 0;
 }
 
-static int get_gbe_resource_version(struct gbe_priv *gbe_dev,
-				    struct device_node *node)
+static int get_gbe_resource_version_ss_regs(struct gbe_priv *gbe_dev,
+					    struct device_node *node)
 {
 	struct resource res;
 	void __iomem *regs;
@@ -3248,8 +3278,27 @@ static int get_gbe_resource_version(struct gbe_priv *gbe_dev,
 		dev_err(gbe_dev->dev, "Failed to map gbe register base\n");
 		return PTR_ERR(regs);
 	}
+
 	gbe_dev->ss_regs = regs;
 	gbe_dev->ss_version = readl(gbe_dev->ss_regs);
+	gbe_dev->ss_regmap = NULL;
+	return 0;
+}
+
+static int get_gbe_resource_version(struct gbe_priv *gbe_dev,
+				    struct device_node *node)
+{
+	gbe_dev->ss_regmap = syscon_regmap_lookup_by_phandle(node,
+							     "syscon-subsys");
+	if (IS_ERR(gbe_dev->ss_regmap)) {
+		dev_dbg(gbe_dev->dev,
+			"subsys regmap lookup failed: %ld. try reg property\n",
+			PTR_ERR(gbe_dev->ss_regmap));
+		return get_gbe_resource_version_ss_regs(gbe_dev, node);
+	}
+
+	regmap_read(gbe_dev->ss_regmap, 0, &gbe_dev->ss_version);
+	gbe_dev->ss_regs = NULL;
 	return 0;
 }
 
@@ -3260,6 +3309,27 @@ static int set_gbe_ethss14_priv(struct gbe_priv *gbe_dev,
 	void __iomem *regs;
 	int i, ret;
 
+	if (gbe_dev->ss_regs) {
+		gbe_dev->sgmii_port_regs = gbe_dev->ss_regs +
+					   GBE13_SGMII_MODULE_OFFSET;
+	} else {
+		ret = of_address_to_resource(node, GBE_SGMII_REG_INDEX, &res);
+		if (ret) {
+			dev_err(gbe_dev->dev,
+				"Can't translate of gbe node(%s) address at index %d\n",
+				node->name, GBE_SGMII_REG_INDEX);
+			return ret;
+		}
+
+		regs = devm_ioremap_resource(gbe_dev->dev, &res);
+		if (IS_ERR(regs)) {
+			dev_err(gbe_dev->dev,
+				"Failed to map gbe sgmii port register base\n");
+			return PTR_ERR(regs);
+		}
+		gbe_dev->sgmii_port_regs = regs;
+	}
+
 	ret = of_address_to_resource(node, GBE_SGMII34_REG_INDEX, &res);
 	if (ret) {
 		dev_err(gbe_dev->dev,
@@ -3314,7 +3384,6 @@ static int set_gbe_ethss14_priv(struct gbe_priv *gbe_dev,
 		return -ENOMEM;
 	}
 
-	gbe_dev->sgmii_port_regs = gbe_dev->ss_regs + GBE13_SGMII_MODULE_OFFSET;
 	gbe_dev->host_port_regs = gbe_dev->switch_regs + GBE13_HOST_PORT_OFFSET;
 
 	/* K2HK has only 2 hw stats modules visible at a time, so
@@ -3402,14 +3471,33 @@ static int set_gbenu_ethss_priv(struct gbe_priv *gbe_dev,
 	}
 	gbe_dev->switch_regs = regs;
 
-	gbe_dev->sgmii_port_regs = gbe_dev->ss_regs + GBENU_SGMII_MODULE_OFFSET;
+	if (gbe_dev->ss_regs) {
+		gbe_dev->sgmii_port_regs = gbe_dev->ss_regs +
+					   GBENU_SGMII_MODULE_OFFSET;
+	} else {
+		ret = of_address_to_resource(node, GBENU_SGMII_REG_INDEX, &res);
+		if (ret) {
+			dev_err(gbe_dev->dev,
+				"Can't translate of gbenu node(%s) addr at index %d\n",
+				node->name, GBENU_SGMII_REG_INDEX);
+			return ret;
+		}
+
+		regs = devm_ioremap_resource(gbe_dev->dev, &res);
+		if (IS_ERR(regs)) {
+			dev_err(gbe_dev->dev,
+				"Failed to map gbenu sgmii port register base\n");
+			return PTR_ERR(regs);
+		}
+		gbe_dev->sgmii_port_regs = regs;
+	}
 
 	/* Although sgmii modules are mem mapped to one contiguous
 	 * region on GBENU devices, setting sgmii_port34_regs allows
 	 * consistent code when accessing sgmii api
 	 */
 	gbe_dev->sgmii_port34_regs = gbe_dev->sgmii_port_regs +
-				     (2 * GBENU_SGMII_MODULE_SIZE);
+				     (2 * SGMII_MODULE_SIZE);
 
 	gbe_dev->host_port_regs = gbe_dev->switch_regs + GBENU_HOST_PORT_OFFSET;
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 00/10] netcp: enhancements and minor fixes
From: Murali Karicheri @ 2016-12-20 22:09 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA, linux-omap-u79uwXL29TY76Z2rM5mHXA,
	grygorii.strashko-l0cyMroinI0, mugunthanvnm-l0cyMroinI0,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, arnd-r2nGTMty4D4,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, devicetree-u79uwXL29TY76Z2rM5mHXA,
	mark.rutland-5wv7dgnIgG8, robh+dt-DgEjT+Ai2ygdnm+yROfE0A

This series is for net-next. This propagates enhancements and minor
bug fixes from internal version of the driver to keep the upstream
in sync. Please review and apply if this looks good.

Tested on all of K2HK/E/L boards.

Thanks
Murali Karicheri

Michael Scherban (1):
  net: netcp: store network statistics in 64 bits

Murali Karicheri (7):
  net: netcp: extract eflag from desc for rx_hook handling
  net: netcp: remove the redundant memmov()
  net: netcp: ethss: get phy-handle only if link interface is MAC-to-PHY
  net: netcp: use hw capability to remove FCS word from rx packets
  net: netcp: ale: update to support unknown vlan controls for NU switch
  net: netcp: ale: use ale_status to size the ale table
  net: netcp: ale: add proper ale entry mask bits for netcp switch ALE

WingMan Kwok (2):
  net: netcp: ethss: add support of subsystem register region regmap
  net: netcp: ethss: add support of 10gbe pcsr link status

 .../devicetree/bindings/net/keystone-netcp.txt     |  19 +-
 drivers/net/ethernet/ti/cpsw_ale.c                 | 180 ++++++++++++++++---
 drivers/net/ethernet/ti/cpsw_ale.h                 |  17 +-
 drivers/net/ethernet/ti/netcp.h                    |  21 +++
 drivers/net/ethernet/ti/netcp_core.c               | 102 ++++++++---
 drivers/net/ethernet/ti/netcp_ethss.c              | 200 +++++++++++++++++----
 include/linux/soc/ti/knav_dma.h                    |   2 +
 7 files changed, 456 insertions(+), 85 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: HalfSipHash Acceptable Usage
From: Theodore Ts'o @ 2016-12-20 21:36 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Jean-Philippe Aumasson, Hannes Frederic Sowa, LKML, Eric Biggers,
	Daniel J . Bernstein, David Laight, David Miller, Andi Kleen,
	George Spelvin, kernel-hardening, Andy Lutomirski,
	Linux Crypto Mailing List, Tom Herbert, Vegard Nossum, Netdev,
	Linus Torvalds
In-Reply-To: <CAHmME9rPmH=wP_eHYopt8ZPG9TSN7bos3fGOuqKL2HjQW-2SWA@mail.gmail.com>

On Mon, Dec 19, 2016 at 06:32:44PM +0100, Jason A. Donenfeld wrote:
> 1) Anything that requires actual long-term security will use
> SipHash2-4, with the 64-bit output and the 128-bit key. This includes
> things like TCP sequence numbers. This seems pretty uncontroversial to
> me. Seem okay to you?

Um, why do TCP sequence numbers need long-term security?  So long as
you rekey every 5 minutes or so, TCP sequence numbers don't need any
more security than that, since even if you break the key used to
generate initial sequence numbers seven a minute or two later, any
pending TCP connections will have timed out long before.

See the security analysis done in RFC 6528[1], where among other
things, it points out why MD5 is acceptable with periodic rekeying,
although there is the concern that this could break certain hueristics
used when establishing new connections during the TIME-WAIT state.

[1] https://tools.ietf.org/html/rfc6528

						- Ted

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2016-12-20 21:02 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) Use rb_entry() instead of hardcoded container_of(), from Geliang Tang.

2) Use correct memory barriers in stammac driver, from Pavel Machek.

3) Fix assoc bind address handling in SCTP, from Xin Long.

4) Make the length check for UFO handling consistent between
   __ip_append_data() and ip_finish_output(), from Zheng Li.

5) HSI driver compatible strings were busted fro hix5hd2, from Dongpo
   Li.

6) Handle devm_ioremap() errors properly in cavium driver, from Arvind
   Yadav.

Please pull, thanks a lot!

The following changes since commit 52f40e9d657cc126b766304a5dd58ad73b02ff46:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2016-12-17 20:17:04 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to a763f78cea845c91b8d91f93dabf70c407635dc5:

  RDS: use rb_entry() (2016-12-20 14:22:49 -0500)

----------------------------------------------------------------
Arvind Yadav (1):
      net: ethernet: cavium: octeon: octeon_mgmt: Handle return NULL error from devm_ioremap

David S. Miller (4):
      Merge branch 'phy-broken-modes'
      Merge branch 'fsl-fixes'
      Merge branch 'hix5hd2_gmac-compatible-string'
      Merge branch 'sctp-fixes'

Dongpo Li (2):
      net: hix5hd2_gmac: fix compatible strings name
      ARM: dts: hix5hd2: don't change the existing compatible string

Geliang Tang (4):
      net/mlx5: use rb_entry()
      net_sched: sch_fq: use rb_entry()
      net_sched: sch_netem: use rb_entry()
      RDS: use rb_entry()

Jarno Rajahalme (1):
      openvswitch: Add a missing break statement.

Madalin Bucur (4):
      fsl/fman: fix 1G support for QSGMII interfaces
      powerpc: fsl/fman: remove fsl,fman from of_device_ids[]
      fsl/fman: A007273 only applies to PPC SoCs
      fsl/fman: enable compilation on ARM64

Pavel Machek (1):
      stmmac: fix memory barriers

Tobias Klauser (1):
      ethernet: sfc: Add Kconfig entry for vendor Solarflare

WingMan Kwok (2):
      net: netcp: ethss: fix errors in ethtool ops
      net: netcp: ethss: fix 10gbe host port tx pri map configuration

Xin Long (2):
      sctp: reduce indent level in sctp_copy_local_addr_list
      sctp: not copying duplicate addrs to the assoc's bind address list

jbrunet (3):
      net: phy: fix sign type error in genphy_config_eee_advert
      net: phy: use boolean dt properties for eee broken modes
      dt: bindings: net: use boolean dt properties for eee broken modes

zheng li (1):
      ipv4: Should use consistent conditional judgement for ip fragment in __ip_append_data and ip_finish_output

 Documentation/devicetree/bindings/net/hisilicon-hix5hd2-gmac.txt | 13 ++++++++-----
 Documentation/devicetree/bindings/net/phy.txt                    | 10 ++++++++--
 arch/arm/boot/dts/hisi-x5hd2.dtsi                                |  4 ++--
 arch/powerpc/platforms/85xx/corenet_generic.c                    |  3 ---
 drivers/net/ethernet/Kconfig                                     |  1 -
 drivers/net/ethernet/cavium/octeon/octeon_mgmt.c                 |  6 ++++++
 drivers/net/ethernet/freescale/fman/Kconfig                      |  2 +-
 drivers/net/ethernet/freescale/fman/fman.c                       | 15 +++++++++++++++
 drivers/net/ethernet/freescale/fman/mac.c                        |  1 +
 drivers/net/ethernet/hisilicon/hix5hd2_gmac.c                    | 13 +++++++------
 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c            |  2 +-
 drivers/net/ethernet/sfc/Kconfig                                 | 21 +++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c               |  4 ++--
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c                   |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c                |  8 ++++----
 drivers/net/ethernet/ti/netcp_ethss.c                            | 24 ++++++++++++++++++------
 drivers/net/phy/phy_device.c                                     | 22 +++++++++++++++++-----
 include/dt-bindings/net/mdio.h                                   | 19 -------------------
 net/ipv4/ip_output.c                                             |  2 +-
 net/openvswitch/flow_netlink.c                                   |  1 +
 net/rds/rdma.c                                                   |  2 +-
 net/sched/sch_fq.c                                               | 14 +++++++-------
 net/sched/sch_netem.c                                            |  2 +-
 net/sctp/bind_addr.c                                             |  3 +++
 net/sctp/protocol.c                                              | 40 ++++++++++++++++++++++------------------
 25 files changed, 148 insertions(+), 86 deletions(-)
 delete mode 100644 include/dt-bindings/net/mdio.h

^ permalink raw reply

* [ANNOUNCE] nftables 0.7 release
From: Pablo Neira Ayuso @ 2016-12-20 20:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, netfilter, netfilter-announce, lwn

[-- Attachment #1: Type: text/plain, Size: 10356 bytes --]

Hi!

The Netfilter project proudly presents:

        nftables 0.7

This release contains many accumulated bug fixes and new features
available up to the (upcoming) Linux 4.10-rc1 kernel release.

* Facilitate migration from iptables to nftables:

  At compilation time, you have to pass this option.

  # ./configure --with-xtables

  And libxtables needs to be installed in your system. This allows you
  to list a ruleset containing xt extensions loaded through
  iptables-compat-restore tool. The nft tool provides a native
  translation for iptables extensions (if available).

* Add new fib expression, which can be used to obtain the output
  interface from the route table based on either source or destination
  address of a packet. This can be used to e.g. add reverse path
  filtering, eg. drop if not coming from the same interface packet
  arrived on:

  # nft add rule x prerouting fib saddr . iif oif eq 0 drop

  Accept only if from eth:

  # nft add rule x prerouting fib saddr . iif oif eq "eth0" accept

  Accept if from any valid interface:

  # nft add rule x prerouting fib saddr oif accept

  Querying of address type is also supported, this can be used
  to only accept packets to addresses configured in the same
  interface, eg.

  # nft add rule x prerouting fib daddr . iif type local accept

  Its also possible to use mark and verdict map, eg,

  # nft add rule x prerouting \
        meta mark set 0xdead fib daddr . mark type vmap {
                blackhole : drop,
                prohibit : drop,
                unicast : accept
        }

* Support hashing of any arbitrary key combination, eg.

  # nft add rule x y \
        dnat to jhash ip saddr . tcp dport mod 2 map { \
                0 : 192.168.20.100, \
                1 : 192.168.30.100 \
        }

  Another usecase: Set packet marks based on any arbitrary hashing.

* Add number generation support. Useful for round-robin packet mark
  setting, eg.

  # nft add rule filter prerouting meta mark set numgen inc mod 2

  You can also specify an offset to indicate from what value you want
  to start from.

  The modulus provides the scale of the counting sequence. You can
  also use this from maps, eg.

  # nft add rule nat prerouting \
        dnat to numgen inc mod 2 map { 0 : 192.168.10.100, 1 : 192.168.20.200 }

  So this is distributing new connections in a round-robin fashion
  between 192.168.10.100 and 192.168.20.200. Don't forget the special NAT
  chain semantics: Only the first packet evaluates the rule, follow up
  packets rely on conntrack to apply the NAT information.

  You can also emulate flow distribution with different backend weights
  using intervals, eg.

  # nft add rule nat prerouting \
        dnat to numgen inc mod 10 map { 0-5 : 192.168.10.100, 6-9 : 192.168.20.200 }

* Add quota support, eg.

  # nft add rule filter input \
            flow table http { ip saddr timeout 60s quota over 50 mbytes } drop

  This creates a flow table, where every flow gets a quota of 50
  mbytes. You can also from use simple rules too to enforce quotas, of
  course.

* Introduce routing expression, for routing related data with support
  for nexthop (i.e. the directly connected IP address that an outgoing
  packet is sent to), which can be used either for matching or accounting, eg.

     # nft add rule filter postrouting \
          ip daddr 192.168.1.0/24 rt nexthop != 192.168.0.1 drop

  This will drop any traffic to 192.168.1.0/24 that is not routed via
  192.168.0.1.

     # nft add rule filter postrouting \
          flow table acct { rt nexthop timeout 600s counter }

     # nft add rule ip6 filter postrouting \
          flow table acct { rt nexthop timeout 600s counter }

  These rules count outgoing traffic per nexthop. Note that the timeout
  releases an entry if no traffic is seen for this nexthop within 10
  minutes.

* Notrack support, to explicitly skip connection tracking for matching
  packets, eg.

     # nft add rule ip raw prerouting tcp dport { 80, 443 } notrack

  So you can skip tracking for http and https traffic.

* Support to set non-byte bound packet header fields, including
  checksum adjustment, eg. ip6 ecn set 1.

* Add 'create set' and 'create element' commands, eg.

     # nft add set x y { type ipv4_addr\; }
     # nft create set x y { type ipv4_addr\; }
     <cmdline>:1:1-35: Error: Could not process rule: File exists
     create set x y { type ipv4_addr; }
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     # nft add set x y { type ipv4_addr\; }
     #

  So 'create' bails out if the set already exists, while 'add'
  doesn't, for more ergonomic usage as several users requested on
  the mailing list.

* Allow to use variable reference for set element definitions, eg.

  # cat ruleset.nft
    define s-ext-2-int = { 10.10.10.10 . 25, 10.10.10.10 . 143 }

    table inet forward {
            set s-ext-2-int {
                 type ipv4_addr . inet_service
                 elements = $s-ext-2-int
            }
    }
  # nft -f ruleset.nft

  Useful to improve ruleset maintainability, as you can split out
  variable and set definitions from the filtering policy itself.

* Allow to use variable definitions from element commands, eg.

     define whitelist_v4 = { 1.1.1.1 }

     table inet filter {
        set whitelist_v4 { type ipv4_addr; }
     }

     add element inet filter whitelist_v4 $whitelist_v4

* Add support to flush set. You can use this new command to remove all
  existing elements in a set, eg.

  # nft flush set filter xyz

  Note that this requires (upcoming) Linux kernel 4.10-rc versions.

* Inverted set lookups, eg. tcp dport != { 80, 443 }.

* Honor absolute and relative paths via include file, where:

    include "./ruleset.nft"

  refers to a file in the working directory.

    include "ruleset.nft"

  refers to a file in the nftables root path (via sysconfdir), and:

    include "/etc/nftables/ruleset.nft"

  provides an absolute reference to the file that need to be included.
  This also solves an ambiguity if the same file name is used both under
  sysconfdir and the current working directory.

* Support log flags, to enable logging TCP sequence and options:

     # nft add rule x y log flags tcp sequence,options

  ... IP options, eg:

     # nft add rule x y log flags ip options

  ... socket UID, eg.

     # nft add rule x y log flags skuid

  ... decide ethernet link layer address, eg.

     # nft add rule x y log flags ether

  ... or simply set on all flags:

     # nft add rule x y log flags all

* tc classid parser support, eg.

    nft add rule filter forward meta priority abcd:1234

* Allow numeric connlabels, so if connlabel still works with undefined
  labels, eg. ct label set 2.

* Document log, reject, counter, meta, limit, nat, ct, payload and
  queue statements from nft(8) manpage.

Bugfixes
========

Not strictly limited to this list below, but some highlights:

* Allow split table definitions, eg.

  # cat ruleset.nft
  table inet filter {
       chain ssh {
               type filter hook input priority 0; policy accept;
               tcp dport ssh accept;
       }
  }
  table inet filter {
       chain input {
               type filter hook input priority 1; policy drop;
       }
  }
  # nft -f ruleset.nft

* Use new range expression to represent inverted intervals, eg.
  ip saddr != 1.1.1.1-2.2.2.2, since previously generated bytecode was
  not correct.
* Solve endianness problems with link layer address.
* Fix parser to keep map flag around on definition.
* Skip timeout attribute in dynamic set updates, other kernel bails
  out with EINVAL.
* Restore parsing of dynamic set element updates.
* The time datatype now uses milliseconds, as the kernel expects.
* Allow numeric interface index numbers, eg. in meta iif, oif.
* Fix monitor trace crash with netdev family.
* Flow table with concatenation fixes.
* Keep element comments around when using set intervals.
* Fixed memory corruption in userspace when deleting lots of elements
  in one go via nft -f.
* Several nft internal cache fixes, including cache reset on 'flush
  ruleset'.
* Restore parens on right-hand side of relational expression.
* Replace getnameinfo() by internal lookup table, so we don't rely on
  /etc/services anymore for service names, so we restrict them to
  a well-known set that is supported by our scanner. You can list
  service names via 'nft describe tcp dport'.
* Display symbol table values in the right hostbyte order and
  decimal/hexadecimal representation.
* Fix a nasty bug in the set interval code triggering huge memory
  consumption in userspace for set and map intervals with runtime
  updates.

We also got lots more tests added to our infrastructure to catch up
regressions.

Syntax updates
==============

Several minor syntax updates, although previous syntax has been
preserved by now to facilitate transition, the new one is prefered:

* Consistency grammar fixes: 'snat' and 'dnat' now require 'to', eg.
  snat to 1.2.3.4. For consistency with existing statements such as
  redirect, masquerade, dup and fwd. Moreover, add colon after 'to' in
  'redirect' for consistency with nat and masq statements.

* Allow ct l3proto/protocol without direction since they are unrelated
  to the direction.

* Explicit ruleset exportation, eg. nft export ruleset json, for
  consistency with other existing ruleset commands.

* Always quote user-defined strings from rules when listing them.

* Support for RFC2732 IPv6 address format with brackets, eg.

  dnat to [2001:838:35f:1::]:80

* Allow strings starting by underscores and dots in user-define
  strings, conforming with POSIX.1-2008 (which is simultaneously IEEE
  Std 1003.1-2008).

Resources
=========

The nftables code can be obtained from:

* http://netfilter.org/projects/nftables/downloads.html
* ftp://ftp.netfilter.org/pub/nftables
* git://git.netfilter.org/nftables

To build the code, libnftnl 1.0.7 and libmnl >= 1.0.2 are required:

* http://netfilter.org/projects/libnftnl/index.html
* http://netfilter.org/projects/libmnl/index.html

Visit our wikipage for user documentation at:

* http://wiki.nftables.org

For the manpage reference, check man(8) nft.

In case of bugs and feature request, file them via:

* https://bugzilla.netfilter.org

Make sure you create no duplicates already, thanks!

Happy holidays!

[-- Attachment #2: changes-nftables-0.7.txt --]
[-- Type: text/plain, Size: 9551 bytes --]

Anatole Denis (7):
      evaluate: Add set to cache only when well-formed
      tests: Add regression test for malformed sets
      Revert "evaluate: check for NULL datatype in rhs in lookup expr"
      src: Interpret OP_NEQ against a set as OP_LOOKUP
      tests/py: Unmask negative set lookup
      rule: Introduce helper function cache_flush
      evaluate: Update cache on flush ruleset

Anders K. Pedersen (4):
      rt: introduce routing expression
      Replace tests/files/expr-rt with Python based tests, and replace ether type     with meta nfproto, which generates a bit fewer instructions.
      evaluate: Allow concatenation of rt nexthop etc.
      doc: fix synopsis for ct expression

Arturo Borrero (3):
      tests: shell: delete unused variable in run-tests.sh
      tests: shell: cleanup tempfile handling in testcases/sets/cache_handling_0
      tests: shell: run-tests.sh: use src/nft binary by default

Arturo Borrero Gonzalez (12):
      tests: shell: update kernel modules to clean
      xt: update Arturo Borrero Gonzalez email address
      tests: shell: delete useless stderr output in testcase
      tests: shell: introduce the cache testcases directory
      tests: shell: add a new testcase for ruleset loading bug
      tests: shell: add testcases for comments in set elements
      tests: shell: allow to execute a single testcase
      tests: shell: testcase for adding many set elements
      tests: shell: testcase for deleting many set elements
      tests: shell: another testcase for deleting many set elements
      tests: shell: add a testcase for many defines
      tests: shell: add testcase for different defines usage

Carlos Falgueras García (1):
      src: Simplify parser rule_spec tree

Elise Lennion (4):
      datatype: Replace getnameinfo() by internal lookup table
      datatype: Display pre-defined inet_service values in host byte order
      datatype: Display pre-defined inet_service values in decimal base
      expression: Show the base which pre-defined constants are displayed

Florian Westphal (30):
      payload: don't update protocol context if we can't find a description
      meta: add random support
      meta: add tests for meta random
      ct: use nftables sysconf location for connlabel configuration
      tests: add basic payload tests
      tests: add ether payload set test
      netlink: add __binop_adjust helper
      payload: print base and raw values for unknown payloads
      evaluate: add small helper to check if payload expr needs binop adjustment
      evaluate: add support to set IPv6 non-byte header fields
      netlink: decode payload statment
      tests: ip6 dscp, flowlabel and ecn test cases
      netlink: make checksum fixup work with odd-sized header fields
      tests: ip payload set support for ecn and dscp
      ct: allow numeric conntrack labels
      ct: display bit number instead of raw value
      doc: update meta expression
      doc: payload and conntrack statement
      datatype: ll: use big endian byte ordering
      tests: catch ordering issue w. ether set
      payload: remove byteorder conversion
      meta: permit numeric interface type
      netlink: fix monitor trace crash with netdev family
      meta: fix pkttype name and add 'other' symbol
      utils: provide snprintf helper macro
      ct: allow resolving ct keys at run time
      meta: allow resolving meta keys at run time
      src: add fib expression
      Revert "tests: py: nft-tests.py: Add function for loading and removing kernel modules"
      bison: remove old log level tokens

Jon Jensen (1):
      Correct description of -n/--numeric option

Laura Garcia Liebana (5):
      doc: Update datatypes
      src: add offset attribute for numgen expression
      netlink: fix linearize numgen type
      src: make hash seed attribute optional
      src: add offset attribute for hash expression

Liping Zhang (14):
      tests: shell: make testcases which using tcp/udp port more rubost
      tests: shell: add endless jump loop tests
      parser_bison: keep snat/dnat existing syntax unchanged
      tests: shell: add testcase for reject expr
      meta: fix memory leak in tc classid parser
      tests: py: replace "eth0" with "lo" in dup expr tests
      src: fix compile error due to _UNTIL renamed to _MODULUS in libnftnl
      tests: py: add more test cases for queue expr
      tests: py: fix numgen case failed due to changes in libnftnl
      src: support ct l3proto/protocol without direction syntax
      ct: fix "ct l3proto/protocol" syntax broken
      log: rename the log level "warning" to "warn"
      src: add log flags syntax support
      tests: shell: add test case for inserting element into verdict map

Manuel Johannes Messner (3):
      tests: py: nft-tests.py: Add function for loading and removing kernel modules
      tests: py: any: Make tests more generic by using other interfaces
      tests: py: any: Remove duplicate tests

Nicholas Vinson (1):
      nft: configure.ac: Replace magic dblatex dep.

Pablo Neira (2):
      src: expose delinearize/linearize structures and stmt_error()
      src: trigger layer 4 checksum when pseudoheader fields are modified

Pablo Neira Ayuso (71):
      src: use new definitions from libnftnl
      segtree: don't check for overlaps if set definition is empty
      tests: shell: cover transactions via nft -f using flat syntax
      datatype: time_type should send milliseconds to userspace
      parser_bison: restore parsing of dynamic set element updates
      netlink_linearize: skip NFTNL_EXPR_DYNSET_TIMEOUT attribute if timeout is unset
      include: cache ip_tables.h, ip6_tables.h, arp_tables.h and ebtables.h
      src: add xt compat support
      parser_bison: fix typo in symbol redefinition error reporting
      tests: shell: make sure split table definition works via nft -f
      xt: use struct xt_xlate_{mt,tg}_params
      parser_bison: keep map flag around when flags are specified
      scanner: honor absolute and relative paths via include file
      scanner: don't fall back on current directory if include is not found
      scanner: don't break line on include error message
      tests: tests to include files
      ct: add missing slash to connlabel path
      ct: release ct_label table on exit
      src: quote user-defined strings when used from rule selectors
      src: add 'to' for snat and dnat
      src: support for RFC2732 IPv6 address format with brackets
      parser_bison: missing token string in QUOTED_ASTERISK and ASTERISK_STRING
      scanner: allow strings starting by underscores and dots
      scanner: remove range expression
      src: rename datatype name from tc_handle to classid
      src: simplify classid printing using %x instead of %04x
      src: meta priority support using tc classid
      parser_bison: redirect to :port for consistency with nat/masq statement
      parser_bison: explicit indication on export ruleset
      src: add create set command
      tests: shell: cover add and create set command
      src: create element command
      tests: shell: cover add and create set command
      include: refresh uapi/linux/netfilter/nf_tables.h copy
      tests: py: adapt it to new add element command semantics
      src: add quota statement
      src: add numgen expression
      src: add hash expression
      evaluate: add expr_evaluate_integer()
      evaluate: validate maximum hash and numgen value
      parser_bison: add variable_expr rule
      parser_bison: allow variable references in set elements definition
      tests: py: adapt netlink bytecode output of numgen and hash
      evaluate: display expression, statement and command name on debug
      netlink_delinearize: Avoid potential null pointer deref
      doc: nft: add my copyright statement to the manpage
      doc: nft: document log, reject, counter, meta, limit, nat and queue statements
      src: use new range expression for != [a,b] intervals
      parser_bison: allow to use variable to add/create/delete elements
      src: don't need keyword for log level
      parser: add offset keyword and parser rule
      tests/py: add missing payload test for numgen offset
      netlink_linearize: skip set element expression in flow table key
      segtree: keep element comments in set intervals
      tests: py: add some testcases for log flags
      tests: py: missing range conversion in icmpv6
      src: add notrack support
      mnl: use nftnl_set_elems_nlmsg_build_payload_iter() when deleting elements
      include: refresh nf_tables.h header
      datatype: honor -nn option from inet_service_type_print()
      evaluate: return ctx->table from table_lookup_global()
      src: add support to flush sets
      segtree: wrong prefix expression length on interval_map_decompose()
      segtree: don't trigger error on exact overlaps
      mnl: don't send empty set elements netlink message to kernel
      tests: py: update quota and payload
      netlink_linearize: fix IPv6 layer 4 checksum mangling
      mnl: add mnl_nft_setelem_batch_flush() and use it from netlink_flush_setelems()
      xt: use NFTNL_* definitions
      configure: Bump version to v0.7
      include: Missing noinst_HEADERS updates

Phil Sutter (5):
      evaluate: Fix datalen checks in expr_evaluate_string()
      evaluate: reject: Have a generic fix for missing network context
      evaluate: Avoid undefined behaviour in concat_subtype_id()
      parser_bison: Allow parens on RHS of relational_expr
      tests: py: Test TCP flags match with parentheses


^ permalink raw reply

* [PATCH 5/5 net-next] inet: reset tb->fastreuseport when adding a reuseport sk
From: Josef Bacik @ 2016-12-20 20:07 UTC (permalink / raw)
  To: davem, hannes, kraigatgoog, eric.dumazet, tom, netdev,
	kernel-team
In-Reply-To: <1482264424-15439-1-git-send-email-jbacik@fb.com>

If we have non reuseport sockets on a tb we will set tb->fastreuseport to 0 and
never set it again.  Which means that in the future if we end up adding a bunch
of reuseport sk's to that tb we'll have to do the expensive scan every time.
Instead add a sock_common to the tb so we know what reuseport sk succeeded last.
Once one sk has made it onto the list we know that there are no potential bind
conflicts on the owners list that match that sk's rcv_addr.  So copy the sk's
common into our tb->fastsock and set tb->fastruseport to FASTREUSESOCK_STRICT so
we know we have to do an extra check for subsequent reuseport sockets and skip
the expensive bind conflict check.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 include/net/inet_hashtables.h   |  4 ++++
 net/ipv4/inet_connection_sock.c | 53 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 50f635c..b776401 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -74,12 +74,16 @@ struct inet_ehash_bucket {
  * users logged onto your box, isn't it nice to know that new data
  * ports are created in O(1) time?  I thought so. ;-)	-DaveM
  */
+#define FASTREUSEPORT_ANY	1
+#define FASTREUSEPORT_STRICT	2
+
 struct inet_bind_bucket {
 	possible_net_t		ib_net;
 	unsigned short		port;
 	signed char		fastreuse;
 	signed char		fastreuseport;
 	kuid_t			fastuid;
+	struct sock_common	fastsock;
 	int			num_owners;
 	struct hlist_node	node;
 	struct hlist_head	owners;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d3ccf62..9e29fad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -164,6 +164,32 @@ success:
 	return head;
 }
 
+static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
+				     struct sock *sk)
+{
+	struct sock *sk2 = (struct sock *)&tb->fastsock;
+	kuid_t uid = sock_i_uid(sk);
+
+	if (tb->fastreuseport <= 0)
+		return 0;
+	if (!sk->sk_reuseport)
+		return 0;
+	if (rcu_access_pointer(sk->sk_reuseport_cb))
+		return 0;
+	if (!uid_eq(tb->fastuid, uid))
+		return 0;
+	/* We only need to check the rcv_saddr if this tb was once marked
+	 * without fastreuseport and then was reset, as we can only know that
+	 * the fastsock has no potential bind conflicts with the rest of the
+	 * possible socks on the owners list.
+	 */
+	if (tb->fastreuseport == FASTREUSEPORT_ANY)
+		return 1;
+	if (!inet_csk(sk)->icsk_af_ops->rcv_saddr_equal(sk, sk2, true))
+		return 0;
+	return 1;
+}
+
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
  * We try to allocate an odd port (and leave even ports for connect())
@@ -206,9 +232,7 @@ tb_found:
 			goto success;
 
 		if ((tb->fastreuse > 0 && reuse) ||
-		     (tb->fastreuseport > 0 &&
-		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-		      sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
+		    sk_reuseport_match(tb, sk))
 			goto success;
 		if (inet_csk_bind_conflict(sk, tb, true, true))
 			goto fail_unlock;
@@ -220,14 +244,35 @@ success:
 		if (sk->sk_reuseport) {
 			tb->fastreuseport = 1;
 			tb->fastuid = uid;
+			memcpy(&tb->fastsock, &sk->__sk_common,
+			       sizeof(struct sock_common));
 		} else {
 			tb->fastreuseport = 0;
 		}
 	} else {
 		if (!reuse)
 			tb->fastreuse = 0;
-		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+		if (sk->sk_reuseport) {
+			/* We didn't match or we don't have fastreuseport set on
+			 * the tb, but we have sk_reuseport set on this socket
+			 * and we know that there are no bind conflicts with
+			 * this socket in this tb, so reset our tb's reuseport
+			 * settings so that any subsequent sockets that match
+			 * our current socket will be put on the fast path.
+			 *
+			 * If we reset we need to set FASTREUSEPORT_STRICT so we
+			 * do extra checking for all subsequent sk_reuseport
+			 * socks.
+			 */
+			if (!sk_reuseport_match(tb, sk)) {
+				tb->fastreuseport = FASTREUSEPORT_STRICT;
+				tb->fastuid = uid;
+				memcpy(&tb->fastsock, &sk->__sk_common,
+				       sizeof(struct sock_common));
+			}
+		} else {
 			tb->fastreuseport = 0;
+		}
 	}
 	if (!inet_csk(sk)->icsk_bind_hash)
 		inet_bind_hash(sk, tb, port);
-- 
2.9.3

^ permalink raw reply related

* [PATCH 4/5 net-next] inet: split inet_csk_get_port into two functions
From: Josef Bacik @ 2016-12-20 20:07 UTC (permalink / raw)
  To: davem, hannes, kraigatgoog, eric.dumazet, tom, netdev,
	kernel-team
In-Reply-To: <1482264424-15439-1-git-send-email-jbacik@fb.com>

inet_csk_get_port does two different things, it either scans for an open port,
or it tries to see if the specified port is available for use.  Since these two
operations have different rules and are basically independent lets split them
into two different functions to make them both more readable.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 net/ipv4/inet_connection_sock.c | 72 +++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 25 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fc9bfe1..d3ccf62 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -84,34 +84,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
 	return sk2 != NULL;
 }
 
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
+/*
+ * Find an open port number for the socket.  Returns with the
+ * inet_bind_hashbucket lock held.
  */
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+static struct inet_bind_hashbucket *
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
 {
-	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
-	int ret = 1, port = snum;
+	int port = 0;
 	struct inet_bind_hashbucket *head;
 	struct net *net = sock_net(sk);
 	int i, low, high, attempt_half;
 	struct inet_bind_bucket *tb;
-	kuid_t uid = sock_i_uid(sk);
 	u32 remaining, offset;
-	bool reuseport_ok = !!snum;
-	bool empty_tb = true;
 
-	if (port) {
-		head = &hinfo->bhash[inet_bhashfn(net, port,
-						  hinfo->bhash_size)];
-		spin_lock_bh(&head->lock);
-		inet_bind_bucket_for_each(tb, &head->chain)
-			if (net_eq(ib_net(tb), net) && tb->port == port)
-				goto tb_found;
-
-		goto tb_not_found;
-	}
 	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 other_half_scan:
 	inet_get_local_port_range(net, &low, &high);
@@ -150,13 +137,12 @@ other_parity_scan:
 			if (net_eq(ib_net(tb), net) && tb->port == port) {
 				if (hlist_empty(&tb->owners))
 					goto success;
-				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok)) {
-					empty_tb = false;
+				if (!inet_csk_bind_conflict(sk, tb, false, false))
 					goto success;
-				}
 				goto next_port;
 			}
-		goto tb_not_found;
+		tb = NULL;
+		goto success;
 next_port:
 		spin_unlock_bh(&head->lock);
 		cond_resched();
@@ -171,8 +157,44 @@ next_port:
 		attempt_half = 2;
 		goto other_half_scan;
 	}
-	return ret;
+	return NULL;
+success:
+	*port_ret = port;
+	*tb_ret = tb;
+	return head;
+}
 
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	int ret = 1, port = snum;
+	struct inet_bind_hashbucket *head;
+	struct net *net = sock_net(sk);
+	struct inet_bind_bucket *tb = NULL;
+	kuid_t uid = sock_i_uid(sk);
+	bool empty_tb = true;
+
+	if (!port) {
+		head = inet_csk_find_open_port(sk, &tb, &port);
+		if (!head)
+			return 1;
+		if (!tb)
+			goto tb_not_found;
+		if (!hlist_empty(&tb->owners))
+			empty_tb = false;
+		goto success;
+	}
+	head = &hinfo->bhash[inet_bhashfn(net, port,
+					  hinfo->bhash_size)];
+	spin_lock_bh(&head->lock);
+	inet_bind_bucket_for_each(tb, &head->chain)
+		if (net_eq(ib_net(tb), net) && tb->port == port)
+			goto tb_found;
 tb_not_found:
 	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
 				     net, head, port);
@@ -188,7 +210,7 @@ tb_found:
 		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 		      sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
 			goto success;
-		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok))
+		if (inet_csk_bind_conflict(sk, tb, true, true))
 			goto fail_unlock;
 		empty_tb = false;
 	}
-- 
2.9.3

^ permalink raw reply related

* [PATCH 3/5 net-next] inet: don't check for bind conflicts twice when searching for a port
From: Josef Bacik @ 2016-12-20 20:07 UTC (permalink / raw)
  To: davem, hannes, kraigatgoog, eric.dumazet, tom, netdev,
	kernel-team
In-Reply-To: <1482264424-15439-1-git-send-email-jbacik@fb.com>

This is just wasted time, we've already found a tb that doesn't have a bind
conflict, and we don't drop the head lock so scanning again isn't going to give
us a different answer.  Instead move the tb->reuse setting logic outside of the
found_tb path and put it in the success: path.  Then make it so that we don't
goto again if we find a bind conflict in the found_tb path as we won't reach
this anymore when we are scanning for an ephemeral port.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 net/ipv4/inet_connection_sock.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1a1a94bd..fc9bfe1 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -92,7 +92,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
 	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
-	int ret = 1, attempts = 5, port = snum;
+	int ret = 1, port = snum;
 	struct inet_bind_hashbucket *head;
 	struct net *net = sock_net(sk);
 	int i, low, high, attempt_half;
@@ -100,6 +100,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 	kuid_t uid = sock_i_uid(sk);
 	u32 remaining, offset;
 	bool reuseport_ok = !!snum;
+	bool empty_tb = true;
 
 	if (port) {
 		head = &hinfo->bhash[inet_bhashfn(net, port,
@@ -111,7 +112,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 
 		goto tb_not_found;
 	}
-again:
 	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
 other_half_scan:
 	inet_get_local_port_range(net, &low, &high);
@@ -148,8 +148,12 @@ other_parity_scan:
 		spin_lock_bh(&head->lock);
 		inet_bind_bucket_for_each(tb, &head->chain)
 			if (net_eq(ib_net(tb), net) && tb->port == port) {
-				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok))
-					goto tb_found;
+				if (hlist_empty(&tb->owners))
+					goto success;
+				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok)) {
+					empty_tb = false;
+					goto success;
+				}
 				goto next_port;
 			}
 		goto tb_not_found;
@@ -184,23 +188,12 @@ tb_found:
 		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
 		      sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
 			goto success;
-		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
-			if ((reuse ||
-			     (tb->fastreuseport > 0 &&
-			      sk->sk_reuseport &&
-			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-			      uid_eq(tb->fastuid, uid))) && !snum &&
-			    --attempts >= 0) {
-				spin_unlock_bh(&head->lock);
-				goto again;
-			}
+		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok))
 			goto fail_unlock;
-		}
-		if (!reuse)
-			tb->fastreuse = 0;
-		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
-			tb->fastreuseport = 0;
-	} else {
+		empty_tb = false;
+	}
+success:
+	if (empty_tb) {
 		tb->fastreuse = reuse;
 		if (sk->sk_reuseport) {
 			tb->fastreuseport = 1;
@@ -208,8 +201,12 @@ tb_found:
 		} else {
 			tb->fastreuseport = 0;
 		}
+	} else {
+		if (!reuse)
+			tb->fastreuse = 0;
+		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+			tb->fastreuseport = 0;
 	}
-success:
 	if (!inet_csk(sk)->icsk_bind_hash)
 		inet_bind_hash(sk, tb, port);
 	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
-- 
2.9.3

^ permalink raw reply related

* [PATCH 1/5 net-next] inet: replace ->bind_conflict with ->rcv_saddr_equal
From: Josef Bacik @ 2016-12-20 20:07 UTC (permalink / raw)
  To: davem, hannes, kraigatgoog, eric.dumazet, tom, netdev,
	kernel-team
In-Reply-To: <1482264424-15439-1-git-send-email-jbacik@fb.com>

The only difference between inet6_csk_bind_conflict and inet_csk_bind_conflict
is how they check the rcv_saddr.  Since we want to be able to check the saddr in
other places just drop the protocol specific ->bind_conflict and replace it with
->rcv_saddr_equal, then make inet_csk_bind_conflict the one true bind conflict
function.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 include/net/inet6_connection_sock.h |  5 -----
 include/net/inet_connection_sock.h  |  9 +++------
 net/dccp/ipv4.c                     |  3 ++-
 net/dccp/ipv6.c                     |  2 +-
 net/ipv4/inet_connection_sock.c     | 22 +++++++-------------
 net/ipv4/tcp_ipv4.c                 |  3 ++-
 net/ipv4/udp.c                      |  1 +
 net/ipv6/inet6_connection_sock.c    | 40 -------------------------------------
 net/ipv6/tcp_ipv6.c                 |  4 ++--
 9 files changed, 18 insertions(+), 71 deletions(-)

diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
index 3212b39..8ec87b6 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -15,16 +15,11 @@
 
 #include <linux/types.h>
 
-struct inet_bind_bucket;
 struct request_sock;
 struct sk_buff;
 struct sock;
 struct sockaddr;
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-			    const struct inet_bind_bucket *tb, bool relax,
-			    bool soreuseport_ok);
-
 struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6,
 				      const struct request_sock *req, u8 proto);
 
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index ec0479a..9cd43c5 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -62,9 +62,9 @@ struct inet_connection_sock_af_ops {
 				char __user *optval, int __user *optlen);
 #endif
 	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
-	int	    (*bind_conflict)(const struct sock *sk,
-				     const struct inet_bind_bucket *tb,
-				     bool relax, bool soreuseport_ok);
+	int         (*rcv_saddr_equal)(const struct sock *sk1,
+				       const struct sock *sk2,
+				       bool match_wildcard);
 	void	    (*mtu_reduced)(struct sock *sk);
 };
 
@@ -261,9 +261,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
 
 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-			   const struct inet_bind_bucket *tb, bool relax,
-			   bool soreuseport_ok);
 int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
 struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9c67a96..1931324 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -17,6 +17,7 @@
 #include <linux/skbuff.h>
 #include <linux/random.h>
 
+#include <net/addrconf.h>
 #include <net/icmp.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
@@ -901,7 +902,7 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
 	.getsockopt	   = ip_getsockopt,
 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
-	.bind_conflict	   = inet_csk_bind_conflict,
+	.rcv_saddr_equal   = ipv4_rcv_saddr_equal,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 4663a01..45242b8 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -926,7 +926,7 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
+	.rcv_saddr_equal   = ipv6_rcv_saddr_equal,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5f44fa1..74f6a57 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -44,9 +44,9 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
 }
 EXPORT_SYMBOL(inet_get_local_port_range);
 
-int inet_csk_bind_conflict(const struct sock *sk,
-			   const struct inet_bind_bucket *tb, bool relax,
-			   bool reuseport_ok)
+static int inet_csk_bind_conflict(const struct sock *sk,
+				  const struct inet_bind_bucket *tb,
+				  bool relax, bool reuseport_ok)
 {
 	struct sock *sk2;
 	bool reuse = sk->sk_reuse;
@@ -62,7 +62,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
 
 	sk_for_each_bound(sk2, &tb->owners) {
 		if (sk != sk2 &&
-		    !inet_v6_ipv6only(sk2) &&
 		    (!sk->sk_bound_dev_if ||
 		     !sk2->sk_bound_dev_if ||
 		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
@@ -72,23 +71,18 @@ int inet_csk_bind_conflict(const struct sock *sk,
 			     rcu_access_pointer(sk->sk_reuseport_cb) ||
 			     (sk2->sk_state != TCP_TIME_WAIT &&
 			     !uid_eq(uid, sock_i_uid(sk2))))) {
-
-				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
-				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+				if (inet_csk(sk)->icsk_af_ops->rcv_saddr_equal(sk, sk2, true))
 					break;
 			}
 			if (!relax && reuse && sk2->sk_reuse &&
 			    sk2->sk_state != TCP_LISTEN) {
-
-				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
-				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+				if (inet_csk(sk)->icsk_af_ops->rcv_saddr_equal(sk, sk2, true))
 					break;
 			}
 		}
 	}
 	return sk2 != NULL;
 }
-EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
@@ -167,8 +161,7 @@ other_parity_scan:
 					smallest_size = tb->num_owners;
 					smallest_port = port;
 				}
-				if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false,
-									      reuseport_ok))
+				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok))
 					goto tb_found;
 				goto next_port;
 			}
@@ -209,8 +202,7 @@ tb_found:
 		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
 		    smallest_size == -1)
 			goto success;
-		if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true,
-							     reuseport_ok)) {
+		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
 			if ((reuse ||
 			     (tb->fastreuseport > 0 &&
 			      sk->sk_reuseport &&
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 029708f..7608012 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -63,6 +63,7 @@
 #include <linux/times.h>
 #include <linux/slab.h>
 
+#include <net/addrconf.h>
 #include <net/net_namespace.h>
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -1781,7 +1782,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 	.getsockopt	   = ip_getsockopt,
 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in),
-	.bind_conflict	   = inet_csk_bind_conflict,
+	.rcv_saddr_equal   = ipv4_rcv_saddr_equal,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2a70c05..6089ea8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -374,6 +374,7 @@ int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
 	}
 	return 0;
 }
+EXPORT_SYMBOL(ipv4_rcv_saddr_equal);
 
 static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
 			      unsigned int port)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 71939a2..7538715 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -28,46 +28,6 @@
 #include <net/inet6_connection_sock.h>
 #include <net/sock_reuseport.h>
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-			    const struct inet_bind_bucket *tb, bool relax,
-			    bool reuseport_ok)
-{
-	const struct sock *sk2;
-	bool reuse = !!sk->sk_reuse;
-	bool reuseport = !!sk->sk_reuseport && reuseport_ok;
-	kuid_t uid = sock_i_uid((struct sock *)sk);
-
-	/* We must walk the whole port owner list in this case. -DaveM */
-	/*
-	 * See comment in inet_csk_bind_conflict about sock lookup
-	 * vs net namespaces issues.
-	 */
-	sk_for_each_bound(sk2, &tb->owners) {
-		if (sk != sk2 &&
-		    (!sk->sk_bound_dev_if ||
-		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
-			if ((!reuse || !sk2->sk_reuse ||
-			     sk2->sk_state == TCP_LISTEN) &&
-			    (!reuseport || !sk2->sk_reuseport ||
-			     rcu_access_pointer(sk->sk_reuseport_cb) ||
-			     (sk2->sk_state != TCP_TIME_WAIT &&
-			      !uid_eq(uid,
-				      sock_i_uid((struct sock *)sk2))))) {
-				if (ipv6_rcv_saddr_equal(sk, sk2, true))
-					break;
-			}
-			if (!relax && reuse && sk2->sk_reuse &&
-			    sk2->sk_state != TCP_LISTEN &&
-			    ipv6_rcv_saddr_equal(sk, sk2, true))
-				break;
-		}
-	}
-
-	return sk2 != NULL;
-}
-EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
-
 struct dst_entry *inet6_csk_route_req(const struct sock *sk,
 				      struct flowi6 *fl6,
 				      const struct request_sock *req,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index bee59a6..2f40b98 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1603,7 +1603,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
+	.rcv_saddr_equal   = ipv6_rcv_saddr_equal,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
@@ -1634,7 +1634,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.getsockopt	   = ipv6_getsockopt,
 	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
 	.sockaddr_len	   = sizeof(struct sockaddr_in6),
-	.bind_conflict	   = inet6_csk_bind_conflict,
+	.rcv_saddr_equal   = ipv6_rcv_saddr_equal,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
-- 
2.9.3

^ permalink raw reply related

* [PATCH 2/5 net-next] inet: kill smallest_size and smallest_port
From: Josef Bacik @ 2016-12-20 20:07 UTC (permalink / raw)
  To: davem, hannes, kraigatgoog, eric.dumazet, tom, netdev,
	kernel-team
In-Reply-To: <1482264424-15439-1-git-send-email-jbacik@fb.com>

In inet_csk_get_port we seem to be using smallest_port to figure out where the
best place to look for a SO_REUSEPORT sk that matches with an existing set of
SO_REUSEPORT's.  However if we get to the logic

if (smallest_size != -1) {
	port = smallest_port;
	goto have_port;
}

we will do a useless search, because we would have already done the
inet_csk_bind_conflict for that port and it would have returned 1, otherwise we
would have gone to found_tb and succeeded.  Since this logic makes us do yet
another trip through inet_csk_bind_conflict for a port we know won't work just
delete this code and save us the time.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 net/ipv4/inet_connection_sock.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 74f6a57..1a1a94bd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -93,7 +93,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
 	int ret = 1, attempts = 5, port = snum;
-	int smallest_size = -1, smallest_port;
 	struct inet_bind_hashbucket *head;
 	struct net *net = sock_net(sk);
 	int i, low, high, attempt_half;
@@ -103,7 +102,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
 	bool reuseport_ok = !!snum;
 
 	if (port) {
-have_port:
 		head = &hinfo->bhash[inet_bhashfn(net, port,
 						  hinfo->bhash_size)];
 		spin_lock_bh(&head->lock);
@@ -137,8 +135,6 @@ other_half_scan:
 	 * We do the opposite to not pollute connect() users.
 	 */
 	offset |= 1U;
-	smallest_size = -1;
-	smallest_port = low; /* avoid compiler warning */
 
 other_parity_scan:
 	port = low + offset;
@@ -152,15 +148,6 @@ other_parity_scan:
 		spin_lock_bh(&head->lock);
 		inet_bind_bucket_for_each(tb, &head->chain)
 			if (net_eq(ib_net(tb), net) && tb->port == port) {
-				if (((tb->fastreuse > 0 && reuse) ||
-				     (tb->fastreuseport > 0 &&
-				      sk->sk_reuseport &&
-				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-				      uid_eq(tb->fastuid, uid))) &&
-				    (tb->num_owners < smallest_size || smallest_size == -1)) {
-					smallest_size = tb->num_owners;
-					smallest_port = port;
-				}
 				if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok))
 					goto tb_found;
 				goto next_port;
@@ -171,10 +158,6 @@ next_port:
 		cond_resched();
 	}
 
-	if (smallest_size != -1) {
-		port = smallest_port;
-		goto have_port;
-	}
 	offset--;
 	if (!(offset & 1))
 		goto other_parity_scan;
@@ -196,19 +179,18 @@ tb_found:
 		if (sk->sk_reuse == SK_FORCE_REUSE)
 			goto success;
 
-		if (((tb->fastreuse > 0 && reuse) ||
+		if ((tb->fastreuse > 0 && reuse) ||
 		     (tb->fastreuseport > 0 &&
 		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
-		    smallest_size == -1)
+		      sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
 			goto success;
 		if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
 			if ((reuse ||
 			     (tb->fastreuseport > 0 &&
 			      sk->sk_reuseport &&
 			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-			      uid_eq(tb->fastuid, uid))) &&
-			    !snum && smallest_size != -1 && --attempts >= 0) {
+			      uid_eq(tb->fastuid, uid))) && !snum &&
+			    --attempts >= 0) {
 				spin_unlock_bh(&head->lock);
 				goto again;
 			}
-- 
2.9.3

^ permalink raw reply related

* [RFC][PATCH 0/5 net-next] Rework inet_csk_get_port
From: Josef Bacik @ 2016-12-20 20:06 UTC (permalink / raw)
  To: davem, hannes, kraigatgoog, eric.dumazet, tom, netdev,
	kernel-team

At some point recently the guys working on our load balancer added the ability
to use SO_REUSEPORT.  When they restarted their app with this option enabled
they immediately hit a softlockup on what appeared to be the
inet_bind_bucket->lock.  Eventually what all of our debugging and discussion led
us to was the fact that the application comes up without SO_REUSEPORT, shuts
down which creates around 100k twsk's, and then comes up and tries to open a
bunch of sockets using SO_REUSEPORT, which meant traversing the inet_bind_bucket
owners list under the lock.  Since this lock is needed for dealing with the
twsk's and basically anything else related to connections we would softlockup,
and sometimes not ever recover.

To solve this problem I did what you see in Path 5/5.  Once we have a
SO_REUSEPORT socket on the tb->owners list we know that the socket has no
conflicts with any of the other sockets on that list.  So we can add a copy of
the sock_common (really all we need is the recv_saddr but it seemed ugly to copy
just the ipv6, ipv4, and flag to indicate if we were ipv6 only in there so I've
copied the whole common) in order to check subsequent SO_REUSEPORT sockets.  If
they match the previous one then we can skip the expensive
inet_csk_bind_conflict check.  This is what eliminated the soft lockup that we
were seeing.

Patches 1-4 are cleanups and re-workings.  For instance when we specify port ==
0 we need to find an open port, but we would do two passes through
inet_csk_bind_conflict every time we found a possible port.  We would also keep
track of the smallest_port value in order to try and use it if we found no
port our first run through.  This however made no sense as it would have had to
fail the first pass through inet_csk_bind_conflict, so would not actually pass
the second pass through either.  Finally I split the function into two functions
in order to make it easier to read and to distinguish between the two behaviors.

I have tested this on one of our load balancing boxes during peak traffic and it
hasn't fallen over.  But this is not my area, so obviously feel free to point
out where I'm being stupid and I'll get it fixed up and retested.  Thanks,

Josef

^ permalink raw reply

* Re: nfc: trf7970a: Prevent repeated polling from crashing the kernel
From: Mark Greer @ 2016-12-20 19:56 UTC (permalink / raw)
  To: Justin Bronder
  Cc: Geoff Lansberry, linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	lauro.venancio-430g2QfJUUCGglJvpFV4uA,
	aloisio.almeida-430g2QfJUUCGglJvpFV4uA,
	sameo-VuQAYsv1563Yd54FQh9/CA, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
	mark.rutland-5wv7dgnIgG8, netdev-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Jaret Cantu
In-Reply-To: <20161220191352.GB23496-WrO9gjaJVAZ9BIO3fBtL+FdjMaeQq/Z1QQ4Iyu8u01E@public.gmane.org>

On Tue, Dec 20, 2016 at 02:13:52PM -0500, Justin Bronder wrote:
> On 20/12/16 11:59 -0700, Mark Greer wrote:
> > On Tue, Dec 20, 2016 at 11:16:32AM -0500, Geoff Lansberry wrote:
> > > From: Jaret Cantu <jaret.cantu-jEh4hwF5bVhBDgjK7y7TUQ@public.gmane.org>
> > > 
> > > Repeated polling attempts cause a NULL dereference error to occur.
> > > This is because the state of the trf7970a is currently reading but
> > > another request has been made to send a command before it has finished.
> > 
> > How is this happening?  Was trf7970a_abort_cmd() called and it didn't
> > work right?  Was it not called at all and there is a bug in the digital
> > layer?  More details please.
> > 
> > > The solution is to properly kill the waiting reading (workqueue)
> > > before failing on the send.
> > 
> > If the bug is in the calling code, then that is what should get fixed.
> > This seems to be a hack to work-around a digital layer bug.
> 
> One of our uses of NFC is to begin polling to read a tag and then stop polling
> (in order to save power) until we know via user interaction that we need to poll
> again.  This is typically many minutes later so the power saving is pretty
> significant.  However, it's possible that a user will remove the tag before
> reading has completed.  We also detect this case and stop polling.  I can go
> more into this if necessary but that is what exposed a panic.
> 
> You can reproduce using neard and python, in our testing it was very likely to
> occur in 10-100 iterations of the following.:
> 
>     #!/usr/bin/python
>     import time
> 
>     import dbus
> 
>     bus = dbus.SystemBus()
>     nfc0 = bus.get_object('org.neard', '/org/neard/nfc0')
>     props = dbus.Interface(nfc0, 'org.freedesktop.DBus.Properties')
> 
>     try:
>         props.Set('org.neard.Adapter', 'Powered', dbus.Boolean(1))
>     except:
>         pass
> 
>     adapter = dbus.Interface(nfc0, 'org.neard.Adapter')
> 
>     for i in range(1000):
>         adapter.StartPollLoop('Initiator')
>         time.sleep(0.1)
>         adapter.StopPollLoop()
>         print(i)
> 
> I believe the last time we tested this was around the 4.1 release.

Thanks for the info, Justin, but I was also seeking more information
at the kernel NFC subsystem and trf7970a driver level.  This patch
adds code inside an 'if' in the driver whose condition should never
be evaluate to true but apparently it did.  How?

Thanks,

Mark
--
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: ipv6: handle -EFAULT from skb_copy_bits
From: Dave Jones @ 2016-12-20 19:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20161220.132813.435056880928769245.davem@davemloft.net>

On Tue, Dec 20, 2016 at 01:28:13PM -0500, David Miller wrote:
 
 > This has to do with the SKB buffer layout and geometry, not whether
 > the packet is "fragmented" in the protocol sense.
 > 
 > So no, this isn't a criteria for packets being filtered out by this
 > point.
 > 
 > Can you try to capture what sk->sk_socket->type and
 > inet_sk(sk)->hdrincl are set to at the time of the crash?
 > 

type:3 hdrincl:0

	Dave

^ permalink raw reply

* Re: ipv6: handle -EFAULT from skb_copy_bits
From: Cong Wang @ 2016-12-20 19:31 UTC (permalink / raw)
  To: Dave Jones; +Cc: David Miller, Linux Kernel Network Developers
In-Reply-To: <20161220181728.dd2cynjwrceruwcu@codemonkey.org.uk>

On Tue, Dec 20, 2016 at 10:17 AM, Dave Jones <davej@codemonkey.org.uk> wrote:
> On Mon, Dec 19, 2016 at 08:36:23PM -0500, David Miller wrote:
>  > From: Dave Jones <davej@codemonkey.org.uk>
>  > Date: Mon, 19 Dec 2016 19:40:13 -0500
>  >
>  > > On Mon, Dec 19, 2016 at 07:31:44PM -0500, Dave Jones wrote:
>  > >
>  > >  > Unfortunately, this made no difference.  I spent some time today trying
>  > >  > to make a better reproducer, but failed. I'll revisit again tomorrow.
>  > >  >
>  > >  > Maybe I need >1 process/thread to trigger this.  That would explain why
>  > >  > I can trigger it with Trinity.
>  > >
>  > > scratch that last part, I finally just repro'd it with a single process.
>  >
>  > Thanks for the info, I'll try to think about this some more.
>
> I threw in some debug printks right before that BUG_ON.
> it's always this:
>
> skb->len=31 skb->data_len=0 offset:30 total_len:9

Clearly we fail because 30 > 31 - 2, seems 'offset' is not correct here,
off-by-one?

^ permalink raw reply

* Re: [PATCH net] be2net: Increase skb headroom size to 256 bytes
From: David Miller @ 2016-12-20 19:30 UTC (permalink / raw)
  To: suresh.reddy; +Cc: netdev, kalesh-anakkur.purayil
In-Reply-To: <20161220151430.11115-1-suresh.reddy@broadcom.com>

From: Suresh Reddy <suresh.reddy@broadcom.com>
Date: Tue, 20 Dec 2016 10:14:30 -0500

> From: Kalesh A P <kalesh-anakkur.purayil@broadcom.com>
> 
> The driver currently allocates 128 bytes of skb headroom.
> This was found to be insufficient with some configurations
> like Geneve tunnels, which resulted in skb head reallocations.
> 
> Increase the headroom to 256 bytes to fix this.
> 
> Signed-off-by: Kalesh A P <kalesh-anakkur.purayil@broadcom.com>
> Signed-off-by: Suresh Reddy <suresh.reddy@broadcom.com>

Adding 128 bytes of headroom just for geneve seems excessive.

Do you really need to add that much?

^ permalink raw reply

* Re: [PATCH] net/mlx5: use rb_entry()
From: David Miller @ 2016-12-20 19:23 UTC (permalink / raw)
  To: geliangtang; +Cc: saeedm, matanb, leonro, netdev, linux-rdma, linux-kernel
In-Reply-To: <8443fa3fa03d82c2b829375d8020762e5236dc6d.1482203930.git.geliangtang@gmail.com>

From: Geliang Tang <geliangtang@gmail.com>
Date: Tue, 20 Dec 2016 22:02:14 +0800

> To make the code clearer, use rb_entry() instead of container_of() to
> deal with rbtree.
> 
> Signed-off-by: Geliang Tang <geliangtang@gmail.com>

Applied.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox