Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v3 11/12] net: stmmac: Add support for VLAN Insertion Offload
From: Jose Abreu @ 2019-08-17 18:54 UTC (permalink / raw)
  To: netdev
  Cc: Joao Pinto, Jakub Kicinski, Jose Abreu, Giuseppe Cavallaro,
	Alexandre Torgue, David S. Miller, Maxime Coquelin, linux-stm32,
	linux-arm-kernel, linux-kernel
In-Reply-To: <cover.1566067802.git.joabreu@synopsys.com>

Adds the logic to insert a given VLAN ID in a packet. This is offloaded
to HW and its descriptor based. For now, only XGMAC implements the
necessary callbacks.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>

---
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jose Abreu <joabreu@synopsys.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-stm32@st-md-mailman.stormreply.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |  8 ++++
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h     | 15 +++++++
 .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c    | 14 ++++++
 .../net/ethernet/stmicro/stmmac/dwxgmac2_descs.c   | 35 +++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c |  2 +
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 10 +++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 52 +++++++++++++++++++++-
 7 files changed, 135 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 1303ec81fd3d..49aa56ca09cc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -358,6 +358,8 @@ struct dma_features {
 	unsigned int rssen;
 	unsigned int vlhash;
 	unsigned int sphen;
+	unsigned int vlins;
+	unsigned int dvlan;
 };
 
 /* GMAC TX FIFO is 8K, Rx FIFO is 16K */
@@ -389,6 +391,12 @@ struct dma_features {
 #define STMMAC_RSS_HASH_KEY_SIZE	40
 #define STMMAC_RSS_MAX_TABLE_SIZE	256
 
+/* VLAN */
+#define STMMAC_VLAN_NONE	0x0
+#define STMMAC_VLAN_REMOVE	0x1
+#define STMMAC_VLAN_INSERT	0x2
+#define STMMAC_VLAN_REPLACE	0x3
+
 extern const struct stmmac_desc_ops enh_desc_ops;
 extern const struct stmmac_desc_ops ndesc_ops;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
index 79c145ba25a8..7357b8bdc128 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
@@ -63,6 +63,11 @@
 #define XGMAC_VLAN_ETV			BIT(16)
 #define XGMAC_VLAN_VID			GENMASK(15, 0)
 #define XGMAC_VLAN_HASH_TABLE		0x00000058
+#define XGMAC_VLAN_INCL			0x00000060
+#define XGMAC_VLAN_VLTI			BIT(20)
+#define XGMAC_VLAN_CSVL			BIT(19)
+#define XGMAC_VLAN_VLC			GENMASK(17, 16)
+#define XGMAC_VLAN_VLC_SHIFT		16
 #define XGMAC_RXQ_CTRL0			0x000000a0
 #define XGMAC_RXQEN(x)			GENMASK((x) * 2 + 1, (x) * 2)
 #define XGMAC_RXQEN_SHIFT(x)		((x) * 2)
@@ -128,6 +133,7 @@
 #define XGMAC_HWFEAT_RXQCNT		GENMASK(3, 0)
 #define XGMAC_HW_FEATURE3		0x00000128
 #define XGMAC_HWFEAT_ASP		GENMASK(15, 14)
+#define XGMAC_HWFEAT_DVLAN		BIT(13)
 #define XGMAC_HWFEAT_FRPES		GENMASK(12, 11)
 #define XGMAC_HWFEAT_FRPPB		GENMASK(10, 9)
 #define XGMAC_HWFEAT_FRPSEL		BIT(3)
@@ -337,10 +343,14 @@
 #define XGMAC_REGSIZE			((0x0000317c + (0x80 * 15)) / 4)
 
 /* Descriptors */
+#define XGMAC_TDES2_IVT			GENMASK(31, 16)
+#define XGMAC_TDES2_IVT_SHIFT		16
 #define XGMAC_TDES2_IOC			BIT(31)
 #define XGMAC_TDES2_TTSE		BIT(30)
 #define XGMAC_TDES2_B2L			GENMASK(29, 16)
 #define XGMAC_TDES2_B2L_SHIFT		16
+#define XGMAC_TDES2_VTIR		GENMASK(15, 14)
+#define XGMAC_TDES2_VTIR_SHIFT		14
 #define XGMAC_TDES2_B1L			GENMASK(13, 0)
 #define XGMAC_TDES3_OWN			BIT(31)
 #define XGMAC_TDES3_CTXT		BIT(30)
@@ -353,10 +363,15 @@
 #define XGMAC_TDES3_SAIC_SHIFT		23
 #define XGMAC_TDES3_THL			GENMASK(22, 19)
 #define XGMAC_TDES3_THL_SHIFT		19
+#define XGMAC_TDES3_IVTIR		GENMASK(19, 18)
+#define XGMAC_TDES3_IVTIR_SHIFT		18
 #define XGMAC_TDES3_TSE			BIT(18)
+#define XGMAC_TDES3_IVLTV		BIT(17)
 #define XGMAC_TDES3_CIC			GENMASK(17, 16)
 #define XGMAC_TDES3_CIC_SHIFT		16
 #define XGMAC_TDES3_TPL			GENMASK(17, 0)
+#define XGMAC_TDES3_VLTV		BIT(16)
+#define XGMAC_TDES3_VT			GENMASK(15, 0)
 #define XGMAC_TDES3_FL			GENMASK(14, 0)
 #define XGMAC_RDES2_HL			GENMASK(9, 0)
 #define XGMAC_RDES3_OWN			BIT(31)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index d8483d088711..e534a3aaf4a3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
@@ -1150,6 +1150,19 @@ static void dwxgmac2_sarc_configure(void __iomem *ioaddr, int val)
 	writel(value, ioaddr + XGMAC_TX_CONFIG);
 }
 
+static void dwxgmac2_enable_vlan(struct mac_device_info *hw, u32 type)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	value = readl(ioaddr + XGMAC_VLAN_INCL);
+	value |= XGMAC_VLAN_VLTI;
+	value |= XGMAC_VLAN_CSVL; /* Only use SVLAN */
+	value &= ~XGMAC_VLAN_VLC;
+	value |= (type << XGMAC_VLAN_VLC_SHIFT) & XGMAC_VLAN_VLC;
+	writel(value, ioaddr + XGMAC_VLAN_INCL);
+}
+
 const struct stmmac_ops dwxgmac210_ops = {
 	.core_init = dwxgmac2_core_init,
 	.set_mac = dwxgmac2_set_mac,
@@ -1189,6 +1202,7 @@ const struct stmmac_ops dwxgmac210_ops = {
 	.get_mac_tx_timestamp = dwxgmac2_get_mac_tx_timestamp,
 	.flex_pps_config = dwxgmac2_flex_pps_config,
 	.sarc_configure = dwxgmac2_sarc_configure,
+	.enable_vlan = dwxgmac2_enable_vlan,
 };
 
 int dwxgmac2_setup(struct stmmac_priv *priv)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c
index ab11e73ac6b1..ae48154f933c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c
@@ -305,6 +305,39 @@ static void dwxgmac2_set_sarc(struct dma_desc *p, u32 sarc_type)
 	p->des3 |= cpu_to_le32(sarc_type & XGMAC_TDES3_SAIC);
 }
 
+static void dwxgmac2_set_vlan_tag(struct dma_desc *p, u16 tag, u16 inner_tag,
+				  u32 inner_type)
+{
+	p->des0 = 0;
+	p->des1 = 0;
+	p->des2 = 0;
+	p->des3 = 0;
+
+	/* Inner VLAN */
+	if (inner_type) {
+		u32 des = inner_tag << XGMAC_TDES2_IVT_SHIFT;
+
+		des &= XGMAC_TDES2_IVT;
+		p->des2 = cpu_to_le32(des);
+
+		des = inner_type << XGMAC_TDES3_IVTIR_SHIFT;
+		des &= XGMAC_TDES3_IVTIR;
+		p->des3 = cpu_to_le32(des | XGMAC_TDES3_IVLTV);
+	}
+
+	/* Outer VLAN */
+	p->des3 |= cpu_to_le32(tag & XGMAC_TDES3_VT);
+	p->des3 |= cpu_to_le32(XGMAC_TDES3_VLTV);
+
+	p->des3 |= cpu_to_le32(XGMAC_TDES3_CTXT);
+}
+
+static void dwxgmac2_set_vlan(struct dma_desc *p, u32 type)
+{
+	type <<= XGMAC_TDES2_VTIR_SHIFT;
+	p->des2 |= cpu_to_le32(type & XGMAC_TDES2_VTIR);
+}
+
 const struct stmmac_desc_ops dwxgmac210_desc_ops = {
 	.tx_status = dwxgmac2_get_tx_status,
 	.rx_status = dwxgmac2_get_rx_status,
@@ -332,4 +365,6 @@ const struct stmmac_desc_ops dwxgmac210_desc_ops = {
 	.get_rx_header_len = dwxgmac2_get_rx_header_len,
 	.set_sec_addr = dwxgmac2_set_sec_addr,
 	.set_sarc = dwxgmac2_set_sarc,
+	.set_vlan_tag = dwxgmac2_set_vlan_tag,
+	.set_vlan = dwxgmac2_set_vlan,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
index f2d5901fbaff..64956465c030 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
@@ -359,6 +359,7 @@ static void dwxgmac2_get_hw_feature(void __iomem *ioaddr,
 
 	/*  MAC HW feature 0 */
 	hw_cap = readl(ioaddr + XGMAC_HW_FEATURE0);
+	dma_cap->vlins = (hw_cap & XGMAC_HWFEAT_SAVLANINS) >> 27;
 	dma_cap->rx_coe = (hw_cap & XGMAC_HWFEAT_RXCOESEL) >> 16;
 	dma_cap->tx_coe = (hw_cap & XGMAC_HWFEAT_TXCOESEL) >> 14;
 	dma_cap->eee = (hw_cap & XGMAC_HWFEAT_EEESEL) >> 13;
@@ -413,6 +414,7 @@ static void dwxgmac2_get_hw_feature(void __iomem *ioaddr,
 	/* MAC HW feature 3 */
 	hw_cap = readl(ioaddr + XGMAC_HW_FEATURE3);
 	dma_cap->asp = (hw_cap & XGMAC_HWFEAT_ASP) >> 14;
+	dma_cap->dvlan = (hw_cap & XGMAC_HWFEAT_DVLAN) >> 13;
 	dma_cap->frpes = (hw_cap & XGMAC_HWFEAT_FRPES) >> 11;
 	dma_cap->frpbs = (hw_cap & XGMAC_HWFEAT_FRPPB) >> 9;
 	dma_cap->frpsel = (hw_cap & XGMAC_HWFEAT_FRPSEL) >> 3;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index e54864cde01b..9435b312495d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -92,6 +92,9 @@ struct stmmac_desc_ops {
 	int (*get_rx_header_len)(struct dma_desc *p, unsigned int *len);
 	void (*set_sec_addr)(struct dma_desc *p, dma_addr_t addr);
 	void (*set_sarc)(struct dma_desc *p, u32 sarc_type);
+	void (*set_vlan_tag)(struct dma_desc *p, u16 tag, u16 inner_tag,
+			     u32 inner_type);
+	void (*set_vlan)(struct dma_desc *p, u32 type);
 };
 
 #define stmmac_init_rx_desc(__priv, __args...) \
@@ -150,6 +153,10 @@ struct stmmac_desc_ops {
 	stmmac_do_void_callback(__priv, desc, set_sec_addr, __args)
 #define stmmac_set_desc_sarc(__priv, __args...) \
 	stmmac_do_void_callback(__priv, desc, set_sarc, __args)
+#define stmmac_set_desc_vlan_tag(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_vlan_tag, __args)
+#define stmmac_set_desc_vlan(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_vlan, __args)
 
 struct stmmac_dma_cfg;
 struct dma_features;
@@ -351,6 +358,7 @@ struct stmmac_ops {
 	/* VLAN */
 	void (*update_vlan_hash)(struct mac_device_info *hw, u32 hash,
 				 bool is_double);
+	void (*enable_vlan)(struct mac_device_info *hw, u32 type);
 	/* TX Timestamp */
 	int (*get_mac_tx_timestamp)(struct mac_device_info *hw, u64 *ts);
 	/* Source Address Insertion / Replacement */
@@ -429,6 +437,8 @@ struct stmmac_ops {
 	stmmac_do_callback(__priv, mac, rss_configure, __args)
 #define stmmac_update_vlan_hash(__priv, __args...) \
 	stmmac_do_void_callback(__priv, mac, update_vlan_hash, __args)
+#define stmmac_enable_vlan(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, enable_vlan, __args)
 #define stmmac_get_mac_tx_timestamp(__priv, __args...) \
 	stmmac_do_callback(__priv, mac, get_mac_tx_timestamp, __args)
 #define stmmac_sarc_configure(__priv, __args...) \
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 8e38b053d9ab..bd1078433448 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2617,6 +2617,10 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 			stmmac_enable_sph(priv, priv->ioaddr, 1, chan);
 	}
 
+	/* VLAN Tag Insertion */
+	if (priv->dma_cap.vlins)
+		stmmac_enable_vlan(priv, priv->hw, STMMAC_VLAN_INSERT);
+
 	/* Start the ball rolling... */
 	stmmac_start_all_dma(priv);
 
@@ -2794,6 +2798,33 @@ static int stmmac_release(struct net_device *dev)
 	return 0;
 }
 
+static bool stmmac_vlan_insert(struct stmmac_priv *priv, struct sk_buff *skb,
+			       struct stmmac_tx_queue *tx_q)
+{
+	u16 tag = 0x0, inner_tag = 0x0;
+	u32 inner_type = 0x0;
+	struct dma_desc *p;
+
+	if (!priv->dma_cap.vlins)
+		return false;
+	if (!skb_vlan_tag_present(skb))
+		return false;
+	if (skb->vlan_proto == htons(ETH_P_8021AD)) {
+		inner_tag = skb_vlan_tag_get(skb);
+		inner_type = STMMAC_VLAN_INSERT;
+	}
+
+	tag = skb_vlan_tag_get(skb);
+
+	p = tx_q->dma_tx + tx_q->cur_tx;
+	if (stmmac_set_desc_vlan_tag(priv, p, tag, inner_tag, inner_type))
+		return false;
+
+	stmmac_set_tx_owner(priv, p);
+	tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, DMA_TX_SIZE);
+	return true;
+}
+
 /**
  *  stmmac_tso_allocator - close entry point of the driver
  *  @priv: driver private structure
@@ -2873,12 +2904,13 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct stmmac_priv *priv = netdev_priv(dev);
 	int nfrags = skb_shinfo(skb)->nr_frags;
 	u32 queue = skb_get_queue_mapping(skb);
-	unsigned int first_entry;
 	struct stmmac_tx_queue *tx_q;
+	unsigned int first_entry;
 	int tmp_pay_len = 0;
 	u32 pay_len, mss;
 	u8 proto_hdr_len;
 	dma_addr_t des;
+	bool has_vlan;
 	int i;
 
 	tx_q = &priv->tx_queue[queue];
@@ -2920,12 +2952,18 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 			skb->data_len);
 	}
 
+	/* Check if VLAN can be inserted by HW */
+	has_vlan = stmmac_vlan_insert(priv, skb, tx_q);
+
 	first_entry = tx_q->cur_tx;
 	WARN_ON(tx_q->tx_skbuff[first_entry]);
 
 	desc = tx_q->dma_tx + first_entry;
 	first = desc;
 
+	if (has_vlan)
+		stmmac_set_desc_vlan(priv, first, STMMAC_VLAN_INSERT);
+
 	/* first descriptor: fill Headers on Buf1 */
 	des = dma_map_single(priv->device, skb->data, skb_headlen(skb),
 			     DMA_TO_DEVICE);
@@ -3085,6 +3123,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned int first_entry;
 	unsigned int enh_desc;
 	dma_addr_t des;
+	bool has_vlan;
 	int entry;
 
 	tx_q = &priv->tx_queue[queue];
@@ -3110,6 +3149,9 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		return NETDEV_TX_BUSY;
 	}
 
+	/* Check if VLAN can be inserted by HW */
+	has_vlan = stmmac_vlan_insert(priv, skb, tx_q);
+
 	entry = tx_q->cur_tx;
 	first_entry = entry;
 	WARN_ON(tx_q->tx_skbuff[first_entry]);
@@ -3123,6 +3165,9 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	first = desc;
 
+	if (has_vlan)
+		stmmac_set_desc_vlan(priv, first, STMMAC_VLAN_INSERT);
+
 	enh_desc = priv->plat->enh_desc;
 	/* To program the descriptors according to the size of the frame */
 	if (enh_desc)
@@ -4473,6 +4518,11 @@ int stmmac_dvr_probe(struct device *device,
 		ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 		ndev->features |= NETIF_F_HW_VLAN_STAG_FILTER;
 	}
+	if (priv->dma_cap.vlins) {
+		ndev->features |= NETIF_F_HW_VLAN_CTAG_TX;
+		if (priv->dma_cap.dvlan)
+			ndev->features |= NETIF_F_HW_VLAN_STAG_TX;
+	}
 #endif
 	priv->msg_enable = netif_msg_init(debug, default_msg_level);
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH net-next v3 03/12] net: stmmac: xgmac: Correctly return that RX descriptor is not last one
From: Jose Abreu @ 2019-08-17 18:54 UTC (permalink / raw)
  To: netdev
  Cc: Joao Pinto, Jakub Kicinski, Jose Abreu, Giuseppe Cavallaro,
	Alexandre Torgue, David S. Miller, Maxime Coquelin, linux-stm32,
	linux-arm-kernel, linux-kernel
In-Reply-To: <cover.1566067802.git.joabreu@synopsys.com>

Return the correct value when RX descriptor is not the last one.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>

---
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jose Abreu <joabreu@synopsys.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-stm32@st-md-mailman.stormreply.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c
index 58b69fa97837..2c1ed8c2a9d3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_descs.c
@@ -26,16 +26,15 @@ static int dwxgmac2_get_rx_status(void *data, struct stmmac_extra_stats *x,
 				  struct dma_desc *p)
 {
 	unsigned int rdes3 = le32_to_cpu(p->des3);
-	int ret = good_frame;
 
 	if (unlikely(rdes3 & XGMAC_RDES3_OWN))
 		return dma_own;
 	if (likely(!(rdes3 & XGMAC_RDES3_LD)))
+		return rx_not_ls;
+	if (unlikely((rdes3 & XGMAC_RDES3_ES) && (rdes3 & XGMAC_RDES3_LD)))
 		return discard_frame;
-	if (unlikely(rdes3 & XGMAC_RDES3_ES))
-		ret = discard_frame;
 
-	return ret;
+	return good_frame;
 }
 
 static int dwxgmac2_get_tx_len(struct dma_desc *p)
-- 
2.7.4


^ permalink raw reply related

* [PATCH net-next v3 00/12] net: stmmac: Improvements for -next
From: Jose Abreu @ 2019-08-17 18:54 UTC (permalink / raw)
  To: netdev
  Cc: Joao Pinto, Jakub Kicinski, Jose Abreu, Giuseppe Cavallaro,
	Alexandre Torgue, David S. Miller, Maxime Coquelin, linux-stm32,
	linux-arm-kernel, linux-kernel

Couple of improvements for -next tree. More info in commit logs.

---
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jose Abreu <joabreu@synopsys.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-stm32@st-md-mailman.stormreply.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---

Jose Abreu (12):
  net: stmmac: Get correct timestamp values from XGMAC
  net: stmmac: Prepare to add Split Header support
  net: stmmac: xgmac: Correctly return that RX descriptor is not last
    one
  net: stmmac: Add Split Header support and enable it in XGMAC cores
  net: stmmac: Add a counter for Split Header packets
  net: stmmac: dwxgmac: Add Flexible PPS support
  net: stmmac: Add ethtool register dump for XGMAC cores
  net: stmmac: Add support for SA Insertion/Replacement in XGMAC cores
  net: stmmac: selftests: Add tests for SA Insertion/Replacement
  net: stmmac: xgmac: Add EEE support
  net: stmmac: Add support for VLAN Insertion Offload
  net: stmmac: selftests: Add selftest for VLAN TX Offload

 drivers/net/ethernet/stmicro/stmmac/common.h       |  10 +
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h     |  56 ++++
 .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c    | 182 ++++++++++++-
 .../net/ethernet/stmicro/stmmac/dwxgmac2_descs.c   |  85 +++++-
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c |  31 ++-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  30 +++
 drivers/net/ethernet/stmicro/stmmac/stmmac.h       |  10 +
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  24 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 286 ++++++++++++++++-----
 .../net/ethernet/stmicro/stmmac/stmmac_selftests.c | 194 +++++++++++++-
 10 files changed, 817 insertions(+), 91 deletions(-)

-- 
2.7.4


^ permalink raw reply

* [PATCH net-next v3 06/12] net: stmmac: dwxgmac: Add Flexible PPS support
From: Jose Abreu @ 2019-08-17 18:54 UTC (permalink / raw)
  To: netdev
  Cc: Joao Pinto, Jakub Kicinski, Jose Abreu, Giuseppe Cavallaro,
	Alexandre Torgue, David S. Miller, Maxime Coquelin, linux-stm32,
	linux-arm-kernel, linux-kernel
In-Reply-To: <cover.1566067802.git.joabreu@synopsys.com>

Add the support for Flexible PPS in XGMAC cores.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>

---
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jose Abreu <joabreu@synopsys.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-stm32@st-md-mailman.stormreply.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h     | 19 ++++++++
 .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c    | 56 ++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
index 995d533b9316..dbac63972faf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
@@ -149,6 +149,25 @@
 #define XGMAC_TXTIMESTAMP_NSEC		0x00000d30
 #define XGMAC_TXTSSTSLO			GENMASK(30, 0)
 #define XGMAC_TXTIMESTAMP_SEC		0x00000d34
+#define XGMAC_PPS_CONTROL		0x00000d70
+#define XGMAC_PPS_MAXIDX(x)		((((x) + 1) * 8) - 1)
+#define XGMAC_PPS_MINIDX(x)		((x) * 8)
+#define XGMAC_PPSx_MASK(x)		\
+	GENMASK(XGMAC_PPS_MAXIDX(x), XGMAC_PPS_MINIDX(x))
+#define XGMAC_TRGTMODSELx(x, val)	\
+	GENMASK(XGMAC_PPS_MAXIDX(x) - 1, XGMAC_PPS_MAXIDX(x) - 2) & \
+	((val) << (XGMAC_PPS_MAXIDX(x) - 2))
+#define XGMAC_PPSCMDx(x, val)		\
+	GENMASK(XGMAC_PPS_MINIDX(x) + 3, XGMAC_PPS_MINIDX(x)) & \
+	((val) << XGMAC_PPS_MINIDX(x))
+#define XGMAC_PPSCMD_START		0x2
+#define XGMAC_PPSCMD_STOP		0x5
+#define XGMAC_PPSEN0			BIT(4)
+#define XGMAC_PPSx_TARGET_TIME_SEC(x)	(0x00000d80 + (x) * 0x10)
+#define XGMAC_PPSx_TARGET_TIME_NSEC(x)	(0x00000d84 + (x) * 0x10)
+#define XGMAC_TRGTBUSY0			BIT(31)
+#define XGMAC_PPSx_INTERVAL(x)		(0x00000d88 + (x) * 0x10)
+#define XGMAC_PPSx_WIDTH(x)		(0x00000d8c + (x) * 0x10)
 
 /* MTL Registers */
 #define XGMAC_MTL_OPMODE		0x00001000
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index ba5183f38f84..f843e3640f50 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
@@ -8,6 +8,7 @@
 #include <linux/crc32.h>
 #include <linux/iopoll.h>
 #include "stmmac.h"
+#include "stmmac_ptp.h"
 #include "dwxgmac2.h"
 
 static void dwxgmac2_core_init(struct mac_device_info *hw,
@@ -1011,6 +1012,60 @@ static int dwxgmac2_get_mac_tx_timestamp(struct mac_device_info *hw, u64 *ts)
 	return 0;
 }
 
+static int dwxgmac2_flex_pps_config(void __iomem *ioaddr, int index,
+				    struct stmmac_pps_cfg *cfg, bool enable,
+				    u32 sub_second_inc, u32 systime_flags)
+{
+	u32 tnsec = readl(ioaddr + XGMAC_PPSx_TARGET_TIME_NSEC(index));
+	u32 val = readl(ioaddr + XGMAC_PPS_CONTROL);
+	u64 period;
+
+	if (!cfg->available)
+		return -EINVAL;
+	if (tnsec & XGMAC_TRGTBUSY0)
+		return -EBUSY;
+	if (!sub_second_inc || !systime_flags)
+		return -EINVAL;
+
+	val &= ~XGMAC_PPSx_MASK(index);
+
+	if (!enable) {
+		val |= XGMAC_PPSCMDx(index, XGMAC_PPSCMD_STOP);
+		writel(val, ioaddr + XGMAC_PPS_CONTROL);
+		return 0;
+	}
+
+	val |= XGMAC_PPSCMDx(index, XGMAC_PPSCMD_START);
+	val |= XGMAC_TRGTMODSELx(index, XGMAC_PPSCMD_START);
+	val |= XGMAC_PPSEN0;
+
+	writel(cfg->start.tv_sec, ioaddr + XGMAC_PPSx_TARGET_TIME_SEC(index));
+
+	if (!(systime_flags & PTP_TCR_TSCTRLSSR))
+		cfg->start.tv_nsec = (cfg->start.tv_nsec * 1000) / 465;
+	writel(cfg->start.tv_nsec, ioaddr + XGMAC_PPSx_TARGET_TIME_NSEC(index));
+
+	period = cfg->period.tv_sec * 1000000000;
+	period += cfg->period.tv_nsec;
+
+	do_div(period, sub_second_inc);
+
+	if (period <= 1)
+		return -EINVAL;
+
+	writel(period - 1, ioaddr + XGMAC_PPSx_INTERVAL(index));
+
+	period >>= 1;
+	if (period <= 1)
+		return -EINVAL;
+
+	writel(period - 1, ioaddr + XGMAC_PPSx_WIDTH(index));
+
+	/* Finally, activate it */
+	writel(val, ioaddr + XGMAC_PPS_CONTROL);
+	return 0;
+}
+
 const struct stmmac_ops dwxgmac210_ops = {
 	.core_init = dwxgmac2_core_init,
 	.set_mac = dwxgmac2_set_mac,
@@ -1048,6 +1103,7 @@ const struct stmmac_ops dwxgmac210_ops = {
 	.update_vlan_hash = dwxgmac2_update_vlan_hash,
 	.rxp_config = dwxgmac3_rxp_config,
 	.get_mac_tx_timestamp = dwxgmac2_get_mac_tx_timestamp,
+	.flex_pps_config = dwxgmac2_flex_pps_config,
 };
 
 int dwxgmac2_setup(struct stmmac_priv *priv)
-- 
2.7.4


^ permalink raw reply related

* Re: Unable to create htb tc classes more than 64K
From: Cong Wang @ 2019-08-17 18:24 UTC (permalink / raw)
  To: Akshat Kakkar; +Cc: NetFilter, lartc, netdev
In-Reply-To: <CAA5aLPiqyhnWjY7A3xsaNJ71sDOf=Rqej8d+7=_PyJPmV9uApA@mail.gmail.com>

On Sat, Aug 17, 2019 at 5:46 AM Akshat Kakkar <akshat.1984@gmail.com> wrote:
>
> I agree that it is because of 16bit of minor I'd of class which
> restricts it to 64K.
> Point is, can we use multilevel qdisc and classes to extend it to more
> no. of classes i.e. to more than 64K classes

If your goal is merely having as many classes as you can, then yes.


>
> One scheme can be like
>                                       100: root qdisc
>                                          |
>                                        / | \
>                                      /   |   \
>                                    /     |     \
>                                  /       |       \
>                           100:1   100:2   100:3        child classes
>                             |              |           |
>                             |              |           |
>                             |              |           |
>                            1:            2:          3:     qdisc
>                            / \           / \           / \
>                          /     \                     /     \
>                       1:1    1:2             3:1      3:2 leaf classes
>
> with all qdisc and classes defined as htb.
>
> Is this correct approach? Any alternative??

Again, depends on what your goal is.


>
> Besides, in order to direct traffic to leaf classes 1:1, 1:2, 2:1,
> 2:2, 3:1, 3:2 .... , instead of using filters I am using ipset with
> skbprio and iptables map-set match rule.
> But even after all this it don't work. Why?

Again, the filters you use to classify the packets could only
work for the classes on the same level, no the next level.


Thanks.

^ permalink raw reply

* Re: [PATCH RFC net-next 3/3] net: dsa: mv88e6xxx: setup SERDES irq also for CPU/DSA ports
From: Marek Behun @ 2019-08-17 18:15 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: netdev, Andrew Lunn, Vladimir Oltean, Florian Fainelli
In-Reply-To: <20190817200342.567c13c4@nic.cz>

On Sat, 17 Aug 2019 20:03:42 +0200
Marek Behun <marek.behun@nic.cz> wrote:

> One way would be to rename the mv88e6xxx_setup_port function to
> mv88e6xxx_setup_port_regs, or mv88e6xxx_port_pre_setup, or something
> like that. Would the names mv88e6xxx_port_setup and
> mv88e6xxx_setup_port_regs still be very confusing and error prone?
> I think maybe yes...
> 
> Other solution would be to, instead of the .port_setup()
> and .port_teardown() DSA ops, create the .after_setup()
> and .before_teardown() ops I mentioned in the previous mail.
> 
> And yet another (in my opinion very improper) solution could be that
> the .setup() method could call dsa_port_setup() from within itself, to
> ensure that the needed structres exist.

I thought of another solution, one that does not need new DSA
operations. What if dsa_port_enable was called for CPU/DSA ports after
in dsa_port_setup_switches, after all ports are setup, and
dsa_port_disable called for CPU/DSA ports in dsa_port_teardown_switches?

This seems to me as cleaner solution.

Marek

^ permalink raw reply

* Re: [PATCH RFC net-next 3/3] net: dsa: mv88e6xxx: setup SERDES irq also for CPU/DSA ports
From: Marek Behun @ 2019-08-17 18:03 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: netdev, Andrew Lunn, Vladimir Oltean, Florian Fainelli
In-Reply-To: <20190816190537.GB14714@t480s.localdomain>

Hi Vivien,

On Fri, 16 Aug 2019 19:05:37 -0400
Vivien Didelot <vivien.didelot@gmail.com> wrote:

> I think the DSA switch port_setup/port_teardown operations are fine, but the
> idea would be that the drivers must no longer setup their ports directly
> in their .setup function. So for mv88e6xxx precisely, we should rename
> mv88e6xxx_setup_port to mv88e6xxx_port_setup, and move all the port-related
> code from mv88e6xxx_setup into mv88e6xxx_port_setup.

I looked into the driver, and found out that mv88e6xxx_setup calls many
other setup functions after the calls to mv88e6xxx_setup_port for each
port:
   1. setup errata
   2. cache cmode
   3. for each port setup_port
   4. irl setup
   5. mac setup
   6. phy setup
   7. vtu setup
   8. pvt setup
   9. atu setup
  10. broadcast setuo
  11. pot setup
  12. rmu setup
  13. rsvd2cpu setup
  14. trunk setup
  15. devmap setup
  16. pri setup
  17. ptp setup
  18. hwtstamp setup
  19. stats setup

The problem is that some of these steps (after step 3) may depend on
some of the work done by step 3. Some of these functions iterate again
over the port array (mv88e6xxx_hwtstamp_setup, for example).
We cannot simply move step 3 to be called from DSA after
mv88e6xxx_setup.

I now do not know exactly what to do about the error prone naming of
setup_port vs port_setup.

One way would be to rename the mv88e6xxx_setup_port function to
mv88e6xxx_setup_port_regs, or mv88e6xxx_port_pre_setup, or something
like that. Would the names mv88e6xxx_port_setup and
mv88e6xxx_setup_port_regs still be very confusing and error prone?
I think maybe yes...

Other solution would be to, instead of the .port_setup()
and .port_teardown() DSA ops, create the .after_setup()
and .before_teardown() ops I mentioned in the previous mail.

And yet another (in my opinion very improper) solution could be that
the .setup() method could call dsa_port_setup() from within itself, to
ensure that the needed structres exist.

Please let me know what you think about this.

The first solution to me currently seems as the easiest.

Marek

^ permalink raw reply

* Re: [PATCH net] tcp: make sure EPOLLOUT wont be missed
From: Neal Cardwell @ 2019-08-17 17:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Soheil Hassas Yeganeh, Eric Dumazet,
	Jason Baron, Vladimir Rutsky
In-Reply-To: <20190817042622.91497-1-edumazet@google.com>

On Sat, Aug 17, 2019 at 12:26 AM Eric Dumazet <edumazet@google.com> wrote:
>
> As Jason Baron explained in commit 790ba4566c1a ("tcp: set SOCK_NOSPACE
> under memory pressure"), it is crucial we properly set SOCK_NOSPACE
> when needed.
>
> However, Jason patch had a bug, because the 'nonblocking' status
> as far as sk_stream_wait_memory() is concerned is governed
> by MSG_DONTWAIT flag passed at sendmsg() time :
>
>     long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
>
> So it is very possible that tcp sendmsg() calls sk_stream_wait_memory(),
> and that sk_stream_wait_memory() returns -EAGAIN with SOCK_NOSPACE
> cleared, if sk->sk_sndtimeo has been set to a small (but not zero)
> value.
>
> This patch removes the 'noblock' variable since we must always
> set SOCK_NOSPACE if -EAGAIN is returned.
>
> It also renames the do_nonblock label since we might reach this
> code path even if we were in blocking mode.
>
> Fixes: 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Jason Baron <jbaron@akamai.com>
> Reported-by: Vladimir Rutsky  <rutsky@google.com>
> ---
>  net/core/stream.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)

Acked-by: Neal Cardwell <ncardwell@google.com>

Thanks, Eric!

neal

^ permalink raw reply

* Re: 5.3-rc3-ish VM crash: RIP: 0010:tcp_trim_head+0x20/0xe0
From: Eric Dumazet @ 2019-08-17 16:35 UTC (permalink / raw)
  To: Sander Eikelenboom, Eric Dumazet, netdev, linux-kernel
In-Reply-To: <674de4ab-c37f-7787-f95a-3ae0f52bc196@eikelenboom.it>



On 8/17/19 10:24 AM, Sander Eikelenboom wrote:
> On 12/08/2019 19:56, Eric Dumazet wrote:
>>
>>
>> On 8/12/19 2:50 PM, Sander Eikelenboom wrote:
>>> L.S.,
>>>
>>> While testing a somewhere-after-5.3-rc3 kernel (which included the latest net merge (33920f1ec5bf47c5c0a1d2113989bdd9dfb3fae9),
>>> one of my Xen VM's (which gets quite some network load) crashed.
>>> See below for the stacktrace.
>>>
>>> Unfortunately I haven't got a clear trigger, so bisection doesn't seem to be an option at the moment. 
>>> I haven't encountered this on 5.2, so it seems to be an regression against 5.2.
>>>
>>> Any ideas ?
>>>
>>> --
>>> Sander
>>>
>>>
>>> [16930.653595] general protection fault: 0000 [#1] SMP NOPTI
>>> [16930.653624] CPU: 0 PID: 3275 Comm: rsync Not tainted 5.3.0-rc3-20190809-doflr+ #1
>>> [16930.653657] RIP: 0010:tcp_trim_head+0x20/0xe0
>>> [16930.653677] Code: 2e 0f 1f 84 00 00 00 00 00 90 41 54 41 89 d4 55 48 89 fd 53 48 89 f3 f6 46 7e 01 74 2f 8b 86 bc 00 00 00 48 03 86 c0 00 00 00 <8b> 40 20 66 83 f8 01 74 19 31 d2 31 f6 b9 20 0a 00 00 48 89 df e8
>>> [16930.653741] RSP: 0000:ffffc90000003ad8 EFLAGS: 00010286
>>> [16930.653762] RAX: fffe888005bf62c0 RBX: ffff8880115fb800 RCX: 000000008010000b
>>
>> crash in " mov    0x20(%rax),%eax"   and RAX=fffe888005bf62c0 (not a valid kernel address)
>>
>> Look like one bit corruption maybe.
>>
>> Nothing comes to mind really between 5.2 and 53 that could explain this.
>>
>>> [16930.653791] RDX: 00000000000005a0 RSI: ffff8880115fb800 RDI: ffff888016b00880
>>> [16930.653819] RBP: ffff888016b00880 R08: 0000000000000001 R09: 0000000000000000
>>> [16930.653848] R10: ffff88800ae00800 R11: 00000000bfe632e6 R12: 00000000000005a0
>>> [16930.653875] R13: 0000000000000001 R14: 00000000bfe62d46 R15: 0000000000000004
>>> [16930.653913] FS:  00007fe71fe2cb80(0000) GS:ffff88801f200000(0000) knlGS:0000000000000000
>>> [16930.653943] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [16930.653965] CR2: 000055de0f3e7000 CR3: 0000000011f32000 CR4: 00000000000006f0
>>> [16930.653993] Call Trace:
>>> [16930.654005]  <IRQ>
>>> [16930.654018]  tcp_ack+0xbb0/0x1230
>>> [16930.654033]  tcp_rcv_established+0x2e8/0x630
>>> [16930.654053]  tcp_v4_do_rcv+0x129/0x1d0
>>> [16930.654070]  tcp_v4_rcv+0xac9/0xcb0
>>> [16930.654088]  ip_protocol_deliver_rcu+0x27/0x1b0
>>> [16930.654109]  ip_local_deliver_finish+0x3f/0x50
>>> [16930.654128]  ip_local_deliver+0x4d/0xe0
>>> [16930.654145]  ? ip_protocol_deliver_rcu+0x1b0/0x1b0
>>> [16930.654163]  ip_rcv+0x4c/0xd0
>>> [16930.654179]  __netif_receive_skb_one_core+0x79/0x90
>>> [16930.654200]  netif_receive_skb_internal+0x2a/0xa0
>>> [16930.654219]  napi_gro_receive+0xe7/0x140
>>> [16930.654237]  xennet_poll+0x9be/0xae0
>>> [16930.654254]  net_rx_action+0x136/0x340
>>> [16930.654271]  __do_softirq+0xdd/0x2cf
>>> [16930.654287]  irq_exit+0x7a/0xa0
>>> [16930.654304]  xen_evtchn_do_upcall+0x27/0x40
>>> [16930.654320]  xen_hvm_callback_vector+0xf/0x20
>>> [16930.654339]  </IRQ>
>>> [16930.654349] RIP: 0033:0x55de0d87db99
>>> [16930.654364] Code: 00 00 48 89 7c 24 f8 45 39 fe 45 0f 42 fe 44 89 7c 24 f4 eb 09 0f 1f 40 00 83 e9 01 74 3e 89 f2 48 63 f8 4c 01 d2 44 38 1c 3a <75> 25 44 38 6c 3a ff 75 1e 41 0f b6 3c 24 40 38 3a 75 14 41 0f b6
>>> [16930.654432] RSP: 002b:00007ffd5531eec8 EFLAGS: 00000a87 ORIG_RAX: ffffffffffffff0c
>>> [16930.655004] RAX: 0000000000000002 RBX: 000055de0f3e8e50 RCX: 000000000000007f
>>> [16930.655034] RDX: 000055de0f3dc2d2 RSI: 0000000000003492 RDI: 0000000000000002
>>> [16930.655062] RBP: 0000000000007fff R08: 00000000000080ea R09: 00000000000001f0
>>> [16930.655089] R10: 000055de0f3d8e40 R11: 0000000000000094 R12: 000055de0f3e0f2a
>>> [16930.655116] R13: 0000000000000010 R14: 0000000000007f16 R15: 0000000000000080
>>> [16930.655144] Modules linked in:
>>> [16930.655200] ---[ end trace 533367c95501b645 ]---
>>> [16930.655223] RIP: 0010:tcp_trim_head+0x20/0xe0
>>> [16930.655243] Code: 2e 0f 1f 84 00 00 00 00 00 90 41 54 41 89 d4 55 48 89 fd 53 48 89 f3 f6 46 7e 01 74 2f 8b 86 bc 00 00 00 48 03 86 c0 00 00 00 <8b> 40 20 66 83 f8 01 74 19 31 d2 31 f6 b9 20 0a 00 00 48 89 df e8
>>> [16930.655312] RSP: 0000:ffffc90000003ad8 EFLAGS: 00010286
>>> [16930.655331] RAX: fffe888005bf62c0 RBX: ffff8880115fb800 RCX: 000000008010000b
>>> [16930.655360] RDX: 00000000000005a0 RSI: ffff8880115fb800 RDI: ffff888016b00880
>>> [16930.655387] RBP: ffff888016b00880 R08: 0000000000000001 R09: 0000000000000000
>>> [16930.655414] R10: ffff88800ae00800 R11: 00000000bfe632e6 R12: 00000000000005a0
>>> [16930.655441] R13: 0000000000000001 R14: 00000000bfe62d46 R15: 0000000000000004
>>> [16930.655475] FS:  00007fe71fe2cb80(0000) GS:ffff88801f200000(0000) knlGS:0000000000000000
>>> [16930.655502] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [16930.655525] CR2: 000055de0f3e7000 CR3: 0000000011f32000 CR4: 00000000000006f0
>>> [16930.655553] Kernel panic - not syncing: Fatal exception in interrupt
>>> [16930.655789] Kernel Offset: disabled
>>>
> 
> Hi Eric,
> 
> Got another VM crash, with a slightly different stacktrace this time around.
> Still networking though.
> 
> --
> Sander
> 
> [112522.697498] general protection fault: 0000 [#1] SMP NOPTI
> [112522.697555] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc4-20190812-doflr+ #1
> [112522.697592] RIP: 0010:skb_shift+0x63/0x430
> [112522.697608] Code: bc 00 00 00 48 03 8f c0 00 00 00 f6 41 03 08 74 07 48 83 79 28 00 75 d0 8b 8e bc 00 00 00 48 03 8e c0 00 00 00 48 85 f6 74 0a <f6> 41 03 08 0f 85 09 03 00 00 49 89 fd 8b bf bc 00 00 00 41 89 



crash in "testb  $0x8,0x3(%rcx)"  with RCX==fffe8880117da6c0

Same strange looking address on x86_64

I have no idea.

> [112522.697673] RSP: 0018:ffffc900000039b0 EFLAGS: 00010286
> [112522.697693] RAX: 00000000000005a0 RBX: ffff8880117fb800 RCX: fffe8880117da6c0
> [112522.697721] RDX: 00000000000005a0 RSI: ffff8880117fb800 RDI: ffff88800ae58000
> [112522.697748] RBP: ffffc900000039e8 R08: 000000000004cfe0 R09: 00000000000005a0
> [112522.697775] R10: 00000000000005a0 R11: ffff8880117fb800 R12: 0000000000000000
> [112522.697803] R13: 00000000c95a98c2 R14: 0000000000000000 R15: ffff88800ae58000
> [112522.697839] FS:  0000000000000000(0000) GS:ffff88801f200000(0000) knlGS:0000000000000000
> [112522.697869] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [112522.697895] CR2: 00007f9210d8e078 CR3: 000000000b660000 CR4: 00000000000006f0
> [112522.697925] Call Trace:
> [112522.697938]  <IRQ>
> [112522.697951]  tcp_sacktag_walk+0x2af/0x480
> [112522.697967]  tcp_sacktag_write_queue+0x34d/0x820
> [112522.697986]  ? ip_forward_options.cold.0+0x1c/0x1c
> [112522.698007]  tcp_ack+0xb8c/0x1230
> [112522.698023]  ? tcp_event_new_data_sent+0x4a/0x90
> [112522.698043]  tcp_rcv_established+0x14c/0x630
> [112522.698064]  tcp_v4_do_rcv+0x129/0x1d0
> [112522.698081]  tcp_v4_rcv+0xac9/0xcb0
> [112522.698099]  ip_protocol_deliver_rcu+0x27/0x1b0
> [112522.698119]  ip_local_deliver_finish+0x3f/0x50
> [112522.698139]  ip_local_deliver+0x4d/0xe0
> [112522.698155]  ? ip_protocol_deliver_rcu+0x1b0/0x1b0
> [112522.698177]  ip_rcv+0x4c/0xd0
> [112522.698194]  __netif_receive_skb_one_core+0x79/0x90
> [112522.698215]  netif_receive_skb_internal+0x2a/0xa0
> [112522.698237]  napi_gro_receive+0xe7/0x140
> [112522.698255]  xennet_poll+0x9be/0xae0
> [112522.698271]  net_rx_action+0x136/0x340
> [112522.698288]  __do_softirq+0xdd/0x2cf
> [112522.698304]  irq_exit+0x7a/0xa0
> [112522.698321]  xen_evtchn_do_upcall+0x27/0x40
> [112522.698340]  xen_hvm_callback_vector+0xf/0x20
> [112522.698359]  </IRQ>
> [112522.698373] RIP: 0010:native_safe_halt+0xe/0x10
> [112522.698392] Code: 48 8b 04 25 c0 6b 01 00 f0 80 48 02 20 48 8b 00 a8 08 75 c4 eb 80 90 90 90 90 90 90 e9 07 00 00 00 0f 00 2d 54 fb 41 00 fb f4 <c3> 90 e9 07 00 00 00 0f 00 2d 44 fb 41 00 f4 c3 90 90 41 55 41 54
> [112522.699522] RSP: 0018:ffffffff82a03e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff0c
> [112522.699552] RAX: 0001a54800000000 RBX: 0000000000000000 RCX: 0000000000000001
> [112522.699580] RDX: 0000000002b9f9b6 RSI: 0000000000000087 RDI: 0000000000000000
> [112522.699608] RBP: 0000000000000000 R08: 000000001eb5c3cb R09: ffffffff82a08460
> [112522.699634] R10: 000000000002e46e R11: 0000000000000000 R12: 0000000000000000
> [112522.699662] R13: 0000000000000000 R14: ffffffff8326e0a0 R15: 0000000000000000
> [112522.699692]  default_idle+0x17/0x140
> [112522.699709]  do_idle+0x1ee/0x210
> [112522.699726]  cpu_startup_entry+0x14/0x20
> [112522.699743]  start_kernel+0x4e9/0x50b
> [112522.699760]  secondary_startup_64+0xa4/0xb0
> [112522.699780] Modules linked in:
> [112522.699829] ---[ end trace 3b8db3603485e952 ]---
> [112522.699850] RIP: 0010:skb_shift+0x63/0x430
> [112522.699866] Code: bc 00 00 00 48 03 8f c0 00 00 00 f6 41 03 08 74 07 48 83 79 28 00 75 d0 8b 8e bc 00 00 00 48 03 8e c0 00 00 00 48 85 f6 74 0a <f6> 41 03 08 0f 85 09 03 00 00 49 89 fd 8b bf bc 00 00 00 41 89 d4
> [112522.699938] RSP: 0018:ffffc900000039b0 EFLAGS: 00010286
> [112522.699959] RAX: 00000000000005a0 RBX: ffff8880117fb800 RCX: fffe8880117da6c0
> [112522.699986] RDX: 00000000000005a0 RSI: ffff8880117fb800 RDI: ffff88800ae58000
> [112522.700013] RBP: ffffc900000039e8 R08: 000000000004cfe0 R09: 00000000000005a0
> [112522.700041] R10: 00000000000005a0 R11: ffff8880117fb800 R12: 0000000000000000
> [112522.700067] R13: 00000000c95a98c2 R14: 0000000000000000 R15: ffff88800ae58000
> [112522.700111] FS:  0000000000000000(0000) GS:ffff88801f200000(0000) knlGS:0000000000000000
> [112522.700140] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [112522.700165] CR2: 00007f9210d8e078 CR3: 000000000b660000 CR4: 00000000000006f0
> [112522.700201] Kernel panic - not syncing: Fatal exception in interrupt
> [112522.702992] Kernel Offset: disabled
> 
> 

^ permalink raw reply

* Re: [PATCH net] tcp: make sure EPOLLOUT wont be missed
From: Eric Dumazet @ 2019-08-17 16:26 UTC (permalink / raw)
  To: Jason Baron, Eric Dumazet, David S . Miller
  Cc: netdev, Soheil Hassas Yeganeh, Neal Cardwell, Eric Dumazet,
	Vladimir Rutsky
In-Reply-To: <b9ab6b03-664c-eb81-0fbd-6f696276d9aa@akamai.com>



On 8/17/19 4:19 PM, Jason Baron wrote:
> 
> 
> On 8/17/19 12:26 AM, Eric Dumazet wrote:
>> As Jason Baron explained in commit 790ba4566c1a ("tcp: set SOCK_NOSPACE
>> under memory pressure"), it is crucial we properly set SOCK_NOSPACE
>> when needed.
>>
>> However, Jason patch had a bug, because the 'nonblocking' status
>> as far as sk_stream_wait_memory() is concerned is governed
>> by MSG_DONTWAIT flag passed at sendmsg() time :
>>
>>     long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
>>
>> So it is very possible that tcp sendmsg() calls sk_stream_wait_memory(),
>> and that sk_stream_wait_memory() returns -EAGAIN with SOCK_NOSPACE
>> cleared, if sk->sk_sndtimeo has been set to a small (but not zero)
>> value.
> 
> Is MSG_DONTWAIT not set in this case? The original patch was intended
> only for the explicit non-blocking case. The epoll manpage says:
> "EPOLLET flag should use nonblocking file descriptors". So the original
> intention was not to impact the blocking case. This seems to me like
> a different use-case.
>

I guess the problem is how we define 'non-blocking' ...

SO_SNDTIMEO can be used by application to implement a variation of non-blocking,
by waiting for a socket event with a short timeout, to maybe recover
from memory pressure conditions in a more efficient way than simply looping.

Note that the man page for epoll() only _suggests_ to use nonblocking file descriptors.

<quote>
       The  suggested  way  to use epoll as an edge-triggered (EPOLLET)
       interface is as follows:

              i   with nonblocking file descriptors; and

              ii  by  waiting  for  an  event  only  after  read(2)  or
                  write(2) return EAGAIN.
</quote>









^ permalink raw reply

* Re: [PATCH 1/2] PTP: introduce new versions of IOCTLs
From: Joe Perches @ 2019-08-17 16:17 UTC (permalink / raw)
  To: Richard Cochran, Felipe Balbi; +Cc: Christopher S Hall, netdev, linux-kernel
In-Reply-To: <20190817155927.GA1540@localhost>

On Sat, 2019-08-17 at 08:59 -0700, Richard Cochran wrote:
> On Wed, Aug 14, 2019 at 10:47:11AM +0300, Felipe Balbi wrote:
> > The current version of the IOCTL have a small problem which prevents us
> > from extending the API by making use of reserved fields. In these new
> > IOCTLs, we are now making sure that flags and rsv fields are zero which
> > will allow us to extend the API in the future.
> > 
> > Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
> > ---
> >  drivers/ptp/ptp_chardev.c      | 58 ++++++++++++++++++++++++++++++++--
> >  include/uapi/linux/ptp_clock.h | 12 +++++++
> >  2 files changed, 68 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
[]
> > @@ -123,9 +123,11 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> >  	struct timespec64 ts;
> >  	int enable, err = 0;
> >  
> > +	memset(&req, 0, sizeof(req));
> 
> Nit: please leave a blank line between memset() and switch/case.

or just initialize the declaration of req with = {}

Is there a case where this initialization is
unnecessary such that it impacts performance
given the use in ptp_ioctl?

caps for instance is memset to zero only in
PTP_CLOCK_GETCAP

req is used in only 3 of the case blocks.

	case PTP_EXTTS_REQUEST:
	case PTP_PEROUT_REQUEST:
	case PTP_ENABLE_PPS:

Maybe it would be better to move the memset(&req...)
into each of the case blocks.

> >  	switch (cmd) {
> >  
> >  	case PTP_CLOCK_GETCAPS:
> > +	case PTP_CLOCK_GETCAPS2:
> >  		memset(&caps, 0, sizeof(caps));
> >  		caps.max_adj = ptp->info->max_adj;
> >  		caps.n_alarm = ptp->info->n_alarm;
> 
> Reviewed-by: Richard Cochran <richardcochran@gmail.com>
> 


^ permalink raw reply

* Re: [PATCH 2/2] PTP: add support for one-shot output
From: Richard Cochran @ 2019-08-17 16:03 UTC (permalink / raw)
  To: Felipe Balbi; +Cc: Christopher S Hall, netdev, linux-kernel
In-Reply-To: <20190814074712.10684-2-felipe.balbi@linux.intel.com>

On Wed, Aug 14, 2019 at 10:47:12AM +0300, Felipe Balbi wrote:
> diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
> index 039cd62ec706..9412b16cc8ed 100644
> --- a/include/uapi/linux/ptp_clock.h
> +++ b/include/uapi/linux/ptp_clock.h
> @@ -67,7 +67,9 @@ struct ptp_perout_request {
>  	struct ptp_clock_time start;  /* Absolute start time. */
>  	struct ptp_clock_time period; /* Desired period, zero means disable. */
>  	unsigned int index;           /* Which channel to configure. */
> -	unsigned int flags;           /* Reserved for future use. */
> +
> +#define PTP_PEROUT_ONE_SHOT BIT(0)
> +	unsigned int flags;           /* Bit 0 -> oneshot output. */

The .flags field doesn't need this comment.  The individual BIT macro
names should be clear enough, and if not, then comment the macros.

>  	unsigned int rsv[4];          /* Reserved for future use. */
>  };
>  
> -- 
> 2.22.0
> 

Reviewed-by: Richard Cochran <richardcochran@gmail.com>

^ permalink raw reply

* Re: [PATCH 1/2] PTP: introduce new versions of IOCTLs
From: Richard Cochran @ 2019-08-17 15:59 UTC (permalink / raw)
  To: Felipe Balbi; +Cc: Christopher S Hall, netdev, linux-kernel
In-Reply-To: <20190814074712.10684-1-felipe.balbi@linux.intel.com>

On Wed, Aug 14, 2019 at 10:47:11AM +0300, Felipe Balbi wrote:
> The current version of the IOCTL have a small problem which prevents us
> from extending the API by making use of reserved fields. In these new
> IOCTLs, we are now making sure that flags and rsv fields are zero which
> will allow us to extend the API in the future.
> 
> Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
> ---
>  drivers/ptp/ptp_chardev.c      | 58 ++++++++++++++++++++++++++++++++--
>  include/uapi/linux/ptp_clock.h | 12 +++++++
>  2 files changed, 68 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
> index 18ffe449efdf..204212fc3f8c 100644
> --- a/drivers/ptp/ptp_chardev.c
> +++ b/drivers/ptp/ptp_chardev.c
> @@ -123,9 +123,11 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
>  	struct timespec64 ts;
>  	int enable, err = 0;
>  
> +	memset(&req, 0, sizeof(req));

Nit: please leave a blank line between memset() and switch/case.

>  	switch (cmd) {
>  
>  	case PTP_CLOCK_GETCAPS:
> +	case PTP_CLOCK_GETCAPS2:
>  		memset(&caps, 0, sizeof(caps));
>  		caps.max_adj = ptp->info->max_adj;
>  		caps.n_alarm = ptp->info->n_alarm;

Reviewed-by: Richard Cochran <richardcochran@gmail.com>


^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Andy Lutomirski @ 2019-08-17 15:44 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Thomas Gleixner, Jordan Glover, Andy Lutomirski,
	Daniel Colascione, Song Liu, Kees Cook, Networking, bpf,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team, Lorenz Bauer,
	Jann Horn, Greg KH, Linux API, LSM List
In-Reply-To: <20190817150245.xxzxqjpvgqsxmloe@ast-mbp>



> On Aug 17, 2019, at 8:02 AM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> 
> Can any of the mechanisms 1/2/3 address the concern in mds.rst?
> 

seccomp() can. It’s straightforward to use seccomp to disable bpf() outright for a process tree.  In this regard, bpf() isn’t particularly unique — it’s a system call that exposes some attack surface and that isn’t required by most programs for basic functionality.

At LPC this year, there will be a discussion about seccomp improvements that will, among other things, offer fiber-grained control. It’s quite likely, for example, that seccomp will soon be able to enable and disable specific map types or attach types.  The exact mechanism isn’t decided yet,  but I think everyone expects that this is mostly a design problem, not an implementation problem.

This is off topic for the current thread, but it could be useful to allow bpf programs to be loaded from files directly (i.e. pass an fd to a file into bpf() to load the program), which would enable LSMs to check that the file is appropriately labeled. This would dramatically raise the bar for exploitation of verifier bugs or speculation attacks, since anyone trying to exploit it would need to get the bpf payload through LSM policy first.

> I believe Andy wants to expand the attack surface when
> kernel.unprivileged_bpf_disabled=0
> Before that happens I'd like the community to work on addressing the text above.
> 

Not by much. BPF maps are already largely exposed to unprivileged code (when unprivileged_bpf_disabled=0).  The attack surface is there, and they’re arguably even more exposed than they should be.  My patch 1 earlier was about locking these interfaces down.

Similarly, my suggestions about reworking cgroup attach and program load don’t actually allow fully unprivileged users to run arbitrary bpf() programs [0] — under my proposal, to attach a bpf cgroup program, you need a delegated cgroup. The mechanism could be extended by a requirement that a privileged cgroup manager explicitly enable certain attach types for a delegated subtree.

A cgroup knob to turn unprivileged bpf on and off for tasks in the cgroup might actually be quite useful.

[0] on some thought, the test run mechanism should probably remain root-only.


^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Christian Brauner @ 2019-08-17 15:42 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Kees Cook, Andy Lutomirski, Song Liu, Networking,
	bpf, Alexei Starovoitov, Daniel Borkmann, Kernel Team,
	Lorenz Bauer, Jann Horn, Greg KH, Linux API, LSM List
In-Reply-To: <20190817153652.zfcsklt474j72dzm@ast-mbp.dhcp.thefacebook.com>

On August 17, 2019 5:36:54 PM GMT+02:00, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>On Sat, Aug 17, 2019 at 05:16:53PM +0200, Christian Brauner wrote:
>> On August 17, 2019 5:08:45 PM GMT+02:00, Alexei Starovoitov
><alexei.starovoitov@gmail.com> wrote:
>> >On Sat, Aug 17, 2019 at 12:22:53AM +0200, Christian Brauner wrote:
>> >> 
>> >> (The one usecase I'd care about is to extend seccomp to do
>> >pointer-based
>> >> syscall filtering. Whether or not that'd require (unprivileged)
>ebpf
>> >is
>> >> up for discussion at KSummit.)
>> >
>> >Kees have been always against using ebpf in seccomp. I believe he
>still
>> >holds this opinion. Until he changes his mind let's stop bringing
>> >seccomp
>> >as a use case for unpriv bpf.
>> 
>> That's why I said "whether or not".
>> For the record, I do prefer a non-unpriv-ebpf way.
>> It's still something that will most surely come up in the discussion
>though.
>
>It's very un-kernely way to defer to in-person meetings.
>If there is anything to discuss please discuss it on the public mailing
>list.

https://lists.linuxfoundation.org/pipermail/ksummit-discuss/2019-July/006699.html

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Alexei Starovoitov @ 2019-08-17 15:36 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andy Lutomirski, Kees Cook, Andy Lutomirski, Song Liu, Networking,
	bpf, Alexei Starovoitov, Daniel Borkmann, Kernel Team,
	Lorenz Bauer, Jann Horn, Greg KH, Linux API, LSM List
In-Reply-To: <61B88085-9FBB-41E6-9783-324E445E428D@ubuntu.com>

On Sat, Aug 17, 2019 at 05:16:53PM +0200, Christian Brauner wrote:
> On August 17, 2019 5:08:45 PM GMT+02:00, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >On Sat, Aug 17, 2019 at 12:22:53AM +0200, Christian Brauner wrote:
> >> 
> >> (The one usecase I'd care about is to extend seccomp to do
> >pointer-based
> >> syscall filtering. Whether or not that'd require (unprivileged) ebpf
> >is
> >> up for discussion at KSummit.)
> >
> >Kees have been always against using ebpf in seccomp. I believe he still
> >holds this opinion. Until he changes his mind let's stop bringing
> >seccomp
> >as a use case for unpriv bpf.
> 
> That's why I said "whether or not".
> For the record, I do prefer a non-unpriv-ebpf way.
> It's still something that will most surely come up in the discussion though.

It's very un-kernely way to defer to in-person meetings.
If there is anything to discuss please discuss it on the public mailing list.


^ permalink raw reply

* Re: kernel BUG at include/linux/skbuff.h:LINE! (2)
From: syzbot @ 2019-08-17 15:28 UTC (permalink / raw)
  To: andriy.shevchenko, davem, edumazet, f.fainelli, idosch,
	kimbrownkd, linux-kernel, linux-sctp, marcelo.leitner, netdev,
	nhorman, syzkaller-bugs, tglx, vyasevich, wanghai26, yuehaibing
In-Reply-To: <0000000000008182a50590404a02@google.com>

syzbot has bisected this bug to:

commit bc389fd101e57b36aacfaec2df8fe479eabb44ea
Author: David S. Miller <davem@davemloft.net>
Date:   Tue Jul 2 21:12:30 2019 +0000

     Merge branch 'macsec-fix-some-bugs-in-the-receive-path'

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=125c5c4c600000
start commit:   459c5fb4 Merge branch 'mscc-PTP-support'
git tree:       net-next
final crash:    https://syzkaller.appspot.com/x/report.txt?x=115c5c4c600000
console output: https://syzkaller.appspot.com/x/log.txt?x=165c5c4c600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=d4cf1ffb87d590d7
dashboard link: https://syzkaller.appspot.com/bug?extid=eb349eeee854e389c36d
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=111849e2600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1442c25a600000

Reported-by: syzbot+eb349eeee854e389c36d@syzkaller.appspotmail.com
Fixes: bc389fd101e5 ("Merge  
branch 'macsec-fix-some-bugs-in-the-receive-path'")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Christian Brauner @ 2019-08-17 15:16 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Kees Cook, Andy Lutomirski, Song Liu, Networking,
	bpf, Alexei Starovoitov, Daniel Borkmann, Kernel Team,
	Lorenz Bauer, Jann Horn, Greg KH, Linux API, LSM List
In-Reply-To: <20190817150843.4vsmzpwpcvzndjld@ast-mbp>

On August 17, 2019 5:08:45 PM GMT+02:00, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>On Sat, Aug 17, 2019 at 12:22:53AM +0200, Christian Brauner wrote:
>> 
>> (The one usecase I'd care about is to extend seccomp to do
>pointer-based
>> syscall filtering. Whether or not that'd require (unprivileged) ebpf
>is
>> up for discussion at KSummit.)
>
>Kees have been always against using ebpf in seccomp. I believe he still
>holds this opinion. Until he changes his mind let's stop bringing
>seccomp
>as a use case for unpriv bpf.

That's why I said "whether or not".
For the record, I do prefer a non-unpriv-ebpf way.
It's still something that will most surely come up in the discussion though.

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Alexei Starovoitov @ 2019-08-17 15:08 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andy Lutomirski, Kees Cook, Andy Lutomirski, Song Liu, Networking,
	bpf, Alexei Starovoitov, Daniel Borkmann, Kernel Team,
	Lorenz Bauer, Jann Horn, Greg KH, Linux API, LSM List
In-Reply-To: <20190816222252.a7zizw7azkxnv3ot@wittgenstein>

On Sat, Aug 17, 2019 at 12:22:53AM +0200, Christian Brauner wrote:
> 
> (The one usecase I'd care about is to extend seccomp to do pointer-based
> syscall filtering. Whether or not that'd require (unprivileged) ebpf is
> up for discussion at KSummit.)

Kees have been always against using ebpf in seccomp. I believe he still
holds this opinion. Until he changes his mind let's stop bringing seccomp
as a use case for unpriv bpf.


^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Alexei Starovoitov @ 2019-08-17 15:02 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jordan Glover, Andy Lutomirski, Daniel Colascione, Song Liu,
	Kees Cook, Networking, bpf, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Lorenz Bauer, Jann Horn, Greg KH, Linux API,
	LSM List
In-Reply-To: <alpine.DEB.2.21.1908162211270.1923@nanos.tec.linutronix.de>

On Fri, Aug 16, 2019 at 10:28:29PM +0200, Thomas Gleixner wrote:
> Alexei,
> 
> On Fri, 16 Aug 2019, Alexei Starovoitov wrote:
> > It's both of the above when 'systemd' is not taken literally.
> > To earlier Thomas's point: the use case is not only about systemd.
> > There are other containers management systems.
> 
> <SNIP>
> 
> > These daemons need to drop privileges to make the system safer == less
> > prone to corruption due to bugs in themselves. Not necessary security
> > bugs.
> 
> Let's take a step back.
> 
> While real usecases are helpful to understand a design decision, the design
> needs to be usecase independent.
> 
> The kernel provides mechanisms, not policies. My impression of this whole
> discussion is that it is policy driven. That's the wrong approach.

not sure what you mean by 'policy driven'.
Proposed CAP_BPF is a policy?

My desire to do kernel.unprivileged_bpf_disabled=1 is driven by
text in Documentation/x86/mds.rst which says:
"There is one exception, which is untrusted BPF. The functionality of
untrusted BPF is limited, but it needs to be thoroughly investigated
whether it can be used to create such a construct."

commit 6a9e52927251 ("x86/speculation/mds: Add mds_clear_cpu_buffers()")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Jon Masters <jcm@redhat.com>
Tested-by: Jon Masters <jcm@redhat.com>

The way I read this text:
- there is a concern that mds is exploitable via bpf
- there is a desire to investigate to address this concern

I'm committed to help with the investigation.

In the mean time I propose a path to do
kernel.unprivileged_bpf_disabled=1 which is CAP_BPF.

Can kernel.unprivileged_bpf_disabled=1 be used now?
Yes, but it will weaken overall system security because things that
use unpriv to load bpf and CAP_NET_ADMIN to attach bpf would need
to move to stronger CAP_SYS_ADMIN.

With CAP_BPF both load and attach would happen under CAP_BPF
instead of CAP_SYS_ADMIN.

> So let's look at the mechanisms which we have at hand:
> 
>  1) Capabilities
>  
>  2) SUID and dropping priviledges
> 
>  3) Seccomp and LSM
> 
> Now the real interesting questions are:
> 
>  A) What kind of restrictions does BPF allow? Is it a binary on/off or is
>     there a more finegrained control of BPF functionality?
> 
>     TBH, I can't tell.
> 
>  B) Depending on the answer to #A what is the control possibility for
>     #1/#2/#3 ?

Can any of the mechanisms 1/2/3 address the concern in mds.rst?

I believe Andy wants to expand the attack surface when
kernel.unprivileged_bpf_disabled=0
Before that happens I'd like the community to work on addressing the text above.


^ permalink raw reply

* Re: [PATCH net] tcp: make sure EPOLLOUT wont be missed
From: Jason Baron @ 2019-08-17 14:19 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller
  Cc: netdev, Soheil Hassas Yeganeh, Neal Cardwell, Eric Dumazet,
	Vladimir Rutsky
In-Reply-To: <20190817042622.91497-1-edumazet@google.com>



On 8/17/19 12:26 AM, Eric Dumazet wrote:
> As Jason Baron explained in commit 790ba4566c1a ("tcp: set SOCK_NOSPACE
> under memory pressure"), it is crucial we properly set SOCK_NOSPACE
> when needed.
> 
> However, Jason patch had a bug, because the 'nonblocking' status
> as far as sk_stream_wait_memory() is concerned is governed
> by MSG_DONTWAIT flag passed at sendmsg() time :
> 
>     long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
> 
> So it is very possible that tcp sendmsg() calls sk_stream_wait_memory(),
> and that sk_stream_wait_memory() returns -EAGAIN with SOCK_NOSPACE
> cleared, if sk->sk_sndtimeo has been set to a small (but not zero)
> value.

Is MSG_DONTWAIT not set in this case? The original patch was intended
only for the explicit non-blocking case. The epoll manpage says:
"EPOLLET flag should use nonblocking file descriptors". So the original
intention was not to impact the blocking case. This seems to me like
a different use-case.

Thanks,

-Jason


> This patch removes the 'noblock' variable since we must always
> set SOCK_NOSPACE if -EAGAIN is returned.
> 
> It also renames the do_nonblock label since we might reach this
> code path even if we were in blocking mode.
> 
> Fixes: 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Jason Baron <jbaron@akamai.com>
> Reported-by: Vladimir Rutsky  <rutsky@google.com>
> ---
>  net/core/stream.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/net/core/stream.c b/net/core/stream.c
> index e94bb02a56295ec2db34ab423a8c7c890df0a696..4f1d4aa5fb38d989a9c81f32dfce3f31bbc1fa47 100644
> --- a/net/core/stream.c
> +++ b/net/core/stream.c
> @@ -120,7 +120,6 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
>  	int err = 0;
>  	long vm_wait = 0;
>  	long current_timeo = *timeo_p;
> -	bool noblock = (*timeo_p ? false : true);
>  	DEFINE_WAIT_FUNC(wait, woken_wake_function);
>  
>  	if (sk_stream_memory_free(sk))
> @@ -133,11 +132,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
>  
>  		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
>  			goto do_error;
> -		if (!*timeo_p) {
> -			if (noblock)
> -				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
> -			goto do_nonblock;
> -		}
> +		if (!*timeo_p)
> +			goto do_eagain;
>  		if (signal_pending(current))
>  			goto do_interrupted;
>  		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
> @@ -169,7 +165,13 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
>  do_error:
>  	err = -EPIPE;
>  	goto out;
> -do_nonblock:
> +do_eagain:
> +	/* Make sure that whenever EAGAIN is returned, EPOLLOUT event can
> +	 * be generated later.
> +	 * When TCP receives ACK packets that make room, tcp_check_space()
> +	 * only calls tcp_new_space() if SOCK_NOSPACE is set.
> +	 */
> +	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
>  	err = -EAGAIN;
>  	goto out;
>  do_interrupted:
> 

^ permalink raw reply

* Re: [RFC PATCH bpf-next 00/14] xdp_flow: Flow offload to XDP
From: Toshiaki Makita @ 2019-08-17 14:10 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, David S. Miller, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
	Cong Wang, Jiri Pirko, netdev, bpf, William Tu
In-Reply-To: <20190816153550.GO2820@mini-arch>

On 19/08/17 (土) 0:35:50, Stanislav Fomichev wrote:
> On 08/16, Toshiaki Makita wrote:
>> On 2019/08/16 0:21, Stanislav Fomichev wrote:
>>> On 08/15, Toshiaki Makita wrote:
>>>> On 2019/08/15 2:07, Stanislav Fomichev wrote:
>>>>> On 08/13, Toshiaki Makita wrote:
>>>>>> * Implementation
>>>>>>
>>>>>> xdp_flow makes use of UMH to load an eBPF program for XDP, similar to
>>>>>> bpfilter. The difference is that xdp_flow does not generate the eBPF
>>>>>> program dynamically but a prebuilt program is embedded in UMH. This is
>>>>>> mainly because flow insertion is considerably frequent. If we generate
>>>>>> and load an eBPF program on each insertion of a flow, the latency of the
>>>>>> first packet of ping in above test will incease, which I want to avoid.
>>>>> Can this be instead implemented with a new hook that will be called
>>>>> for TC events? This hook can write to perf event buffer and control
>>>>> plane will insert/remove/modify flow tables in the BPF maps (contol
>>>>> plane will also install xdp program).
>>>>>
>>>>> Why do we need UMH? What am I missing?
>>>>
>>>> So you suggest doing everything in xdp_flow kmod?
>>> You probably don't even need xdp_flow kmod. Add new tc "offload" mode
>>> (bypass) that dumps every command via netlink (or calls the BPF hook
>>> where you can dump it into perf event buffer) and then read that info
>>> from userspace and install xdp programs and modify flow tables.
>>> I don't think you need any kernel changes besides that stream
>>> of data from the kernel about qdisc/tc flow creation/removal/etc.
>>
>> My intention is to make more people who want high speed network easily use XDP,
>> so making transparent XDP offload with current TC interface.
>>
>> What userspace program would monitor TC events with your suggestion?
> Have a new system daemon (xdpflowerd) that is independently
> packaged/shipped/installed. Anybody who wants accelerated TC can
> download/install it. OVS can be completely unaware of this.

Thanks, but that's what I called an unreliable solution...

>> ovs-vswitchd? If so, it even does not need to monitor TC. It can
>> implement XDP offload directly.
>> (However I prefer kernel solution. Please refer to "About alternative
>> userland (ovs-vswitchd etc.) implementation" section in the cover letter.)
>>
>> Also such a TC monitoring solution easily can be out-of-sync with real TC
>> behavior as TC filter/flower is being heavily developed and changed,
>> e.g. introduction of TC block, support multiple masks with the same pref, etc.
>> I'm not sure such an unreliable solution have much value.
> This same issue applies to the in-kernel implementation, isn't it?
> What happens if somebody sends patches for a new flower feature but
> doesn't add appropriate xdp support? Do we reject them?

Why can we accept a patch which breaks other in-kernel subsystem...
Such patches can be applied accidentally but we are supposed to fix such 
problems in -rc phase, aren't we?

Toshiaki Makita

^ permalink raw reply

* Re: [PATCH v6 4/4] net: phy: realtek: Add LED configuration support for RTL8211E
From: Pavel Machek @ 2019-08-17 14:05 UTC (permalink / raw)
  To: Matthias Kaehlcke, jacek.anaszewski, linux-leds, dmurphy
  Cc: David S . Miller, Rob Herring, Mark Rutland, Andrew Lunn,
	Florian Fainelli, Heiner Kallweit, netdev, devicetree,
	linux-kernel, Douglas Anderson
In-Reply-To: <20190816212728.GW250418@google.com>

[-- Attachment #1: Type: text/plain, Size: 1409 bytes --]

On Fri 2019-08-16 14:27:28, Matthias Kaehlcke wrote:
> On Fri, Aug 16, 2019 at 10:13:42PM +0200, Pavel Machek wrote:
> > On Tue 2019-08-13 12:11:47, Matthias Kaehlcke wrote:
> > > Add a .config_led hook which is called by the PHY core when
> > > configuration data for a PHY LED is available. Each LED can be
> > > configured to be solid 'off, solid 'on' for certain (or all)
> > > link speeds or to blink on RX/TX activity.
> > > 
> > > Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> > 
> > THis really needs to go through the LED subsystem,
> 
> Sorry, I used what get_maintainers.pl threw at me, I should have
> manually cc-ed the LED list.
> 
> > and use the same userland interfaces as the rest of the system.
> 
> With the PHY maintainers we discussed to define a binding that is
> compatible with that of the LED one, to have the option to integrate
> it with the LED subsystem later. The integration itself is beyond the
> scope of this patchset.

Yes, I believe the integration is neccessary. Using same binding is
neccessary for that, but not sufficient. For example, we need
compatible trigger names, too.

So... I'd really like to see proper integration is possible before we
merge this.

Best regards,

									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: [RFC PATCH bpf-next 00/14] xdp_flow: Flow offload to XDP
From: Toshiaki Makita @ 2019-08-17 14:01 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Stanislav Fomichev, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, David S. Miller,
	Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
	Cong Wang, Jiri Pirko, netdev, bpf, William Tu
In-Reply-To: <20190816115224.6aafd4ee@cakuba.netronome.com>

On 19/08/17 (土) 3:52:24, Jakub Kicinski wrote:
> On Fri, 16 Aug 2019 10:28:10 +0900, Toshiaki Makita wrote:
>> On 2019/08/16 4:22, Jakub Kicinski wrote:
>>> There's a certain allure in bringing the in-kernel BPF translation
>>> infrastructure forward. OTOH from system architecture perspective IMHO
>>> it does seem like a task best handed in user space. bpfilter can replace
>>> iptables completely, here we're looking at an acceleration relatively
>>> loosely coupled with flower.
>>
>> I don't think it's loosely coupled. Emulating TC behavior in userspace
>> is not so easy.
>>
>> Think about recent multi-mask support in flower. Previously userspace could
>> assume there is one mask and hash table for each preference in TC. After the
>> change TC accepts different masks with the same pref. Such a change tends to
>> break userspace emulation. It may ignore masks passed from flow insertion
>> and use the mask remembered when the first flow of the pref is inserted. It
>> may override the mask of all existing flows with the pref. It may fail to
>> insert such flows. Any of them would result in unexpected wrong datapath
>> handling which is critical.
>> I think such an emulation layer needs to be updated in sync with TC.
> 
> Oh, so you're saying that if xdp_flow is merged all patches to
> cls_flower and netfilter which affect flow offload will be required
> to update xdp_flow as well?

Hmm... you are saying that we are allowed to break other in-kernel 
subsystem by some change? Sounds strange...

> That's a question of policy. Technically the implementation in user
> space is equivalent.
 >
> The advantage of user space implementation is that you can add more
> to it and explore use cases which do not fit in the flow offload API,
> but are trivial for BPF. Not to mention the obvious advantage of
> decoupling the upgrade path.

I understand the advantage, but I can't trust such a third-party kernel 
emulation solution for this kind of thing which handles critical data path.

> Personally I'm not happy with the way this patch set messes with the
> flow infrastructure. You should use the indirect callback
> infrastructure instead, and that way you can build the whole thing
> touching none of the flow offload core.

I don't want to mess up the core flow infrastructure either. I'm all 
ears about less invasive ways. Using indirect callback sounds like a 
good idea. Will give it a try. Many thanks.

Toshiaki Makita

^ permalink raw reply

* [PATCH net-next v3 16/16] Documentation: Add a section for devlink-trap testing
From: Ido Schimmel @ 2019-08-17 13:28 UTC (permalink / raw)
  To: netdev
  Cc: davem, nhorman, jiri, toke, dsahern, roopa, nikolay,
	jakub.kicinski, andy, f.fainelli, andrew, vivien.didelot, mlxsw,
	Ido Schimmel
In-Reply-To: <20190817132825.29790-1-idosch@idosch.org>

From: Ido Schimmel <idosch@mellanox.com>

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 Documentation/networking/devlink-trap.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink-trap.rst
index fe4f6e149623..c20c7c483664 100644
--- a/Documentation/networking/devlink-trap.rst
+++ b/Documentation/networking/devlink-trap.rst
@@ -196,3 +196,13 @@ narrow. The description of these groups must be added to the following table:
    * - ``buffer_drops``
      - Contains packet traps for packets that were dropped by the device due to
        an enqueue decision
+
+Testing
+=======
+
+See ``tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh`` for a
+test covering the core infrastructure. Test cases should be added for any new
+functionality.
+
+Device drivers should focus their tests on device-specific functionality, such
+as the triggering of supported packet traps.
-- 
2.21.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox