Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC PATCH net-next v4 1/2] macb: Add 1588 support in Cadence GEM.
From: Andrei Pistirica @ 2016-12-14 12:56 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-arm-kernel, davem, nicolas.ferre,
	harinikatakamlinux, harini.katakam
  Cc: boris.brezillon, rafalo, alexandre.belloni, michals,
	Andrei Pistirica, tbultel, anirudh, punnaia, richardcochran

Cadence GEM provides a 102 bit time counter with 48 bits for seconds,
30 bits for nsecs and 24 bits for sub-nsecs to control 1588 timestamping.

This patch does the following:
- Registers to ptp clock framework
- Timer initialization is done by writing time of day to the timer counter.
- ns increment register is programmed as NSEC_PER_SEC/tsu-clock-rate.
  For a 16 bit subns precision, the subns increment equals
  remainder of (NS_PER_SEC/TSU_CLK) * (2^16).
- Timestamps are obtained from the TX/RX PTP event/PEER registers.
  The timestamp obtained thus is updated in skb for upper layers to access.
- The drivers register functions with ptp to perform time and frequency
  adjustment.
- Time adjustment is done by writing to the 1558_ADJUST register.
  The controller will read the delta in this register and update the timer
  counter register. Alternatively, for large time offset adjustments,
  the driver reads the secs and nsecs counter values, adds/subtracts the
  delta and updates the timer counter.
- Frequency is adjusted by adjusting addend (8bit nanosecond increment) and
  addendsub (16bit increment nanosecond fractions).
  The 102bit counter is incremented at nominal frequency with addend and
  addendsub values. Each period addend and addendsub values are adjusted
  based on ppm drift.

Signed-off-by: Andrei Pistirica <andrei.pistirica@microchip.com>
Signed-off-by: Harini Katakam <harinik@xilinx.com>
---
Patch history:

Version 1:
This patch is based on original Harini's patch, implemented in a
separate file to ease the review/maintanance and integration with
other platforms (e.g. Zynq Ultrascale+ MPSoC).
Feature was tested on SAMA5D2 platform using ptp4l v1.6 from linuxptp
project and also with ptpd2 version 2.3.1. PTP was tested over
IPv4,IPv6 and 802.3 protocols.

In case that macb is compiled as a module, it has been renamed to
cadence-macb.ko to avoid naming confusion in Makefile.

Version 2 modifications:
- bitfields for TSU are named according to SAMA5D2 data sheet
- identify GEM-PTP support based on platform capability
- add spinlock for TSU access
- change macb_ptp_adjfreq and use fewer 64bit divisions

Version 3 modifications:
- new adjfine api with one 64 division for frequency adjustment 
  (based on Richard's input)
- add maximum adjustment frequency (ppb) based on nominal frequency
- per platform PTP configuration
- cosmetic changes
Note 1: Kbuild uses "select" instead of "imply", and the macb maintainer agreed
        to make the change when it will be available in net-next.

Version 4 modifications:
- update adjfine for a better approximation
- add maximum adjustment frequency callback to PTP platform configuraion

Note 1: This driver does not support GEM-GXL!
Note 2: Patch on net-next, on December 14th. 

 drivers/net/ethernet/cadence/Kconfig    |  10 +-
 drivers/net/ethernet/cadence/Makefile   |   8 +-
 drivers/net/ethernet/cadence/macb.h     | 118 ++++++++++
 drivers/net/ethernet/cadence/macb_ptp.c | 366 ++++++++++++++++++++++++++++++++
 4 files changed, 500 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/cadence/macb_ptp.c

diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig
index f0bcb15..ebbc65f 100644
--- a/drivers/net/ethernet/cadence/Kconfig
+++ b/drivers/net/ethernet/cadence/Kconfig
@@ -29,6 +29,14 @@ config MACB
 	  support for the MACB/GEM chip.
 
 	  To compile this driver as a module, choose M here: the module
-	  will be called macb.
+	  will be called cadence-macb.
+
+config MACB_USE_HWSTAMP
+	bool "Use IEEE 1588 hwstamp"
+	depends on MACB
+	default y
+	select PTP_1588_CLOCK
+	---help---
+	  Enable IEEE 1588 Precision Time Protocol (PTP) support for MACB.
 
 endif # NET_CADENCE
diff --git a/drivers/net/ethernet/cadence/Makefile b/drivers/net/ethernet/cadence/Makefile
index 91f79b1..4402d42 100644
--- a/drivers/net/ethernet/cadence/Makefile
+++ b/drivers/net/ethernet/cadence/Makefile
@@ -2,4 +2,10 @@
 # Makefile for the Atmel network device drivers.
 #
 
-obj-$(CONFIG_MACB) += macb.o
+cadence-macb-y	:= macb.o
+
+ifeq ($(CONFIG_MACB_USE_HWSTAMP),y)
+cadence-macb-y	+= macb_ptp.o
+endif
+
+obj-$(CONFIG_MACB) += cadence-macb.o
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index d67adad..e65e985 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -10,6 +10,9 @@
 #ifndef _MACB_H
 #define _MACB_H
 
+#include <linux/ptp_clock.h>
+#include <linux/ptp_clock_kernel.h>
+
 #define MACB_GREGS_NBR 16
 #define MACB_GREGS_VERSION 2
 #define MACB_MAX_QUEUES 8
@@ -131,6 +134,20 @@
 #define GEM_RXIPCCNT		0x01a8 /* IP header Checksum Error Counter */
 #define GEM_RXTCPCCNT		0x01ac /* TCP Checksum Error Counter */
 #define GEM_RXUDPCCNT		0x01b0 /* UDP Checksum Error Counter */
+#define GEM_TISUBN		0x01bc /* 1588 Timer Increment Sub-ns */
+#define GEM_TSH			0x01c0 /* 1588 Timer Seconds High */
+#define GEM_TSL			0x01d0 /* 1588 Timer Seconds Low */
+#define GEM_TN			0x01d4 /* 1588 Timer Nanoseconds */
+#define GEM_TA			0x01d8 /* 1588 Timer Adjust */
+#define GEM_TI			0x01dc /* 1588 Timer Increment */
+#define GEM_EFTSL		0x01e0 /* PTP Event Frame Tx Seconds Low */
+#define GEM_EFTN		0x01e4 /* PTP Event Frame Tx Nanoseconds */
+#define GEM_EFRSL		0x01e8 /* PTP Event Frame Rx Seconds Low */
+#define GEM_EFRN		0x01ec /* PTP Event Frame Rx Nanoseconds */
+#define GEM_PEFTSL		0x01f0 /* PTP Peer Event Frame Tx Secs Low */
+#define GEM_PEFTN		0x01f4 /* PTP Peer Event Frame Tx Ns */
+#define GEM_PEFRSL		0x01f8 /* PTP Peer Event Frame Rx Sec Low */
+#define GEM_PEFRN		0x01fc /* PTP Peer Event Frame Rx Ns */
 #define GEM_DCFG1		0x0280 /* Design Config 1 */
 #define GEM_DCFG2		0x0284 /* Design Config 2 */
 #define GEM_DCFG3		0x0288 /* Design Config 3 */
@@ -174,6 +191,7 @@
 #define MACB_NCR_TPF_SIZE	1
 #define MACB_TZQ_OFFSET		12 /* Transmit zero quantum pause frame */
 #define MACB_TZQ_SIZE		1
+#define MACB_SRTSM_OFFSET	15
 
 /* Bitfields in NCFGR */
 #define MACB_SPD_OFFSET		0 /* Speed */
@@ -319,6 +337,32 @@
 #define MACB_PTZ_SIZE		1
 #define MACB_WOL_OFFSET		14 /* Enable wake-on-lan interrupt */
 #define MACB_WOL_SIZE		1
+#define MACB_DRQFR_OFFSET	18 /* PTP Delay Request Frame Received */
+#define MACB_DRQFR_SIZE		1
+#define MACB_SFR_OFFSET		19 /* PTP Sync Frame Received */
+#define MACB_SFR_SIZE		1
+#define MACB_DRQFT_OFFSET	20 /* PTP Delay Request Frame Transmitted */
+#define MACB_DRQFT_SIZE		1
+#define MACB_SFT_OFFSET		21 /* PTP Sync Frame Transmitted */
+#define MACB_SFT_SIZE		1
+#define MACB_PDRQFR_OFFSET	22 /* PDelay Request Frame Received */
+#define MACB_PDRQFR_SIZE	1
+#define MACB_PDRSFR_OFFSET	23 /* PDelay Response Frame Received */
+#define MACB_PDRSFR_SIZE	1
+#define MACB_PDRQFT_OFFSET	24 /* PDelay Request Frame Transmitted */
+#define MACB_PDRQFT_SIZE	1
+#define MACB_PDRSFT_OFFSET	25 /* PDelay Response Frame Transmitted */
+#define MACB_PDRSFT_SIZE	1
+#define MACB_SRI_OFFSET		26 /* TSU Seconds Register Increment */
+#define MACB_SRI_SIZE		1
+
+/* Timer increment fields */
+#define MACB_TI_CNS_OFFSET	0
+#define MACB_TI_CNS_SIZE	8
+#define MACB_TI_ACNS_OFFSET	8
+#define MACB_TI_ACNS_SIZE	8
+#define MACB_TI_NIT_OFFSET	16
+#define MACB_TI_NIT_SIZE	8
 
 /* Bitfields in MAN */
 #define MACB_DATA_OFFSET	0 /* data */
@@ -386,6 +430,17 @@
 #define GEM_PBUF_LSO_OFFSET			27
 #define GEM_PBUF_LSO_SIZE			1
 
+/* Bitfields in TISUBN */
+#define GEM_SUBNSINCR_OFFSET			0
+#define GEM_SUBNSINCR_SIZE			16
+
+/* Bitfields in TI */
+#define GEM_NSINCR_OFFSET			0
+#define GEM_NSINCR_SIZE				8
+
+/* Bitfields in ADJ */
+#define GEM_ADDSUB_OFFSET			31
+#define GEM_ADDSUB_SIZE				1
 /* Constants for CLK */
 #define MACB_CLK_DIV8				0
 #define MACB_CLK_DIV16				1
@@ -417,6 +472,7 @@
 #define MACB_CAPS_GIGABIT_MODE_AVAILABLE	0x20000000
 #define MACB_CAPS_SG_DISABLED			0x40000000
 #define MACB_CAPS_MACB_IS_GEM			0x80000000
+#define MACB_CAPS_GEM_HAS_PTP			0x00000020
 
 /* LSO settings */
 #define MACB_LSO_UFO_ENABLE			0x01
@@ -782,6 +838,20 @@ struct macb_or_gem_ops {
 	int	(*mog_rx)(struct macb *bp, int budget);
 };
 
+/* MACB-PTP interface: adapt to platform needs and GEM (e.g. GXL). */
+struct macb_ptp_info {
+	void (*ptp_init)(struct net_device *ndev);
+	void (*ptp_remove)(struct net_device *ndev);
+	s32 (*get_ptp_max_adj)(void);
+	unsigned int (*get_tsu_rate)(struct macb *bp);
+	int (*get_ts_info)(struct net_device *dev,
+			   struct ethtool_ts_info *info);
+	int (*get_hwtst)(struct net_device *netdev,
+			 struct ifreq *ifr);
+	int (*set_hwtst)(struct net_device *netdev,
+			 struct ifreq *ifr, int cmd);
+};
+
 struct macb_config {
 	u32			caps;
 	unsigned int		dma_burst_length;
@@ -874,11 +944,59 @@ struct macb {
 	unsigned int		jumbo_max_len;
 
 	u32			wol;
+
+	struct macb_ptp_info	*ptp_info;
+#ifdef CONFIG_MACB_USE_HWSTAMP
+	bool			hwts_tx_en;
+	bool			hwts_rx_en;
+	spinlock_t		tsu_clk_lock; /* gem tsu clock locking */
+	unsigned int		tsu_rate;
+
+	struct ptp_clock	*ptp_clock;
+	struct ptp_clock_info	ptp_caps;
+	u32			ns_incr;
+	u32			subns_incr;
+#endif
 };
 
+#ifdef CONFIG_MACB_USE_HWSTAMP
+void gem_ptp_init(struct net_device *ndev);
+void gem_ptp_remove(struct net_device *ndev);
+void gem_ptp_txstamp(struct macb *bp, struct sk_buff *skb);
+void gem_ptp_rxstamp(struct macb *bp, struct sk_buff *skb);
+
+static inline void gem_ptp_do_txstamp(struct macb *bp, struct sk_buff *skb)
+{
+	if (!bp->hwts_tx_en)
+		return;
+
+	return gem_ptp_txstamp(bp, skb);
+}
+
+static inline void gem_ptp_do_rxstamp(struct macb *bp, struct sk_buff *skb)
+{
+	if (!bp->hwts_rx_en)
+		return;
+
+	return gem_ptp_rxstamp(bp, skb);
+}
+
+#else
+static inline void gem_ptp_init(struct net_device *ndev) { }
+static inline void gem_ptp_remove(struct net_device *ndev) { }
+
+static inline void gem_ptp_do_txstamp(struct macb *bp, struct sk_buff *skb) { }
+static inline void gem_ptp_do_rxstamp(struct macb *bp, struct sk_buff *skb) { }
+#endif
+
 static inline bool macb_is_gem(struct macb *bp)
 {
 	return !!(bp->caps & MACB_CAPS_MACB_IS_GEM);
 }
 
+static inline bool gem_has_ptp(struct macb *bp)
+{
+	return !!(bp->caps & MACB_CAPS_GEM_HAS_PTP);
+}
+
 #endif /* _MACB_H */
diff --git a/drivers/net/ethernet/cadence/macb_ptp.c b/drivers/net/ethernet/cadence/macb_ptp.c
new file mode 100644
index 0000000..6121b2a
--- /dev/null
+++ b/drivers/net/ethernet/cadence/macb_ptp.c
@@ -0,0 +1,366 @@
+/*
+ * 1588 PTP support for GEM device.
+ *
+ * Copyright (C) 2016 Microchip Technology
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/platform_device.h>
+#include <linux/time64.h>
+#include <linux/ptp_classify.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/net_tstamp.h>
+
+#include "macb.h"
+
+#define  GEM_PTP_TIMER_NAME "gem-ptp-timer"
+
+static inline void gem_tsu_get_time(struct macb *bp,
+				    struct timespec64 *ts)
+{
+	u64 sec, sech, secl;
+
+	spin_lock(&bp->tsu_clk_lock);
+
+	/* GEM's internal time */
+	sech = gem_readl(bp, TSH);
+	secl = gem_readl(bp, TSL);
+	ts->tv_nsec = gem_readl(bp, TN);
+	ts->tv_sec = (sech << 32) | secl;
+
+	/* minimize error */
+	sech = gem_readl(bp, TSH);
+	secl = gem_readl(bp, TSL);
+	sec = (sech << 32) | secl;
+	if (ts->tv_sec != sec) {
+		ts->tv_sec = sec;
+		ts->tv_nsec = gem_readl(bp, TN);
+	}
+
+	spin_unlock(&bp->tsu_clk_lock);
+}
+
+static inline void gem_tsu_set_time(struct macb *bp,
+				    const struct timespec64 *ts)
+{
+	u32 ns, sech, secl;
+	s64 word_mask = 0xffffffff;
+
+	sech = (u32)ts->tv_sec;
+	secl = (u32)ts->tv_sec;
+	ns = ts->tv_nsec;
+	if (ts->tv_sec > word_mask)
+		sech = (ts->tv_sec >> 32);
+
+	spin_lock(&bp->tsu_clk_lock);
+
+	/* TSH doesn't latch the time and no atomicity! */
+	gem_writel(bp, TN, 0); /* clear to avoid overflow */
+	gem_writel(bp, TSH, sech);
+	gem_writel(bp, TSL, secl);
+	gem_writel(bp, TN, ns);
+
+	spin_unlock(&bp->tsu_clk_lock);
+}
+
+static int gem_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+	struct macb *bp = container_of(ptp, struct macb, ptp_caps);
+	u32 word, diff;
+	u64 adj, rate;
+	int neg_adj = 0;
+
+	if (scaled_ppm < 0) {
+		neg_adj = 1;
+		scaled_ppm = -scaled_ppm;
+	}
+	rate = scaled_ppm;
+
+	/* word: unused(8bit) | ns(8bit) | fractions(16bit) */
+	word = (bp->ns_incr << 16) + bp->subns_incr;
+
+	adj = word;
+	adj *= rate;
+	adj += 500000UL << 16;
+	adj >>= 16; /* remove fractions */
+	diff = div_u64(adj, 1000000UL);
+	word = neg_adj ? word - diff : word + diff;
+
+	spin_lock(&bp->tsu_clk_lock);
+
+	gem_writel(bp, TISUBN, GEM_BF(SUBNSINCR, (word & 0xffff)));
+	gem_writel(bp, TI, GEM_BF(NSINCR, (word >> 16)));
+
+	spin_unlock(&bp->tsu_clk_lock);
+	return 0;
+}
+
+static int gem_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct macb *bp = container_of(ptp, struct macb, ptp_caps);
+	struct timespec64 now, then = ns_to_timespec64(delta);
+	u32 adj, sign = 0;
+
+	if (delta < 0) {
+		delta = -delta;
+		sign = 1;
+	}
+
+	if (delta > 0x3FFFFFFF) {
+		gem_tsu_get_time(bp, &now);
+
+		if (sign)
+			now = timespec64_sub(now, then);
+		else
+			now = timespec64_add(now, then);
+
+		gem_tsu_set_time(bp, (const struct timespec64 *)&now);
+	} else {
+		adj = delta;
+		if (sign)
+			adj |= GEM_BIT(ADDSUB);
+
+		gem_writel(bp, TA, adj);
+	}
+
+	return 0;
+}
+
+static int gem_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct macb *bp = container_of(ptp, struct macb, ptp_caps);
+
+	gem_tsu_get_time(bp, ts);
+
+	return 0;
+}
+
+static int gem_ptp_settime(struct ptp_clock_info *ptp,
+			   const struct timespec64 *ts)
+{
+	struct macb *bp = container_of(ptp, struct macb, ptp_caps);
+
+	gem_tsu_set_time(bp, ts);
+
+	return 0;
+}
+
+static int gem_ptp_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+static struct ptp_clock_info gem_ptp_caps_template = {
+	.owner		= THIS_MODULE,
+	.name		= GEM_PTP_TIMER_NAME,
+	.max_adj	= 0,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfine	= gem_ptp_adjfine,
+	.adjtime	= gem_ptp_adjtime,
+	.gettime64	= gem_ptp_gettime,
+	.settime64	= gem_ptp_settime,
+	.enable		= gem_ptp_enable,
+};
+
+static void gem_ptp_init_timer(struct macb *bp)
+{
+	struct timespec64 now;
+	u32 rem = 0;
+
+	getnstimeofday64(&now);
+	gem_tsu_set_time(bp, (const struct timespec64 *)&now);
+
+	bp->ns_incr = div_u64_rem(NSEC_PER_SEC, bp->tsu_rate, &rem);
+	if (rem) {
+		u64 adj = rem;
+
+		adj <<= 16; /* 16 bits nsec fragments */
+		bp->subns_incr = div_u64(adj, bp->tsu_rate);
+	} else {
+		bp->subns_incr = 0;
+	}
+
+	gem_writel(bp, TISUBN, GEM_BF(SUBNSINCR, bp->subns_incr));
+	gem_writel(bp, TI, GEM_BF(NSINCR, bp->ns_incr));
+	gem_writel(bp, TA, 0);
+}
+
+static void gem_ptp_clear_timer(struct macb *bp)
+{
+	bp->ns_incr = 0;
+	bp->subns_incr = 0;
+
+	gem_writel(bp, TISUBN, GEM_BF(SUBNSINCR, 0));
+	gem_writel(bp, TI, GEM_BF(NSINCR, 0));
+	gem_writel(bp, TA, 0);
+}
+
+/* While GEM can timestamp PTP packets, it does not mark the RX descriptor
+ * to identify them. UDP packets must be parsed to identify PTP packets.
+ *
+ * Note: Inspired from drivers/net/ethernet/ti/cpts.c
+ */
+static int gem_get_ptp_peer(struct sk_buff *skb, int ptp_class)
+{
+	unsigned int offset = 0;
+	u8 *msgtype, *data = skb->data;
+
+	/* PTP frames are rare! */
+	if (likely(ptp_class == PTP_CLASS_NONE))
+		return -1;
+
+	if (ptp_class & PTP_CLASS_VLAN)
+		offset += VLAN_HLEN;
+
+	switch (ptp_class & PTP_CLASS_PMASK) {
+	case PTP_CLASS_IPV4:
+		offset += ETH_HLEN + IPV4_HLEN(data + offset) + UDP_HLEN;
+	break;
+	case PTP_CLASS_IPV6:
+		offset += ETH_HLEN + IP6_HLEN + UDP_HLEN;
+	break;
+	case PTP_CLASS_L2:
+		offset += ETH_HLEN;
+		break;
+
+	/* something went wrong! */
+	default:
+		return -1;
+	}
+
+	if (skb->len + ETH_HLEN < offset + OFF_PTP_SEQUENCE_ID)
+		return -1;
+
+	if (unlikely(ptp_class & PTP_CLASS_V1))
+		msgtype = data + offset + OFF_PTP_CONTROL;
+	else
+		msgtype = data + offset;
+
+	return (*msgtype) & 0x2;
+}
+
+static void gem_ptp_tx_hwtstamp(struct macb *bp, struct sk_buff *skb,
+				int peer_ev)
+{
+	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+	struct timespec64 ts;
+	u64 ns;
+
+	/* PTP Peer Event Frame packets */
+	if (peer_ev) {
+		ts.tv_sec = gem_readl(bp, PEFTSL);
+		ts.tv_nsec = gem_readl(bp, PEFTN);
+
+	/* PTP Event Frame packets */
+	} else {
+		ts.tv_sec = gem_readl(bp, EFTSL);
+		ts.tv_nsec = gem_readl(bp, EFTN);
+	}
+	ns = timespec64_to_ns(&ts);
+
+	memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
+	shhwtstamps->hwtstamp = ns_to_ktime(ns);
+	skb_tstamp_tx(skb, skb_hwtstamps(skb));
+}
+
+static void gem_ptp_rx_hwtstamp(struct macb *bp, struct sk_buff *skb,
+				int peer_ev)
+{
+	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+	struct timespec64 ts;
+	u64 ns;
+
+	if (peer_ev) {
+		/* PTP Peer Event Frame packets */
+		ts.tv_sec = gem_readl(bp, PEFRSL);
+		ts.tv_nsec = gem_readl(bp, PEFRN);
+	} else {
+		/* PTP Event Frame packets */
+		ts.tv_sec = gem_readl(bp, EFRSL);
+		ts.tv_nsec = gem_readl(bp, EFRN);
+	}
+	ns = timespec64_to_ns(&ts);
+
+	memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
+	shhwtstamps->hwtstamp = ns_to_ktime(ns);
+}
+
+/* no static, GEM PTP interface functions */
+void gem_ptp_txstamp(struct macb *bp, struct sk_buff *skb)
+{
+	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) {
+		int class = ptp_classify_raw(skb);
+		int peer;
+
+		peer = gem_get_ptp_peer(skb, class);
+		if (peer < 0)
+			return;
+
+		/* Timestamp this packet */
+		gem_ptp_tx_hwtstamp(bp, skb, peer);
+	}
+}
+
+void gem_ptp_rxstamp(struct macb *bp, struct sk_buff *skb)
+{
+	int class, peer;
+
+	__skb_push(skb, ETH_HLEN);
+	class = ptp_classify_raw(skb);
+	__skb_pull(skb, ETH_HLEN);
+
+	peer = gem_get_ptp_peer(skb, class);
+	if (peer < 0)
+		return;
+
+	gem_ptp_rx_hwtstamp(bp, skb, peer);
+}
+
+void gem_ptp_init(struct net_device *ndev)
+{
+	struct macb *bp = netdev_priv(ndev);
+
+	spin_lock_init(&bp->tsu_clk_lock);
+	bp->ptp_caps = gem_ptp_caps_template;
+
+	/* nominal frequency and maximum adjustment in ppb */
+	bp->tsu_rate = bp->ptp_info->get_tsu_rate(bp);
+	bp->ptp_caps.max_adj = bp->ptp_info->get_ptp_max_adj();
+
+	gem_ptp_init_timer(bp);
+
+	bp->ptp_clock = ptp_clock_register(&bp->ptp_caps, NULL);
+	if (IS_ERR(&bp->ptp_clock)) {
+		bp->ptp_clock = NULL;
+		pr_err("ptp clock register failed\n");
+		return;
+	}
+
+	dev_info(&bp->pdev->dev, "%s ptp clock registered.\n",
+		 GEM_PTP_TIMER_NAME);
+}
+
+void gem_ptp_remove(struct net_device *ndev)
+{
+	struct macb *bp = netdev_priv(ndev);
+
+	if (bp->ptp_clock)
+		ptp_clock_unregister(bp->ptp_clock);
+
+	gem_ptp_clear_timer(bp);
+
+	dev_info(&bp->pdev->dev, "%s ptp clock unregistered.\n",
+		 GEM_PTP_TIMER_NAME);
+}
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH net-next v4 2/2] macb: Enable 1588 support in SAMA5Dx platforms.
From: Andrei Pistirica @ 2016-12-14 12:56 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-arm-kernel, davem, nicolas.ferre,
	harinikatakamlinux, harini.katakam
  Cc: punnaia, michals, anirudh, boris.brezillon, alexandre.belloni,
	tbultel, richardcochran, rafalo, Andrei Pistirica
In-Reply-To: <1481720175-12703-1-git-send-email-andrei.pistirica@microchip.com>

This patch does the following:
- Enable HW time stamp for the following platforms: SAMA5D2, SAMA5D3 and
  SAMA5D4.
- HW time stamp capabilities are advertised via ethtool and macb ioctl is
  updated accordingly.
- HW time stamp on the PTP Ethernet packets are received using the
  SO_TIMESTAMPING API. Where timers are obtained from the PTP event/peer
  registers.

Note: Patch on net-next, on December 7th.

Signed-off-by: Andrei Pistirica <andrei.pistirica@microchip.com>
---
Patch history:

Version 1:
Integration with SAMA5D2 only. This feature wasn't tested on any
other platform that might use cadence/gem.

Patch is not completely ported to the very latest version of net-next,
and it will be after review.

Version 2 modifications:
- add PTP caps for SAMA5D2/3/4 platforms
- and cosmetic changes

Version 3 modifications:
- add support for sama5D2/3/4 platforms using GEM-PTP interface.

Version 4 modifications:
- time stamp only PTP_V2 events
- maximum adjustment value is set based on Richard's input

Note: Patch on net-next, on December 14th. 

 drivers/net/ethernet/cadence/macb.c | 168 ++++++++++++++++++++++++++++++++++--
 1 file changed, 163 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 538544a..8d5c976 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -714,6 +714,8 @@ static void macb_tx_interrupt(struct macb_queue *queue)
 
 			/* First, update TX stats if needed */
 			if (skb) {
+				gem_ptp_do_txstamp(bp, skb);
+
 				netdev_vdbg(bp->dev, "skb %u (data %p) TX complete\n",
 					    macb_tx_ring_wrap(bp, tail),
 					    skb->data);
@@ -878,6 +880,8 @@ static int gem_rx(struct macb *bp, int budget)
 		    GEM_BFEXT(RX_CSUM, ctrl) & GEM_RX_CSUM_CHECKED_MASK)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
+		gem_ptp_do_rxstamp(bp, skb);
+
 		bp->stats.rx_packets++;
 		bp->stats.rx_bytes += skb->len;
 
@@ -2080,6 +2084,9 @@ static int macb_open(struct net_device *dev)
 
 	netif_tx_start_all_queues(dev);
 
+	if (bp->ptp_info)
+		bp->ptp_info->ptp_init(dev);
+
 	return 0;
 }
 
@@ -2101,6 +2108,9 @@ static int macb_close(struct net_device *dev)
 
 	macb_free_consistent(bp);
 
+	if (bp->ptp_info)
+		bp->ptp_info->ptp_remove(dev);
+
 	return 0;
 }
 
@@ -2374,6 +2384,133 @@ static int macb_set_ringparam(struct net_device *netdev,
 	return 0;
 }
 
+#ifdef CONFIG_MACB_USE_HWSTAMP
+static unsigned int gem_get_tsu_rate(struct macb *bp)
+{
+	/* Note: TSU rate is hardwired to PCLK. */
+	return clk_get_rate(bp->pclk);
+}
+
+static s32 gem_get_ptp_max_adj(void)
+{
+	return 3921508;
+}
+
+static int gem_get_ts_info(struct net_device *dev,
+			   struct ethtool_ts_info *info)
+{
+	struct macb *bp = netdev_priv(dev);
+
+	ethtool_op_get_ts_info(dev, info);
+	info->so_timestamping =
+		SOF_TIMESTAMPING_TX_SOFTWARE |
+		SOF_TIMESTAMPING_RX_SOFTWARE |
+		SOF_TIMESTAMPING_SOFTWARE |
+		SOF_TIMESTAMPING_TX_HARDWARE |
+		SOF_TIMESTAMPING_RX_HARDWARE |
+		SOF_TIMESTAMPING_RAW_HARDWARE;
+	info->phc_index = -1;
+
+	if (bp->ptp_clock)
+		info->phc_index = ptp_clock_index(bp->ptp_clock);
+
+	return 0;
+}
+
+static int gem_set_hwtst(struct net_device *netdev,
+			 struct ifreq *ifr, int cmd)
+{
+	struct hwtstamp_config config;
+	struct macb *priv = netdev_priv(netdev);
+	u32 regval;
+
+	netdev_vdbg(netdev, "macb_hwtstamp_ioctl\n");
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+		priv->hwts_tx_en = false;
+		break;
+	case HWTSTAMP_TX_ON:
+		priv->hwts_tx_en = true;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		if (priv->hwts_rx_en)
+			priv->hwts_rx_en = false;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		regval = macb_readl(priv, NCR);
+		macb_writel(priv, NCR, (regval | MACB_BIT(SRTSM)));
+
+		if (!priv->hwts_rx_en)
+			priv->hwts_rx_en = true;
+		break;
+	default:
+		config.rx_filter = HWTSTAMP_FILTER_NONE;
+		return -ERANGE;
+	}
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
+static int gem_get_hwtst(struct net_device *netdev,
+			 struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	struct macb *priv = netdev_priv(netdev);
+
+	config.flags = 0;
+	config.tx_type = priv->hwts_tx_en ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
+	config.rx_filter = (priv->hwts_rx_en ?
+			    HWTSTAMP_FILTER_ALL : HWTSTAMP_FILTER_NONE);
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
+static struct macb_ptp_info gem_ptp_info = {
+	.ptp_init	 = gem_ptp_init,
+	.ptp_remove	 = gem_ptp_remove,
+	.get_ptp_max_adj = gem_get_ptp_max_adj,
+	.get_tsu_rate	 = gem_get_tsu_rate,
+	.get_ts_info	 = gem_get_ts_info,
+	.get_hwtst	 = gem_get_hwtst,
+	.set_hwtst	 = gem_set_hwtst,
+};
+#endif
+
+static int macb_get_ts_info(struct net_device *netdev,
+			    struct ethtool_ts_info *info)
+{
+	struct macb *bp = netdev_priv(netdev);
+
+	if (bp->ptp_info)
+		return bp->ptp_info->get_ts_info(netdev, info);
+
+	return ethtool_op_get_ts_info(netdev, info);
+}
+
 static const struct ethtool_ops macb_ethtool_ops = {
 	.get_regs_len		= macb_get_regs_len,
 	.get_regs		= macb_get_regs,
@@ -2391,7 +2528,7 @@ static const struct ethtool_ops gem_ethtool_ops = {
 	.get_regs_len		= macb_get_regs_len,
 	.get_regs		= macb_get_regs,
 	.get_link		= ethtool_op_get_link,
-	.get_ts_info		= ethtool_op_get_ts_info,
+	.get_ts_info		= macb_get_ts_info,
 	.get_ethtool_stats	= gem_get_ethtool_stats,
 	.get_strings		= gem_get_ethtool_strings,
 	.get_sset_count		= gem_get_sset_count,
@@ -2404,6 +2541,7 @@ static const struct ethtool_ops gem_ethtool_ops = {
 static int macb_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	struct phy_device *phydev = dev->phydev;
+	struct macb *bp = netdev_priv(dev);
 
 	if (!netif_running(dev))
 		return -EINVAL;
@@ -2411,7 +2549,20 @@ static int macb_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	if (!phydev)
 		return -ENODEV;
 
-	return phy_mii_ioctl(phydev, rq, cmd);
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		if (bp->ptp_info)
+			return bp->ptp_info->set_hwtst(dev, rq, cmd);
+
+		return -EOPNOTSUPP;
+	case SIOCGHWTSTAMP:
+		if (bp->ptp_info)
+			return bp->ptp_info->get_hwtst(dev, rq);
+
+		return -EOPNOTSUPP;
+	default:
+		return phy_mii_ioctl(phydev, rq, cmd);
+	}
 }
 
 static int macb_set_features(struct net_device *netdev,
@@ -2485,6 +2636,12 @@ static void macb_configure_caps(struct macb *bp,
 		dcfg = gem_readl(bp, DCFG2);
 		if ((dcfg & (GEM_BIT(RX_PKT_BUFF) | GEM_BIT(TX_PKT_BUFF))) == 0)
 			bp->caps |= MACB_CAPS_FIFO_MODE;
+
+		/* iff HWSTAMP is configure and gem has the capability */
+#ifdef CONFIG_MACB_USE_HWSTAMP
+		if (gem_has_ptp(bp))
+			bp->ptp_info = &gem_ptp_info;
+#endif
 	}
 
 	dev_dbg(&bp->pdev->dev, "Cadence caps 0x%08x\n", bp->caps);
@@ -3041,7 +3198,7 @@ static const struct macb_config pc302gem_config = {
 };
 
 static const struct macb_config sama5d2_config = {
-	.caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII,
+	.caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII | MACB_CAPS_GEM_HAS_PTP,
 	.dma_burst_length = 16,
 	.clk_init = macb_clk_init,
 	.init = macb_init,
@@ -3049,14 +3206,15 @@ static const struct macb_config sama5d2_config = {
 
 static const struct macb_config sama5d3_config = {
 	.caps = MACB_CAPS_SG_DISABLED | MACB_CAPS_GIGABIT_MODE_AVAILABLE
-	      | MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII,
+	      | MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII
+	      | MACB_CAPS_GEM_HAS_PTP,
 	.dma_burst_length = 16,
 	.clk_init = macb_clk_init,
 	.init = macb_init,
 };
 
 static const struct macb_config sama5d4_config = {
-	.caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII,
+	.caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII | MACB_CAPS_GEM_HAS_PTP,
 	.dma_burst_length = 4,
 	.clk_init = macb_clk_init,
 	.init = macb_init,
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH] arp: do neigh confirm based on sk arg
From: kbuild test robot @ 2016-12-14 12:56 UTC (permalink / raw)
  To: YueHaibing
  Cc: kbuild-all, Julian Anastasov, Hannes Frederic Sowa, Eric Dumazet,
	David S. Miller, netdev
In-Reply-To: <74d41c47-7091-3c52-096d-5b9af2e0e9cf@huawei.com>

[-- Attachment #1: Type: text/plain, Size: 2997 bytes --]

Hi YueHaibing,

[auto build test WARNING on v4.9-rc8]
[cannot apply to net/master net-next/master sparc-next/master next-20161214]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/YueHaibing/arp-do-neigh-confirm-based-on-sk-arg/20161214-191755
config: openrisc-or1ksim_defconfig (attached as .config)
compiler: or32-linux-gcc (GCC) 4.5.1-or32-1.0rc1
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=openrisc 

All warnings (new ones prefixed by >>):

   net/ipv4/tcp_input.c: In function 'tcp_rcv_state_process':
>> net/ipv4/tcp_input.c:6003:21: warning: unused variable 'dst'

vim +/dst +6003 net/ipv4/tcp_input.c

^1da177e4 Linus Torvalds 2005-04-16  5987  			 */
168a8f580 Jerry Chu      2012-08-31  5988  			tcp_rearm_rto(sk);
168a8f580 Jerry Chu      2012-08-31  5989  		} else
^1da177e4 Linus Torvalds 2005-04-16  5990  			tcp_init_metrics(sk);
^1da177e4 Linus Torvalds 2005-04-16  5991  
c0402760f Yuchung Cheng  2016-09-19  5992  		if (!inet_csk(sk)->icsk_ca_ops->cong_control)
02cf4ebd8 Neal Cardwell  2013-10-21  5993  			tcp_update_pacing_rate(sk);
02cf4ebd8 Neal Cardwell  2013-10-21  5994  
61eb90035 Joe Perches    2013-05-24  5995  		/* Prevent spurious tcp_cwnd_restart() on first data packet */
^1da177e4 Linus Torvalds 2005-04-16  5996  		tp->lsndtime = tcp_time_stamp;
^1da177e4 Linus Torvalds 2005-04-16  5997  
^1da177e4 Linus Torvalds 2005-04-16  5998  		tcp_initialize_rcv_mss(sk);
^1da177e4 Linus Torvalds 2005-04-16  5999  		tcp_fast_path_on(tp);
^1da177e4 Linus Torvalds 2005-04-16  6000  		break;
^1da177e4 Linus Torvalds 2005-04-16  6001  
c48b22daa Joe Perches    2013-05-24  6002  	case TCP_FIN_WAIT1: {
c48b22daa Joe Perches    2013-05-24 @6003  		struct dst_entry *dst;
c48b22daa Joe Perches    2013-05-24  6004  		int tmo;
c48b22daa Joe Perches    2013-05-24  6005  
168a8f580 Jerry Chu      2012-08-31  6006  		/* If we enter the TCP_FIN_WAIT1 state and we are a
168a8f580 Jerry Chu      2012-08-31  6007  		 * Fast Open socket and this is the first acceptable
168a8f580 Jerry Chu      2012-08-31  6008  		 * ACK we have received, this would have acknowledged
168a8f580 Jerry Chu      2012-08-31  6009  		 * our SYNACK so stop the SYNACK timer.
168a8f580 Jerry Chu      2012-08-31  6010  		 */
00db41243 Ian Morris     2015-04-03  6011  		if (req) {

:::::: The code at line 6003 was first introduced by commit
:::::: c48b22daa6062fff9eded311b4d6974c29b40487 tcp: Remove 2 indentation levels in tcp_rcv_state_process

:::::: TO: Joe Perches <joe@perches.com>
:::::: CC: David S. Miller <davem@davemloft.net>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 7277 bytes --]

^ permalink raw reply

* Re: [PATCH 1/1] Fixed to BUG_ON to WARN_ON def
From: Tariq Toukan @ 2016-12-14 12:58 UTC (permalink / raw)
  To: Leon Romanovsky, Ozgur Karatas, Tariq Toukan
  Cc: yishaih@mellanox.com, netdev, linux-kernel
In-Reply-To: <20161212181838.GB8204@mtr-leonro.local>

Thanks Ozgur for your report.


On 12/12/2016 8:18 PM, Leon Romanovsky wrote:
> On Mon, Dec 12, 2016 at 03:04:28PM +0200, Ozgur Karatas wrote:
>> Dear Romanovsky;
> Please avoid top-posting in your replies.
> Thanks
>
>> I'm trying to learn english and I apologize for my mistake words and phrases. So, I think the code when call to "sg_set_buf" and next time set memory and buffer. For example, isn't to call "WARN_ON" function, get a error to implicit declaration, right?
>>
>> Because, you will use to "BUG_ON" get a error implicit declaration of functions.
> I'm not sure that I followed you. mem->offset is set by sg_set_buf from
> buf variable returned by dma_alloc_coherent(). HW needs to get very
> precise size of this buf, in multiple of pages and aligned to pages
> boundaries.
>
>>          sg_set_buf(mem, buf, PAGE_SIZE << order);
>>          WARN_ON(mem->offset);
> See the patch inline which removes this BUG_ON in proper and safe way.
>
>  From 7babe807affa2b27d51d3610afb75b693929ea1a Mon Sep 17 00:00:00 2001
> From: Leon Romanovsky <leonro@mellanox.com>
> Date: Mon, 12 Dec 2016 20:02:45 +0200
> Subject: [PATCH] net/mlx4: Remove BUG_ON from ICM allocation routine
>
> This patch removes BUG_ON() macro from mlx4_alloc_icm_coherent()
> by checking DMA address aligment in advance and performing proper
> folding in case of error.
>
> Fixes: 5b0bf5e25efe ("mlx4_core: Support ICM tables in coherent memory")
> Reported-by: Ozgur Karatas <okaratas@member.fsf.org>
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> ---
>   drivers/net/ethernet/mellanox/mlx4/icm.c | 7 ++++++-
>   1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
> index 2a9dd46..e1f9e7c 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
> @@ -118,8 +118,13 @@ static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
>   	if (!buf)
>   		return -ENOMEM;
>
> +	if (offset_in_page(buf)) {
> +		dma_free_coherent(dev, PAGE_SIZE << order,
> +				  buf, sg_dma_address(mem));
> +		return -ENOMEM;
> +	}
> +
>   	sg_set_buf(mem, buf, PAGE_SIZE << order);
> -	BUG_ON(mem->offset);
>   	sg_dma_len(mem) = PAGE_SIZE << order;
>   	return 0;
>   }
> --
Thanks Leon for the patch. It is the right way to do so.
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>

We will submit Leon's patch in a new email.

Regards,
Tariq
> 2.10.2
>

^ permalink raw reply

* Re: Synopsys Ethernet QoS
From: Pavel Machek @ 2016-12-14 12:57 UTC (permalink / raw)
  To: Niklas Cassel
  Cc: Giuseppe CAVALLARO, Joao Pinto, Florian Fainelli, Andy Shevchenko,
	David Miller, larper, rabinv, netdev, CARLOS.PALMINHA, Jie.Deng1,
	Stephen Warren
In-Reply-To: <99424968-ad8f-fec6-ebcf-ab7b19ee5486@axis.com>

[-- Attachment #1: Type: text/plain, Size: 1102 bytes --]

Hi!

> So if there is a long time before handling interrupts,
> I guess that it makes sense that one stream could
> get an advantage in the net scheduler.
> 
> If I find the time, and if no one beats me to it, I will try to replace
> the normal timers with HR timers + a smaller default timeout.
> 

Can you try something like this? Highres timers will be needed, too,
but this fixes the logic problem.

You'll need to apply it twice as code is copy&pasted.

Best regards,
									Pavel

+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c

 	 */
 	priv->tx_count_frames += nfrags + 1;
 	if (likely(priv->tx_coal_frames > priv->tx_count_frames)) {
-		mod_timer(&priv->txtimer,
-			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
+		if (priv->tx_count_frames == nfrags + 1)
+			mod_timer(&priv->txtimer,
+				  STMMAC_COAL_TIMER(priv->tx_coal_timer));
 	} else {
 		priv->tx_count_frames = 0;
 		priv->hw->desc->set_tx_ic(desc);


-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: [PATCH v2 1/4] siphash: add cryptographically secure hashtable function
From: Jason A. Donenfeld @ 2016-12-14 13:10 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Netdev, kernel-hardening, LKML, Linux Crypto Mailing List,
	Jean-Philippe Aumasson, Daniel J . Bernstein, Linus Torvalds,
	Eric Biggers
In-Reply-To: <516c5633-14c2-ee18-90e4-84d73870ba2c@stressinduktion.org>

Hi Hannes,

On Wed, Dec 14, 2016 at 12:21 PM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> Can you show or cite benchmarks in comparison with jhash? Last time I
> looked, especially for short inputs, siphash didn't beat jhash (also on
> all the 32 bit devices etc.).

I assume that jhash is likely faster than siphash, but I wouldn't be
surprised if with optimization we can make siphash at least pretty
close on 64-bit platforms. (I'll do some tests though; maybe I'm wrong
and jhash is already slower.)

With that said, siphash is here to replace uses of jhash where
hashtable poisoning vulnerabilities make it necessary. Where there's
no significant security improvement, if there's no speed improvement
either, then of course nothing's going to change.

I should have mentioned md5_transform in this first message too, as
two other patches in this series actually replace md5_transform usage
with siphash. I think in this case, siphash is a clear performance
winner (and security winner) over md5_transform. So if the push back
against replacing jhash usages is just too high, at the very least it
remains useful already for the md5_transform usage.

> This pretty much depends on the linearity of the hash function? I don't
> think a crypto secure hash function is needed for a hash table. Albeit I
> agree that siphash certainly looks good to be used here.

In order to prevent the aforementioned poisoning attacks, a PRF with
perfect linearity is required, which is what's achieved when it's a
cryptographically secure one. Check out section 7 of
https://131002.net/siphash/siphash.pdf .

> I am pretty sure that SipHash still needs a random key per hash table
> also. So far it was only the choice of hash function you are questioning.

Siphash needs a random secret key, yes. The point is that the hash
function remains secure so long as the secret key is kept secret.
Other functions can't make the same guarantee, and so nervous periodic
key rotation is necessary, but in most cases nothing is done, and so
things just leak over time.

> Hmm, I tried to follow up with all the HashDoS work and so far didn't
> see any HashDoS attacks against the Jenkins/SpookyHash family.
>
> If this is an issue we might need to also put those changes into stable.

jhash just isn't secure; it's not a cryptographically secure PRF. If
there hasn't already been an academic paper put out there about it
this year, let's make this thread 1000 messages long to garner
attention, and next year perhaps we'll see one. No doubt that
motivated government organizations, defense contractors, criminals,
and other netizens have already done research in private. Replacing
insecure functions with secure functions is usually a good thing.

Jason

^ permalink raw reply

* Re: [PATCH v2 3/4] secure_seq: use siphash24 instead of md5_transform
From: Hannes Frederic Sowa @ 2016-12-14 13:16 UTC (permalink / raw)
  To: Jason A. Donenfeld, David Laight
  Cc: Netdev, kernel-hardening, Andi Kleen, LKML,
	Linux Crypto Mailing List
In-Reply-To: <CAHmME9pEM=cDC5S=j1BU2oCF8-WdnbRfiVojcet4rXcRLcpJRw@mail.gmail.com>

On 14.12.2016 13:53, Jason A. Donenfeld wrote:
> Hi David,
> 
> On Wed, Dec 14, 2016 at 10:51 AM, David Laight <David.Laight@aculab.com> wrote:
>> From: Jason A. Donenfeld
>>> Sent: 14 December 2016 00:17
>>> This gives a clear speed and security improvement. Rather than manually
>>> filling MD5 buffers, we simply create a layout by a simple anonymous
>>> struct, for which gcc generates rather efficient code.
>> ...
>>> +     const struct {
>>> +             struct in6_addr saddr;
>>> +             struct in6_addr daddr;
>>> +             __be16 sport;
>>> +             __be16 dport;
>>> +     } __packed combined = {
>>> +             .saddr = *(struct in6_addr *)saddr,
>>> +             .daddr = *(struct in6_addr *)daddr,
>>> +             .sport = sport,
>>> +             .dport = dport
>>> +     };
>>
>> You need to look at the effect of marking this (and the other)
>> structures 'packed' on architectures like sparc64.
> 
> In all current uses of __packed in the code, I think the impact is
> precisely zero, because all structures have members in descending
> order of size, with each member being a perfect multiple of the one
> below it. The __packed is therefore just there for safety, in case
> somebody comes in and screws everything up by sticking a u8 in
> between. In that case, it wouldn't be desirable to hash the structure
> padding bits. In the worst case, I don't believe the impact would be
> worse than a byte-by-byte memcpy, which is what the old code did. But
> anyway, these structures are already naturally packed anyway, so the
> present impact is nil.

__packed not only removes all padding of the struct but also changes the
alignment assumptions for the whole struct itself. The rule, the struct
is aligned by its maximum alignment of a member is no longer true. That
said, the code accessing this struct will change (not on archs that can
deal efficiently with unaligned access, but on others).

A proper test for not introducing padding is to use something with
BUILD_BUG_ON. Also gcc also clears the padding of the struct, so padding
shouldn't be that bad in there (it is better than byte access on mips).

Btw. I think gcc7 gets support for store merging optimization.

Bye,
Hannes

^ permalink raw reply

* Re: [PATCH 3/3] netns: fix net_generic() "id - 1" bloat
From: Alexey Dobriyan @ 2016-12-14 13:19 UTC (permalink / raw)
  To: David Laight
  Cc: davem@davemloft.net, netdev@vger.kernel.org, xemul@openvz.org
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB023DBE9@AcuExch.aculab.com>

On Tue, Dec 13, 2016 at 5:42 PM, David Laight <David.Laight@aculab.com> wrote:
> From: Alexey Dobriyan
>> Sent: 13 December 2016 14:23
> ...
>> Well, the point of the patch is to save .text, so might as well save
>> as much as possible. Any form other than "ptr[id]" is going
>> to be either bigger or bigger and slower and "ptr" should be the first field.
>
> You've not read and understood the next bit:
>
>> > However if you offset the 'id' values so that only
>> > values 2 up are valid the code becomes:
>> >         return net->gen2->ptr[id - 2];
>> > which will be exactly the same code as:
>> >         return net->gen1->ptr[id];
>> > but it is much more obvious that 'id' values must be >= 2.
>> >
>> > The '2' should be generated from the structure offset, but with my method
>> > is doesn't actually matter if it is wrong.
>
> If you have foo->bar[id - const] then the compiler has to add the
> offset of 'bar' and subtract for 'const'.
> If the numbers match no add or subtract is needed.
>
> It is much cleaner to do this by explicitly removing the offset on the
> accesses than using a union.

Surprisingly, the trick only works if array index is cast to "unsigned long"
before subtracting.

Code becomes

    ...
    ptr = ng->ptr[(unsigned long)id - 3];
    ...

I'll post a patch when net-next reopens.

^ permalink raw reply

* Re: Synopsys Ethernet QoS
From: Joao Pinto @ 2016-12-14 13:14 UTC (permalink / raw)
  To: Pavel Machek, Niklas Cassel
  Cc: Giuseppe CAVALLARO, Joao Pinto, Florian Fainelli, Andy Shevchenko,
	David Miller, larper, rabinv, netdev, CARLOS.PALMINHA, Jie.Deng1,
	Stephen Warren
In-Reply-To: <20161214125735.GA19542@amd>


Hi,

Às 12:57 PM de 12/14/2016, Pavel Machek escreveu:
> Hi!
> 
>> So if there is a long time before handling interrupts,
>> I guess that it makes sense that one stream could
>> get an advantage in the net scheduler.
>>
>> If I find the time, and if no one beats me to it, I will try to replace
>> the normal timers with HR timers + a smaller default timeout.
>>
> 
> Can you try something like this? Highres timers will be needed, too,
> but this fixes the logic problem.
> 
> You'll need to apply it twice as code is copy&pasted.
> 
> Best regards,
> 									Pavel
> 
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> 
>  	 */
>  	priv->tx_count_frames += nfrags + 1;
>  	if (likely(priv->tx_coal_frames > priv->tx_count_frames)) {
> -		mod_timer(&priv->txtimer,
> -			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
> +		if (priv->tx_count_frames == nfrags + 1)
> +			mod_timer(&priv->txtimer,
> +				  STMMAC_COAL_TIMER(priv->tx_coal_timer));
>  	} else {
>  		priv->tx_count_frames = 0;
>  		priv->hw->desc->set_tx_ic(desc);
> 
> 

I know that this is completely of topic, but I am facing a dificulty with
stmmac. I have interrupts, mac well configured rx packets being received
successfully, but TX is not working, resulting in Tx errors = Total TX packets.
I have made a lot of debug and my conclusions is that by some reason when using
stmmac after starting tx dma, the hw state machine enters a deadend state
resulting in those errors. Anyone faced this trouble?

Thanks.

^ permalink raw reply

* Re: [PATCHv3 perf/core 0/7] Reuse libbpf from samples/bpf
From: Arnaldo Carvalho de Melo @ 2016-12-14 13:25 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Joe Stringer, linux-kernel, netdev, wangnan0, ast
In-Reply-To: <584ACE2E.2090108@iogearbox.net>

Em Fri, Dec 09, 2016 at 04:30:54PM +0100, Daniel Borkmann escreveu:
> Hi Arnaldo,
> 
> On 12/09/2016 04:09 PM, Arnaldo Carvalho de Melo wrote:
> > Em Thu, Dec 08, 2016 at 06:46:13PM -0800, Joe Stringer escreveu:
> > > (Was "libbpf: Synchronize implementations")
> > > 
> > > Update tools/lib/bpf to provide the remaining bpf wrapper pieces needed by the
> > > samples/bpf/ code, then get rid of all of the duplicate BPF libraries in
> > > samples/bpf/libbpf.[ch].
> > > 
> > > ---
> > > v3: Add ack for first patch.
> > >      Split out second patch from v2 into separate changes for remaining diff.
> > >      Add patches to switch samples/bpf over to using tools/lib/.
> > > v2: https://www.mail-archive.com/netdev@vger.kernel.org/msg135088.html
> > >      Don't shift non-bpf code into libbpf.
> > >      Drop the patch to synchronize ELF definitions with tc.
> > > v1: https://www.mail-archive.com/netdev@vger.kernel.org/msg135088.html
> > >      First post.
> > 
> > Thanks, applied after addressing the -I$(objtree) issue raised by Wang,
> 
> [ Sorry for late reply. ]
> 
> First of all, glad to see us getting rid of the duplicate lib eventually! :)
> 
> Please note that this might result in hopefully just a minor merge issue
> with net-next. Looks like patch 4/7 touches test_maps.c and test_verifier.c,
> which moved to a new bpf selftest suite [1] this net-next cycle. Seems it's
> just log buffer and some renames there, which can be discarded for both
> files sitting in selftests.

Yeah, I've got to this point, and the merge has a little bit more than
that, including BPF_PROG_ATTACH/BPF_PROG_DETACH, etc, working on it...

- Arnaldo

^ permalink raw reply

* Re: [net-next PATCH v5 1/6] net: virtio dynamically disable/enable LRO
From: Michael S. Tsirkin @ 2016-12-14 13:31 UTC (permalink / raw)
  To: John Fastabend
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, brouer
In-Reply-To: <5849F52A.7050105@gmail.com>

On Thu, Dec 08, 2016 at 04:04:58PM -0800, John Fastabend wrote:
> On 16-12-08 01:36 PM, Michael S. Tsirkin wrote:
> > On Wed, Dec 07, 2016 at 12:11:11PM -0800, John Fastabend wrote:
> >> This adds support for dynamically setting the LRO feature flag. The
> >> message to control guest features in the backend uses the
> >> CTRL_GUEST_OFFLOADS msg type.
> >>
> >> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> >> ---
> >>  drivers/net/virtio_net.c |   40 +++++++++++++++++++++++++++++++++++++++-
> >>  1 file changed, 39 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >> index a21d93a..a5c47b1 100644
> >> --- a/drivers/net/virtio_net.c
> >> +++ b/drivers/net/virtio_net.c
> >> @@ -1419,6 +1419,36 @@ static void virtnet_init_settings(struct net_device *dev)
> >>  	.set_settings = virtnet_set_settings,
> >>  };
> >>  
> >> +static int virtnet_set_features(struct net_device *netdev,
> >> +				netdev_features_t features)
> >> +{
> >> +	struct virtnet_info *vi = netdev_priv(netdev);
> >> +	struct virtio_device *vdev = vi->vdev;
> >> +	struct scatterlist sg;
> >> +	u64 offloads = 0;
> >> +
> >> +	if (features & NETIF_F_LRO)
> >> +		offloads |= (1 << VIRTIO_NET_F_GUEST_TSO4) |
> >> +			    (1 << VIRTIO_NET_F_GUEST_TSO6);
> >> +
> >> +	if (features & NETIF_F_RXCSUM)
> >> +		offloads |= (1 << VIRTIO_NET_F_GUEST_CSUM);
> >> +
> >> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
> >> +		sg_init_one(&sg, &offloads, sizeof(uint64_t));
> >> +		if (!virtnet_send_command(vi,
> >> +					  VIRTIO_NET_CTRL_GUEST_OFFLOADS,
> >> +					  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
> >> +					  &sg)) {
> > 
> > Hmm I just realised that this will slow down setups that bridge
> > virtio net interfaces since bridge calls this if provided.
> > See below.
> 
> 
> Really? What code is trying to turn off GRO via the GUEST_OFFLOADS LRO
> command. My qemu/Linux setup has a set of tap/vhost devices attached to
> a bridge and all of them have LRO enabled even with this patch series.
> 
> I must missing a setup handler somewhere?
> 
> > 
> >> +			dev_warn(&netdev->dev,
> >> +				 "Failed to set guest offloads by virtnet command.\n");
> >> +			return -EINVAL;
> >> +		}
> >> +	}
> > 
> > Hmm if VIRTIO_NET_F_CTRL_GUEST_OFFLOADS is off, this fails
> > silently. It might actually be a good idea to avoid
> > breaking setups.
> > 
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  static const struct net_device_ops virtnet_netdev = {
> >>  	.ndo_open            = virtnet_open,
> >>  	.ndo_stop   	     = virtnet_close,
> >> @@ -1435,6 +1465,7 @@ static void virtnet_init_settings(struct net_device *dev)
> >>  #ifdef CONFIG_NET_RX_BUSY_POLL
> >>  	.ndo_busy_poll		= virtnet_busy_poll,
> >>  #endif
> >> +	.ndo_set_features	= virtnet_set_features,
> >>  };
> >>  
> >>  static void virtnet_config_changed_work(struct work_struct *work)
> >> @@ -1815,6 +1846,12 @@ static int virtnet_probe(struct virtio_device *vdev)
> >>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
> >>  		dev->features |= NETIF_F_RXCSUM;
> >>  
> >> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) &&
> >> +	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) {
> >> +		dev->features |= NETIF_F_LRO;
> >> +		dev->hw_features |= NETIF_F_LRO;
> > 
> > So the issue is I think that the virtio "LRO" isn't really
> > LRO, it's typically just GRO forwarded to guests.
> > So these are easily re-split along MTU boundaries,
> > which makes it ok to forward these across bridges.
> > 
> > It's not nice that we don't document this in the spec,
> > but it's the reality and people rely on this.
> > 
> > For now, how about doing a custom thing and just disable/enable
> > it as XDP is attached/detached?
> 
> The annoying part about doing this is ethtool will say that it is fixed
> yet it will be changed by seemingly unrelated operation. I'm not sure I
> like the idea to start automatically configuring the link via xdp_set.

I really don't like the idea of dropping performance
by a factor of 3 for people bridging two virtio net
interfaces.

So how about a simple approach for now, just disable
XDP if GUEST_TSO is enabled?

We can discuss better approaches in next version.


> > 
> >> +	}
> >> +
> >>  	dev->vlan_features = dev->features;
> >>  
> >>  	/* MTU range: 68 - 65535 */
> >> @@ -2057,7 +2094,8 @@ static int virtnet_restore(struct virtio_device *vdev)
> >>  	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
> >>  	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
> >>  	VIRTIO_NET_F_CTRL_MAC_ADDR, \
> >> -	VIRTIO_NET_F_MTU
> >> +	VIRTIO_NET_F_MTU, \
> >> +	VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
> >>  
> >>  static unsigned int features[] = {
> >>  	VIRTNET_FEATURES,

^ permalink raw reply

* Re: [PATCH] arp: do neigh confirm based on sk arg
From: kbuild test robot @ 2016-12-14 13:37 UTC (permalink / raw)
  To: YueHaibing
  Cc: kbuild-all, Julian Anastasov, Hannes Frederic Sowa, Eric Dumazet,
	David S. Miller, netdev
In-Reply-To: <74d41c47-7091-3c52-096d-5b9af2e0e9cf@huawei.com>

[-- Attachment #1: Type: text/plain, Size: 2920 bytes --]

Hi YueHaibing,

[auto build test WARNING on v4.9-rc8]
[cannot apply to net/master net-next/master sparc-next/master next-20161214]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/YueHaibing/arp-do-neigh-confirm-based-on-sk-arg/20161214-191755
reproduce: make htmldocs

All warnings (new ones prefixed by >>):

>> include/net/sock.h:452: warning: No description found for parameter 'sk_dst_pending_confirm'

vim +/sk_dst_pending_confirm +452 include/net/sock.h

^1da177e4 Linus Torvalds  2005-04-16  436  	int			sk_write_pending;
4ea59a6cc YueHaibing      2016-12-14  437  	unsigned short          sk_dst_pending_confirm;
d5f642384 Alexey Dobriyan 2008-11-04  438  #ifdef CONFIG_SECURITY
^1da177e4 Linus Torvalds  2005-04-16  439  	void			*sk_security;
d5f642384 Alexey Dobriyan 2008-11-04  440  #endif
2a56a1fec Tejun Heo       2015-12-07  441  	struct sock_cgroup_data	sk_cgrp_data;
baac50bbc Johannes Weiner 2016-01-14  442  	struct mem_cgroup	*sk_memcg;
^1da177e4 Linus Torvalds  2005-04-16  443  	void			(*sk_state_change)(struct sock *sk);
676d23690 David S. Miller 2014-04-11  444  	void			(*sk_data_ready)(struct sock *sk);
^1da177e4 Linus Torvalds  2005-04-16  445  	void			(*sk_write_space)(struct sock *sk);
^1da177e4 Linus Torvalds  2005-04-16  446  	void			(*sk_error_report)(struct sock *sk);
^1da177e4 Linus Torvalds  2005-04-16  447  	int			(*sk_backlog_rcv)(struct sock *sk,
^1da177e4 Linus Torvalds  2005-04-16  448  						  struct sk_buff *skb);
^1da177e4 Linus Torvalds  2005-04-16  449  	void                    (*sk_destruct)(struct sock *sk);
ef456144d Craig Gallek    2016-01-04  450  	struct sock_reuseport __rcu	*sk_reuseport_cb;
a4298e452 Eric Dumazet    2016-04-01  451  	struct rcu_head		sk_rcu;
^1da177e4 Linus Torvalds  2005-04-16 @452  };
^1da177e4 Linus Torvalds  2005-04-16  453  
559835ea7 Pravin B Shelar 2013-09-24  454  #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
559835ea7 Pravin B Shelar 2013-09-24  455  
559835ea7 Pravin B Shelar 2013-09-24  456  #define rcu_dereference_sk_user_data(sk)	rcu_dereference(__sk_user_data((sk)))
559835ea7 Pravin B Shelar 2013-09-24  457  #define rcu_assign_sk_user_data(sk, ptr)	rcu_assign_pointer(__sk_user_data((sk)), ptr)
559835ea7 Pravin B Shelar 2013-09-24  458  
4a17fd522 Pavel Emelyanov 2012-04-19  459  /*
4a17fd522 Pavel Emelyanov 2012-04-19  460   * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK

:::::: The code at line 452 was first introduced by commit
:::::: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Linux-2.6.12-rc2

:::::: TO: Linus Torvalds <torvalds@ppc970.osdl.org>
:::::: CC: Linus Torvalds <torvalds@ppc970.osdl.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 6425 bytes --]

^ permalink raw reply

* Re: [PATCHv2 2/5] sh_eth: enable wake-on-lan for Gen2 devices
From: Sergei Shtylyov @ 2016-12-14 13:37 UTC (permalink / raw)
  To: Niklas Söderlund, Simon Horman, netdev, linux-renesas-soc
  Cc: Geert Uytterhoeven
In-Reply-To: <20161212160931.6478-3-niklas.soderlund+renesas@ragnatech.se>

Hello!

    You forgot "R-Car" before "Gen2" in the subject.

On 12/12/2016 07:09 PM, Niklas Söderlund wrote:

> Tested on Gen2 r8a7791/Koelsch.
>
> Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
> ---
>  drivers/net/ethernet/renesas/sh_eth.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
> index 87640b9..348ed22 100644
> --- a/drivers/net/ethernet/renesas/sh_eth.c
> +++ b/drivers/net/ethernet/renesas/sh_eth.c
> @@ -624,8 +624,9 @@ static struct sh_eth_cpu_data r8a779x_data = {
>
>  	.register_type	= SH_ETH_REG_FAST_RCAR,
>
> -	.ecsr_value	= ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD,
> -	.ecsipr_value	= ECSIPR_PSRTOIP | ECSIPR_LCHNGIP | ECSIPR_ICDIP,
> +	.ecsr_value	= ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD | ECSR_MPD,
> +	.ecsipr_value	= ECSIPR_PSRTOIP | ECSIPR_LCHNGIP | ECSIPR_ICDIP |
> +			  ECSIPR_MPDIP,

   These expressions seem to have been sorted by the bit # before your patch, 
now they aren't... care to fix? :-)

[...]

MBR, Sergei

^ permalink raw reply

* Re: [PATCH v2 3/4] secure_seq: use siphash24 instead of md5_transform
From: Jason A. Donenfeld @ 2016-12-14 13:44 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: David Laight, Netdev, kernel-hardening, Andi Kleen, LKML,
	Linux Crypto Mailing List
In-Reply-To: <1e502c6b-cda3-c46d-2535-fcfb58f443a9@stressinduktion.org>

Hi Hannes,

Thanks for the feedback.

> __packed not only removes all padding of the struct but also changes the
> alignment assumptions for the whole struct itself. The rule, the struct
> is aligned by its maximum alignment of a member is no longer true. That
> said, the code accessing this struct will change (not on archs that can
> deal efficiently with unaligned access, but on others).

That's interesting. There currently aren't any alignment requirements
in siphash because we use the unaligned helper functions, but as David
pointed out in another thread, maybe that too should change. In that
case, we'd have an aligned-only version of the function that requires
8-byte aligned input. Perhaps the best way to go about that would be
to just mark the struct as __packed __aligned(8). Or, I guess, since
64-bit accesses gets split into two on 32-bit, that'd be best descried
as __packed __aligned(sizeof(long)). Would that be an acceptable
solution?

Jason

^ permalink raw reply

* Re: [PATCH 2/3] selftests: do not require bash to run bpf tests
From: Shuah Khan @ 2016-12-14 14:22 UTC (permalink / raw)
  To: Daniel Borkmann, Rolf Eike Beer, linux-kselftest
  Cc: David S. Miller, netdev, Alexei Starovoitov, linux-kernel,
	Shuah Khan, Shuah Khan
In-Reply-To: <5851270F.4090709@iogearbox.net>

On 12/14/2016 04:03 AM, Daniel Borkmann wrote:
> On 12/14/2016 11:58 AM, Rolf Eike Beer wrote:
>>  From b9d6c1b7427d708ef2d4d57aac17b700b3694d71 Mon Sep 17 00:00:00 2001
>> From: Rolf Eike Beer <eike-kernel@sf-tec.de>
>> Date: Wed, 14 Dec 2016 09:58:12 +0100
>> Subject: [PATCH 2/3] selftests: do not require bash to run bpf tests
>>
>> Nothing in this minimal script seems to require bash. We often run these tests
>> on embedded devices where the only shell available is the busybox ash.
>>
>> Signed-off-by: Rolf Eike Beer <eb@emlix.com>
> 
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>

Thanks. I will get these into 4.10-rc1 or rc2

-- Shuah

^ permalink raw reply

* Re: [PATCH net-next] net: remove abuse of VLAN DEI/CFI bit
From: Michał Mirosław @ 2016-12-14 14:28 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: open list:OPENVSWITCH, netdev-u79uwXL29TY76Z2rM5mHXA,
	moderated list:ETHERNET BRIDGE
In-Reply-To: <20161213172118.2f55c503@xeon-e3>

On Tue, Dec 13, 2016 at 05:21:18PM -0800, Stephen Hemminger wrote:
> On Sat,  3 Dec 2016 10:22:28 +0100 (CET)
> Michał Mirosław <mirq-linux-CoA6ZxLDdyEEUmgCuDUIdw@public.gmane.org> wrote:
> 
> > This All-in-one patch removes abuse of VLAN CFI bit, so it can be passed
> > intact through linux networking stack.
> > 
> > Signed-off-by: Michał Mirosław <michal.miroslaw-sjE0K2xrq/hHxbwTTUZ4aWZHpeb/A1Y/@public.gmane.org>
> > ---
> > 
> > Dear NetDevs
> > 
> > I guess this needs to be split to the prep..convert[]..finish sequence,
> > but if you like it as is, then it's ready.
> > 
> > The biggest question is if the modified interface and vlan_present
> > is the way to go. This can be changed to use vlan_proto != 0 instead
> > of an extra flag bit.
> > 
> > As I can't test most of the driver changes, please look at them carefully.
> > OVS and bridge eyes are especially welcome.
> > 
> > Best Regards,
> > Michał Mirosław
> 
> Is the motivation to support 802.1ad Drop Eligability Indicator (DEI)?
> 
> If so then you need to be more verbose in the commit log, and lots more
> work is needed. You need to rename fields and validate every place a
> driver is using DEI bit to make sure it really does the right thing
> on that hardware. It is not just a mechanical change.

There are not many mentions of CFI bit in the Linux tree. Places that
used it as VLAN_TAG_PRESENT are fixed with this patchset. Other uses are:

 - VLAN code: ignored
 - ebt_vlan: ignored
 - OVS: cleared because of netlink API assumptions
 - DSA: transferred to/from (E)DSA tag
 - drivers: gianfar: uses properly in filtering rules
 - drivers: cnic: false-positive (uses only VLAN ID, CFI bit marks the field 'valid')
 - drivers: qedr: false-positive (like cnic)

So unless there is something hidden in the hardware, no driver does anything
special with the CFI bit.

After this patchset only OVS will need further modifications to be able to
support handling of DEI bit.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [v1] net:ethernet:cavium:octeon:octeon_mgmt: Handle return NULL error from devm_ioremap
From: kbuild test robot @ 2016-12-14 14:40 UTC (permalink / raw)
  To: Arvind Yadav; +Cc: kbuild-all, peter.chen, fw, netdev, linux-kernel
In-Reply-To: <1481639670-17888-1-git-send-email-arvind.yadav.cs@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1852 bytes --]

Hi Arvind,

[auto build test ERROR on net-next/master]
[also build test ERROR on v4.9 next-20161214]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Arvind-Yadav/net-ethernet-cavium-octeon-octeon_mgmt-Handle-return-NULL-error-from-devm_ioremap/20161213-224624
config: mips-cavium_octeon_defconfig (attached as .config)
compiler: mips64-linux-gnuabi64-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=mips 

All errors (new ones prefixed by >>):

   drivers/net/ethernet/cavium/octeon/octeon_mgmt.c: In function 'octeon_mgmt_probe':
>> drivers/net/ethernet/cavium/octeon/octeon_mgmt.c:1473:11: error: 'dev' undeclared (first use in this function)
      dev_err(dev, "failed to map I/O memory\n");
              ^~~
   drivers/net/ethernet/cavium/octeon/octeon_mgmt.c:1473:11: note: each undeclared identifier is reported only once for each function it appears in

vim +/dev +1473 drivers/net/ethernet/cavium/octeon/octeon_mgmt.c

  1467	
  1468		p->mix = (u64)devm_ioremap(&pdev->dev, p->mix_phys, p->mix_size);
  1469		p->agl = (u64)devm_ioremap(&pdev->dev, p->agl_phys, p->agl_size);
  1470		p->agl_prt_ctl = (u64)devm_ioremap(&pdev->dev, p->agl_prt_ctl_phys,
  1471						   p->agl_prt_ctl_size);
  1472		if (!p->mix || !p->agl || !p->agl_prt_ctl) {
> 1473			dev_err(dev, "failed to map I/O memory\n");
  1474			result = -ENOMEM;
  1475			goto err;
  1476		}

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 15718 bytes --]

^ permalink raw reply

* RE: [PATCH 3/3] netns: fix net_generic() "id - 1" bloat
From: David Laight @ 2016-12-14 14:41 UTC (permalink / raw)
  To: 'Alexey Dobriyan'
  Cc: davem@davemloft.net, netdev@vger.kernel.org, xemul@openvz.org
In-Reply-To: <CACVxJT9HAMex1Pa+VbTVJwDPuY_dnfgfQgAZFSjMuUmgCiZD1w@mail.gmail.com>

From: Alexey Dobriyan 
> Sent: 14 December 2016 13:20
...
> > If you have foo->bar[id - const] then the compiler has to add the
> > offset of 'bar' and subtract for 'const'.
> > If the numbers match no add or subtract is needed.
> >
> > It is much cleaner to do this by explicitly removing the offset on the
> > accesses than using a union.
> 
> Surprisingly, the trick only works if array index is cast to "unsigned long"
> before subtracting.
> 
> Code becomes
> 
>     ...
>     ptr = ng->ptr[(unsigned long)id - 3];
>     ...

The compiler may also be able to optimise it away if 'id' is 'int'
rather than 'unsigned int'.

Oh, if you need casts like that use an accessor function.

	David



^ permalink raw reply

* RE: [PATCH v2 3/4] secure_seq: use siphash24 instead of md5_transform
From: David Laight @ 2016-12-14 14:47 UTC (permalink / raw)
  To: 'Jason A. Donenfeld', Hannes Frederic Sowa
  Cc: Netdev, kernel-hardening@lists.openwall.com, Andi Kleen, LKML,
	Linux Crypto Mailing List
In-Reply-To: <CAHmME9o4NVi-MPeURio1Ga58rnW6JAGQdTg6scd+K3EZEf3RNA@mail.gmail.com>

From: Jason A. Donenfeld
> Sent: 14 December 2016 13:44
> To: Hannes Frederic Sowa
> > __packed not only removes all padding of the struct but also changes the
> > alignment assumptions for the whole struct itself. The rule, the struct
> > is aligned by its maximum alignment of a member is no longer true. That
> > said, the code accessing this struct will change (not on archs that can
> > deal efficiently with unaligned access, but on others).
> 
> That's interesting. There currently aren't any alignment requirements
> in siphash because we use the unaligned helper functions, but as David
> pointed out in another thread, maybe that too should change. In that
> case, we'd have an aligned-only version of the function that requires
> 8-byte aligned input. Perhaps the best way to go about that would be
> to just mark the struct as __packed __aligned(8). Or, I guess, since
> 64-bit accesses gets split into two on 32-bit, that'd be best descried
> as __packed __aligned(sizeof(long)). Would that be an acceptable
> solution?

Just remove the __packed and ensure that the structure is 'nice'.
This includes ensuring there is no 'tail padding'.
In some cases you'll need to put the port number into a 32bit field.

I'd also require that the key be aligned.
It probably ought to be a named structure type with two 64bit members
(or with an array member that has two elements).

	David


^ permalink raw reply

* Re: [PATCHv3 perf/core 0/7] Reuse libbpf from samples/bpf
From: Arnaldo Carvalho de Melo @ 2016-12-14 14:55 UTC (permalink / raw)
  To: Daniel Borkmann, Joe Stringer; +Cc: linux-kernel, netdev, wangnan0, ast
In-Reply-To: <20161214132501.GP5482@kernel.org>

Em Wed, Dec 14, 2016 at 10:25:01AM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Fri, Dec 09, 2016 at 04:30:54PM +0100, Daniel Borkmann escreveu:
> > On 12/09/2016 04:09 PM, Arnaldo Carvalho de Melo wrote:
> > > > v3: Add ack for first patch.
> > > >      Split out second patch from v2 into separate changes for remaining diff.
> > > >      Add patches to switch samples/bpf over to using tools/lib/.
> > > > v2: https://www.mail-archive.com/netdev@vger.kernel.org/msg135088.html
> > > >      Don't shift non-bpf code into libbpf.
> > > >      Drop the patch to synchronize ELF definitions with tc.
> > > > v1: https://www.mail-archive.com/netdev@vger.kernel.org/msg135088.html
> > > >      First post.

> > > Thanks, applied after addressing the -I$(objtree) issue raised by Wang,

> > [ Sorry for late reply. ]

> > First of all, glad to see us getting rid of the duplicate lib eventually! :)
> > 
> > Please note that this might result in hopefully just a minor merge issue
> > with net-next. Looks like patch 4/7 touches test_maps.c and test_verifier.c,
> > which moved to a new bpf selftest suite [1] this net-next cycle. Seems it's
> > just log buffer and some renames there, which can be discarded for both
> > files sitting in selftests.
> 
> Yeah, I've got to this point, and the merge has a little bit more than
> that, including BPF_PROG_ATTACH/BPF_PROG_DETACH, etc, working on it...

So, Joe, can you try refreshing this work, starting from what I have in
perf/core? It has the changes coming from net-next that Daniel warned us about
and some more.

[acme@jouet linux]$ git log --oneline -5
1f125a4aa4d8 tools lib bpf: Add flags to bpf_create_map()
5adf5614f72d tools lib bpf: use __u32 from linux/types.h
ff687c38d803 tools lib bpf: Sync {tools,}/include/uapi/linux/bpf.h
53452c69b4c3 perf annotate: Fix jump target outside of function address range
2f41ae602b57 perf annotate: Support jump instruction with target as second operand
[acme@jouet linux]$

I tried refreshing it, but it seems samples/bpf/ needs some love and
care first, as I can't get it to build before these patches, to make
sure nothing gets broken.

Trying to bisect it I get to what seems multiple bisect breakages, last
tag I got it to build, with lots of warnings, was v4.8, after that I get
things like the ones below.

I could try fixing it, but may be missing something, and want to push the other
stuff in this branch...

[acme@jouet linux]$ egrep SAMPLES\|BPF .config
CONFIG_BPF=y
CONFIG_BPF_SYSCALL=y
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NET_CLS_BPF=m
CONFIG_NET_ACT_BPF=m
CONFIG_BPF_JIT=y
CONFIG_HAVE_EBPF_JIT=y
CONFIG_BPF_EVENTS=y
# CONFIG_TEST_BPF is not set
CONFIG_SAMPLES=y
[acme@jouet linux]$ 

[acme@jouet linux]$ make -C samples/bpf
make: Entering directory '/home/acme/git/linux/samples/bpf'
make -C ../../ $PWD/
make[1]: Entering directory '/home/acme/git/linux'
  CHK     include/config/kernel.release
  CHK     include/generated/uapi/linux/version.h
  CHK     include/generated/utsrelease.h
  CHK     include/generated/timeconst.h
  CHK     include/generated/bounds.h
  CHK     include/generated/asm-offsets.h
  CALL    scripts/checksyscalls.sh
  HOSTCC  /home/acme/git/linux/samples/bpf/bpf_load.o
In file included from /home/acme/git/linux/samples/bpf/bpf_load.c:21:0:
/home/acme/git/linux/samples/bpf/bpf_helpers.h:76:11: error: ‘BPF_FUNC_skb_in_cgroup’ undeclared here (not in a function)
  (void *) BPF_FUNC_skb_in_cgroup;
           ^~~~~~~~~~~~~~~~~~~~~~
scripts/Makefile.host:124: recipe for target '/home/acme/git/linux/samples/bpf/bpf_load.o' failed
make[2]: *** [/home/acme/git/linux/samples/bpf/bpf_load.o] Error 1
Makefile:1646: recipe for target '/home/acme/git/linux/samples/bpf/' failed

[acme@jouet linux]$ make -C samples/bpf
make: Entering directory '/home/acme/git/linux/samples/bpf'
make -C ../../ $PWD/
make[1]: Entering directory '/home/acme/git/linux'
scripts/kconfig/conf  --silentoldconfig Kconfig
#
# configuration written to .config
#
  SYSTBL  arch/x86/entry/syscalls/../../include/generated/asm/syscalls_32.h
  SYSHDR  arch/x86/entry/syscalls/../../include/generated/asm/unistd_32_ia32.h
  SYSHDR  arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_32.h
  CHK     include/config/kernel.release
  UPD     include/config/kernel.release
  CHK     include/generated/uapi/linux/version.h
  UPD     include/generated/uapi/linux/version.h
  CHK     include/generated/utsrelease.h
  UPD     include/generated/utsrelease.h
  CHK     include/generated/timeconst.h
  CC      kernel/bounds.s
  CHK     include/generated/bounds.h
  GEN     scripts/gdb/linux/constants.py
  CC      arch/x86/kernel/asm-offsets.s
  CHK     include/generated/asm-offsets.h
  CALL    scripts/checksyscalls.sh
  HOSTCC  /home/acme/git/linux/samples/bpf/bpf_load.o
In file included from /home/acme/git/linux/samples/bpf/bpf_load.c:21:0:
/home/acme/git/linux/samples/bpf/bpf_helpers.h:49:11: error: ‘BPF_FUNC_current_task_under_cgroup’ undeclared here (not in a function)
  (void *) BPF_FUNC_current_task_under_cgroup;
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/acme/git/linux/samples/bpf/bpf_helpers.h:80:11: error: ‘BPF_FUNC_skb_in_cgroup’ undeclared here (not in a function)
  (void *) BPF_FUNC_skb_in_cgroup;
           ^~~~~~~~~~~~~~~~~~~~~~
scripts/Makefile.host:124: recipe for target '/home/acme/git/linux/samples/bpf/bpf_load.o' failed

^ permalink raw reply

* [PATCH net-next 1/1] driver: ipvlan: Define common functions to decrease duplicated codes used to add or del IP address
From: fgao @ 2016-12-14 14:52 UTC (permalink / raw)
  To: davem, maheshb, edumazet, netdev, gfree.wind

From: Gao Feng <gfree.wind@gmail.com>

There are some duplicated codes in ipvlan_add_addr6/4 and
ipvlan_del_addr6/4. Now define two common functions ipvlan_add_addr
and ipvlan_del_addr to decrease the duplicated codes.
It could be helful to maintain the codes.

Signed-off-by: Gao Feng <gfree.wind@gmail.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 68 +++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 39 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 693ec5b..5874d30 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -669,23 +669,22 @@ static int ipvlan_device_event(struct notifier_block *unused,
 	return NOTIFY_DONE;
 }
 
-static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
+static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
 {
 	struct ipvl_addr *addr;
 
-	if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) {
-		netif_err(ipvlan, ifup, ipvlan->dev,
-			  "Failed to add IPv6=%pI6c addr for %s intf\n",
-			  ip6_addr, ipvlan->dev->name);
-		return -EINVAL;
-	}
 	addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC);
 	if (!addr)
 		return -ENOMEM;
 
 	addr->master = ipvlan;
-	memcpy(&addr->ip6addr, ip6_addr, sizeof(struct in6_addr));
-	addr->atype = IPVL_IPV6;
+	if (is_v6) {
+		memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr));
+		addr->atype = IPVL_IPV6;
+	} else {
+		memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr));
+		addr->atype = IPVL_IPV4;
+	}
 	list_add_tail(&addr->anode, &ipvlan->addrs);
 
 	/* If the interface is not up, the address will be added to the hash
@@ -697,11 +696,11 @@ static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
 	return 0;
 }
 
-static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
+static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
 {
 	struct ipvl_addr *addr;
 
-	addr = ipvlan_find_addr(ipvlan, ip6_addr, true);
+	addr = ipvlan_find_addr(ipvlan, iaddr, is_v6);
 	if (!addr)
 		return;
 
@@ -712,6 +711,23 @@ static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
 	return;
 }
 
+static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
+{
+	if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true)) {
+		netif_err(ipvlan, ifup, ipvlan->dev,
+			  "Failed to add IPv6=%pI6c addr for %s intf\n",
+			  ip6_addr, ipvlan->dev->name);
+		return -EINVAL;
+	}
+
+	return ipvlan_add_addr(ipvlan, ip6_addr, true);
+}
+
+static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
+{
+	return ipvlan_del_addr(ipvlan, ip6_addr, true);
+}
+
 static int ipvlan_addr6_event(struct notifier_block *unused,
 			      unsigned long event, void *ptr)
 {
@@ -745,45 +761,19 @@ static int ipvlan_addr6_event(struct notifier_block *unused,
 
 static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
 {
-	struct ipvl_addr *addr;
-
 	if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false)) {
 		netif_err(ipvlan, ifup, ipvlan->dev,
 			  "Failed to add IPv4=%pI4 on %s intf.\n",
 			  ip4_addr, ipvlan->dev->name);
 		return -EINVAL;
 	}
-	addr = kzalloc(sizeof(struct ipvl_addr), GFP_KERNEL);
-	if (!addr)
-		return -ENOMEM;
-
-	addr->master = ipvlan;
-	memcpy(&addr->ip4addr, ip4_addr, sizeof(struct in_addr));
-	addr->atype = IPVL_IPV4;
-	list_add_tail(&addr->anode, &ipvlan->addrs);
-
-	/* If the interface is not up, the address will be added to the hash
-	 * list by ipvlan_open.
-	 */
-	if (netif_running(ipvlan->dev))
-		ipvlan_ht_addr_add(ipvlan, addr);
 
-	return 0;
+	return ipvlan_add_addr(ipvlan, ip4_addr, false);
 }
 
 static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
 {
-	struct ipvl_addr *addr;
-
-	addr = ipvlan_find_addr(ipvlan, ip4_addr, false);
-	if (!addr)
-		return;
-
-	ipvlan_ht_addr_del(addr);
-	list_del(&addr->anode);
-	kfree_rcu(addr, rcu);
-
-	return;
+	return ipvlan_del_addr(ipvlan, ip4_addr, false);
 }
 
 static int ipvlan_addr4_event(struct notifier_block *unused,
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v2 1/4] siphash: add cryptographically secure hashtable function
From: Hannes Frederic Sowa @ 2016-12-14 15:09 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Netdev, kernel-hardening, LKML, Linux Crypto Mailing List,
	Jean-Philippe Aumasson, Daniel J . Bernstein, Linus Torvalds,
	Eric Biggers
In-Reply-To: <CAHmME9qA6qKdp+qoih2Je4fxU+4E6=Gp7CVfhYU7VbOr6HJ=0Q@mail.gmail.com>

Hello,

On 14.12.2016 14:10, Jason A. Donenfeld wrote:
> On Wed, Dec 14, 2016 at 12:21 PM, Hannes Frederic Sowa
> <hannes@stressinduktion.org> wrote:
>> Can you show or cite benchmarks in comparison with jhash? Last time I
>> looked, especially for short inputs, siphash didn't beat jhash (also on
>> all the 32 bit devices etc.).
> 
> I assume that jhash is likely faster than siphash, but I wouldn't be
> surprised if with optimization we can make siphash at least pretty
> close on 64-bit platforms. (I'll do some tests though; maybe I'm wrong
> and jhash is already slower.)

Yes, numbers would be very usable here. I am mostly concerned about
small plastic router cases. E.g. assume you double packet processing
time with a change of the hashing function at what point is the actual
packet processing more of an attack vector than the hashtable?

> With that said, siphash is here to replace uses of jhash where
> hashtable poisoning vulnerabilities make it necessary. Where there's
> no significant security improvement, if there's no speed improvement
> either, then of course nothing's going to change.

It still changes currently well working source. ;-)

> I should have mentioned md5_transform in this first message too, as
> two other patches in this series actually replace md5_transform usage
> with siphash. I think in this case, siphash is a clear performance
> winner (and security winner) over md5_transform. So if the push back
> against replacing jhash usages is just too high, at the very least it
> remains useful already for the md5_transform usage.

MD5 is considered broken because its collision resistance is broken?
SipHash doesn't even claim to have collision resistance (which we don't
need here)?

But I agree, certainly it could be a nice speed-up!

>> This pretty much depends on the linearity of the hash function? I don't
>> think a crypto secure hash function is needed for a hash table. Albeit I
>> agree that siphash certainly looks good to be used here.
> 
> In order to prevent the aforementioned poisoning attacks, a PRF with
> perfect linearity is required, which is what's achieved when it's a
> cryptographically secure one. Check out section 7 of
> https://131002.net/siphash/siphash.pdf .

I think you mean non-linearity. Otherwise I agree that siphash is
certainly a better suited hashing algorithm as far as I know. But it
would be really interesting to compare some performance numbers. Hard to
say anything without them.

>> I am pretty sure that SipHash still needs a random key per hash table
>> also. So far it was only the choice of hash function you are questioning.
> 
> Siphash needs a random secret key, yes. The point is that the hash
> function remains secure so long as the secret key is kept secret.
> Other functions can't make the same guarantee, and so nervous periodic
> key rotation is necessary, but in most cases nothing is done, and so
> things just leak over time.
> 
> 
>> Hmm, I tried to follow up with all the HashDoS work and so far didn't
>> see any HashDoS attacks against the Jenkins/SpookyHash family.
>>
>> If this is an issue we might need to also put those changes into stable.
> 
> jhash just isn't secure; it's not a cryptographically secure PRF. If
> there hasn't already been an academic paper put out there about it
> this year, let's make this thread 1000 messages long to garner
> attention, and next year perhaps we'll see one. No doubt that
> motivated government organizations, defense contractors, criminals,
> and other netizens have already done research in private. Replacing
> insecure functions with secure functions is usually a good thing.

I think this is a weak argument.

In general I am in favor to switch to siphash, but it would be nice to
see some benchmarks with the specific kernel implementation also on some
smaller 32 bit CPUs and especially without using any SIMD instructions
(which might have been used in paper comparison).

Bye,
Hannes

^ permalink raw reply

* RE: [PATCH v2 net-next 1/2] phy: add phy fixup unregister functions
From: Woojung.Huh @ 2016-12-14 15:34 UTC (permalink / raw)
  To: lidongpo, davem, f.fainelli; +Cc: andrew, netdev, UNGLinuxDriver
In-Reply-To: <58510539.1030803@hisilicon.com>

> I just want to commit the unregister patch and found this patch. Good job!
> But I consider this patch may miss something.
> If one SoC has 2 MAC ports and each port uses the different network driver,
> the 2 drivers may register fixup for the same PHY chip with different
> "run" function because the PHY chip works in different mode.
> In such a case, this patch doesn't consider "run" function and may cause
> problem.
> When removing the driver which register fixup at last, it will remove another
> driver's fixup.
> Should this condition be considered and fixed?
Good point.
Current phy fixup is independent LIST from phydev structure,
and, fixup runs in two places of phy_device_register() and phy_init_hw().
It's not clear that it needs two separate fixup, but it may be good idea to
pass phy fixup when calling phy_attach() or phy_attach_direct() and
put it under phydev structure.
So, fixup can be called at phy_init_hw() per phy device and remove
When phy detached.
Welcome any comments.

- Woojung

^ permalink raw reply

* netfilter -stable backport request
From: Eric Desrochers @ 2016-12-14 15:35 UTC (permalink / raw)
  To: stable, netdev

Hi,

I would like to request a -stable backport for the following patchset that as we speak can be found in pablo's nf-next:

# git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git

[PATCH 1/3]
commit 2394ae21e8b652aff0db1c02e946243c1e2f5edb
netfilter: x_tables: pass xt_counters struct instead of packet counter
   
[PATCH 2/3]
commit 18b61e8161cc308cbfd06d2e2c6c0758dfd925ef
netfilter: x_tables: pass xt_counters struct to counter allocator

[PATCH 3/3]
commit 722d6785e3b29a3b9f95c4d77542a1416094786a
netfilter: x_tables: pack percpu counter allocations

Please add this to stable branches : v4.4.x, v4.8.x

The above patchset is fixing a netfilter regression which introduced a performance slowdown in binary arp/ip/ip6tables starting at commit :

#v4.2-rc1
commit 71ae0dff02d756e4d2ca710b79f2ff5390029a5f
netfilter: xtables: use percpu rule counters

Regards,

Eric

^ permalink raw reply

* [PATCH RFC 1/2] bpf: add a longest prefix match trie map implementation
From: Daniel Mack @ 2016-12-14 15:43 UTC (permalink / raw)
  To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
In-Reply-To: <20161214154336.17639-1-daniel@zonque.org>

This trie implements a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges.

Internally, data is stored in an unbalanced trie of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.

Tries may be created with prefix lengths that are multiples of 8, in
the range from 8 to 2048. The key used for lookup and update operations
is a struct bpf_lpm_trie_key, and the value is a uint64_t.

The code carries more information about the internal implementation.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
---
 include/uapi/linux/bpf.h |   7 +
 kernel/bpf/Makefile      |   2 +-
 kernel/bpf/lpm_trie.c    | 491 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 499 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/lpm_trie.c

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87..d564277 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -63,6 +63,12 @@ struct bpf_insn {
 	__s32	imm;		/* signed immediate constant */
 };
 
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
+	__u8	data[0];	/* Arbitrary size */
+};
+
 /* BPF syscall commands, see bpf(2) man-page for details. */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
@@ -89,6 +95,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGROUP_ARRAY,
 	BPF_MAP_TYPE_LRU_HASH,
 	BPF_MAP_TYPE_LRU_PERCPU_HASH,
+	BPF_MAP_TYPE_LPM_TRIE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474..e1ce4f4 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 0000000..cae759d
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,491 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+	struct rcu_head rcu;
+	struct lpm_trie_node	*child[2];
+	u32			prefixlen;
+	u32			flags;
+	u64			value;
+	u8			data[0];
+};
+
+struct lpm_trie {
+	struct bpf_map		map;
+	struct lpm_trie_node	*root;
+	size_t			n_entries;
+	size_t			max_prefixlen;
+	size_t			data_size;
+	spinlock_t		lock;
+};
+
+/*
+ * This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of that (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *                   |
+ *    +----------------+
+ *    |       (2)      |
+ *    | 192.168.0.0/24 |
+ *    |    value: 2    |
+ *    |   [0]    [1]   |
+ *    +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ *              +----------------+
+ *              |       (1)  (R) |
+ *              | 192.168.0.0/16 |
+ *              |    value: 1    |
+ *              |   [0]    [1]   |
+ *              +----------------+
+ *                   |      |
+ *    +----------------+  +------------------+
+ *    |       (2)      |  |        (3)       |
+ *    | 192.168.0.0/24 |  | 192.168.128.0/24 |
+ *    |    value: 2    |  |     value: 3     |
+ *    |   [0]    [1]   |  |    [0]    [1]    |
+ *    +----------------+  +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node is does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differenciate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ *                      +----------------+
+ *                      |       (1)  (R) |
+ *                      | 192.168.0.0/16 |
+ *                      |    value: 1    |
+ *                      |   [0]    [1]   |
+ *                      +----------------+
+ *                           |      |
+ *            +----------------+  +------------------+
+ *            |       (4)  (I) |  |        (3)       |
+ *            | 192.168.0.0/23 |  | 192.168.128.0/24 |
+ *            |    value: ---  |  |     value: 3     |
+ *            |   [0]    [1]   |  |    [0]    [1]    |
+ *            +----------------+  +------------------+
+ *                 |      |
+ *  +----------------+  +----------------+
+ *  |       (2)      |  |       (5)      |
+ *  | 192.168.0.0/24 |  | 192.168.1.0/24 |
+ *  |    value: 2    |  |     value: 5   |
+ *  |   [0]    [1]   |  |   [0]    [1]   |
+ *  +----------------+  +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+	return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * longest_prefix_match() - determine the longest prefix
+ * @trie:	The trie to get internal sizes from
+ * @node:	The node to operate on
+ * @key:	The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+				   const struct lpm_trie_node *node,
+				   const struct bpf_lpm_trie_key *key)
+{
+	size_t prefixlen = 0;
+	int i;
+
+	for (i = 0; i < trie->data_size; i++) {
+		size_t b;
+
+		b = 8 - fls(node->data[i] ^ key->data[i]);
+		prefixlen += b;
+
+		if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
+			return min(node->prefixlen, key->prefixlen);
+
+		if (b < 8)
+			break;
+	}
+
+	return prefixlen;
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+	struct lpm_trie_node *node, *found = NULL;
+	struct bpf_lpm_trie_key *key = _key;
+	struct lpm_trie *trie =
+		container_of(map, struct lpm_trie, map);
+
+	/* Start walking the trie from the root node ... */
+
+	for (node = rcu_dereference(trie->root); node;) {
+		unsigned int next_bit;
+		size_t matchlen;
+
+		/*
+		 * Determine the longest prefix of @node that matches @key.
+		 * If it's the maximum possible prefix for this trie, we have
+		 * an exact match and can return it directly.
+		 */
+		matchlen = longest_prefix_match(trie, node, key);
+		if (matchlen == trie->max_prefixlen)
+			return &node->value;
+
+		/*
+		 * If the number of bits that match is smaller than the prefix
+		 * length of @node, bail out and return the node we have seen
+		 * last in the traversal (ie, the parent).
+		 */
+		if (matchlen < node->prefixlen)
+			break;
+
+		/*
+		 * Consider this node as return candidate unless it is an
+		 * artificially added intermediate one
+		 */
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			found = node;
+
+		/*
+		 * If the node match is fully satisfied, let's see if we can
+		 * become more specific. Determine the next bit in the key and
+		 * traverse down.
+		 */
+		next_bit = extract_bit(key->data, node->prefixlen);
+		node = rcu_dereference(node->child[next_bit]);
+	}
+
+	return found ? &found->value : NULL;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(size_t data_size)
+{
+	return kmalloc(sizeof(struct lpm_trie_node) + data_size,
+		       GFP_ATOMIC | __GFP_NOWARN);
+}
+
+/**
+ *_lpm_trie_find_target_node() - locate a spot to put a new node
+ * @trie:	The trie to walk
+ * @key:	The key to find a slot for
+ * @node_ret:	Return variable for a node slot
+ *
+ * Find a slot to put a new node for @key, and return it in @node_ret.
+ *
+ * If the target location is an empty child of an existing node, or the
+ * root is unused, a pointer to that empty spot is returned in @node_ret
+ * and 0 is returned by the function.
+ *
+ * Otherwise, if a node is detected that conflicts with @key, that conflicting
+ * node is returned in @node_ret. The caller should then replace that node with
+ * an intermediate node. In this case, the longest prefix match between the
+ * existing node and @key is returned.
+ */
+static size_t find_target_node(struct lpm_trie *trie,
+			       struct bpf_lpm_trie_key *key,
+			       struct lpm_trie_node ***node_ret)
+{
+	struct lpm_trie_node **node = &trie->root;
+	size_t matchlen = 0;
+
+	while (*node) {
+		unsigned int next_bit;
+
+		matchlen = longest_prefix_match(trie, *node, key);
+
+		if ((*node)->prefixlen != matchlen ||
+		    (*node)->prefixlen == key->prefixlen ||
+		    (*node)->prefixlen == trie->max_prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, (*node)->prefixlen);
+		node = &(*node)->child[next_bit];
+	}
+
+	*node_ret = node;
+
+	return *node ? matchlen : 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int trie_update_elem(struct bpf_map *map,
+			    void *_key, void *value, u64 flags)
+{
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct lpm_trie_node **node, *im_node, *new_node = NULL;
+	struct bpf_lpm_trie_key *key = _key;
+	size_t matchlen;
+	int ret = 0;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	spin_lock(&trie->lock);
+
+	/* Allocate and fill a new node */
+
+	if (trie->n_entries == trie->map.max_entries) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	new_node = lpm_trie_node_alloc(trie->data_size);
+	if (!new_node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	trie->n_entries++;
+	new_node->value = *(u64 *) value;
+	new_node->prefixlen = key->prefixlen;
+	new_node->flags = 0;
+	new_node->child[0] = NULL;
+	new_node->child[1] = NULL;
+	memcpy(new_node->data, key->data, trie->data_size);
+
+	/*
+	 * Now find a place to attach the new node. find_target_node()
+	 * either returned an empty slot (the root or an empty leaf), or the
+	 * closest match, in which case an intermediate node has to be created
+	 * and installed.
+	 */
+	matchlen = find_target_node(trie, key, &node);
+	if (!*node) {
+		rcu_assign_pointer(*node, new_node);
+		goto out;
+	}
+
+	/*
+	 * If the node we got back as target already exists, replace it
+	 * new_node, which already has the correct data array and value set.
+	 * If the node that is replaced is an intermediate one, turn it into a
+	 * 'real' node.
+	 */
+	if ((*node)->prefixlen == matchlen) {
+		struct lpm_trie_node *tmp;
+
+		new_node->child[0] = (*node)->child[0];
+		new_node->child[1] = (*node)->child[1];
+
+		tmp = rcu_dereference(*node);
+		if (!(tmp->flags & LPM_TREE_NODE_FLAG_IM))
+			trie->n_entries--;
+
+		rcu_assign_pointer(*node, new_node);
+		kfree_rcu(tmp, rcu);
+
+		goto out;
+	}
+
+	/*
+	 * If the new node matches the prefix completely, it must be an
+	 * inserted as an ancestor. Simply insert it between @node and @*node.
+	 */
+	if (matchlen == key->prefixlen) {
+		new_node->child[extract_bit((*node)->data, matchlen)] = *node;
+		rcu_assign_pointer(*node, new_node);
+		goto out;
+	}
+
+	/* Create an intermediate node and place it inbetween */
+	im_node = lpm_trie_node_alloc(trie->data_size);
+	if (!im_node) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	im_node->prefixlen = matchlen;
+	im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+	memcpy(im_node->data, (*node)->data, trie->data_size);
+
+	/* Now determine which child to install in which slot */
+	if (extract_bit(key->data, matchlen)) {
+		im_node->child[0] = *node;
+		im_node->child[1] = new_node;
+	} else {
+		im_node->child[0] = new_node;
+		im_node->child[1] = *node;
+	}
+
+	/* Finally, assign the intermediate node to the determined spot */
+	rcu_assign_pointer(*node, im_node);
+
+out:
+	if (ret) {
+		if (new_node)
+			trie->n_entries--;
+
+		kfree(new_node);
+		kfree(im_node);
+	}
+
+	spin_unlock(&trie->lock);
+
+	return ret;
+}
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+	struct lpm_trie *trie;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->map_flags ||
+	    attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1   ||
+	    attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 ||
+	    attr->value_size != sizeof(u64))
+		return ERR_PTR(-EINVAL);
+
+	trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+	if (!trie)
+		return NULL;
+
+	/* copy mandatory map attributes */
+	trie->map.map_type = attr->map_type;
+	trie->map.key_size = attr->key_size;
+	trie->map.value_size = attr->value_size;
+	trie->map.max_entries = attr->max_entries;
+	trie->data_size = attr->key_size -
+				offsetof(struct bpf_lpm_trie_key, data);
+	trie->max_prefixlen = trie->data_size * 8;
+
+	spin_lock_init(&trie->lock);
+
+	return &trie->map;
+}
+
+static void trie_free(struct bpf_map *map)
+{
+	struct lpm_trie_node **node;
+	struct lpm_trie *trie =
+		container_of(map, struct lpm_trie, map);
+
+	spin_lock(&trie->lock);
+
+	/*
+	 * Always start at the root and walk down to a node that has no
+	 * children. Then free that node, nullify its parent pointer and
+	 * start over.
+	 */
+
+	for (;;) {
+		node = &trie->root;
+		if (!*node)
+			break;
+
+		for (;;) {
+			if ((*node)->child[0]) {
+				node = &(*node)->child[0];
+				continue;
+			}
+
+			if ((*node)->child[1]) {
+				node = &(*node)->child[1];
+				continue;
+			}
+
+			kfree(*node);
+			*node = NULL;
+			break;
+		}
+	}
+
+	spin_unlock(&trie->lock);
+}
+
+static const struct bpf_map_ops trie_ops = {
+	.map_alloc = trie_alloc,
+	.map_free = trie_free,
+	.map_lookup_elem = trie_lookup_elem,
+	.map_update_elem = trie_update_elem,
+};
+
+static struct bpf_map_type_list trie_type __read_mostly = {
+	.ops = &trie_ops,
+	.type = BPF_MAP_TYPE_LPM_TRIE,
+};
+
+static int __init register_trie_map(void)
+{
+	bpf_register_map_type(&trie_type);
+	return 0;
+}
+late_initcall(register_trie_map);
-- 
2.9.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox