Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH 3/4] igb: add support for extended PHC gettime
From: Miroslav Lichvar @ 2018-10-26 16:27 UTC (permalink / raw)
  To: netdev; +Cc: intel-wired-lan, Richard Cochran, Jacob Keller, Miroslav Lichvar
In-Reply-To: <20181026162742.631-1-mlichvar@redhat.com>

Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 drivers/net/ethernet/intel/igb/igb_ptp.c | 43 ++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index 29ced6b74d36..6294d18b5a60 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -310,6 +310,46 @@ static int igb_ptp_gettime_i210(struct ptp_clock_info *ptp,
 	return 0;
 }
 
+static int igb_ptp_gettimex(struct ptp_clock_info *ptp,
+			    struct ptp_system_timestamp *sts)
+{
+	struct igb_adapter *igb = container_of(ptp, struct igb_adapter,
+					       ptp_caps);
+	struct e1000_hw *hw = &igb->hw;
+	unsigned long flags;
+	u32 lo, hi;
+	u64 ns;
+
+	spin_lock_irqsave(&igb->tmreg_lock, flags);
+
+	/* 82576 doesn't have SYSTIMR */
+	if (igb->hw.mac.type == e1000_82576) {
+		ptp_read_system_prets(sts);
+		lo = rd32(E1000_SYSTIML);
+		ptp_read_system_postts(sts);
+		hi = rd32(E1000_SYSTIMH);
+	} else {
+		ptp_read_system_prets(sts);
+		rd32(E1000_SYSTIMR);
+		ptp_read_system_postts(sts);
+		lo = rd32(E1000_SYSTIML);
+		hi = rd32(E1000_SYSTIMH);
+	}
+
+	/* SYSTIM on I210/I211 counts time in seconds and nanoseconds */
+	if (igb->hw.mac.type == e1000_i210 || igb->hw.mac.type == e1000_i211) {
+		sts->phc_ts.tv_sec = hi;
+		sts->phc_ts.tv_nsec = lo;
+	} else {
+		ns = timecounter_cyc2time(&igb->tc, ((u64)hi << 32) | lo);
+		sts->phc_ts = ns_to_timespec64(ns);
+	}
+
+	spin_unlock_irqrestore(&igb->tmreg_lock, flags);
+
+	return 0;
+}
+
 static int igb_ptp_settime_82576(struct ptp_clock_info *ptp,
 				 const struct timespec64 *ts)
 {
@@ -1125,6 +1165,7 @@ void igb_ptp_init(struct igb_adapter *adapter)
 		adapter->ptp_caps.adjfreq = igb_ptp_adjfreq_82576;
 		adapter->ptp_caps.adjtime = igb_ptp_adjtime_82576;
 		adapter->ptp_caps.gettime64 = igb_ptp_gettime_82576;
+		adapter->ptp_caps.gettimex64 = igb_ptp_gettimex;
 		adapter->ptp_caps.settime64 = igb_ptp_settime_82576;
 		adapter->ptp_caps.enable = igb_ptp_feature_enable;
 		adapter->cc.read = igb_ptp_read_82576;
@@ -1144,6 +1185,7 @@ void igb_ptp_init(struct igb_adapter *adapter)
 		adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580;
 		adapter->ptp_caps.adjtime = igb_ptp_adjtime_82576;
 		adapter->ptp_caps.gettime64 = igb_ptp_gettime_82576;
+		adapter->ptp_caps.gettimex64 = igb_ptp_gettimex;
 		adapter->ptp_caps.settime64 = igb_ptp_settime_82576;
 		adapter->ptp_caps.enable = igb_ptp_feature_enable;
 		adapter->cc.read = igb_ptp_read_82580;
@@ -1172,6 +1214,7 @@ void igb_ptp_init(struct igb_adapter *adapter)
 		adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580;
 		adapter->ptp_caps.adjtime = igb_ptp_adjtime_i210;
 		adapter->ptp_caps.gettime64 = igb_ptp_gettime_i210;
+		adapter->ptp_caps.gettimex64 = igb_ptp_gettimex;
 		adapter->ptp_caps.settime64 = igb_ptp_settime_i210;
 		adapter->ptp_caps.enable = igb_ptp_feature_enable_i210;
 		adapter->ptp_caps.verify = igb_ptp_verify_pin;
-- 
2.17.2

^ permalink raw reply related

* Re: [PATCH net] bridge: do not add port to router list when receives query with source 0.0.0.0
From: Roopa Prabhu @ 2018-10-26 16:27 UTC (permalink / raw)
  To: Hangbin Liu
  Cc: netdev, Nikolay Aleksandrov, Jiri Pirko, Linus Lüssing,
	David Miller
In-Reply-To: <1540520923-17589-1-git-send-email-liuhangbin@gmail.com>

On Thu, Oct 25, 2018 at 7:29 PM Hangbin Liu <liuhangbin@gmail.com> wrote:
>
> Based on RFC 4541, 2.1.1.  IGMP Forwarding Rules
>
>   The switch supporting IGMP snooping must maintain a list of
>   multicast routers and the ports on which they are attached.  This
>   list can be constructed in any combination of the following ways:
>
>   a) This list should be built by the snooping switch sending
>      Multicast Router Solicitation messages as described in IGMP
>      Multicast Router Discovery [MRDISC].  It may also snoop
>      Multicast Router Advertisement messages sent by and to other
>      nodes.
>
>   b) The arrival port for IGMP Queries (sent by multicast routers)
>      where the source address is not 0.0.0.0.
>
> We should not add the port to router list when receives query with source
> 0.0.0.0.
>
> Reported-by: Ying Xu <yinxu@redhat.com>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---

Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>

^ permalink raw reply

* [RFC PATCH 2/4] e1000e: add support for extended PHC gettime
From: Miroslav Lichvar @ 2018-10-26 16:27 UTC (permalink / raw)
  To: netdev; +Cc: intel-wired-lan, Richard Cochran, Jacob Keller, Miroslav Lichvar
In-Reply-To: <20181026162742.631-1-mlichvar@redhat.com>

Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 drivers/net/ethernet/intel/e1000e/e1000.h  |  3 ++
 drivers/net/ethernet/intel/e1000e/netdev.c | 48 +++++++++++++++++-----
 drivers/net/ethernet/intel/e1000e/ptp.c    | 21 ++++++++++
 3 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index c760dc72c520..be13227f1697 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -505,6 +505,9 @@ extern const struct e1000_info e1000_es2_info;
 void e1000e_ptp_init(struct e1000_adapter *adapter);
 void e1000e_ptp_remove(struct e1000_adapter *adapter);
 
+u64 e1000e_read_systim(struct e1000_adapter *adapter,
+		       struct ptp_system_timestamp *sts);
+
 static inline s32 e1000_phy_hw_reset(struct e1000_hw *hw)
 {
 	return hw->phy.ops.reset(hw);
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 3ba0c90e7055..3bad1a1f36c3 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4319,13 +4319,16 @@ void e1000e_reinit_locked(struct e1000_adapter *adapter)
 /**
  * e1000e_sanitize_systim - sanitize raw cycle counter reads
  * @hw: pointer to the HW structure
- * @systim: time value read, sanitized and returned
+ * @systim: PHC time value read, sanitized and returned
+ * @sts: structure which will contain system time before and after reading
+ * SYSTIML, may be NULL
  *
  * Errata for 82574/82583 possible bad bits read from SYSTIMH/L:
  * check to see that the time is incrementing at a reasonable
  * rate and is a multiple of incvalue.
  **/
-static u64 e1000e_sanitize_systim(struct e1000_hw *hw, u64 systim)
+static u64 e1000e_sanitize_systim(struct e1000_hw *hw, u64 systim,
+				  struct ptp_system_timestamp *sts)
 {
 	u64 time_delta, rem, temp;
 	u64 systim_next;
@@ -4335,7 +4338,11 @@ static u64 e1000e_sanitize_systim(struct e1000_hw *hw, u64 systim)
 	incvalue = er32(TIMINCA) & E1000_TIMINCA_INCVALUE_MASK;
 	for (i = 0; i < E1000_MAX_82574_SYSTIM_REREADS; i++) {
 		/* latch SYSTIMH on read of SYSTIML */
+		if (sts)
+			ptp_read_system_prets(sts);
 		systim_next = (u64)er32(SYSTIML);
+		if (sts)
+			ptp_read_system_postts(sts);
 		systim_next |= (u64)er32(SYSTIMH) << 32;
 
 		time_delta = systim_next - systim;
@@ -4353,15 +4360,16 @@ static u64 e1000e_sanitize_systim(struct e1000_hw *hw, u64 systim)
 }
 
 /**
- * e1000e_cyclecounter_read - read raw cycle counter (used by time counter)
- * @cc: cyclecounter structure
+ * e1000e_read_systim - read SYSTIM register
+ * @adapter: board private structure
+ * @sts: structure which will contain system time before and after reading
+ * SYSTIML, may be NULL
  **/
-static u64 e1000e_cyclecounter_read(const struct cyclecounter *cc)
+u64 e1000e_read_systim(struct e1000_adapter *adapter,
+		       struct ptp_system_timestamp *sts)
 {
-	struct e1000_adapter *adapter = container_of(cc, struct e1000_adapter,
-						     cc);
 	struct e1000_hw *hw = &adapter->hw;
-	u32 systimel, systimeh;
+	u32 systimel, systimel_2, systimeh;
 	u64 systim;
 	/* SYSTIMH latching upon SYSTIML read does not work well.
 	 * This means that if SYSTIML overflows after we read it but before
@@ -4369,11 +4377,19 @@ static u64 e1000e_cyclecounter_read(const struct cyclecounter *cc)
 	 * will experience a huge non linear increment in the systime value
 	 * to fix that we test for overflow and if true, we re-read systime.
 	 */
+	if (sts)
+		ptp_read_system_prets(sts);
 	systimel = er32(SYSTIML);
+	if (sts)
+		ptp_read_system_postts(sts);
 	systimeh = er32(SYSTIMH);
 	/* Is systimel is so large that overflow is possible? */
 	if (systimel >= (u32)0xffffffff - E1000_TIMINCA_INCVALUE_MASK) {
-		u32 systimel_2 = er32(SYSTIML);
+		if (sts)
+			ptp_read_system_prets(sts);
+		systimel_2 = er32(SYSTIML);
+		if (sts)
+			ptp_read_system_postts(sts);
 		if (systimel > systimel_2) {
 			/* There was an overflow, read again SYSTIMH, and use
 			 * systimel_2
@@ -4386,11 +4402,23 @@ static u64 e1000e_cyclecounter_read(const struct cyclecounter *cc)
 	systim |= (u64)systimeh << 32;
 
 	if (adapter->flags2 & FLAG2_CHECK_SYSTIM_OVERFLOW)
-		systim = e1000e_sanitize_systim(hw, systim);
+		systim = e1000e_sanitize_systim(hw, systim, sts);
 
 	return systim;
 }
 
+/**
+ * e1000e_cyclecounter_read - read raw cycle counter (used by time counter)
+ * @cc: cyclecounter structure
+ **/
+static u64 e1000e_cyclecounter_read(const struct cyclecounter *cc)
+{
+	struct e1000_adapter *adapter = container_of(cc, struct e1000_adapter,
+						     cc);
+
+	return e1000e_read_systim(adapter, NULL);
+}
+
 /**
  * e1000_sw_init - Initialize general software structures (struct e1000_adapter)
  * @adapter: board private structure to initialize
diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c
index e1f821edbc21..bf1ca7ba8c37 100644
--- a/drivers/net/ethernet/intel/e1000e/ptp.c
+++ b/drivers/net/ethernet/intel/e1000e/ptp.c
@@ -188,6 +188,26 @@ static int e1000e_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 	return 0;
 }
 
+static int e1000e_phc_gettimex(struct ptp_clock_info *ptp,
+			       struct ptp_system_timestamp *sts)
+{
+	struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter,
+						     ptp_clock_info);
+	unsigned long flags;
+	u64 cycles, ns;
+
+	spin_lock_irqsave(&adapter->systim_lock, flags);
+
+	cycles = e1000e_read_systim(adapter, sts);
+	ns = timecounter_cyc2time(&adapter->tc, cycles);
+
+	spin_unlock_irqrestore(&adapter->systim_lock, flags);
+
+	sts->phc_ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
 /**
  * e1000e_phc_settime - Set the current time on the hardware clock
  * @ptp: ptp clock structure
@@ -259,6 +279,7 @@ static const struct ptp_clock_info e1000e_ptp_clock_info = {
 	.adjfreq	= e1000e_phc_adjfreq,
 	.adjtime	= e1000e_phc_adjtime,
 	.gettime64	= e1000e_phc_gettime,
+	.gettimex64	= e1000e_phc_gettimex,
 	.settime64	= e1000e_phc_settime,
 	.enable		= e1000e_phc_enable,
 };
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH 1/4] ptp: add PTP_SYS_OFFSET_EXTENDED ioctl
From: Miroslav Lichvar @ 2018-10-26 16:27 UTC (permalink / raw)
  To: netdev; +Cc: intel-wired-lan, Richard Cochran, Jacob Keller, Miroslav Lichvar
In-Reply-To: <20181026162742.631-1-mlichvar@redhat.com>

The PTP_SYS_OFFSET ioctl, which can be used to measure the offset
between a PHC and the system clock, includes the total time that the
gettime64 function of a driver needs to read the PHC timestamp.

This typically involves reading of multiple PCI registers (sometimes in
multiple iterations) and the register that contains the lowest bits of
the timestamp is not read in the middle between the two readings of the
system clock. This asymmetry causes the measured offset to have a
significant error.

Introduce a new ioctl, driver function, and helper functions, which
allow the reading of the lowest register to be isolated from the other
readings in order to reduce the asymmetry. The ioctl and driver function
return three timestamps for each measurement:
- system time right before reading the lowest bits of the PHC timestamp
- PHC time
- system time immediately after reading the lowest bits of the PHC
  timestamp

Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
---
 drivers/ptp/ptp_chardev.c        | 39 ++++++++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h | 26 +++++++++++++++++++++
 include/uapi/linux/ptp_clock.h   | 12 ++++++++++
 3 files changed, 77 insertions(+)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 2012551d93e0..1a04c437fd4f 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -124,11 +124,13 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 	struct ptp_clock_caps caps;
 	struct ptp_clock_request req;
 	struct ptp_sys_offset *sysoff = NULL;
+	struct ptp_sys_offset_extended *sysoff_extended = NULL;
 	struct ptp_sys_offset_precise precise_offset;
 	struct ptp_pin_desc pd;
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
 	struct ptp_clock_info *ops = ptp->info;
 	struct ptp_clock_time *pct;
+	struct ptp_system_timestamp sts;
 	struct timespec64 ts;
 	struct system_device_crosststamp xtstamp;
 	int enable, err = 0;
@@ -211,6 +213,43 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			err = -EFAULT;
 		break;
 
+	case PTP_SYS_OFFSET_EXTENDED:
+		if (!ptp->info->gettimex64) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+		sysoff_extended = memdup_user((void __user *)arg,
+					      sizeof(*sysoff_extended));
+		if (IS_ERR(sysoff_extended)) {
+			err = PTR_ERR(sysoff_extended);
+			sysoff = NULL;
+			break;
+		}
+		if (sysoff_extended->n_samples > PTP_MAX_SAMPLES) {
+			err = -EINVAL;
+			break;
+		}
+
+		pct = &sysoff_extended->ts[0];
+		for (i = 0; i < sysoff_extended->n_samples; i++) {
+			err = ptp->info->gettimex64(ptp->info, &sts);
+			if (err)
+				break;
+			pct->sec = sts.sys_ts1.tv_sec;
+			pct->nsec = sts.sys_ts1.tv_nsec;
+			pct++;
+			pct->sec = sts.phc_ts.tv_sec;
+			pct->nsec = sts.phc_ts.tv_nsec;
+			pct++;
+			pct->sec = sts.sys_ts2.tv_sec;
+			pct->nsec = sts.sys_ts2.tv_nsec;
+			pct++;
+		}
+		if (copy_to_user((void __user *)arg, sysoff_extended,
+				 sizeof(*sysoff_extended)))
+			err = -EFAULT;
+		break;
+
 	case PTP_SYS_OFFSET:
 		sysoff = memdup_user((void __user *)arg, sizeof(*sysoff));
 		if (IS_ERR(sysoff)) {
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 51349d124ee5..79321d929925 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -39,6 +39,13 @@ struct ptp_clock_request {
 };
 
 struct system_device_crosststamp;
+
+struct ptp_system_timestamp {
+	struct timespec64 sys_ts1;
+	struct timespec64 phc_ts;
+	struct timespec64 sys_ts2;
+};
+
 /**
  * struct ptp_clock_info - decribes a PTP hardware clock
  *
@@ -75,6 +82,13 @@ struct system_device_crosststamp;
  * @gettime64:  Reads the current time from the hardware clock.
  *              parameter ts: Holds the result.
  *
+ * @gettimex64:  Reads the current time from the system clock, hardware clock,
+ *               and system clock again.
+ *               parameter sts:  The structure contains system time right
+ *               before reading the lowest bits of the PHC timestamp, the PHC
+ *               timestamp itself, and system time immediately after reading
+ *               the lowest bits of the PHC timestamp.
+ *
  * @getcrosststamp:  Reads the current time from the hardware clock and
  *                   system clock simultaneously.
  *                   parameter cts: Contains timestamp (device,system) pair,
@@ -124,6 +138,8 @@ struct ptp_clock_info {
 	int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
 	int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
 	int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
+	int (*gettimex64)(struct ptp_clock_info *ptp,
+			  struct ptp_system_timestamp *sts);
 	int (*getcrosststamp)(struct ptp_clock_info *ptp,
 			      struct system_device_crosststamp *cts);
 	int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts);
@@ -227,6 +243,16 @@ int ptp_find_pin(struct ptp_clock *ptp,
 
 int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay);
 
+static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
+{
+	ktime_get_real_ts64(&sts->sys_ts1);
+}
+
+static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
+{
+	ktime_get_real_ts64(&sts->sys_ts2);
+}
+
 #else
 static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 						   struct device *parent)
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 3039bf6a742e..0cb61aed9077 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -84,6 +84,16 @@ struct ptp_sys_offset {
 	struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1];
 };
 
+struct ptp_sys_offset_extended {
+	unsigned int n_samples; /* Desired number of measurements. */
+	unsigned int rsv[3];    /* Reserved for future use. */
+	/*
+	 * Array of sys, phc, sys, sys, phc, sys, ... time stamps. The kernel
+	 * will provide 3*n_samples time stamps.
+	 */
+	struct ptp_clock_time ts[3 * PTP_MAX_SAMPLES];
+};
+
 struct ptp_sys_offset_precise {
 	struct ptp_clock_time device;
 	struct ptp_clock_time sys_realtime;
@@ -136,6 +146,8 @@ struct ptp_pin_desc {
 #define PTP_PIN_SETFUNC    _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc)
 #define PTP_SYS_OFFSET_PRECISE \
 	_IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise)
+#define PTP_SYS_OFFSET_EXTENDED \
+	_IOW(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
 
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH 0/4] More accurate PHC<->system clock synchronization
From: Miroslav Lichvar @ 2018-10-26 16:27 UTC (permalink / raw)
  To: netdev; +Cc: intel-wired-lan, Richard Cochran, Jacob Keller, Miroslav Lichvar

This series adds support for a more accurate synchronization between a
PTP hardware clock and the system clock.

The first patch adds an extended version of the PTP_SYS_OFFSET ioctl,
which returns three timestamps for each measurement. The idea is to
shorten the interval between the system timestamps to contain just the
reading of the lowest register of the PHC in order to reduce the error
in the measured offset and give a better bound on the maximum error.

The other patches add support for the new ioctl to the e1000e, igb,
and ixgbe driver. Tests with few different NICs in different machines
(and PCIe slots) show that:
- with an I219 (e1000e) the measured delay improved from 2500 to 1300 ns
  and the error in the measured offset, when compared to cross
  timestamping, was reduced by a factor of 5
- with an I210 (igb) the delay improved from 5100 to 1700 ns
- with an I350 (igb) the delay improved from 2300 to 750 ns
- with an X550 (ixgbe) the delay improved from 1950 to 650 ns

There is some duplication of code in the igb and ixgbe drivers, which I
don't like very much, but I thought it's better than extending and
wrapping the existing functions like in the e1000e driver. Also, mixing
SYSTIM and "system time" in the code will probably be confusing.

I wasn't able to find a better name for the ioctl, the structures, and
the driver function. If anyone has suggestions, please let me know.

Miroslav Lichvar (4):
  ptp: add PTP_SYS_OFFSET_EXTENDED ioctl
  e1000e: add support for extended PHC gettime
  igb: add support for extended PHC gettime
  ixgbe: add support for extended PHC gettime

 drivers/net/ethernet/intel/e1000e/e1000.h    |  3 ++
 drivers/net/ethernet/intel/e1000e/netdev.c   | 48 +++++++++++++----
 drivers/net/ethernet/intel/e1000e/ptp.c      | 21 ++++++++
 drivers/net/ethernet/intel/igb/igb_ptp.c     | 43 +++++++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c | 57 ++++++++++++++++++++
 drivers/ptp/ptp_chardev.c                    | 39 ++++++++++++++
 include/linux/ptp_clock_kernel.h             | 26 +++++++++
 include/uapi/linux/ptp_clock.h               | 12 +++++
 8 files changed, 239 insertions(+), 10 deletions(-)

-- 
2.17.2

^ permalink raw reply

* Re: [PATCH net] bridge: do not add port to router list when receives query with source 0.0.0.0
From: nikolay @ 2018-10-26 16:26 UTC (permalink / raw)
  To: Hangbin Liu, netdev; +Cc: Jiri Pirko, Linus Lüssing, David S. Miller
In-Reply-To: <1540520923-17589-1-git-send-email-liuhangbin@gmail.com>

On 26 October 2018 05:28:43 EEST, Hangbin Liu <liuhangbin@gmail.com> wrote:
>Based on RFC 4541, 2.1.1.  IGMP Forwarding Rules
>
>  The switch supporting IGMP snooping must maintain a list of
>  multicast routers and the ports on which they are attached.  This
>  list can be constructed in any combination of the following ways:
>
>  a) This list should be built by the snooping switch sending
>     Multicast Router Solicitation messages as described in IGMP
>     Multicast Router Discovery [MRDISC].  It may also snoop
>     Multicast Router Advertisement messages sent by and to other
>     nodes.
>
>  b) The arrival port for IGMP Queries (sent by multicast routers)
>     where the source address is not 0.0.0.0.
>
>We should not add the port to router list when receives query with
>source
>0.0.0.0.
>
>Reported-by: Ying Xu <yinxu@redhat.com>
>Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
>---

Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>

^ permalink raw reply

* Re: [PATCH v2 00/17] octeontx2-af: NPC parser and NIX blocks initialization
From: Sunil Kovvuri @ 2018-10-26 16:26 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: David S. Miller, Linux Netdev List, linux-soc, Sunil Goutham
In-Reply-To: <CAK8P3a3qL15dmeRYX7W2EC5bGO1qFCXEDRj+xTcGt8Fakwc3vw@mail.gmail.com>

On Fri, Oct 26, 2018 at 7:34 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On 10/26/18, Sunil Kovvuri <sunil.kovvuri@gmail.com> wrote:
> > On Fri, Oct 26, 2018 at 6:24 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >>
> >> I see this has been applied, but I'd still like to understand better how
> >> the
> >> configuration interface is expected to work once the driver is complete.
> >>
> >> In particular, so far the interfaces all assume that configuration is
> >> done through the mailbox between PCI devices, which could be done
> >> from a virtual machine kernel with access to PCI, or through the use
> >> of VFIO from a user application.
> >>
> >> Is that the only method of configuring it that you support, or will there
> >> also be a devlink based interface or something like that to configure
> >> the aspects of a virtual device that should not be accessible to the
> >> VF itself?
> >>
> >
> >
> > As of now it's only mbox based configuration that is supported.
>
> Ok, thanks for the clarification.
>
> Does this mean that you intend to have user space tools that use
> the mbox based interface on VFIO devices to perform configuration
> for virtual network devices, or just that the configuration interface
> is something that needs to be designed later?
>

No there is no need for any userspace tools.
It's the virtual network device's driver which will send commands
like resource allocation, configuration, stats retrieval to this
AF device via mbox interface.

eg: A user using ethtool changes RSS settings for the network device,
network device's driver receives the data, prepares a mailbox command
sends it to this driver for configuring the same in HW.

Thanks,
Sunil.

^ permalink raw reply

* Re: [PATCH net] net: sched: Remove TCA_OPTIONS from policy
From: Jiri Pirko @ 2018-10-26 16:08 UTC (permalink / raw)
  To: David Ahern; +Cc: David Ahern, netdev, davem, pupilla
In-Reply-To: <b3d1f6a1-e1aa-1323-33fc-2dadb9602a95@gmail.com>

Fri, Oct 26, 2018 at 06:02:01PM CEST, dsahern@gmail.com wrote:
>On 10/25/18 12:31 AM, Jiri Pirko wrote:
>> Wed, Oct 24, 2018 at 05:32:49PM CEST, dsahern@kernel.org wrote:
>>> From: David Ahern <dsahern@gmail.com>
>>>
>>> Marco reported an error with hfsc:
>>> root@Calimero:~# tc qdisc add dev eth0 root handle 1:0 hfsc default 1
>>> Error: Attribute failed policy validation.
>>>
>>> Apparently a few implementations pass TCA_OPTIONS as a binary instead
>>> of nested attribute, so drop TCA_OPTIONS from the policy.
>> 
>> Yeah, this is nice example of a case, where I think it wouldn't hurt to
>> be a bit more strict. Apparently, the userspace app is buggy. It should
>> be fixed. Note that I'm aware of the bw compatibility.
>
>Kernel side for hfsc expects TCA_OPTIONS as a binary as well - a struct
>tc_hfsc_qopt. Nothing that can be done.

:(

^ permalink raw reply

* Re: [PATCH net] net: sched: Remove TCA_OPTIONS from policy
From: David Ahern @ 2018-10-26 16:02 UTC (permalink / raw)
  To: Jiri Pirko, David Ahern; +Cc: netdev, davem, pupilla
In-Reply-To: <20181025063148.GA2143@nanopsycho>

On 10/25/18 12:31 AM, Jiri Pirko wrote:
> Wed, Oct 24, 2018 at 05:32:49PM CEST, dsahern@kernel.org wrote:
>> From: David Ahern <dsahern@gmail.com>
>>
>> Marco reported an error with hfsc:
>> root@Calimero:~# tc qdisc add dev eth0 root handle 1:0 hfsc default 1
>> Error: Attribute failed policy validation.
>>
>> Apparently a few implementations pass TCA_OPTIONS as a binary instead
>> of nested attribute, so drop TCA_OPTIONS from the policy.
> 
> Yeah, this is nice example of a case, where I think it wouldn't hurt to
> be a bit more strict. Apparently, the userspace app is buggy. It should
> be fixed. Note that I'm aware of the bw compatibility.

Kernel side for hfsc expects TCA_OPTIONS as a binary as well - a struct
tc_hfsc_qopt. Nothing that can be done.

^ permalink raw reply

* Re: [PATCH v2 00/17] octeontx2-af: NPC parser and NIX blocks initialization
From: Arnd Bergmann @ 2018-10-26 15:55 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Sunil Kovvuri, David Miller, Networking, linux-soc, Sunil Goutham
In-Reply-To: <20181026154713.GE820@lunn.ch>

On Fri, Oct 26, 2018 at 5:47 PM Andrew Lunn <andrew@lunn.ch> wrote:
>
> > I fear that setting a precedent of using the mbox for user-level
> > configuration management would mean that we would have to
> > treat each of these interfaces as an ABI, which in turn requires
> > much deeper review as well as raising the fundamental question
> > on how this should be done across drivers. The mailbox interface
> > seem inherently nonportable to other hardware here, which is
> > a significant downside.
>
> Hi Arnd
>
> You might want to go look at the Freescale DPAA2. They also want to
> add an ioctl to pass binary blob commands to their firmware. The
> patches were re-posted recently.
>
> https://lkml.org/lkml/2018/10/5/873
>
> When this was first posted, i strongly argued against it.
> You also commented about this:
>
> https://lkml.org/lkml/2018/3/24/29
>
> We need to consistent here. I think it is a bad idea.

I agree, and this is exactly why I commented here. I just wanted
to first ensure that it's not me misunderstanding the scope and
intention of the interfaces here before I say it's a mistake.

        Arnd

^ permalink raw reply

* Re: [PATCH net] net: sched: Remove TCA_OPTIONS from policy
From: David Ahern @ 2018-10-26 15:48 UTC (permalink / raw)
  To: Marco Berizzi, David Ahern; +Cc: davem, netdev
In-Reply-To: <1305358874.1795395.1540553653206@mail.libero.it>

On 10/26/18 5:34 AM, Marco Berizzi wrote:
> Apologies for bothering you again.
> I applied your patch to 4.19, but after issuing this
> command:
> 
> root@Calimero:~# tc qdisc add dev eth0 root handle 1:0 hfsc default 1
> root@Calimero:~# ping 10.81.104.1
> PING 10.81.104.1 (10.81.104.1) 56(84) bytes of data.
> ^C
> --- 10.81.104.1 ping statistics ---
> 2 packets transmitted, 0 received, 100% packet loss, time 1001ms
> 
> I'm losing ipv4 connectivity.
> If I remove the qdisc everything is going to work again:
> 
> root@Calimero:~# tc qdisc del dev eth0 root                   
> root@Calimero:~# ping 10.81.104.1
> PING 10.81.104.1 (10.81.104.1) 56(84) bytes of data.
> 64 bytes from 10.81.104.1: icmp_seq=1 ttl=255 time=0.711 ms
> ^C
> --- 10.81.104.1 ping statistics ---
> 1 packets transmitted, 1 received, 0% packet loss, time 0ms
> rtt min/avg/max/mdev = 0.711/0.711/0.711/0.000 ms
> 

I backed up to 95278ddaa15cfa23e4a06ee9ed7b6ee0197c500b which is the
commit before the validation patch and it does not work there.

Can you bisect and find out when it stopped working?

^ permalink raw reply

* RE: [PATCH] igb: shorten maximum PHC timecounter update interval
From: Keller, Jacob E @ 2018-10-26 15:47 UTC (permalink / raw)
  To: Miroslav Lichvar, Richard Cochran
  Cc: intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	Thomas Gleixner
In-Reply-To: <20181026120416.GB27139@localhost>

> -----Original Message-----
> From: Miroslav Lichvar [mailto:mlichvar@redhat.com]
> Sent: Friday, October 26, 2018 5:04 AM
> To: Richard Cochran <richardcochran@gmail.com>
> Cc: intel-wired-lan@lists.osuosl.org; netdev@vger.kernel.org; Keller, Jacob E
> <jacob.e.keller@intel.com>; Thomas Gleixner <tglx@linutronix.de>
> Subject: Re: [PATCH] igb: shorten maximum PHC timecounter update interval
> 
> On Fri, Oct 12, 2018 at 07:05:30AM -0700, Richard Cochran wrote:
> > On Fri, Oct 12, 2018 at 01:13:39PM +0200, Miroslav Lichvar wrote:
> > > Since commit 500462a9d ("timers: Switch to a non-cascading wheel"),
> > > scheduling of delayed work seems to be less accurate and a requested
> > > delay of 540 seconds may actually be longer than 550 seconds. Shorten
> > > the delay to 480 seconds to be sure the timecounter is updated in time.
> >
> > Good catch.  This timer wheel change will affect other, similar
> > drivers.  Guess I'll go through and adjust their timeouts, too.
> 
> I just realized that we need to fit there also any frequency
> adjustments of the PHC and system clock. The PHC can be set to run up
> to 6% faster and the system clock can be slowed down by up to 10%.
> 
> Those 480 seconds in the igb driver is not short enough for that.
> Should I fix and resend this patch, or send a new one?
> 
> Other drivers may have a similar problem.
> 

Hmm, good point. I'd send a v2 of this patch, unless it's already been applied to net or net-next.

Thanks,
Jake

> --
> Miroslav Lichvar

^ permalink raw reply

* Re: [PATCH v2 00/17] octeontx2-af: NPC parser and NIX blocks initialization
From: Andrew Lunn @ 2018-10-26 15:47 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Sunil Kovvuri, David S. Miller, Linux Netdev List, linux-soc,
	Sunil Goutham
In-Reply-To: <CAK8P3a3qL15dmeRYX7W2EC5bGO1qFCXEDRj+xTcGt8Fakwc3vw@mail.gmail.com>

> I fear that setting a precedent of using the mbox for user-level
> configuration management would mean that we would have to
> treat each of these interfaces as an ABI, which in turn requires
> much deeper review as well as raising the fundamental question
> on how this should be done across drivers. The mailbox interface
> seem inherently nonportable to other hardware here, which is
> a significant downside.

Hi Arnd

You might want to go look at the Freescale DPAA2. They also want to
add an ioctl to pass binary blob commands to their firmware. The
patches were re-posted recently.

https://lkml.org/lkml/2018/10/5/873

When this was first posted, i strongly argued against it.
You also commented about this:

https://lkml.org/lkml/2018/3/24/29

We need to consistent here. I think it is a bad idea.

   Andrew

^ permalink raw reply

* Re: [PATCH net] ipv6/ndisc: Preserve IPv6 control buffer if protocol error handlers are called
From: Sabrina Dubroca @ 2018-10-26 15:00 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: David S. Miller, Hideaki YOSHIFUJI, netdev
In-Reply-To: <7acb12cb7a35c7a5f9670fc5b1373610b4d5ed67.1540384081.git.sbrivio@redhat.com>

2018-10-24, 14:37:21 +0200, Stefano Brivio wrote:
> Commit a61bbcf28a8c ("[NET]: Store skb->timestamp as offset to a base
> timestamp") introduces a neighbour control buffer and zeroes it out in
> ndisc_rcv(), as ndisc_recv_ns() uses it.
> 
> Commit f2776ff04722 ("[IPV6]: Fix address/interface handling in UDP and
> DCCP, according to the scoping architecture.") introduces the usage of the
> IPv6 control buffer in protocol error handlers (e.g. inet6_iif() in
> present-day __udp6_lib_err()).
> 
> Now, with commit b94f1c0904da ("ipv6: Use icmpv6_notify() to propagate
> redirect, instead of rt6_redirect()."), we call protocol error handlers
> from ndisc_redirect_rcv(), after the control buffer is already stolen and
> some parts are already zeroed out. This implies that inet6_iif() on this
> path will always return zero.
> 
> This gives unexpected results on UDP socket lookup in __udp6_lib_err(), as
> we might actually need to match sockets for a given interface.
> 
> Instead of always claiming the control buffer in ndisc_rcv(), do that only
> when needed.
> 
> Fixes: b94f1c0904da ("ipv6: Use icmpv6_notify() to propagate redirect, instead of rt6_redirect().")
> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>

-- 
Sabrina

^ permalink raw reply

* Re: [PATCH RFC] net: dsa: Make switches VLAN aware when enslaved into a bridge
From: Florian Fainelli @ 2018-10-26 23:16 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: netdev@vger.kernel.org, Jiri Pirko, Petr Machata,
	privat@egil-hjelmeland.no, Woojung.Huh@microchip.com,
	tristram.ha@microchip.com, Andrew Lunn, Vivien Didelot,
	David S. Miller, open list
In-Reply-To: <20181026151019.GA15354@splinter.mtl.com>

On 10/26/18 8:10 AM, Ido Schimmel wrote:
> On Wed, Oct 24, 2018 at 12:36:57PM -0700, Florian Fainelli wrote:
>> Commit 2ea7a679ca2a ("net: dsa: Don't add vlans when vlan filtering is
>> disabled") changed the behavior of DSA switches when the switch ports
>> are enslaved into the bridge and only pushed the VLAN configuration down
>> to the switch if the bridge is configured with VLAN filtering enabled.
> 
> This is what mlxsw is doing.
> 
>> This is unfortunately wrong, because what vlan_filtering configures is a
>> policy on the acceptance of VLAN tagged frames with an unknown VID.
>>
>> vlan_filtering=0 means a frame with a VLAN tag that is not part of the
>> VLAN table should be allowed to ingress the switch, and vlan_fltering=1
>> would reject that frame.
> 
> While you correctly describe the logic, this is not how VLAN-unaware
> bridges are actually used. The expectation is that packets will be
> untagged when entering the bridge. Either because they are truly
> untagged or because they were untagged by a VLAN netdev.
> 
> For a long time we rejected the enslavement of physical ports to
> VLAN-unaware bridges and only allowed VLAN netdevs to be enslaved. In
> order to support the logic you described, we would need to map all 4K
> VLANs on each port to 4K different FIDs. In addition, each FDB entry
> would need to be programmed 4K times, each time with a different FID.
> This is because FDB lookup is performed using {MAC, FID} and not only
> MAC. I can go into more details about why we cannot map different VLANs
> on a port to the same FID, but I do not think it is pertinent to our
> discussion.
> 
> Eventually, users started complaining about this constraint and we
> relaxed it in commit 65b53bfd497b ("mlxsw: spectrum_switchdev: Allow
> port enslavement to a VLAN-unaware bridge").

Thanks for providing more context, I suppose we will keep the current
logic then, if nothing else it aligns us with mlxsw.
-- 
Florian

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2018-10-26 23:12 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


What better way to start off a weekend than with some networking
bug fixes:

1) net namespace leak in dump filtering code of ipv4 and ipv6, fixed
   by David Ahern and Bjørn Mork.

2) Handle bad checksums from hardware when using CHECKSUM_COMPLETE
   properly in UDP, from Sean Tranchetti.

3) Remove TCA_OPTIONS from policy validation, it turns out we don't
   consistently use nested attributes for this across all packet
   schedulers.  From David Ahern.

4) Fix SKB corruption in cadence driver, from Tristram Ha.

5) Fix broken WoL handling in r8169 driver, from Heiner Kallweit.

6) Fix OOPS in pneigh_dump_table(), from Eric Dumazet.

Please pull, thanks!

The following changes since commit 01aa9d518eae8a4d75cd3049defc6ed0b6d0a658:

  Merge tag 'docs-4.20' of git://git.lwn.net/linux (2018-10-24 18:01:11 +0100)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to aab456dfa404f3a16d6f1780e62a6a8533c4d008:

  net/neigh: fix NULL deref in pneigh_dump_table() (2018-10-26 16:03:51 -0700)

----------------------------------------------------------------
Andrew Lunn (1):
      net: phy: genphy_10g_driver: Avoid NULL pointer dereference

Anirudh Venkataramanan (7):
      ice: Make ice_msix_clean_rings static
      ice: Change device ID define names to align with branding string
      ice: Update expected FW version
      ice: Use capability count returned by the firmware
      ice: Introduce ice_dev_onetime_setup
      ice: Allocate VF interrupts and set queue map
      ice: Poll for link status change

Bjørn Mork (1):
      net/{ipv4,ipv6}: Do not put target net if input nsid is invalid

Bryan Whitehead (1):
      lan743x: Remove SPI dependency from Microchip group.

Dan Carpenter (1):
      octeontx2-af: Copy the right amount of memory

David Ahern (6):
      net/ipv4: Put target net when address dump fails due to bad attributes
      net/ipv6: Put target net when address dump fails due to bad attributes
      net: Don't return invalid table id error when dumping all families
      net: rtnl_dump_all needs to propagate error from dumpit function
      net: sched: Remove TCA_OPTIONS from policy
      net/ipv6: Allow onlink routes to have a device mismatch if it is the default route

David S. Miller (2):
      Merge branch 'route-dump-filter-fixes'
      Merge branch '100GbE' of git://git.kernel.org/.../jkirsher/net-queue

Eric Dumazet (2):
      drivers: net: remove <net/busy_poll.h> inclusion when not needed
      net/neigh: fix NULL deref in pneigh_dump_table()

Hangbin Liu (1):
      bridge: do not add port to router list when receives query with source 0.0.0.0

Heiner Kallweit (1):
      r8169: fix broken Wake-on-LAN from S5 (poweroff)

Karsten Graul (1):
      net/smc: fix smc_buf_unuse to use the lgr pointer

Mike Manning (1):
      net: allow traceroute with a specified interface in a vrf

Sean Tranchetti (1):
      net: udp: fix handling of CHECKSUM_COMPLETE packets

Shiju Jose (1):
      net: hns3: Fix for warning uninitialized symbol hw_err_lst3

Stefano Brivio (1):
      ipv6/ndisc: Preserve IPv6 control buffer if protocol error handlers are called

Tristram Ha (1):
      net: ethernet: cadence: fix socket buffer corruption problem

Wei Yongjun (1):
      octeontx2-af: Use GFP_ATOMIC under spin lock

 drivers/net/ethernet/amd/xgbe/xgbe-drv.c               |   1 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c        |   1 -
 drivers/net/ethernet/cadence/macb_main.c               |   2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c |  10 +++++----
 drivers/net/ethernet/intel/i40e/i40e_txrx.c            |   1 -
 drivers/net/ethernet/intel/iavf/iavf_txrx.c            |   1 -
 drivers/net/ethernet/intel/ice/ice_common.c            |  52 +++++++++++++++++++--------------------------
 drivers/net/ethernet/intel/ice/ice_common.h            |   9 +++-----
 drivers/net/ethernet/intel/ice/ice_controlq.h          |   5 ++---
 drivers/net/ethernet/intel/ice/ice_devids.h            |   6 +++---
 drivers/net/ethernet/intel/ice/ice_hw_autogen.h        |   8 +++++++
 drivers/net/ethernet/intel/ice/ice_lib.c               |   3 ++-
 drivers/net/ethernet/intel/ice/ice_lib.h               |   1 -
 drivers/net/ethernet/intel/ice/ice_main.c              | 116 ++++++++++++++++++++++++-----------------------------------------------------------------------------
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c       |  15 +++++++++----
 drivers/net/ethernet/intel/ixgbe/ixgbe.h               |   1 -
 drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c    |   4 ++--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c         |   1 -
 drivers/net/ethernet/mellanox/mlx4/en_rx.c             |   1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c        |   1 -
 drivers/net/ethernet/microchip/Kconfig                 |   1 -
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c       |   1 -
 drivers/net/ethernet/realtek/r8169.c                   |   9 ++++++--
 drivers/net/phy/phy-c45.c                              |   2 +-
 include/net/ip_fib.h                                   |   1 +
 net/bridge/br_multicast.c                              |  10 ++++++++-
 net/core/datagram.c                                    |   5 +++--
 net/core/neighbour.c                                   |   4 ++--
 net/core/rtnetlink.c                                   |   6 ++++--
 net/ipv4/devinet.c                                     |  14 ++++++++-----
 net/ipv4/fib_frontend.c                                |   4 ++++
 net/ipv4/ipmr.c                                        |   3 +++
 net/ipv4/udp.c                                         |  24 +++++++++++++++++----
 net/ipv6/addrconf.c                                    |  15 +++++++------
 net/ipv6/ip6_checksum.c                                |  20 ++++++++++++++++--
 net/ipv6/ip6_fib.c                                     |   3 +++
 net/ipv6/ip6mr.c                                       |   3 +++
 net/ipv6/ndisc.c                                       |   3 +--
 net/ipv6/route.c                                       |   2 ++
 net/ipv6/udp.c                                         |   2 +-
 net/sched/sch_api.c                                    |   1 -
 net/smc/smc_core.c                                     |  25 +++++++++++-----------
 tools/testing/selftests/net/fib-onlink-tests.sh        |  14 ++++++-------
 43 files changed, 205 insertions(+), 206 deletions(-)

^ permalink raw reply

* Re: CAKE and r8169 cause panic on upload in v4.19
From: Dave Taht @ 2018-10-26 23:08 UTC (permalink / raw)
  To: Oleksandr Natalenko
  Cc: hkallweit1, Toke Høiland-Jørgensen, David S. Miller,
	Jamal Hadi Salim, Cong Wang, Jiří Pírko,
	Linux Kernel Network Developers, linux-kernel
In-Reply-To: <b80e6819da8ea74f18b6ec0aaf9128fa@natalenko.name>

On Fri, Oct 26, 2018 at 1:54 PM Oleksandr Natalenko
<oleksandr@natalenko.name> wrote:
>
> Hi.
>
> On 26.10.2018 22:25, Dave Taht wrote:
> > Can you repeat your test, disabling gro splitting in cake?
> >
> > the option is "no-split-gso"
>
> Still panics. Takes a couple of rounds, but panics.
>
> Moreover, I've stressed my HTB setup like this too for a longer time,
> and it panics as well. So, at least, now I have a proof this is not a
> CAKE-specific thing.

Groovy. :whew:

I do look forward to more cake test results, particularly on different
network cards such as these, and at speeds higher than 10Gbit on high
end hardware, and in the 100-1Gbit range on low to mid-range. After
the last round of features added to cake before it went into linux, we
run now out of cpu on inbound shaping at those speeds on low end apu2
(x86) hardware, (atom and a15 chips are not so hot now either) and I
wish I knew what we could do to speed it up. The new "list skb" and
mirred code looked promising but we haven't got around to exploring it
yet.

Thank you for trying and I hope this gets sorted out on your chipset.

>
> Also, I've stressed it even with noqueue, and the panic is still there.
> So, this thing is not even sch-specific.
>
> Next, I've seen GRO bits in the call trace and decided to disable GRO on
> this NIC. So far, I cannot trigger a panic with GRO disabled even after
> 20 rounds of speedtest.

We tend to use flent's rrul test to *really* abuse things. :)

So cake's ok with gro disabled in hw?

>
> So, must be some generic thing indeed.
>
> --
>    Oleksandr Natalenko (post-factum)



-- 

Dave Täht
CTO, TekLibre, LLC
http://www.teklibre.com
Tel: 1-831-205-9740

^ permalink raw reply

* KMSAN: uninit-value in dev_mc_add_excl
From: syzbot @ 2018-10-26 22:48 UTC (permalink / raw)
  To: davem, edumazet, linux-kernel, netdev, sunlw.fnst, syzkaller-bugs

Hello,

syzbot found the following crash on:

HEAD commit:    4bb25354f0b0 kmsan: unpoison pt_regs in do_nmi()
git tree:       https://github.com/google/kmsan.git/master
console output: https://syzkaller.appspot.com/x/log.txt?x=11e4ae95400000
kernel config:  https://syzkaller.appspot.com/x/.config?x=36c582b1a617b1e6
dashboard link: https://syzkaller.appspot.com/bug?extid=d53ab4e92a1db04110ff
compiler:       clang version 8.0.0 (trunk 339414)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=15ac7c79400000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=15dbbf0d400000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+d53ab4e92a1db04110ff@syzkaller.appspotmail.com

sshd (6129) used greatest stack depth: 53504 bytes left
==================================================================
BUG: KMSAN: uninit-value in memcmp+0x117/0x180 lib/string.c:863
CPU: 1 PID: 6214 Comm: syz-executor693 Not tainted 4.19.0-rc8+ #70
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x306/0x460 lib/dump_stack.c:113
  kmsan_report+0x1a2/0x2e0 mm/kmsan/kmsan.c:917
  __msan_warning+0x7c/0xe0 mm/kmsan/kmsan_instr.c:500
  memcmp+0x117/0x180 lib/string.c:863
  dev_mc_add_excl+0x165/0x770 net/core/dev_addr_lists.c:648
  ndo_dflt_fdb_add net/core/rtnetlink.c:3469 [inline]
  rtnl_fdb_add+0xe83/0x12a0 net/core/rtnetlink.c:3562
  rtnetlink_rcv_msg+0xa53/0x1590 net/core/rtnetlink.c:4730
  netlink_rcv_skb+0x394/0x640 net/netlink/af_netlink.c:2454
  rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4748
  netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
  netlink_unicast+0x166d/0x1720 net/netlink/af_netlink.c:1343
  netlink_sendmsg+0x1391/0x1420 net/netlink/af_netlink.c:1908
  sock_sendmsg_nosec net/socket.c:621 [inline]
  sock_sendmsg net/socket.c:631 [inline]
  ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116
  __sys_sendmsg net/socket.c:2154 [inline]
  __do_sys_sendmsg net/socket.c:2163 [inline]
  __se_sys_sendmsg+0x307/0x460 net/socket.c:2161
  __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
  do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291
  entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x440fd9
Code: e8 cc ab 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 bb 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffe833197b8 EFLAGS: 00000213 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000440fd9
RDX: 0000000000000000 RSI: 0000000020000500 RDI: 0000000000000003
RBP: 0000000000000000 R08: 00000000004002c8 R09: 00000000004002c8
R10: 00000000004002c8 R11: 0000000000000213 R12: 0000000000010c7d
R13: 0000000000401fb0 R14: 0000000000000000 R15: 0000000000000000

Uninit was created at:
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:255 [inline]
  kmsan_internal_poison_shadow+0xc8/0x1d0 mm/kmsan/kmsan.c:180
  kmsan_kmalloc+0xa4/0x120 mm/kmsan/kmsan_hooks.c:104
  kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan_hooks.c:113
  slab_post_alloc_hook mm/slab.h:446 [inline]
  slab_alloc_node mm/slub.c:2727 [inline]
  __kmalloc_node_track_caller+0xb43/0x1400 mm/slub.c:4360
  __kmalloc_reserve net/core/skbuff.c:138 [inline]
  __alloc_skb+0x422/0xe90 net/core/skbuff.c:206
  alloc_skb include/linux/skbuff.h:996 [inline]
  netlink_alloc_large_skb net/netlink/af_netlink.c:1189 [inline]
  netlink_sendmsg+0xcaf/0x1420 net/netlink/af_netlink.c:1883
  sock_sendmsg_nosec net/socket.c:621 [inline]
  sock_sendmsg net/socket.c:631 [inline]
  ___sys_sendmsg+0xe47/0x1200 net/socket.c:2116
  __sys_sendmsg net/socket.c:2154 [inline]
  __do_sys_sendmsg net/socket.c:2163 [inline]
  __se_sys_sendmsg+0x307/0x460 net/socket.c:2161
  __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2161
  do_syscall_64+0xbe/0x100 arch/x86/entry/common.c:291
  entry_SYSCALL_64_after_hwframe+0x63/0xe7
==================================================================


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH v2 00/17] octeontx2-af: NPC parser and NIX blocks initialization
From: Arnd Bergmann @ 2018-10-26 14:04 UTC (permalink / raw)
  To: Sunil Kovvuri
  Cc: David S. Miller, Linux Netdev List, linux-soc, Sunil Goutham
In-Reply-To: <CA+sq2CdNo4ooSSb3cwZrcbShpWto9uQQB=cfaMAeQqPUdgMv4Q@mail.gmail.com>

On 10/26/18, Sunil Kovvuri <sunil.kovvuri@gmail.com> wrote:
> On Fri, Oct 26, 2018 at 6:24 PM Arnd Bergmann <arnd@arndb.de> wrote:
>>
>> I see this has been applied, but I'd still like to understand better how
>> the
>> configuration interface is expected to work once the driver is complete.
>>
>> In particular, so far the interfaces all assume that configuration is
>> done through the mailbox between PCI devices, which could be done
>> from a virtual machine kernel with access to PCI, or through the use
>> of VFIO from a user application.
>>
>> Is that the only method of configuring it that you support, or will there
>> also be a devlink based interface or something like that to configure
>> the aspects of a virtual device that should not be accessible to the
>> VF itself?
>>
>
>
> As of now it's only mbox based configuration that is supported.

Ok, thanks for the clarification.

Does this mean that you intend to have user space tools that use
the mbox based interface on VFIO devices to perform configuration
for virtual network devices, or just that the configuration interface
is something that needs to be designed later?

I fear that setting a precedent of using the mbox for user-level
configuration management would mean that we would have to
treat each of these interfaces as an ABI, which in turn requires
much deeper review as well as raising the fundamental question
on how this should be done across drivers. The mailbox interface
seem inherently nonportable to other hardware here, which is
a significant downside.

       Arnd

^ permalink raw reply

* Re: [PATCH net-next] net/ncsi: Add NCSI Mellanox OEM command
From: Vijay Khemka @ 2018-10-26 22:40 UTC (permalink / raw)
  To: David Miller
  Cc: sam@mendozajonas.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, openbmc@lists.ozlabs.org,
	Justin.Lee1@Dell.com, joel@jms.id.au,
	linux-aspeed@lists.ozlabs.org
In-Reply-To: <20181026.103617.373827980255084988.davem@davemloft.net>

Thanks David

On 10/26/18, 10:36 AM, "David Miller" <davem@davemloft.net> wrote:

    From: Vijay Khemka <vijaykhemka@fb.com>
    Date: Fri, 26 Oct 2018 17:19:49 +0000
    
    > Do you have any timeline when it is going to open next or how do I
    > know.
    
    I always announce net-next openning and closing here on the list.
    
    There is also a web site:
    
    	http://vger.kernel.org/~davem/net-next.html
    
    Thanks.
    


^ permalink raw reply

* Re: [PATCH v2 00/17] octeontx2-af: NPC parser and NIX blocks initialization
From: Sunil Kovvuri @ 2018-10-26 13:29 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: David S. Miller, Linux Netdev List, linux-soc, Sunil Goutham
In-Reply-To: <CAK8P3a1zpGZiKDrseHs0TVBy2435PD=D5L+29Zuag62=SGjnpg@mail.gmail.com>

On Fri, Oct 26, 2018 at 6:24 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On 10/23/18, David Miller <davem@davemloft.net> wrote:
> > From: sunil.kovvuri@gmail.com
> > Date: Mon, 22 Oct 2018 23:25:47 +0530
> >
> >> From: Sunil Goutham <sgoutham@marvell.com>
> >>
> >> This patchset is a continuation to earlier submitted two patch
> >> series to add a new driver for Marvell's OcteonTX2 SOC's
> >> Resource virtualization unit (RVU) admin function driver.
> >>
> >> 1. octeontx2-af: Add RVU Admin Function driver
> >>    https://www.spinics.net/lists/netdev/msg528272.html
> >> 2. octeontx2-af: NPA and NIX blocks initialization
> >>    https://www.spinics.net/lists/netdev/msg529163.html
> >>
> >> This patch series adds more NIX block configuration logic
> >> and additionally adds NPC block parser profile configuration.
> >> In brief below is what this series adds.
> >> NIX block:
> >> - Support for PF/VF to allocate/free transmit scheduler queues,
> >>   maintenance and their configuration.
> >> - Adds support for packet replication lists, only broadcast
> >>   packets is covered for now.
> >> - Defines few RSS flow algorithms for HW to distribute packets.
> >>   This is not the hash algorithsm (i.e toeplitz or crc32), here SW
> >>   defines what fields in packet should HW take and calculate the hash.
> >> - Support for PF/VF to configure VTAG strip and capture capabilities.
> >> - Reset NIXLF statastics.
> >>
> >> NPC block:
> >> This block has multiple parser engines which support packet parsing
> >> at multiple layers and generates a parse result which is further used
> >> to generate a key. Based on packet field offsets in the key, SW can
> >> install packet forwarding rules.
> >> This patch series adds
> >> - Initial parser profile to be programmed into parser engines.
> >> - Default forwarding rules to forward packets to different logical
> >>   interfaces having a NIXLF attached.
> >> - Support for promiscuous and multicast modes.
> >>
> >> Changes from v1:
> >>  1 Fixed kernel build failure when compiled with BIG_ENDIAN enabled.
> >>    - Reported by Kbuild test robot
> >>  2 Fixed a warning observed when kernel is built with
> >> -Wunused-but-set-variable
> >
> > Series applied.
>
> I see this has been applied, but I'd still like to understand better how the
> configuration interface is expected to work once the driver is complete.
>
> In particular, so far the interfaces all assume that configuration is
> done through the mailbox between PCI devices, which could be done
> from a virtual machine kernel with access to PCI, or through the use
> of VFIO from a user application.
>
> Is that the only method of configuring it that you support, or will there
> also be a devlink based interface or something like that to configure
> the aspects of a virtual device that should not be accessible to the
> VF itself?
>
>         Arnd


As of now it's only mbox based configuration that is supported.

Thanks,
Sunil.

^ permalink raw reply

* Re: Fw: [Bug 201423] New: eth0: hw csum failure
From: Andre Tomt @ 2018-10-26 13:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric Dumazet, Stephen Hemminger, netdev, rossi.f,
	Dimitris Michailidis
In-Reply-To: <CANn89iKHKa94DG=-PjwZx7V7zE_EYz5mraQ=VFJaA135pheHXQ@mail.gmail.com>

On 26.10.2018 14:59, Eric Dumazet wrote:
> On Fri, Oct 26, 2018 at 5:38 AM Andre Tomt <andre@tomt.net> wrote:
>> And it tripped again with that commit; however on another box with a
>> much more complicated setup (VRFs, sch_cake, ifb, conntrack/nat, 6in4
>> tunnel, VF device on mlx4)
>>
>>> [ 8197.348260] wanib: hw csum failure
>>> [ 8197.348288] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.19.0-1 #1
>>> [ 8197.348289] Hardware name: Supermicro SYS-5018D-FN8T/X10SDV-TP8F, BIOS 1.3 03/19/2018
>>> [ 8197.348290] Call Trace:
>>> [ 8197.348296]  <IRQ>
>>> [ 8197.348304]  dump_stack+0x5c/0x80
>>> [ 8197.348308]  __skb_checksum_complete+0xac/0xc0
>>> [ 8197.348318]  icmp_error+0x1c8/0x1f0 [nf_conntrack]
>>> [ 8197.348325]  ? ip_output+0x61/0xc0
>>> [ 8197.348328]  ? skb_copy_bits+0x13d/0x220
>>> [ 8197.348334]  nf_conntrack_in+0xd8/0x390 [nf_conntrack]
>>> [ 8197.348339]  ? ___pskb_trim+0x192/0x330
>>> [ 8197.348343]  nf_hook_slow+0x43/0xc0
>>> [ 8197.348346]  ip_rcv+0x90/0xb0
>>> [ 8197.348349]  ? ip_rcv_finish_core.isra.0+0x310/0x310
>>> [ 8197.348354]  __netif_receive_skb_one_core+0x42/0x50
>>> [ 8197.348357]  netif_receive_skb_internal+0x24/0xb0
>>> [ 8197.348361]  ifb_ri_tasklet+0x167/0x260 [ifb]
>>> [ 8197.348365]  tasklet_action_common.isra.3+0x49/0xb0
>>> [ 8197.348369]  __do_softirq+0xe7/0x2d3
>>> [ 8197.348372]  irq_exit+0x96/0xd0
>>> [ 8197.348375]  do_IRQ+0x85/0xd0
>>> [ 8197.348378]  common_interrupt+0xf/0xf
>>> [ 8197.348379]  </IRQ>
>>> [ 8197.348382] RIP: 0010:cpuidle_enter_state+0xb9/0x320
>>> [ 8197.348384] Code: e8 1c 16 bc ff 80 7c 24 0b 00 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 3b 02 00 00 31 ff e8 3e fb c0 ff fb 66 0f 1f 44 00 00 <48> b8 ff ff ff ff f3 01 00 00 48 2b 1c 24 ba ff ff ff 7f 48 39 c3
>>> [ 8197.348386] RSP: 0018:ffff9f0441953ea8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffd5
>>> [ 8197.348388] RAX: ffff9759efae0fc0 RBX: 000007749807d911 RCX: 000000000000001f
>>> [ 8197.348390] RDX: 000007749807d911 RSI: 000000003a2e8670 RDI: 0000000000000000
>>> [ 8197.348393] RBP: ffff9759efae98a8 R08: 0000000000000002 R09: 0000000000020840
>>> [ 8197.348396] R10: 00626b4810384abc R11: ffff9759efae01e8 R12: 0000000000000001
>>> [ 8197.348398] R13: ffffffff8d0ac638 R14: 0000000000000001 R15: 0000000000000000
>>> [ 8197.348402]  ? cpuidle_enter_state+0x94/0x320
>>> [ 8197.348407]  do_idle+0x1e4/0x220
>>> [ 8197.348411]  cpu_startup_entry+0x5f/0x70
>>> [ 8197.348415]  start_secondary+0x185/0x1a0
>>> [ 8197.348417]  secondary_startup_64+0xa4/0xb0
> 
> 
> Very different trace , yet another bug to track .
> 
> If you can, try to remove some components from this setup.
> 

Will do. Just remembered I took out the VF stuff a few days ago and that 
netdev is just a normal vlan device now. Going to eliminate VRF and 
cake/ifb as well.

^ permalink raw reply

* Re: [PATCH v3 2/2] net: qcom/emac: add phy-handle support for ACPI
From: Andrew Lunn @ 2018-10-26 13:13 UTC (permalink / raw)
  To: Wang, Dongsheng
  Cc: Timur Tabi, Zheng, Joey, f.fainelli@gmail.com,
	netdev@vger.kernel.org, robert.moore@intel.com, rjw@rjwysocki.net,
	linux-acpi@vger.kernel.org
In-Reply-To: <a1adfc5a696d40388ad9bf982f0ff64d@HXTBJIDCEMVIW02.hxtcorp.net>

On Fri, Oct 26, 2018 at 03:04:25AM +0000, Wang, Dongsheng wrote:
> On 2018/10/26 10:37, Timur Tabi wrote:
> > On 10/25/18 9:18 PM, Wang, Dongsheng wrote:
> >> But when I was reading Documentation/acpi/DSD-properties-rules.txt, my
> >> understanding is we should try to conform to DT bindings. So maybe ACPI
> >> doesn't have such a document, just DT bindings.
> > There was an attempt to document DSDs, but it was abandoned after a while.
> >
> > https://github.com/ahs3/dsd
> >
> 
> Yes, here's a database concept, and I asked some Intel guys, the answer
> I got was there is no such database or document. :(

Hi Dongsheng

If there is no clear documentation for ACPI, it becomes even more
important that the xgene code is refactored into a central location,
and you make use of it. We really need to avoid every ACPI ethernet
driver doing its own thing.

       Thanks
	Andrew

^ permalink raw reply

* RE: [PATCH net-next v2 6/6] net/ncsi: Configure multi-package, multi-channel modes with failover
From: Justin.Lee1 @ 2018-10-26 21:48 UTC (permalink / raw)
  To: sam, netdev; +Cc: davem, linux-kernel, openbmc
In-Reply-To: <20181023215201.27315-7-sam@mendozajonas.com>

Hi Samuel,

There is one place that we assume the next available TX channel is under the same package.
Please see the comment below.

Thanks,
Justin


+/* Change the active Tx channel in a multi-channel setup */
+int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp,
> +			   struct ncsi_package *np,
> +			   struct ncsi_channel *disable,
> +			   struct ncsi_channel *enable)
> +{
> +	struct ncsi_cmd_arg nca;
> +	struct ncsi_channel *nc;
> +	int ret = 0;
> +
> +	if (!np->multi_channel)
> +		netdev_warn(ndp->ndev.dev,
> +			    "NCSI: Trying to update Tx channel in single-channel mode\n");
> +	nca.ndp = ndp;
> +	nca.package = np->id;

If the channel may be on different package, the package ID here may not be correct
in some cases.

> +	nca.req_flags = 0;
> +
> +	/* Find current channel with Tx enabled */
> +	if (!disable) {
> +		NCSI_FOR_EACH_CHANNEL(np, nc)
> +			if (nc->modes[NCSI_MODE_TX_ENABLE].enable)
> +				disable = nc;
> +	}
> +
> +	/* Find a suitable channel for Tx */
> +	if (!enable) {
> +		if (np->preferred_channel &&
> +		    ncsi_channel_has_link(np->preferred_channel)) {
> +			enable = np->preferred_channel;
> +		} else {
> +			NCSI_FOR_EACH_CHANNEL(np, nc) {
> +				if (!(np->channel_whitelist & 0x1 << nc->id))
> +					continue;
> +				if (nc->state != NCSI_CHANNEL_ACTIVE)
> +					continue;
> +				if (ncsi_channel_has_link(nc)) {
> +					enable = nc;
> +					break;
> +				}
> +			}

When we search, we need to consider the other available channel might be on the
package.

> +		}
> +	}
> +
> +	if (disable == enable)
> +		return -1;
> +
> +	if (!enable)
> +		return -1;
> +
> +	if (disable) {
> +		nca.channel = disable->id;
> +		nca.type = NCSI_PKT_CMD_DCNT;
> +		ret = ncsi_xmit_cmd(&nca);
> +		if (ret)
> +			netdev_err(ndp->ndev.dev,
> +				   "Error %d sending DCNT\n",
> +				   ret);
> +	}

I remove the cable from ncsi0 and it doesn't failover to ncsi3 as ncsi0 and ncsi3 are not under
the same package.

cat /sys/kernel/debug/ncsi_protocol/ncsi_device_
IFIDX IFNAME NAME   PID CID RX TX MP MC WP WC PC CS PS LS RU CR NQ HA
======================================================================
  2   eth2   ncsi0  000 000 1  1  1  1  1  1  1  3  0  0  1  1  0  1
  2   eth2   ncsi1  000 001 0  0  1  1  1  0  0  1  0  1  1  1  0  1
  2   eth2   ncsi2  001 000 0  0  1  1  1  0  0  1  0  1  1  1  0  1
  2   eth2   ncsi3  001 001 1  0  1  1  1  1  0  2  1  1  1  1  0  1
======================================================================
MP: Multi-mode Package     WP: Whitelist Package
MC: Multi-mode Channel     WC: Whitelist Channel
PC: Primary Channel        CS: Channel State
PS: Poll Status            LS: Link Status
RU: Running                CR: Carrier OK
NQ: Queue Stopped          HA: Hardware Arbitration

^ permalink raw reply

* [BPF] "padded" structures are not supported by BPF
From: Krishna Chaitanya @ 2018-10-26 13:06 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann

Hi,

With below config BPF doesn't seem to support "padded" structures. Is
this a bug or expected?
Kernel Version: 4.15.0-34, Intel, Ubuntu. Below is the BPF JIT output.

struct info {
        u16 seq_num;
        u32 packet_num;
};

bpf: Failed to load program: Permission denied
0: (69) r1 = *(u16 *)(r1 +12)
1: (6b) *(u16 *)(r10 -8) = r1
2: (b7) r1 = 1000000
3: (63) *(u32 *)(r10 -4) = r1
4: (b7) r1 = 0
5: (63) *(u32 *)(r10 -12) = r1
6: (18) r1 = 0xffff8f86f2998e00
8: (bf) r2 = r10
9: (07) r2 += -12
10: (85) call bpf_map_lookup_elem#1
11: (bf) r6 = r0
12: (15) if r6 == 0x0 goto pc+13
 R0=map_value(id=0,off=0,ks=4,vs=4,imm=0)
R6=map_value(id=0,off=0,ks=4,vs=4,imm=0) R10=fp0
13: (61) r1 = *(u32 *)(r6 +0)
 R0=map_value(id=0,off=0,ks=4,vs=4,imm=0)
R6=map_value(id=0,off=0,ks=4,vs=4,imm=0) R10=fp0
14: (63) *(u32 *)(r10 -16) = r1
15: (18) r1 = 0xffff8f86f2999400
17: (bf) r2 = r10
18: (07) r2 += -16
19: (bf) r3 = r10
20: (07) r3 += -8
21: (b7) r4 = 0
22: (85) call bpf_map_update_elem#2
invalid indirect read from stack off -8+2 size 8

Traceback (most recent call last):
    b = BPF(text=bpf_source)
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 337, in __init__
    self._trace_autoload()
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 1038,
in _trace_autoload
    fn = self.load_func(func_name, BPF.TRACEPOINT)
  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 377,
in load_func
    (func_name, errstr))
Exception: Failed to load BPF program tracepoint__<Snip>: Permission denied

If we add "__packed" to above struct is compiled successfully.

-- 
Thanks,
Regards,
Chaitanya T K.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox