* [PATCH v2 net-next 3/7] net: dsa: sja1105: Switch to hardware operations for PTP
From: Vladimir Oltean @ 2019-09-10 1:34 UTC (permalink / raw)
To: f.fainelli, vivien.didelot, andrew, davem, richardcochran
Cc: netdev, Vladimir Oltean
In-Reply-To: <20190910013501.3262-1-olteanv@gmail.com>
Adjusting the hardware clock (PTPCLKVAL, PTPCLKADD, PTPCLKRATE) is a
requirement for the auxiliary PTP functionality of the switch
(TTEthernet, PPS input, PPS output).
Now that the sync precision issues have been identified (and fixed in
the spi-fsl-dspi driver), we can get rid of the timecounter/cyclecounter
implementation, which is reliant on the free-running PTPTSCLK.
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
---
drivers/net/dsa/sja1105/sja1105.h | 16 +--
drivers/net/dsa/sja1105/sja1105_main.c | 18 ++-
drivers/net/dsa/sja1105/sja1105_ptp.c | 181 ++++++++++++-------------
drivers/net/dsa/sja1105/sja1105_ptp.h | 22 +++
drivers/net/dsa/sja1105/sja1105_spi.c | 2 -
5 files changed, 122 insertions(+), 117 deletions(-)
diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index d8a92646e80a..e4955a025e46 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -32,7 +32,6 @@ struct sja1105_regs {
u64 ptp_control;
u64 ptpclk;
u64 ptpclkrate;
- u64 ptptsclk;
u64 ptpegr_ts[SJA1105_NUM_PORTS];
u64 pad_mii_tx[SJA1105_NUM_PORTS];
u64 pad_mii_id[SJA1105_NUM_PORTS];
@@ -50,8 +49,15 @@ struct sja1105_regs {
u64 qlevel[SJA1105_NUM_PORTS];
};
+enum sja1105_ptp_clk_mode {
+ PTP_ADD_MODE = 1,
+ PTP_SET_MODE = 0,
+};
+
struct sja1105_ptp_cmd {
u64 resptp; /* reset */
+ u64 corrclk4ts; /* use the corrected clock for timestamps */
+ u64 ptpclkadd; /* enum sja1105_ptp_clk_mode */
};
struct sja1105_info {
@@ -96,13 +102,7 @@ struct sja1105_private {
struct sja1105_ptp_cmd ptp_cmd;
struct ptp_clock_info ptp_caps;
struct ptp_clock *clock;
- /* The cycle counter translates the PTP timestamps (based on
- * a free-running counter) into a software time domain.
- */
- struct cyclecounter tstamp_cc;
- struct timecounter tstamp_tc;
- struct delayed_work refresh_work;
- /* Serializes all operations on the cycle counter */
+ /* Serializes all operations on the PTP hardware clock */
struct mutex ptp_lock;
/* Serializes transmission of management frames so that
* the switch doesn't confuse them with one another.
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 0b0205abc3d2..c92326871fec 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1818,7 +1818,7 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
struct skb_shared_hwtstamps shwt = {0};
int slot = sp->mgmt_slot;
struct sk_buff *clone;
- u64 now, ts;
+ u64 ticks, ts;
int rc;
/* The tragic fact about the switch having 4x2 slots for installing
@@ -1849,7 +1849,7 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
mutex_lock(&priv->ptp_lock);
- now = priv->tstamp_cc.read(&priv->tstamp_cc);
+ ticks = sja1105_ptpclkval_read(priv);
rc = sja1105_ptpegr_ts_poll(priv, slot, &ts);
if (rc < 0) {
@@ -1858,10 +1858,9 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
goto out_unlock_ptp;
}
- ts = sja1105_tstamp_reconstruct(priv, now, ts);
- ts = timecounter_cyc2time(&priv->tstamp_tc, ts);
+ ts = sja1105_tstamp_reconstruct(priv, ticks, ts);
- shwt.hwtstamp = ns_to_ktime(ts);
+ shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(ts));
skb_complete_tx_timestamp(clone, &shwt);
out_unlock_ptp:
@@ -1999,11 +1998,11 @@ static void sja1105_rxtstamp_work(struct work_struct *work)
struct sja1105_tagger_data *data = to_tagger(work);
struct sja1105_private *priv = to_sja1105(data);
struct sk_buff *skb;
- u64 now;
+ u64 ticks;
mutex_lock(&priv->ptp_lock);
- now = priv->tstamp_cc.read(&priv->tstamp_cc);
+ ticks = sja1105_ptpclkval_read(priv);
while ((skb = skb_dequeue(&data->skb_rxtstamp_queue)) != NULL) {
struct skb_shared_hwtstamps *shwt = skb_hwtstamps(skb);
@@ -2012,10 +2011,9 @@ static void sja1105_rxtstamp_work(struct work_struct *work)
*shwt = (struct skb_shared_hwtstamps) {0};
ts = SJA1105_SKB_CB(skb)->meta_tstamp;
- ts = sja1105_tstamp_reconstruct(priv, now, ts);
- ts = timecounter_cyc2time(&priv->tstamp_tc, ts);
+ ts = sja1105_tstamp_reconstruct(priv, ticks, ts);
- shwt->hwtstamp = ns_to_ktime(ts);
+ shwt->hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(ts));
netif_rx_ni(skb);
}
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index 13f9f5799e46..bcdfdda46b9c 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -13,24 +13,6 @@
#define SJA1105_MAX_ADJ_PPB 32000000
#define SJA1105_SIZE_PTP_CMD 4
-/* Timestamps are in units of 8 ns clock ticks (equivalent to a fixed
- * 125 MHz clock) so the scale factor (MULT / SHIFT) needs to be 8.
- * Furthermore, wisely pick SHIFT as 28 bits, which translates
- * MULT into 2^31 (0x80000000). This is the same value around which
- * the hardware PTPCLKRATE is centered, so the same ppb conversion
- * arithmetic can be reused.
- */
-#define SJA1105_CC_SHIFT 28
-#define SJA1105_CC_MULT (8 << SJA1105_CC_SHIFT)
-
-/* Having 33 bits of cycle counter left until a 64-bit overflow during delta
- * conversion, we multiply this by the 8 ns counter resolution and arrive at
- * a comfortable 68.71 second refresh interval until the delta would cause
- * an integer overflow, in absence of any other readout.
- * Approximate to 1 minute.
- */
-#define SJA1105_REFRESH_INTERVAL (HZ * 60)
-
/* This range is actually +/- SJA1105_MAX_ADJ_PPB
* divided by 1000 (ppb -> ppm) and with a 16-bit
* "fractional" part (actually fixed point).
@@ -41,7 +23,7 @@
*
* This forgoes a "ppb" numeric representation (up to NSEC_PER_SEC)
* and defines the scaling factor between scaled_ppm and the actual
- * frequency adjustments (both cycle counter and hardware).
+ * frequency adjustments of the PHC.
*
* ptpclkrate = scaled_ppm * 2^31 / (10^6 * 2^16)
* simplifies to
@@ -49,10 +31,9 @@
*/
#define SJA1105_CC_MULT_NUM (1 << 9)
#define SJA1105_CC_MULT_DEM 15625
+#define SJA1105_CC_MULT 0x80000000
#define ptp_to_sja1105(d) container_of((d), struct sja1105_private, ptp_caps)
-#define cc_to_sja1105(d) container_of((d), struct sja1105_private, tstamp_cc)
-#define dw_to_sja1105(d) container_of((d), struct sja1105_private, refresh_work)
int sja1105_get_ts_info(struct dsa_switch *ds, int port,
struct ethtool_ts_info *info)
@@ -86,6 +67,8 @@ int sja1105et_ptp_cmd(const void *ctx, const void *data)
sja1105_pack(buf, &valid, 31, 31, size);
sja1105_pack(buf, &cmd->resptp, 2, 2, size);
+ sja1105_pack(buf, &cmd->corrclk4ts, 1, 1, size);
+ sja1105_pack(buf, &cmd->ptpclkadd, 0, 0, size);
return sja1105_spi_send_packed_buf(priv, SPI_WRITE, regs->ptp_control,
buf, SJA1105_SIZE_PTP_CMD);
@@ -103,6 +86,8 @@ int sja1105pqrs_ptp_cmd(const void *ctx, const void *data)
sja1105_pack(buf, &valid, 31, 31, size);
sja1105_pack(buf, &cmd->resptp, 3, 3, size);
+ sja1105_pack(buf, &cmd->corrclk4ts, 2, 2, size);
+ sja1105_pack(buf, &cmd->ptpclkadd, 0, 0, size);
return sja1105_spi_send_packed_buf(priv, SPI_WRITE, regs->ptp_control,
buf, SJA1105_SIZE_PTP_CMD);
@@ -215,17 +200,14 @@ int sja1105_ptpegr_ts_poll(struct sja1105_private *priv, int port, u64 *ts)
int sja1105_ptp_reset(struct sja1105_private *priv)
{
struct sja1105_ptp_cmd cmd = priv->ptp_cmd;
- struct dsa_switch *ds = priv->ds;
int rc;
mutex_lock(&priv->ptp_lock);
cmd.resptp = 1;
- dev_dbg(ds->dev, "Resetting PTP clock\n");
- rc = priv->info->ptp_cmd(priv, &cmd);
- timecounter_init(&priv->tstamp_tc, &priv->tstamp_cc,
- ktime_to_ns(ktime_get_real()));
+ dev_dbg(priv->ds->dev, "Resetting PTP clock\n");
+ rc = priv->info->ptp_cmd(priv, &cmd);
mutex_unlock(&priv->ptp_lock);
@@ -236,124 +218,130 @@ static int sja1105_ptp_gettime(struct ptp_clock_info *ptp,
struct timespec64 *ts)
{
struct sja1105_private *priv = ptp_to_sja1105(ptp);
- u64 ns;
+ u64 ticks;
mutex_lock(&priv->ptp_lock);
- ns = timecounter_read(&priv->tstamp_tc);
- mutex_unlock(&priv->ptp_lock);
- *ts = ns_to_timespec64(ns);
+ ticks = sja1105_ptpclkval_read(priv);
+ *ts = ns_to_timespec64(sja1105_ticks_to_ns(ticks));
+
+ mutex_unlock(&priv->ptp_lock);
return 0;
}
+/* Caller must hold priv->ptp_lock */
+static int sja1105_ptp_mode_set(struct sja1105_private *priv,
+ enum sja1105_ptp_clk_mode mode)
+{
+ if (priv->ptp_cmd.ptpclkadd == mode)
+ return 0;
+
+ priv->ptp_cmd.ptpclkadd = mode;
+
+ return priv->info->ptp_cmd(priv, &priv->ptp_cmd);
+}
+
+/* Caller must hold priv->ptp_lock */
+static int sja1105_ptpclkval_write(struct sja1105_private *priv, u64 val)
+{
+ const struct sja1105_regs *regs = priv->info->regs;
+
+ return sja1105_spi_send_int(priv, SPI_WRITE, regs->ptpclk, &val, 8);
+}
+
+/* Write to PTPCLKVAL while PTPCLKADD is 0 */
static int sja1105_ptp_settime(struct ptp_clock_info *ptp,
const struct timespec64 *ts)
{
+ u64 ticks = ns_to_sja1105_ticks(timespec64_to_ns(ts));
struct sja1105_private *priv = ptp_to_sja1105(ptp);
- u64 ns = timespec64_to_ns(ts);
+ int rc;
mutex_lock(&priv->ptp_lock);
- timecounter_init(&priv->tstamp_tc, &priv->tstamp_cc, ns);
+
+ rc = sja1105_ptp_mode_set(priv, PTP_SET_MODE);
+ if (rc < 0) {
+ dev_err(priv->ds->dev, "Failed to put PTPCLK in set mode\n");
+ goto out;
+ }
+
+ rc = sja1105_ptpclkval_write(priv, ticks);
+
+out:
mutex_unlock(&priv->ptp_lock);
- return 0;
+ return rc;
}
static int sja1105_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
{
struct sja1105_private *priv = ptp_to_sja1105(ptp);
+ const struct sja1105_regs *regs = priv->info->regs;
s64 clkrate;
+ int rc;
clkrate = (s64)scaled_ppm * SJA1105_CC_MULT_NUM;
clkrate = div_s64(clkrate, SJA1105_CC_MULT_DEM);
- mutex_lock(&priv->ptp_lock);
-
- /* Force a readout to update the timer *before* changing its frequency.
- *
- * This way, its corrected time curve can at all times be modeled
- * as a linear "A * x + B" function, where:
- *
- * - B are past frequency adjustments and offset shifts, all
- * accumulated into the cycle_last variable.
- *
- * - A is the new frequency adjustments we're just about to set.
- *
- * Reading now makes B accumulate the correct amount of time,
- * corrected at the old rate, before changing it.
- *
- * Hardware timestamps then become simple points on the curve and
- * are approximated using the above function. This is still better
- * than letting the switch take the timestamps using the hardware
- * rate-corrected clock (PTPCLKVAL) - the comparison in this case would
- * be that we're shifting the ruler at the same time as we're taking
- * measurements with it.
- *
- * The disadvantage is that it's possible to receive timestamps when
- * a frequency adjustment took place in the near past.
- * In this case they will be approximated using the new ppb value
- * instead of a compound function made of two segments (one at the old
- * and the other at the new rate) - introducing some inaccuracy.
- */
- timecounter_read(&priv->tstamp_tc);
-
- priv->tstamp_cc.mult = SJA1105_CC_MULT + clkrate;
+ /* Take a +/- value and re-center it around 2^31. */
+ clkrate = SJA1105_CC_MULT + clkrate;
+ clkrate &= GENMASK_ULL(31, 0);
- mutex_unlock(&priv->ptp_lock);
-
- return 0;
-}
+ mutex_lock(&priv->ptp_lock);
-static int sja1105_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
-{
- struct sja1105_private *priv = ptp_to_sja1105(ptp);
+ rc = sja1105_spi_send_int(priv, SPI_WRITE, regs->ptpclkrate,
+ &clkrate, 4);
- mutex_lock(&priv->ptp_lock);
- timecounter_adjtime(&priv->tstamp_tc, delta);
mutex_unlock(&priv->ptp_lock);
- return 0;
+ return rc;
}
-static u64 sja1105_ptptsclk_read(const struct cyclecounter *cc)
+/* Caller must hold priv->ptp_lock */
+u64 sja1105_ptpclkval_read(struct sja1105_private *priv)
{
- struct sja1105_private *priv = cc_to_sja1105(cc);
const struct sja1105_regs *regs = priv->info->regs;
- u64 ptptsclk = 0;
+ u64 ptpclkval = 0;
int rc;
- rc = sja1105_spi_send_int(priv, SPI_READ, regs->ptptsclk,
- &ptptsclk, 8);
+ rc = sja1105_spi_send_int(priv, SPI_READ, regs->ptpclk,
+ &ptpclkval, 8);
if (rc < 0)
dev_err_ratelimited(priv->ds->dev,
- "failed to read ptp cycle counter: %d\n",
+ "failed to read ptp time: %d\n",
rc);
- return ptptsclk;
+
+ return ptpclkval;
}
-static void sja1105_ptp_overflow_check(struct work_struct *work)
+/* Write to PTPCLKVAL while PTPCLKADD is 1 */
+static int sja1105_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
{
- struct delayed_work *dw = to_delayed_work(work);
- struct sja1105_private *priv = dw_to_sja1105(dw);
- struct timespec64 ts;
+ struct sja1105_private *priv = ptp_to_sja1105(ptp);
+ s64 ticks = ns_to_sja1105_ticks(delta);
+ int rc;
- sja1105_ptp_gettime(&priv->ptp_caps, &ts);
+ mutex_lock(&priv->ptp_lock);
- schedule_delayed_work(&priv->refresh_work, SJA1105_REFRESH_INTERVAL);
+ rc = sja1105_ptp_mode_set(priv, PTP_ADD_MODE);
+ if (rc < 0) {
+ dev_err(priv->ds->dev, "Failed to put PTPCLK in add mode\n");
+ goto out;
+ }
+
+ rc = sja1105_ptpclkval_write(priv, ticks);
+
+out:
+ mutex_unlock(&priv->ptp_lock);
+
+ return rc;
}
int sja1105_ptp_clock_register(struct sja1105_private *priv)
{
struct dsa_switch *ds = priv->ds;
- /* Set up the cycle counter */
- priv->tstamp_cc = (struct cyclecounter) {
- .read = sja1105_ptptsclk_read,
- .mask = CYCLECOUNTER_MASK(64),
- .shift = SJA1105_CC_SHIFT,
- .mult = SJA1105_CC_MULT,
- };
priv->ptp_caps = (struct ptp_clock_info) {
.owner = THIS_MODULE,
.name = "SJA1105 PHC",
@@ -370,8 +358,8 @@ int sja1105_ptp_clock_register(struct sja1105_private *priv)
if (IS_ERR_OR_NULL(priv->clock))
return PTR_ERR(priv->clock);
- INIT_DELAYED_WORK(&priv->refresh_work, sja1105_ptp_overflow_check);
- schedule_delayed_work(&priv->refresh_work, SJA1105_REFRESH_INTERVAL);
+ priv->ptp_cmd.corrclk4ts = true;
+ priv->ptp_cmd.ptpclkadd = PTP_SET_MODE;
return sja1105_ptp_reset(priv);
}
@@ -381,7 +369,6 @@ void sja1105_ptp_clock_unregister(struct sja1105_private *priv)
if (IS_ERR_OR_NULL(priv->clock))
return;
- cancel_delayed_work_sync(&priv->refresh_work);
ptp_clock_unregister(priv->clock);
priv->clock = NULL;
}
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h
index af456b0a4d27..51e21d951548 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.h
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.h
@@ -4,6 +4,21 @@
#ifndef _SJA1105_PTP_H
#define _SJA1105_PTP_H
+/* Timestamps are in units of 8 ns clock ticks (equivalent to
+ * a fixed 125 MHz clock).
+ */
+#define SJA1105_TICK_NS 8
+
+static inline s64 ns_to_sja1105_ticks(s64 ns)
+{
+ return ns / SJA1105_TICK_NS;
+}
+
+static inline s64 sja1105_ticks_to_ns(s64 ticks)
+{
+ return ticks * SJA1105_TICK_NS;
+}
+
#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP)
int sja1105_ptp_clock_register(struct sja1105_private *priv);
@@ -24,6 +39,8 @@ u64 sja1105_tstamp_reconstruct(struct sja1105_private *priv, u64 now,
int sja1105_ptp_reset(struct sja1105_private *priv);
+u64 sja1105_ptpclkval_read(struct sja1105_private *priv);
+
#else
static inline int sja1105_ptp_clock_register(struct sja1105_private *priv)
@@ -53,6 +70,11 @@ static inline int sja1105_ptp_reset(struct sja1105_private *priv)
return 0;
}
+static inline u64 sja1105_ptpclkval_read(struct sja1105_private *priv)
+{
+ return 0;
+}
+
#define sja1105et_ptp_cmd NULL
#define sja1105pqrs_ptp_cmd NULL
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index 84dc603138cf..1953d8c54af6 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -517,7 +517,6 @@ static struct sja1105_regs sja1105et_regs = {
.ptp_control = 0x17,
.ptpclk = 0x18, /* Spans 0x18 to 0x19 */
.ptpclkrate = 0x1A,
- .ptptsclk = 0x1B, /* Spans 0x1B to 0x1C */
};
static struct sja1105_regs sja1105pqrs_regs = {
@@ -548,7 +547,6 @@ static struct sja1105_regs sja1105pqrs_regs = {
.ptp_control = 0x18,
.ptpclk = 0x19,
.ptpclkrate = 0x1B,
- .ptptsclk = 0x1C,
};
struct sja1105_info sja1105e_info = {
--
2.17.1
^ permalink raw reply related
* [PATCH v2 net-next 4/7] net: dsa: sja1105: Implement the .gettimex64 system call for PTP
From: Vladimir Oltean @ 2019-09-10 1:34 UTC (permalink / raw)
To: f.fainelli, vivien.didelot, andrew, davem, richardcochran
Cc: netdev, Vladimir Oltean
In-Reply-To: <20190910013501.3262-1-olteanv@gmail.com>
Through the PTP_SYS_OFFSET_EXTENDED ioctl, it is possible for userspace
applications (i.e. phc2sys) to compensate for the delays incurred while
reading the PHC's time.
For now implement this ioctl in the driver, although the performance
improvements are minimal. The goal with this patch is to rework the
infrastructure in the driver for SPI transfers to be timestamped. Other
patches depend on this change.
The "performance" implementation of this ioctl will come later, once the
API in the SPI subsystem is agreed upon. The change in the sja1105
driver will be minimal then.
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
---
drivers/net/dsa/sja1105/sja1105.h | 3 ++-
drivers/net/dsa/sja1105/sja1105_main.c | 8 +++---
drivers/net/dsa/sja1105/sja1105_ptp.c | 20 ++++++++------
drivers/net/dsa/sja1105/sja1105_ptp.h | 6 +++--
drivers/net/dsa/sja1105/sja1105_spi.c | 36 +++++++++++++++++++-------
5 files changed, 48 insertions(+), 25 deletions(-)
diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index e4955a025e46..c80be59dafbd 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -131,7 +131,8 @@ int sja1105_spi_send_packed_buf(const struct sja1105_private *priv,
void *packed_buf, size_t size_bytes);
int sja1105_spi_send_int(const struct sja1105_private *priv,
sja1105_spi_rw_mode_t rw, u64 reg_addr,
- u64 *value, u64 size_bytes);
+ u64 *value, u64 size_bytes,
+ struct ptp_system_timestamp *ptp_sts);
int sja1105_spi_send_long_packed_buf(const struct sja1105_private *priv,
sja1105_spi_rw_mode_t rw, u64 base_addr,
void *packed_buf, u64 buf_len);
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index c92326871fec..b886e1a09dfb 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1849,7 +1849,7 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
mutex_lock(&priv->ptp_lock);
- ticks = sja1105_ptpclkval_read(priv);
+ ticks = sja1105_ptpclkval_read(priv, NULL);
rc = sja1105_ptpegr_ts_poll(priv, slot, &ts);
if (rc < 0) {
@@ -2002,7 +2002,7 @@ static void sja1105_rxtstamp_work(struct work_struct *work)
mutex_lock(&priv->ptp_lock);
- ticks = sja1105_ptpclkval_read(priv);
+ ticks = sja1105_ptpclkval_read(priv, NULL);
while ((skb = skb_dequeue(&data->skb_rxtstamp_queue)) != NULL) {
struct skb_shared_hwtstamps *shwt = skb_hwtstamps(skb);
@@ -2097,8 +2097,8 @@ static int sja1105_check_device_id(struct sja1105_private *priv)
u64 part_no;
int rc;
- rc = sja1105_spi_send_int(priv, SPI_READ, regs->device_id,
- &device_id, SJA1105_SIZE_DEVICE_ID);
+ rc = sja1105_spi_send_int(priv, SPI_READ, regs->device_id, &device_id,
+ SJA1105_SIZE_DEVICE_ID, NULL);
if (rc < 0)
return rc;
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index bcdfdda46b9c..04693c702b09 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com>
*/
+#include <linux/spi/spi.h>
#include "sja1105.h"
/* The adjfine API clamps ppb between [-32,768,000, 32,768,000], and
@@ -214,15 +215,16 @@ int sja1105_ptp_reset(struct sja1105_private *priv)
return rc;
}
-static int sja1105_ptp_gettime(struct ptp_clock_info *ptp,
- struct timespec64 *ts)
+static int sja1105_ptp_gettimex(struct ptp_clock_info *ptp,
+ struct timespec64 *ts,
+ struct ptp_system_timestamp *sts)
{
struct sja1105_private *priv = ptp_to_sja1105(ptp);
u64 ticks;
mutex_lock(&priv->ptp_lock);
- ticks = sja1105_ptpclkval_read(priv);
+ ticks = sja1105_ptpclkval_read(priv, sts);
*ts = ns_to_timespec64(sja1105_ticks_to_ns(ticks));
mutex_unlock(&priv->ptp_lock);
@@ -247,7 +249,8 @@ static int sja1105_ptpclkval_write(struct sja1105_private *priv, u64 val)
{
const struct sja1105_regs *regs = priv->info->regs;
- return sja1105_spi_send_int(priv, SPI_WRITE, regs->ptpclk, &val, 8);
+ return sja1105_spi_send_int(priv, SPI_WRITE, regs->ptpclk, &val, 8,
+ NULL);
}
/* Write to PTPCLKVAL while PTPCLKADD is 0 */
@@ -291,7 +294,7 @@ static int sja1105_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
mutex_lock(&priv->ptp_lock);
rc = sja1105_spi_send_int(priv, SPI_WRITE, regs->ptpclkrate,
- &clkrate, 4);
+ &clkrate, 4, NULL);
mutex_unlock(&priv->ptp_lock);
@@ -299,14 +302,15 @@ static int sja1105_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
}
/* Caller must hold priv->ptp_lock */
-u64 sja1105_ptpclkval_read(struct sja1105_private *priv)
+u64 sja1105_ptpclkval_read(struct sja1105_private *priv,
+ struct ptp_system_timestamp *sts)
{
const struct sja1105_regs *regs = priv->info->regs;
u64 ptpclkval = 0;
int rc;
rc = sja1105_spi_send_int(priv, SPI_READ, regs->ptpclk,
- &ptpclkval, 8);
+ &ptpclkval, 8, sts);
if (rc < 0)
dev_err_ratelimited(priv->ds->dev,
"failed to read ptp time: %d\n",
@@ -347,7 +351,7 @@ int sja1105_ptp_clock_register(struct sja1105_private *priv)
.name = "SJA1105 PHC",
.adjfine = sja1105_ptp_adjfine,
.adjtime = sja1105_ptp_adjtime,
- .gettime64 = sja1105_ptp_gettime,
+ .gettimex64 = sja1105_ptp_gettimex,
.settime64 = sja1105_ptp_settime,
.max_adj = SJA1105_MAX_ADJ_PPB,
};
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h
index 51e21d951548..80c33e5e4503 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.h
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.h
@@ -39,7 +39,8 @@ u64 sja1105_tstamp_reconstruct(struct sja1105_private *priv, u64 now,
int sja1105_ptp_reset(struct sja1105_private *priv);
-u64 sja1105_ptpclkval_read(struct sja1105_private *priv);
+u64 sja1105_ptpclkval_read(struct sja1105_private *priv,
+ struct ptp_system_timestamp *sts);
#else
@@ -70,7 +71,8 @@ static inline int sja1105_ptp_reset(struct sja1105_private *priv)
return 0;
}
-static inline u64 sja1105_ptpclkval_read(struct sja1105_private *priv)
+static inline u64 sja1105_ptpclkval_read(struct sja1105_private *priv,
+ struct ptp_system_timestamp *sts)
{
return 0;
}
diff --git a/drivers/net/dsa/sja1105/sja1105_spi.c b/drivers/net/dsa/sja1105/sja1105_spi.c
index 1953d8c54af6..26985f1209ad 100644
--- a/drivers/net/dsa/sja1105/sja1105_spi.c
+++ b/drivers/net/dsa/sja1105/sja1105_spi.c
@@ -15,7 +15,8 @@
(SJA1105_SIZE_SPI_MSG_HEADER + SJA1105_SIZE_SPI_MSG_MAXLEN)
static int sja1105_spi_transfer(const struct sja1105_private *priv,
- const void *tx, void *rx, int size)
+ const void *tx, void *rx, int size,
+ struct ptp_system_timestamp *ptp_sts)
{
struct spi_device *spi = priv->spidev;
struct spi_transfer transfer = {
@@ -35,12 +36,16 @@ static int sja1105_spi_transfer(const struct sja1105_private *priv,
spi_message_init(&msg);
spi_message_add_tail(&transfer, &msg);
+ ptp_read_system_prets(ptp_sts);
+
rc = spi_sync(spi, &msg);
if (rc < 0) {
dev_err(&spi->dev, "SPI transfer failed: %d\n", rc);
return rc;
}
+ ptp_read_system_postts(ptp_sts);
+
return rc;
}
@@ -66,9 +71,11 @@ sja1105_spi_message_pack(void *buf, const struct sja1105_spi_message *msg)
* @size_bytes is smaller than SIZE_SPI_MSG_MAXLEN. Larger packed buffers
* are chunked in smaller pieces by sja1105_spi_send_long_packed_buf below.
*/
-int sja1105_spi_send_packed_buf(const struct sja1105_private *priv,
- sja1105_spi_rw_mode_t rw, u64 reg_addr,
- void *packed_buf, size_t size_bytes)
+static int
+__sja1105_spi_send_packed_buf(const struct sja1105_private *priv,
+ sja1105_spi_rw_mode_t rw, u64 reg_addr,
+ void *packed_buf, size_t size_bytes,
+ struct ptp_system_timestamp *ptp_sts)
{
u8 tx_buf[SJA1105_SIZE_SPI_TRANSFER_MAX] = {0};
u8 rx_buf[SJA1105_SIZE_SPI_TRANSFER_MAX] = {0};
@@ -90,7 +97,7 @@ int sja1105_spi_send_packed_buf(const struct sja1105_private *priv,
memcpy(tx_buf + SJA1105_SIZE_SPI_MSG_HEADER,
packed_buf, size_bytes);
- rc = sja1105_spi_transfer(priv, tx_buf, rx_buf, msg_len);
+ rc = sja1105_spi_transfer(priv, tx_buf, rx_buf, msg_len, ptp_sts);
if (rc < 0)
return rc;
@@ -101,6 +108,14 @@ int sja1105_spi_send_packed_buf(const struct sja1105_private *priv,
return 0;
}
+int sja1105_spi_send_packed_buf(const struct sja1105_private *priv,
+ sja1105_spi_rw_mode_t rw, u64 reg_addr,
+ void *packed_buf, size_t size_bytes)
+{
+ return __sja1105_spi_send_packed_buf(priv, rw, reg_addr, packed_buf,
+ size_bytes, NULL);
+}
+
/* If @rw is:
* - SPI_WRITE: creates and sends an SPI write message at absolute
* address reg_addr, taking size_bytes from *packed_buf
@@ -114,7 +129,8 @@ int sja1105_spi_send_packed_buf(const struct sja1105_private *priv,
*/
int sja1105_spi_send_int(const struct sja1105_private *priv,
sja1105_spi_rw_mode_t rw, u64 reg_addr,
- u64 *value, u64 size_bytes)
+ u64 *value, u64 size_bytes,
+ struct ptp_system_timestamp *ptp_sts)
{
u8 packed_buf[SJA1105_SIZE_SPI_MSG_MAXLEN];
int rc;
@@ -126,8 +142,8 @@ int sja1105_spi_send_int(const struct sja1105_private *priv,
sja1105_pack(packed_buf, value, 8 * size_bytes - 1, 0,
size_bytes);
- rc = sja1105_spi_send_packed_buf(priv, rw, reg_addr, packed_buf,
- size_bytes);
+ rc = __sja1105_spi_send_packed_buf(priv, rw, reg_addr, packed_buf,
+ size_bytes, ptp_sts);
if (rw == SPI_READ)
sja1105_unpack(packed_buf, value, 8 * size_bytes - 1, 0,
@@ -291,7 +307,7 @@ int sja1105_inhibit_tx(const struct sja1105_private *priv,
int rc;
rc = sja1105_spi_send_int(priv, SPI_READ, regs->port_control,
- &inhibit_cmd, SJA1105_SIZE_PORT_CTRL);
+ &inhibit_cmd, SJA1105_SIZE_PORT_CTRL, NULL);
if (rc < 0)
return rc;
@@ -301,7 +317,7 @@ int sja1105_inhibit_tx(const struct sja1105_private *priv,
inhibit_cmd &= ~port_bitmap;
return sja1105_spi_send_int(priv, SPI_WRITE, regs->port_control,
- &inhibit_cmd, SJA1105_SIZE_PORT_CTRL);
+ &inhibit_cmd, SJA1105_SIZE_PORT_CTRL, NULL);
}
struct sja1105_status {
--
2.17.1
^ permalink raw reply related
* [PATCH v2 net-next 6/7] net: dsa: sja1105: Disallow management xmit during switch reset
From: Vladimir Oltean @ 2019-09-10 1:35 UTC (permalink / raw)
To: f.fainelli, vivien.didelot, andrew, davem, richardcochran
Cc: netdev, Vladimir Oltean
In-Reply-To: <20190910013501.3262-1-olteanv@gmail.com>
The purpose here is to avoid ptp4l fail due to this condition:
timed out while polling for tx timestamp
increasing tx_timestamp_timeout may correct this issue, but it is likely caused by a driver bug
port 1: send peer delay request failed
So either reset the switch before the management frame was sent, or
after it was timestamped as well, but not in the middle.
The condition may arise either due to a true timeout (i.e. because
re-uploading the static config takes time), or due to the TX timestamp
actually getting lost due to reset. For the former we can increase
tx_timestamp_timeout in userspace, for the latter we need this patch.
Locking all traffic during switch reset does not make sense at all,
though. Forcing all CPU-originated traffic to potentially block waiting
for a sleepable context to send > 800 bytes over SPI is not a good idea.
Flows that are autonomously forwarded by the switch will get dropped
anyway during switch reset no matter what. So just let all other
CPU-originated traffic be dropped as well.
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
---
drivers/net/dsa/sja1105/sja1105_main.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 5a110b40f5f3..68b09ef7aeb2 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1393,6 +1393,8 @@ static int sja1105_static_config_reload(struct sja1105_private *priv)
s64 t12, t34;
int rc, i;
+ mutex_lock(&priv->mgmt_lock);
+
mac = priv->static_config.tables[BLK_IDX_MAC_CONFIG].entries;
/* Back up the dynamic link speed changed by sja1105_adjust_port_config
@@ -1449,6 +1451,8 @@ static int sja1105_static_config_reload(struct sja1105_private *priv)
goto out;
}
out:
+ mutex_unlock(&priv->mgmt_lock);
+
return rc;
}
--
2.17.1
^ permalink raw reply related
* [PATCH v2 net-next 7/7] net: dsa: sja1105: Move PTP data to its own private structure
From: Vladimir Oltean @ 2019-09-10 1:35 UTC (permalink / raw)
To: f.fainelli, vivien.didelot, andrew, davem, richardcochran
Cc: netdev, Vladimir Oltean
In-Reply-To: <20190910013501.3262-1-olteanv@gmail.com>
Reduce the size of the sja1105_private structure when
CONFIG_NET_DSA_SJA1105_PTP is not enabled. Also make the PTP code a
little bit more self-contained.
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
---
drivers/net/dsa/sja1105/sja1105.h | 20 +------
drivers/net/dsa/sja1105/sja1105_main.c | 12 ++--
drivers/net/dsa/sja1105/sja1105_ptp.c | 81 +++++++++++++++-----------
drivers/net/dsa/sja1105/sja1105_ptp.h | 29 +++++++++
4 files changed, 84 insertions(+), 58 deletions(-)
diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h
index c80be59dafbd..3ca0b87aa3e4 100644
--- a/drivers/net/dsa/sja1105/sja1105.h
+++ b/drivers/net/dsa/sja1105/sja1105.h
@@ -20,6 +20,8 @@
*/
#define SJA1105_AGEING_TIME_MS(ms) ((ms) / 10)
+#include "sja1105_ptp.h"
+
/* Keeps the different addresses between E/T and P/Q/R/S */
struct sja1105_regs {
u64 device_id;
@@ -49,17 +51,6 @@ struct sja1105_regs {
u64 qlevel[SJA1105_NUM_PORTS];
};
-enum sja1105_ptp_clk_mode {
- PTP_ADD_MODE = 1,
- PTP_SET_MODE = 0,
-};
-
-struct sja1105_ptp_cmd {
- u64 resptp; /* reset */
- u64 corrclk4ts; /* use the corrected clock for timestamps */
- u64 ptpclkadd; /* enum sja1105_ptp_clk_mode */
-};
-
struct sja1105_info {
u64 device_id;
/* Needed for distinction between P and R, and between Q and S
@@ -99,20 +90,15 @@ struct sja1105_private {
struct spi_device *spidev;
struct dsa_switch *ds;
struct sja1105_port ports[SJA1105_NUM_PORTS];
- struct sja1105_ptp_cmd ptp_cmd;
- struct ptp_clock_info ptp_caps;
- struct ptp_clock *clock;
- /* Serializes all operations on the PTP hardware clock */
- struct mutex ptp_lock;
/* Serializes transmission of management frames so that
* the switch doesn't confuse them with one another.
*/
struct mutex mgmt_lock;
struct sja1105_tagger_data tagger_data;
+ struct sja1105_ptp_data ptp_data;
};
#include "sja1105_dynamic_config.h"
-#include "sja1105_ptp.h"
struct sja1105_spi_message {
u64 access;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 68b09ef7aeb2..a55d360ba9f5 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1408,7 +1408,7 @@ static int sja1105_static_config_reload(struct sja1105_private *priv)
}
/* No PTP operations can run right now */
- mutex_lock(&priv->ptp_lock);
+ mutex_lock(&priv->ptp_data.lock);
ptpclkval = __sja1105_ptp_gettimex(priv, &ptp_sts_before);
@@ -1435,7 +1435,7 @@ static int sja1105_static_config_reload(struct sja1105_private *priv)
__sja1105_ptp_adjtime(priv, ptpclkval);
out_unlock_ptp:
- mutex_unlock(&priv->ptp_lock);
+ mutex_unlock(&priv->ptp_data.lock);
/* Configure the CGU (PLLs) for MII and RMII PHYs.
* For these interfaces there is no dynamic configuration
@@ -1881,7 +1881,7 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS;
- mutex_lock(&priv->ptp_lock);
+ mutex_lock(&priv->ptp_data.lock);
ticks = sja1105_ptpclkval_read(priv, NULL);
@@ -1898,7 +1898,7 @@ static netdev_tx_t sja1105_port_deferred_xmit(struct dsa_switch *ds, int port,
skb_complete_tx_timestamp(clone, &shwt);
out_unlock_ptp:
- mutex_unlock(&priv->ptp_lock);
+ mutex_unlock(&priv->ptp_data.lock);
out:
mutex_unlock(&priv->mgmt_lock);
return NETDEV_TX_OK;
@@ -2034,7 +2034,7 @@ static void sja1105_rxtstamp_work(struct work_struct *work)
struct sk_buff *skb;
u64 ticks;
- mutex_lock(&priv->ptp_lock);
+ mutex_lock(&priv->ptp_data.lock);
ticks = sja1105_ptpclkval_read(priv, NULL);
@@ -2051,7 +2051,7 @@ static void sja1105_rxtstamp_work(struct work_struct *work)
netif_rx_ni(skb);
}
- mutex_unlock(&priv->ptp_lock);
+ mutex_unlock(&priv->ptp_data.lock);
}
/* Called from dsa_skb_defer_rx_timestamp */
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index a7722c0944fb..f85f44bdab31 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -34,7 +34,10 @@
#define SJA1105_CC_MULT_DEM 15625
#define SJA1105_CC_MULT 0x80000000
-#define ptp_to_sja1105(d) container_of((d), struct sja1105_private, ptp_caps)
+#define ptp_to_sja1105_data(d) \
+ container_of((d), struct sja1105_ptp_data, caps)
+#define ptp_data_to_sja1105(d) \
+ container_of((d), struct sja1105_private, ptp_data)
int sja1105_get_ts_info(struct dsa_switch *ds, int port,
struct ethtool_ts_info *info)
@@ -42,7 +45,7 @@ int sja1105_get_ts_info(struct dsa_switch *ds, int port,
struct sja1105_private *priv = ds->priv;
/* Called during cleanup */
- if (!priv->clock)
+ if (!priv->ptp_data.clock)
return -ENODEV;
info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE |
@@ -52,7 +55,7 @@ int sja1105_get_ts_info(struct dsa_switch *ds, int port,
(1 << HWTSTAMP_TX_ON);
info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) |
(1 << HWTSTAMP_FILTER_PTP_V2_L2_EVENT);
- info->phc_index = ptp_clock_index(priv->clock);
+ info->phc_index = ptp_clock_index(priv->ptp_data.clock);
return 0;
}
@@ -200,22 +203,23 @@ int sja1105_ptpegr_ts_poll(struct sja1105_private *priv, int port, u64 *ts)
int sja1105_ptp_reset(struct sja1105_private *priv)
{
- struct sja1105_ptp_cmd cmd = priv->ptp_cmd;
+ struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
+ struct sja1105_ptp_cmd cmd = ptp_data->cmd;
int rc;
- mutex_lock(&priv->ptp_lock);
+ mutex_lock(&ptp_data->lock);
cmd.resptp = 1;
dev_dbg(priv->ds->dev, "Resetting PTP clock\n");
rc = priv->info->ptp_cmd(priv, &cmd);
- mutex_unlock(&priv->ptp_lock);
+ mutex_unlock(&ptp_data->lock);
return rc;
}
-/* Caller must hold priv->ptp_lock */
+/* Caller must hold priv->ptp_data.lock */
u64 __sja1105_ptp_gettimex(struct sja1105_private *priv,
struct ptp_system_timestamp *sts)
{
@@ -230,30 +234,31 @@ static int sja1105_ptp_gettimex(struct ptp_clock_info *ptp,
struct timespec64 *ts,
struct ptp_system_timestamp *sts)
{
- struct sja1105_private *priv = ptp_to_sja1105(ptp);
+ struct sja1105_ptp_data *ptp_data = ptp_to_sja1105_data(ptp);
+ struct sja1105_private *priv = ptp_data_to_sja1105(ptp_data);
- mutex_lock(&priv->ptp_lock);
+ mutex_lock(&ptp_data->lock);
*ts = ns_to_timespec64(__sja1105_ptp_gettimex(priv, sts));
- mutex_unlock(&priv->ptp_lock);
+ mutex_unlock(&ptp_data->lock);
return 0;
}
-/* Caller must hold priv->ptp_lock */
+/* Caller must hold priv->ptp_data.lock */
static int sja1105_ptp_mode_set(struct sja1105_private *priv,
enum sja1105_ptp_clk_mode mode)
{
- if (priv->ptp_cmd.ptpclkadd == mode)
+ if (priv->ptp_data.cmd.ptpclkadd == mode)
return 0;
- priv->ptp_cmd.ptpclkadd = mode;
+ priv->ptp_data.cmd.ptpclkadd = mode;
- return priv->info->ptp_cmd(priv, &priv->ptp_cmd);
+ return priv->info->ptp_cmd(priv, &priv->ptp_data.cmd);
}
-/* Caller must hold priv->ptp_lock */
+/* Caller must hold priv->ptp_data.lock */
static int sja1105_ptpclkval_write(struct sja1105_private *priv, u64 val,
struct ptp_system_timestamp *ptp_sts)
{
@@ -282,22 +287,24 @@ int __sja1105_ptp_settime(struct sja1105_private *priv, u64 ns,
static int sja1105_ptp_settime(struct ptp_clock_info *ptp,
const struct timespec64 *ts)
{
- struct sja1105_private *priv = ptp_to_sja1105(ptp);
+ struct sja1105_ptp_data *ptp_data = ptp_to_sja1105_data(ptp);
+ struct sja1105_private *priv = ptp_data_to_sja1105(ptp_data);
u64 ns = timespec64_to_ns(ts);
int rc;
- mutex_lock(&priv->ptp_lock);
+ mutex_lock(&ptp_data->lock);
rc = __sja1105_ptp_settime(priv, ns, NULL);
- mutex_unlock(&priv->ptp_lock);
+ mutex_unlock(&ptp_data->lock);
return rc;
}
static int sja1105_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
{
- struct sja1105_private *priv = ptp_to_sja1105(ptp);
+ struct sja1105_ptp_data *ptp_data = ptp_to_sja1105_data(ptp);
+ struct sja1105_private *priv = ptp_data_to_sja1105(ptp_data);
const struct sja1105_regs *regs = priv->info->regs;
s64 clkrate;
int rc;
@@ -309,17 +316,17 @@ static int sja1105_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
clkrate = SJA1105_CC_MULT + clkrate;
clkrate &= GENMASK_ULL(31, 0);
- mutex_lock(&priv->ptp_lock);
+ mutex_lock(&priv->ptp_data.lock);
rc = sja1105_spi_send_int(priv, SPI_WRITE, regs->ptpclkrate,
&clkrate, 4, NULL);
- mutex_unlock(&priv->ptp_lock);
+ mutex_unlock(&priv->ptp_data.lock);
return rc;
}
-/* Caller must hold priv->ptp_lock */
+/* Caller must hold priv->ptp_data.lock */
u64 sja1105_ptpclkval_read(struct sja1105_private *priv,
struct ptp_system_timestamp *sts)
{
@@ -354,23 +361,25 @@ int __sja1105_ptp_adjtime(struct sja1105_private *priv, s64 delta)
static int sja1105_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
{
- struct sja1105_private *priv = ptp_to_sja1105(ptp);
+ struct sja1105_ptp_data *ptp_data = ptp_to_sja1105_data(ptp);
+ struct sja1105_private *priv = ptp_data_to_sja1105(ptp_data);
int rc;
- mutex_lock(&priv->ptp_lock);
+ mutex_lock(&ptp_data->lock);
rc = __sja1105_ptp_adjtime(priv, delta);
- mutex_unlock(&priv->ptp_lock);
+ mutex_unlock(&ptp_data->lock);
return rc;
}
int sja1105_ptp_clock_register(struct sja1105_private *priv)
{
+ struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
struct dsa_switch *ds = priv->ds;
- priv->ptp_caps = (struct ptp_clock_info) {
+ ptp_data->caps = (struct ptp_clock_info) {
.owner = THIS_MODULE,
.name = "SJA1105 PHC",
.adjfine = sja1105_ptp_adjfine,
@@ -380,23 +389,25 @@ int sja1105_ptp_clock_register(struct sja1105_private *priv)
.max_adj = SJA1105_MAX_ADJ_PPB,
};
- mutex_init(&priv->ptp_lock);
+ mutex_init(&ptp_data->lock);
- priv->clock = ptp_clock_register(&priv->ptp_caps, ds->dev);
- if (IS_ERR_OR_NULL(priv->clock))
- return PTR_ERR(priv->clock);
+ ptp_data->clock = ptp_clock_register(&ptp_data->caps, ds->dev);
+ if (IS_ERR_OR_NULL(ptp_data->clock))
+ return PTR_ERR(ptp_data->clock);
- priv->ptp_cmd.corrclk4ts = true;
- priv->ptp_cmd.ptpclkadd = PTP_SET_MODE;
+ ptp_data->cmd.corrclk4ts = true;
+ ptp_data->cmd.ptpclkadd = PTP_SET_MODE;
return sja1105_ptp_reset(priv);
}
void sja1105_ptp_clock_unregister(struct sja1105_private *priv)
{
- if (IS_ERR_OR_NULL(priv->clock))
+ struct sja1105_ptp_data *ptp_data = &priv->ptp_data;
+
+ if (IS_ERR_OR_NULL(ptp_data->clock))
return;
- ptp_clock_unregister(priv->clock);
- priv->clock = NULL;
+ ptp_clock_unregister(ptp_data->clock);
+ ptp_data->clock = NULL;
}
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h
index c699611e585d..dfe856200394 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.h
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.h
@@ -19,8 +19,29 @@ static inline s64 sja1105_ticks_to_ns(s64 ticks)
return ticks * SJA1105_TICK_NS;
}
+struct sja1105_private;
+
#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP)
+enum sja1105_ptp_clk_mode {
+ PTP_ADD_MODE = 1,
+ PTP_SET_MODE = 0,
+};
+
+struct sja1105_ptp_cmd {
+ u64 resptp; /* reset */
+ u64 corrclk4ts; /* use the corrected clock for timestamps */
+ u64 ptpclkadd; /* enum sja1105_ptp_clk_mode */
+};
+
+struct sja1105_ptp_data {
+ struct sja1105_ptp_cmd cmd;
+ struct ptp_clock_info caps;
+ struct ptp_clock *clock;
+ /* Serializes all operations on the PTP hardware clock */
+ struct mutex lock;
+};
+
int sja1105_ptp_clock_register(struct sja1105_private *priv);
void sja1105_ptp_clock_unregister(struct sja1105_private *priv);
@@ -52,6 +73,14 @@ int __sja1105_ptp_adjtime(struct sja1105_private *priv, s64 delta);
#else
+/* Structures cannot be empty in C. Bah!
+ * Keep the mutex as the only element, which is a bit more difficult to
+ * refactor out of sja1105_main.c anyway.
+ */
+struct sja1105_ptp_data {
+ struct mutex lock;
+};
+
static inline int sja1105_ptp_clock_register(struct sja1105_private *priv)
{
return 0;
--
2.17.1
^ permalink raw reply related
* Re: [PATCH v1 net-next 00/15] tc-taprio offload for SJA1105 DSA
From: Vladimir Oltean @ 2019-09-10 1:46 UTC (permalink / raw)
To: Joergen Andreasen
Cc: Andrew Lunn, David Miller, f.fainelli, vivien.didelot,
vinicius.gomes, vedang.patel, richardcochran, weifeng.voon, jiri,
m-karicheri2, Jose.Abreu, ilias.apalodimas, jhs, xiyou.wangcong,
kurt.kanzenbach, netdev
In-Reply-To: <20190909123632.nvlmfdtw3otyx3xh@soft-dev16>
Hi Andrew, Joergen, Richard,
On 09/09/2019, Joergen Andreasen <joergen.andreasen@microchip.com> wrote:
> The 09/08/2019 22:42, Andrew Lunn wrote:
>> On Sun, Sep 08, 2019 at 12:07:27PM +0100, Vladimir Oltean wrote:
>> > I think Richard has been there when the taprio, etf qdiscs, SO_TXTIME
>> > were first defined and developed:
>> > https://patchwork.ozlabs.org/cover/808504/
>> > I expect he is capable of delivering a competent review of the entire
>> > series, possibly way more competent than my patch set itself.
>> >
>> > The reason why I'm not splitting it up is because I lose around 10 ns
>> > of synchronization offset when using the hardware-corrected PTPCLKVAL
>> > clock for timestamping rather than the PTPTSCLK free-running counter.
>>
>> Hi Vladimir
>>
>> I'm not suggesting anything is wrong with your concept, when i say
>> split it up. It is more than when somebody sees 15 patches, they
>> decide they don't have the time at the moment, and put it off until
>> later. And often later never happens. If however they see a smaller
>> number of patches, they think that yes they have time now, and do the
>> review.
>>
>> So if you are struggling to get something reviewed, make it more
>> appealing for the reviewer. Salami tactics.
>>
>> Andrew
>
> I vote for splitting it up.
> I don't know enough about PTP and taprio/qdisc to review the entire series
> but the interface presented in patch 09/15 fits well with our future TSN
> switches.
>
> Joergen Andreasen, Microchip
>
Thanks for the feedback. I split the PTP portion that is loosely
coupled (patches 01-07) into a different series. The rest is qdisc
stuff and hardware implementation details. They belong together
because it would be otherwise strange to provide an interface with no
user. You can still review only the patches you are interested in,
however.
Thanks,
-Vladimir
^ permalink raw reply
* Re: [RFC PATCH untested] vhost: block speculation of translated descriptors
From: Jason Wang @ 2019-09-10 1:52 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: linux-kernel, kvm, virtualization, netdev
In-Reply-To: <20190909104355-mutt-send-email-mst@kernel.org>
On 2019/9/9 下午10:45, Michael S. Tsirkin wrote:
> On Mon, Sep 09, 2019 at 03:19:55PM +0800, Jason Wang wrote:
>> On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
>>> iovec addresses coming from vhost are assumed to be
>>> pre-validated, but in fact can be speculated to a value
>>> out of range.
>>>
>>> Userspace address are later validated with array_index_nospec so we can
>>> be sure kernel info does not leak through these addresses, but vhost
>>> must also not leak userspace info outside the allowed memory table to
>>> guests.
>>>
>>> Following the defence in depth principle, make sure
>>> the address is not validated out of node range.
>>>
>>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>>> ---
>>> drivers/vhost/vhost.c | 4 +++-
>>> 1 file changed, 3 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>> index 5dc174ac8cac..0ee375fb7145 100644
>>> --- a/drivers/vhost/vhost.c
>>> +++ b/drivers/vhost/vhost.c
>>> @@ -2072,7 +2072,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>>> size = node->size - addr + node->start;
>>> _iov->iov_len = min((u64)len - s, size);
>>> _iov->iov_base = (void __user *)(unsigned long)
>>> - (node->userspace_addr + addr - node->start);
>>> + (node->userspace_addr +
>>> + array_index_nospec(addr - node->start,
>>> + node->size));
>>> s += size;
>>> addr += size;
>>> ++ret;
>>
>> I've tried this on Kaby Lake smap off metadata acceleration off using
>> testpmd (virtio-user) + vhost_net. I don't see obvious performance
>> difference with TX PPS.
>>
>> Thanks
> Should I push this to Linus right now then? It's a security thing so
> maybe we better do it ASAP ... what's your opinion?
Yes, you can.
Acked-by: Jason Wang <jasowang@redhat.com>
>
^ permalink raw reply
* NULL pointer dereference in ip6_datagram_dst_update
From: Rafael D'Halleweyn @ 2019-09-10 1:29 UTC (permalink / raw)
To: David S. Miller, Hideaki YOSHIFUJI, Alexey Kuznetsov; +Cc: netdev
Hi,
While running 5.3.0-rc8 on my laptop, I receive an NULL pointer error when trying to use IPv6 (i.e. just browsing with Firefox). I have a desktop running the same version and kernel-config without problem (but different hardware). The laptop is using wireless (Intel AX200 iwlwifi).
Please let me know if you need more detailed information.
Thanks,
Raf.
[ 31.377880] dst_release: dst:00000000751fac0a refcnt:-1
[ 31.769445] BUG: kernel NULL pointer dereference, address: 00000000000000ed
[ 31.769449] #PF: supervisor read access in kernel mode
[ 31.769451] #PF: error_code(0x0000) - not-present page
[ 31.769452] PGD 0 P4D 0
[ 31.769455] Oops: 0000 [#1] SMP PTI
[ 31.769458] CPU: 7 PID: 3103 Comm: pool-gnome-shel Tainted: G OE 5.3.0-rc8 #14
[ 31.769460] Hardware name: Dell Inc. XPS 13 9360/02PG84, BIOS 2.12.0 05/26/2019
[ 31.769465] RIP: 0010:ip6_sk_dst_store_flow+0x7b/0xb0
[ 31.769467] Code: 48 8b 42 30 48 33 47 40 48 09 c1 0f b6 4f 12 b8 01 00 00 00 4d 0f 45 e1 31 db d3 e0 a9 bf ef ff ff 74 07 48 8b 9f f0 02 00 00 <48> 8b 46 70 31 d2 48 85 c0 74 0c 48 8b 40 10 48 85 c0 74 03 8b 50
[ 31.769469] RSP: 0018:ffffbc40c2253d10 EFLAGS: 00010202
[ 31.769471] RAX: 0000000000000080 RBX: ffffa01ac2098460 RCX: 0000000000000007
[ 31.769473] RDX: ffffbc40c2253d50 RSI: 000000000000007d RDI: ffffa01ac2098000
[ 31.769474] RBP: ffffa01ac2098460 R08: 0000000000000000 R09: 0000000000000000
[ 31.769475] R10: 83b567a7afeebe71 R11: ffffa01b5e85fb80 R12: ffffa01ac2098038
[ 31.769476] R13: 0000000000000000 R14: 0000000000000001 R15: ffffa01ac2098460
[ 31.769479] FS: 00007f33320c5700(0000) GS:ffffa01b6e3c0000(0000) knlGS:0000000000000000
[ 31.769481] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 31.769482] CR2: 00000000000000ed CR3: 000000046be2e005 CR4: 00000000003606e0
[ 31.769483] Call Trace:
[ 31.769490] ip6_datagram_dst_update+0x156/0x280
[ 31.769494] __ip6_datagram_connect+0x1dc/0x380
[ 31.769497] ip6_datagram_connect+0x27/0x40
[ 31.769500] __sys_connect+0xd0/0x100
[ 31.769504] ? __sys_socket+0xb7/0xe0
[ 31.769506] __x64_sys_connect+0x16/0x20
[ 31.769509] do_syscall_64+0x4f/0x120
[ 31.769514] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 31.769517] RIP: 0033:0x7f3375ebdb57
[ 31.769519] Code: 0f 1f 00 41 54 41 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 e8 8b d2 00 00 44 89 e2 48 89 ee 89 df 41 89 c0 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 89 44 24 0c e8 c5 d2 00 00 8b 44
[ 31.769521] RSP: 002b:00007f33320c4270 EFLAGS: 00000293 ORIG_RAX: 000000000000002a
[ 31.769523] RAX: ffffffffffffffda RBX: 0000000000000023 RCX: 00007f3375ebdb57
[ 31.769524] RDX: 000000000000001c RSI: 00007f3308001530 RDI: 0000000000000023
[ 31.769526] RBP: 00007f3308001530 R08: 0000000000000000 R09: 0000000000000000
[ 31.769527] R10: 0000000000000009 R11: 0000000000000293 R12: 000000000000001c
[ 31.769528] R13: 0000000000000023 R14: 000000000000000a R15: 00007f3308001500
[ 31.769531] Modules linked in: rfcomm msr nf_log_ipv4 nf_log_common xt_LOG xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat wireguard(OE) nf_nat ip6_udp_tunnel udp_tunnel ccm xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter cmac bnep uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 btusb videobuf2_common btrtl btbcm btintel videodev bluetooth mc ecdh_generic ecc snd_hda_codec_hdmi binfmt_misc x86_pkg_temp_thermal intel_powerclamp coretemp joydev snd_hda_codec_realtek snd_hda_codec_generic snd_soc_skl kvm_intel snd_soc_hdac_hda snd_hda_ext_core snd_soc_skl_ipc kvm snd_soc_sst_ipc irqbypass snd_soc_sst_dsp snd_soc_acpi_intel_match snd_soc_acpi snd_soc_core snd_compress ac97_bus snd_pcm_dmaengine snd_hda_intel snd_hda_codec snd_hda_core iwlmvm snd_hwdep snd_pcm crct10dif_pclmul nls_iso8859_1 mac80211 dell_laptop ledtrig_audio
snd_seq_midi libarc4
[ 31.769567] mei_hdcp crc32_pclmul intel_rapl_msr snd_seq_midi_event ghash_clmulni_intel snd_rawmidi i915 snd_seq iwlwifi snd_seq_device aesni_intel snd_timer dell_wmi cec dell_smbios aes_x86_64 crypto_simd dcdbas drm_kms_helper cryptd snd dell_wmi_descriptor cfg80211 glue_helper wmi_bmof intel_wmi_thunderbolt soundcore input_leds rtsx_pci_ms serio_raw drm memstick ucsi_acpi intel_gtt mei_me hid_multitouch processor_thermal_device i2c_algo_bit typec_ucsi fb_sys_fops syscopyarea mei idma64 intel_rapl_common sysfillrect virt_dma intel_xhci_usb_role_switch sysimgblt roles intel_pch_thermal intel_soc_dts_iosf typec soc_button_array intel_vbtn mac_hid intel_hid int3400_thermal int3403_thermal sparse_keymap acpi_thermal_rel int340x_thermal_zone acpi_pad sch_fq_codel parport_pc ppdev lp parport efivarfs ip_tables x_tables autofs4 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear usbhid hid_generic
rtsx_pci_sdmmc rtsx_pci
[ 31.769605] i2c_i801 intel_lpss_pci i2c_hid intel_lpss wmi hid pinctrl_sunrisepoint video pinctrl_intel
[ 31.769613] CR2: 00000000000000ed
[ 31.769615] ---[ end trace 18eaad00960f2046 ]---
[ 31.769618] RIP: 0010:ip6_sk_dst_store_flow+0x7b/0xb0
[ 31.769621] Code: 48 8b 42 30 48 33 47 40 48 09 c1 0f b6 4f 12 b8 01 00 00 00 4d 0f 45 e1 31 db d3 e0 a9 bf ef ff ff 74 07 48 8b 9f f0 02 00 00 <48> 8b 46 70 31 d2 48 85 c0 74 0c 48 8b 40 10 48 85 c0 74 03 8b 50
[ 31.769622] RSP: 0018:ffffbc40c2253d10 EFLAGS: 00010202
[ 31.769624] RAX: 0000000000000080 RBX: ffffa01ac2098460 RCX: 0000000000000007
[ 31.769625] RDX: ffffbc40c2253d50 RSI: 000000000000007d RDI: ffffa01ac2098000
[ 31.769627] RBP: ffffa01ac2098460 R08: 0000000000000000 R09: 0000000000000000
[ 31.769628] R10: 83b567a7afeebe71 R11: ffffa01b5e85fb80 R12: ffffa01ac2098038
[ 31.769630] R13: 0000000000000000 R14: 0000000000000001 R15: ffffa01ac2098460
[ 31.769632] FS: 00007f33320c5700(0000) GS:ffffa01b6e3c0000(0000) knlGS:0000000000000000
[ 31.769633] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 31.769635] CR2: 00000000000000ed CR3: 000000046be2e005 CR4: 00000000003606e0
[ 43.255937] BUG: kernel NULL pointer dereference, address: 00000000000000ed
[ 43.255943] #PF: supervisor read access in kernel mode
[ 43.255946] #PF: error_code(0x0000) - not-present page
[ 43.255949] PGD 0 P4D 0
[ 43.255954] Oops: 0000 [#2] SMP PTI
[ 43.255960] CPU: 7 PID: 797 Comm: sd-resolve Tainted: G D OE 5.3.0-rc8 #14
[ 43.255963] Hardware name: Dell Inc. XPS 13 9360/02PG84, BIOS 2.12.0 05/26/2019
[ 43.255971] RIP: 0010:ip6_sk_dst_store_flow+0x7b/0xb0
[ 43.255976] Code: 48 8b 42 30 48 33 47 40 48 09 c1 0f b6 4f 12 b8 01 00 00 00 4d 0f 45 e1 31 db d3 e0 a9 bf ef ff ff 74 07 48 8b 9f f0 02 00 00 <48> 8b 46 70 31 d2 48 85 c0 74 0c 48 8b 40 10 48 85 c0 74 03 8b 50
[ 43.255979] RSP: 0018:ffffbc40c0b9bd10 EFLAGS: 00010202
[ 43.255982] RAX: 0000000000000080 RBX: ffffa01b66018960 RCX: 0000000000000007
[ 43.255984] RDX: ffffbc40c0b9bd50 RSI: 000000000000007d RDI: ffffa01b66018500
[ 43.255986] RBP: ffffa01b66018960 R08: 0000000000000000 R09: 0000000000000000
[ 43.255988] R10: 83b567a7afeebe71 R11: ffffa01b5e85fb80 R12: ffffa01b66018538
[ 43.255990] R13: 0000000000000000 R14: 0000000000000001 R15: ffffa01b66018960
[ 43.255993] FS: 00007f37b47a1700(0000) GS:ffffa01b6e3c0000(0000) knlGS:0000000000000000
[ 43.255995] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 43.255997] CR2: 00000000000000ed CR3: 0000000464eca004 CR4: 00000000003606e0
[ 43.255998] Call Trace:
[ 43.256008] ip6_datagram_dst_update+0x156/0x280
[ 43.256014] __ip6_datagram_connect+0x1dc/0x380
[ 43.256019] ip6_datagram_connect+0x27/0x40
[ 43.256025] __sys_connect+0xd0/0x100
[ 43.256034] ? syscall_trace_enter+0x131/0x2c0
[ 43.256039] __x64_sys_connect+0x16/0x20
[ 43.256044] do_syscall_64+0x4f/0x120
[ 43.256051] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 43.256056] RIP: 0033:0x7f37b543bb57
[ 43.256061] Code: 0f 1f 00 41 54 41 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 e8 8b d2 00 00 44 89 e2 48 89 ee 89 df 41 89 c0 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 89 44 24 0c e8 c5 d2 00 00 8b 44
[ 43.256065] RSP: 002b:00007f37b479b410 EFLAGS: 00000293 ORIG_RAX: 000000000000002a
[ 43.256069] RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f37b543bb57
[ 43.256071] RDX: 000000000000001c RSI: 00007f37ac003010 RDI: 000000000000000c
[ 43.256073] RBP: 00007f37ac003010 R08: 0000000000000000 R09: 0000000000000000
[ 43.256075] R10: 0000000000000000 R11: 0000000000000293 R12: 000000000000001c
[ 43.256077] R13: 000000000000000c R14: 000000000000000a R15: 00007f37ac002fe0
[ 43.256080] Modules linked in: rfcomm msr nf_log_ipv4 nf_log_common xt_LOG xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat wireguard(OE) nf_nat ip6_udp_tunnel udp_tunnel ccm xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter cmac bnep uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 btusb videobuf2_common btrtl btbcm btintel videodev bluetooth mc ecdh_generic ecc snd_hda_codec_hdmi binfmt_misc x86_pkg_temp_thermal intel_powerclamp coretemp joydev snd_hda_codec_realtek snd_hda_codec_generic snd_soc_skl kvm_intel snd_soc_hdac_hda snd_hda_ext_core snd_soc_skl_ipc kvm snd_soc_sst_ipc irqbypass snd_soc_sst_dsp snd_soc_acpi_intel_match snd_soc_acpi snd_soc_core snd_compress ac97_bus snd_pcm_dmaengine snd_hda_intel snd_hda_codec snd_hda_core iwlmvm snd_hwdep snd_pcm crct10dif_pclmul nls_iso8859_1 mac80211 dell_laptop ledtrig_audio
snd_seq_midi libarc4
[ 43.256123] mei_hdcp crc32_pclmul intel_rapl_msr snd_seq_midi_event ghash_clmulni_intel snd_rawmidi i915 snd_seq iwlwifi snd_seq_device aesni_intel snd_timer dell_wmi cec dell_smbios aes_x86_64 crypto_simd dcdbas drm_kms_helper cryptd snd dell_wmi_descriptor cfg80211 glue_helper wmi_bmof intel_wmi_thunderbolt soundcore input_leds rtsx_pci_ms serio_raw drm memstick ucsi_acpi intel_gtt mei_me hid_multitouch processor_thermal_device i2c_algo_bit typec_ucsi fb_sys_fops syscopyarea mei idma64 intel_rapl_common sysfillrect virt_dma intel_xhci_usb_role_switch sysimgblt roles intel_pch_thermal intel_soc_dts_iosf typec soc_button_array intel_vbtn mac_hid intel_hid int3400_thermal int3403_thermal sparse_keymap acpi_thermal_rel int340x_thermal_zone acpi_pad sch_fq_codel parport_pc ppdev lp parport efivarfs ip_tables x_tables autofs4 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear usbhid hid_generic
rtsx_pci_sdmmc rtsx_pci
[ 43.256179] i2c_i801 intel_lpss_pci i2c_hid intel_lpss wmi hid pinctrl_sunrisepoint video pinctrl_intel
[ 43.256188] CR2: 00000000000000ed
[ 43.256192] ---[ end trace 18eaad00960f2047 ]---
[ 43.256196] RIP: 0010:ip6_sk_dst_store_flow+0x7b/0xb0
[ 43.256200] Code: 48 8b 42 30 48 33 47 40 48 09 c1 0f b6 4f 12 b8 01 00 00 00 4d 0f 45 e1 31 db d3 e0 a9 bf ef ff ff 74 07 48 8b 9f f0 02 00 00 <48> 8b 46 70 31 d2 48 85 c0 74 0c 48 8b 40 10 48 85 c0 74 03 8b 50
[ 43.256202] RSP: 0018:ffffbc40c2253d10 EFLAGS: 00010202
[ 43.256204] RAX: 0000000000000080 RBX: ffffa01ac2098460 RCX: 0000000000000007
[ 43.256206] RDX: ffffbc40c2253d50 RSI: 000000000000007d RDI: ffffa01ac2098000
[ 43.256208] RBP: ffffa01ac2098460 R08: 0000000000000000 R09: 0000000000000000
[ 43.256209] R10: 83b567a7afeebe71 R11: ffffa01b5e85fb80 R12: ffffa01ac2098038
[ 43.256211] R13: 0000000000000000 R14: 0000000000000001 R15: ffffa01ac2098460
[ 43.256213] FS: 00007f37b47a1700(0000) GS:ffffa01b6e3c0000(0000) knlGS:0000000000000000
[ 43.256216] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 43.256217] CR2: 00000000000000ed CR3: 0000000464eca004 CR4: 00000000003606e0
[ 46.137611] dst_release: dst:00000000f80c721f refcnt:-1
[ 46.137627] dst_release: dst:00000000f80c721f refcnt:-2
[ 46.137635] dst_release: dst:00000000f80c721f refcnt:-3
[ 46.137642] dst_release: dst:00000000f80c721f refcnt:-4
[ 46.137649] dst_release: dst:00000000f80c721f refcnt:-5
[ 46.137656] dst_release: dst:00000000f80c721f refcnt:-6
[ 46.137663] dst_release: dst:00000000f80c721f refcnt:-7
[ 46.137669] dst_release: dst:00000000f80c721f refcnt:-8
[ 46.137900] BUG: kernel NULL pointer dereference, address: 00000000000000d1
[ 46.137902] #PF: supervisor read access in kernel mode
[ 46.137904] #PF: error_code(0x0000) - not-present page
[ 46.137905] PGD 80000003d7b10067 P4D 80000003d7b10067 PUD 0
[ 46.137908] Oops: 0000 [#3] SMP PTI
[ 46.137910] CPU: 3 PID: 3156 Comm: Socket Thread Tainted: G D OE 5.3.0-rc8 #14
[ 46.137911] Hardware name: Dell Inc. XPS 13 9360/02PG84, BIOS 2.12.0 05/26/2019
[ 46.137915] RIP: 0010:sk_setup_caps+0x38/0x130
[ 46.137917] Code: 53 48 89 fb 66 89 47 78 c7 87 88 01 00 00 00 00 00 00 48 89 f7 48 87 bb 38 01 00 00 e8 11 d0 02 00 48 8b 45 00 b9 01 00 00 00 <48> 8b 80 d0 00 00 00 48 0b 83 f8 01 00 00 48 89 c2 48 0d 00 00 1d
[ 46.137918] RSP: 0018:ffffbc40c0ecfcf8 EFLAGS: 00010246
[ 46.137920] RAX: 0000000000000001 RBX: ffffa01b1fe60000 RCX: 0000000000000001
[ 46.137921] RDX: 0000000000000000 RSI: ffffa01b5f588100 RDI: 0000000000000000
[ 46.137922] RBP: ffffa01b5f588100 R08: 0000000000000006 R09: 020401e0033c0026
[ 46.137923] R10: 83b567a7afeebe71 R11: ffffa01b5e85fb80 R12: ffffbc40c0ecfe88
[ 46.137924] R13: 0000000000000000 R14: ffffa01b1fe60888 R15: 0000000000000000
[ 46.137925] FS: 00007fde8d541700(0000) GS:ffffa01b6e2c0000(0000) knlGS:0000000000000000
[ 46.137926] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 46.137927] CR2: 00000000000000d1 CR3: 0000000411536002 CR4: 00000000003606e0
[ 46.137928] Call Trace:
[ 46.137934] tcp_v6_connect+0x405/0x5f0
[ 46.137938] __inet_stream_connect+0xd1/0x370
[ 46.137941] ? tomoyo_socket_connect_permission+0xa2/0xf0
[ 46.137943] inet_stream_connect+0x36/0x50
[ 46.137945] __sys_connect+0xd0/0x100
[ 46.137947] ? __sys_setsockopt+0x16d/0x190
[ 46.137948] __x64_sys_connect+0x16/0x20
[ 46.137951] do_syscall_64+0x4f/0x120
[ 46.137954] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 46.137957] RIP: 0033:0x7fde9eceaf57
[ 46.137958] Code: 44 00 00 41 54 41 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 e8 9b fa ff ff 44 89 e2 48 89 ee 89 df 41 89 c0 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 89 44 24 0c e8 d5 fa ff ff 8b 44
[ 46.137959] RSP: 002b:00007fde8d540500 EFLAGS: 00000293 ORIG_RAX: 000000000000002a
[ 46.137961] RAX: ffffffffffffffda RBX: 000000000000007f RCX: 00007fde9eceaf57
[ 46.137962] RDX: 000000000000001c RSI: 00007fde8d540590 RDI: 000000000000007f
[ 46.137963] RBP: 00007fde8d540590 R08: 0000000000000000 R09: 0000000000000000
[ 46.137963] R10: 000000000000002e R11: 0000000000000293 R12: 000000000000001c
[ 46.137964] R13: 00007fde801e05e0 R14: 000000000000001c R15: 0000000000000014
[ 46.137966] Modules linked in: rfcomm msr nf_log_ipv4 nf_log_common xt_LOG xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat wireguard(OE) nf_nat ip6_udp_tunnel udp_tunnel ccm xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter cmac bnep uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 btusb videobuf2_common btrtl btbcm btintel videodev bluetooth mc ecdh_generic ecc snd_hda_codec_hdmi binfmt_misc x86_pkg_temp_thermal intel_powerclamp coretemp joydev snd_hda_codec_realtek snd_hda_codec_generic snd_soc_skl kvm_intel snd_soc_hdac_hda snd_hda_ext_core snd_soc_skl_ipc kvm snd_soc_sst_ipc irqbypass snd_soc_sst_dsp snd_soc_acpi_intel_match snd_soc_acpi snd_soc_core snd_compress ac97_bus snd_pcm_dmaengine snd_hda_intel snd_hda_codec snd_hda_core iwlmvm snd_hwdep snd_pcm crct10dif_pclmul nls_iso8859_1 mac80211 dell_laptop ledtrig_audio
snd_seq_midi libarc4
[ 46.137991] mei_hdcp crc32_pclmul intel_rapl_msr snd_seq_midi_event ghash_clmulni_intel snd_rawmidi i915 snd_seq iwlwifi snd_seq_device aesni_intel snd_timer dell_wmi cec dell_smbios aes_x86_64 crypto_simd dcdbas drm_kms_helper cryptd snd dell_wmi_descriptor cfg80211 glue_helper wmi_bmof intel_wmi_thunderbolt soundcore input_leds rtsx_pci_ms serio_raw drm memstick ucsi_acpi intel_gtt mei_me hid_multitouch processor_thermal_device i2c_algo_bit typec_ucsi fb_sys_fops syscopyarea mei idma64 intel_rapl_common sysfillrect virt_dma intel_xhci_usb_role_switch sysimgblt roles intel_pch_thermal intel_soc_dts_iosf typec soc_button_array intel_vbtn mac_hid intel_hid int3400_thermal int3403_thermal sparse_keymap acpi_thermal_rel int340x_thermal_zone acpi_pad sch_fq_codel parport_pc ppdev lp parport efivarfs ip_tables x_tables autofs4 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear usbhid hid_generic
rtsx_pci_sdmmc rtsx_pci
[ 46.138017] i2c_i801 intel_lpss_pci i2c_hid intel_lpss wmi hid pinctrl_sunrisepoint video pinctrl_intel
[ 46.138022] CR2: 00000000000000d1
[ 46.138024] ---[ end trace 18eaad00960f2048 ]---
[ 46.138026] RIP: 0010:ip6_sk_dst_store_flow+0x7b/0xb0
[ 46.138027] Code: 48 8b 42 30 48 33 47 40 48 09 c1 0f b6 4f 12 b8 01 00 00 00 4d 0f 45 e1 31 db d3 e0 a9 bf ef ff ff 74 07 48 8b 9f f0 02 00 00 <48> 8b 46 70 31 d2 48 85 c0 74 0c 48 8b 40 10 48 85 c0 74 03 8b 50
[ 46.138028] RSP: 0018:ffffbc40c2253d10 EFLAGS: 00010202
[ 46.138030] RAX: 0000000000000080 RBX: ffffa01ac2098460 RCX: 0000000000000007
[ 46.138030] RDX: ffffbc40c2253d50 RSI: 000000000000007d RDI: ffffa01ac2098000
[ 46.138031] RBP: ffffa01ac2098460 R08: 0000000000000000 R09: 0000000000000000
[ 46.138032] R10: 83b567a7afeebe71 R11: ffffa01b5e85fb80 R12: ffffa01ac2098038
[ 46.138033] R13: 0000000000000000 R14: 0000000000000001 R15: ffffa01ac2098460
[ 46.138034] FS: 00007fde8d541700(0000) GS:ffffa01b6e2c0000(0000) knlGS:0000000000000000
[ 46.138035] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 46.138036] CR2: 00000000000000d1 CR3: 0000000411536002 CR4: 00000000003606e0
[ 90.518474] dst_release: dst:00000000f80c721f refcnt:-9
[ 90.518495] dst_release: dst:00000000f80c721f refcnt:-10
[ 90.518504] dst_release: dst:00000000f80c721f refcnt:-11
[ 90.518511] dst_release: dst:00000000f80c721f refcnt:-12
[ 90.518588] dst_release: dst:00000000f80c721f refcnt:-13
[ 90.521844] dst_release: dst:00000000e32e020e refcnt:-1
[ 90.521891] dst_release: dst:00000000e32e020e refcnt:-2
[ 90.522907] BUG: kernel NULL pointer dereference, address: 00000000000000d1
[ 90.522911] #PF: supervisor read access in kernel mode
[ 90.522912] #PF: error_code(0x0000) - not-present page
[ 90.522914] PGD 0 P4D 0
[ 90.522918] Oops: 0000 [#4] SMP PTI
[ 90.522921] CPU: 3 PID: 3773 Comm: http Tainted: G D OE 5.3.0-rc8 #14
[ 90.522922] Hardware name: Dell Inc. XPS 13 9360/02PG84, BIOS 2.12.0 05/26/2019
[ 90.522928] RIP: 0010:sk_setup_caps+0x38/0x130
[ 90.522931] Code: 53 48 89 fb 66 89 47 78 c7 87 88 01 00 00 00 00 00 00 48 89 f7 48 87 bb 38 01 00 00 e8 11 d0 02 00 48 8b 45 00 b9 01 00 00 00 <48> 8b 80 d0 00 00 00 48 0b 83 f8 01 00 00 48 89 c2 48 0d 00 00 1d
[ 90.522932] RSP: 0018:ffffbc40c1c87cf8 EFLAGS: 00010246
[ 90.522934] RAX: 0000000000000001 RBX: ffffa01aca530000 RCX: 0000000000000001
[ 90.522936] RDX: 0000000000000000 RSI: ffffa01b5f588100 RDI: 0000000000000000
[ 90.522937] RBP: ffffa01b5f588100 R08: 0000000000000000 R09: 0000000000000000
[ 90.522939] R10: 83b567a7afeebe71 R11: ffffa01b5e85fb80 R12: ffffa01aca530038
[ 90.522940] R13: 0000000000000000 R14: 0000000000000001 R15: ffffa01aca530460
[ 90.522942] FS: 00007ff174d29a40(0000) GS:ffffa01b6e2c0000(0000) knlGS:0000000000000000
[ 90.522944] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 90.522945] CR2: 00000000000000d1 CR3: 00000003cd750006 CR4: 00000000003606e0
[ 90.522947] Call Trace:
[ 90.522953] ip6_sk_dst_store_flow+0x9a/0xb0
[ 90.522958] ip6_datagram_dst_update+0x156/0x280
[ 90.522963] __ip6_datagram_connect+0x1dc/0x380
[ 90.522966] ip6_datagram_connect+0x27/0x40
[ 90.522969] __sys_connect+0xd0/0x100
[ 90.522972] ? __sys_socket+0xb7/0xe0
[ 90.522974] __x64_sys_connect+0x16/0x20
[ 90.522978] do_syscall_64+0x4f/0x120
[ 90.522983] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 90.522987] RIP: 0033:0x7ff1758f5b24
[ 90.522989] Code: 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 8d 05 b9 b8 0c 00 8b 00 85 c0 75 13 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 41 54 41 89 d4 55 48 89 f5 53
[ 90.522991] RSP: 002b:00007ffc0e6a79b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
[ 90.522993] RAX: ffffffffffffffda RBX: 00007ffc0e6a7c30 RCX: 00007ff1758f5b24
[ 90.522994] RDX: 000000000000001c RSI: 0000556f0399faf0 RDI: 0000000000000003
[ 90.522995] RBP: 00007ffc0e6a8330 R08: 0000000000000018 R09: 0000000000000000
[ 90.522997] R10: 0000000000000009 R11: 0000000000000246 R12: 000000000000000d
[ 90.522998] R13: 0000000000000003 R14: 000000000000000a R15: 0000556f0399fac0
[ 90.523000] Modules linked in: rfcomm msr nf_log_ipv4 nf_log_common xt_LOG xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat wireguard(OE) nf_nat ip6_udp_tunnel udp_tunnel ccm xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter cmac bnep uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 btusb videobuf2_common btrtl btbcm btintel videodev bluetooth mc ecdh_generic ecc snd_hda_codec_hdmi binfmt_misc x86_pkg_temp_thermal intel_powerclamp coretemp joydev snd_hda_codec_realtek snd_hda_codec_generic snd_soc_skl kvm_intel snd_soc_hdac_hda snd_hda_ext_core snd_soc_skl_ipc kvm snd_soc_sst_ipc irqbypass snd_soc_sst_dsp snd_soc_acpi_intel_match snd_soc_acpi snd_soc_core snd_compress ac97_bus snd_pcm_dmaengine snd_hda_intel snd_hda_codec snd_hda_core iwlmvm snd_hwdep snd_pcm crct10dif_pclmul nls_iso8859_1 mac80211 dell_laptop ledtrig_audio
snd_seq_midi libarc4
[ 90.523039] mei_hdcp crc32_pclmul intel_rapl_msr snd_seq_midi_event ghash_clmulni_intel snd_rawmidi i915 snd_seq iwlwifi snd_seq_device aesni_intel snd_timer dell_wmi cec dell_smbios aes_x86_64 crypto_simd dcdbas drm_kms_helper cryptd snd dell_wmi_descriptor cfg80211 glue_helper wmi_bmof intel_wmi_thunderbolt soundcore input_leds rtsx_pci_ms serio_raw drm memstick ucsi_acpi intel_gtt mei_me hid_multitouch processor_thermal_device i2c_algo_bit typec_ucsi fb_sys_fops syscopyarea mei idma64 intel_rapl_common sysfillrect virt_dma intel_xhci_usb_role_switch sysimgblt roles intel_pch_thermal intel_soc_dts_iosf typec soc_button_array intel_vbtn mac_hid intel_hid int3400_thermal int3403_thermal sparse_keymap acpi_thermal_rel int340x_thermal_zone acpi_pad sch_fq_codel parport_pc ppdev lp parport efivarfs ip_tables x_tables autofs4 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear usbhid hid_generic
rtsx_pci_sdmmc rtsx_pci
[ 90.523071] i2c_i801 intel_lpss_pci i2c_hid intel_lpss wmi hid pinctrl_sunrisepoint video pinctrl_intel
[ 90.523076] CR2: 00000000000000d1
[ 90.523078] ---[ end trace 18eaad00960f2049 ]---
[ 90.523081] RIP: 0010:ip6_sk_dst_store_flow+0x7b/0xb0
[ 90.523082] Code: 48 8b 42 30 48 33 47 40 48 09 c1 0f b6 4f 12 b8 01 00 00 00 4d 0f 45 e1 31 db d3 e0 a9 bf ef ff ff 74 07 48 8b 9f f0 02 00 00 <48> 8b 46 70 31 d2 48 85 c0 74 0c 48 8b 40 10 48 85 c0 74 03 8b 50
[ 90.523083] RSP: 0018:ffffbc40c2253d10 EFLAGS: 00010202
[ 90.523084] RAX: 0000000000000080 RBX: ffffa01ac2098460 RCX: 0000000000000007
[ 90.523085] RDX: ffffbc40c2253d50 RSI: 000000000000007d RDI: ffffa01ac2098000
[ 90.523086] RBP: ffffa01ac2098460 R08: 0000000000000000 R09: 0000000000000000
[ 90.523087] R10: 83b567a7afeebe71 R11: ffffa01b5e85fb80 R12: ffffa01ac2098038
[ 90.523088] R13: 0000000000000000 R14: 0000000000000001 R15: ffffa01ac2098460
[ 90.523089] FS: 00007ff174d29a40(0000) GS:ffffa01b6e2c0000(0000) knlGS:0000000000000000
[ 90.523090] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 90.523091] CR2: 00000000000000d1 CR3: 00000003cd750006 CR4: 00000000003606e0
[ 90.558593] dst_release: dst:00000000c2af7d17 refcnt:-1
[ 90.558644] dst_release: dst:00000000c2af7d17 refcnt:-2
[ 90.575184] dst_release: dst:00000000c2af7d17 refcnt:-3
^ permalink raw reply
* Re: [PATCH v3] tun: fix use-after-free when register netdev failed
From: Yang Yingliang @ 2019-09-10 2:31 UTC (permalink / raw)
To: Jason Wang
Cc: David Miller, netdev, eric dumazet, xiyou wangcong, weiyongjun1
In-Reply-To: <542aa2c2-fd54-1e6c-f2f4-46fcc2e6f6ee@redhat.com>
On 2019/9/5 11:10, Jason Wang wrote:
> On 2019/9/5 上午10:03, Yang Yingliang wrote:
>>
>> On 2019/9/3 18:50, Jason Wang wrote:
>>> ----- Original Message -----
>>>> On 2019/9/3 14:06, Jason Wang wrote:
>>>>> On 2019/9/3 下午1:42, Yang Yingliang wrote:
>>>>>> On 2019/9/3 11:03, Jason Wang wrote:
>>>>>>> On 2019/9/3 上午9:45, Yang Yingliang wrote:
>>>>>>>> On 2019/9/2 13:32, Jason Wang wrote:
>>>>>>>>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>>>>>>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>>>>>>>>> ----- Original Message -----
>>>>>>>>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>>>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>>>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure
>>>>>>>>>>>>>>>>> tfile->tun
>>>>>>>>>>>>>>>>> is not published until the netdevice is registered. So the
>>>>>>>>>>>>>>>>> read/write
>>>>>>>>>>>>>>>>> thread can not use the tun pointer that may freed by
>>>>>>>>>>>>>>>>> free_netdev().
>>>>>>>>>>>>>>>>> (The tun and dev pointer are allocated by
>>>>>>>>>>>>>>>>> alloc_netdev_mqs(), they
>>>>>>>>>>>>>>>>> can
>>>>>>>>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>>>>>>>>> register_netdevice() must always be the last operation in
>>>>>>>>>>>>>>>> the order of
>>>>>>>>>>>>>>>> network device setup.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> At the point register_netdevice() is called, the device is
>>>>>>>>>>>>>>>> visible
>>>>>>>>>>>>>>>> globally
>>>>>>>>>>>>>>>> and therefore all of it's software state must be fully
>>>>>>>>>>>>>>>> initialized and
>>>>>>>>>>>>>>>> ready for us.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> You're going to have to find another solution to these
>>>>>>>>>>>>>>>> problems.
>>>>>>>>>>>>>>> The device is loosely coupled with sockets/queues. Each
>>>>>>>>>>>>>>> side is
>>>>>>>>>>>>>>> allowed to be go away without caring the other side. So
>>>>>>>>>>>>>>> in this
>>>>>>>>>>>>>>> case, there's a small window that network stack think the
>>>>>>>>>>>>>>> device has
>>>>>>>>>>>>>>> one queue but actually not, the code can then safely drop
>>>>>>>>>>>>>>> them.
>>>>>>>>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Or if not, we can try to hold the device before tun_attach
>>>>>>>>>>>>>>> and drop
>>>>>>>>>>>>>>> it after register_netdevice().
>>>>>>>>>>>>>> Hi Yang:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I think maybe we can try to hold refcnt instead of playing
>>>>>>>>>>>>>> real num
>>>>>>>>>>>>>> queues here. Do you want to post a V4?
>>>>>>>>>>>>> I think the refcnt can prevent freeing the memory in this
>>>>>>>>>>>>> case.
>>>>>>>>>>>>> When register_netdevice() failed, free_netdev() will be called
>>>>>>>>>>>>> directly,
>>>>>>>>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of
>>>>>>>>>>>>> dev.
>>>>>>>>>>>> How about using patch-v1 that using a flag to check whether the
>>>>>>>>>>>> device
>>>>>>>>>>>> registered successfully.
>>>>>>>>>>>>
>>>>>>>>>>> As I said, it lacks sufficient locks or barriers. To be clear, I
>>>>>>>>>>> meant
>>>>>>>>>>> something like (compile-test only):
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>>> index db16d7a13e00..e52678f9f049 100644
>>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>> (ifr->ifr_flags & TUN_FEATURES);
>>>>>>>>>>> INIT_LIST_HEAD(&tun->disabled);
>>>>>>>>>>> + dev_hold(dev);
>>>>>>>>>>> err = tun_attach(tun, file, false,
>>>>>>>>>>> ifr->ifr_flags & IFF_NAPI,
>>>>>>>>>>> ifr->ifr_flags &
>>>>>>>>>>> IFF_NAPI_FRAGS);
>>>>>>>>>>> if (err < 0)
>>>>>>>>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>> err = register_netdevice(tun->dev);
>>>>>>>>>>> if (err < 0)
>>>>>>>>>>> goto err_detach;
>>>>>>>>>>> + dev_put(dev);
>>>>>>>>>>> }
>>>>>>>>>>> netif_carrier_on(tun->dev);
>>>>>>>>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>> return 0;
>>>>>>>>>>> err_detach:
>>>>>>>>>>> + dev_put(dev);
>>>>>>>>>>> tun_detach_all(dev);
>>>>>>>>>>> /* register_netdevice() already called
>>>>>>>>>>> tun_free_netdev() */
>>>>>>>>>>> goto err_free_dev;
>>>>>>>>>>> err_free_flow:
>>>>>>>>>>> + dev_put(dev);
>>>>>>>>>>> tun_flow_uninit(tun);
>>>>>>>>>>> security_tun_dev_free_security(tun->security);
>>>>>>>>>>> err_free_stat:
>>>>>>>>>>>
>>>>>>>>>>> What's your thought?
>>>>>>>>>> The dev pointer are freed without checking the refcount in
>>>>>>>>>> free_netdev() called by err_free_dev
>>>>>>>>>>
>>>>>>>>>> path, so I don't understand how the refcount protects this
>>>>>>>>>> pointer.
>>>>>>>>>>
>>>>>>>>> The refcount are guaranteed to be zero there, isn't it?
>>>>>>>> No, it's not.
>>>>>>>>
>>>>>>>> err_free_dev:
>>>>>>>> free_netdev(dev);
>>>>>>>>
>>>>>>>> void free_netdev(struct net_device *dev)
>>>>>>>> {
>>>>>>>> ...
>>>>>>>> /* pcpu_refcnt can be freed without checking refcount */
>>>>>>>> free_percpu(dev->pcpu_refcnt);
>>>>>>>> dev->pcpu_refcnt = NULL;
>>>>>>>>
>>>>>>>> /* Compatibility with error handling in drivers */
>>>>>>>> if (dev->reg_state == NETREG_UNINITIALIZED) {
>>>>>>>> /* dev can be freed without checking refcount */
>>>>>>>> netdev_freemem(dev);
>>>>>>>> return;
>>>>>>>> }
>>>>>>>> ...
>>>>>>>> }
>>>>>>> Right, but what I meant is in my patch, when code reaches
>>>>>>> free_netdev() the refcnt is zero. What did I miss?
>>>>>> Yes, but it can't fix the UAF problem.
>>>>> Well, it looks to me that the dev_put() in tun_put() won't release the
>>>>> device in this case.
>>>> The device is not released in tun_put().
>>>> This is how the UAF occurs:
>>>>
>>>> CPUA CPUB
>>>> tun_set_iff()
>>>> alloc_netdev_mqs()
>>>> tun_attach()
>>>>
>>>> tun_chr_read_iter()
>>>> tun_get()
>>>> tun_do_read()
>>>>
>>>> tun_ring_recv()
>>>> register_netdevice() <-- inject error
>>>> goto err_detach
>>>> tun_detach_all() <-- set RCV_SHUTDOWN
>>>> free_netdev() <-- called from
>>>> err_free_dev path
>>>> netdev_freemem() <-- free the memory
>>>> without check refcount
>>>> (In this path, the refcount cannot prevent
>>>> freeing the memory of dev, and the memory
>>>> will be used by dev_put() called by
>>>> tun_chr_read_iter() on CPUB.)
>>>> (Break from
>>>>
>>>> tun_ring_recv(),
>>>> because
>>>> RCV_SHUTDOWN
>>>> is set)
>>>> tun_put()
>>>> dev_put() <--
>>>> use the
>>>> memory freed by
>>>> netdev_freemem()
>>>>
>>>>
>>> My bad, thanks for the patience. Since all evil come from the
>>> tfile->tun, how about delay the publishing of tfile->tun until the
>>> success of registration to make sure dev_put() and dev_hold() work.
>>> (Compile test only)
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index db16d7a13e00..aab0be40d443 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -787,7 +787,8 @@ static void tun_detach_all(struct net_device *dev)
>>> }
>>> static int tun_attach(struct tun_struct *tun, struct file *file,
>>> - bool skip_filter, bool napi, bool napi_frags)
>>> + bool skip_filter, bool napi, bool napi_frags,
>>> + bool publish_tun)
>>> {
>>> struct tun_file *tfile = file->private_data;
>>> struct net_device *dev = tun->dev;
>>> @@ -870,7 +871,8 @@ static int tun_attach(struct tun_struct *tun,
>>> struct file *file,
>>> * initialized tfile; otherwise we risk using half-initialized
>>> * object.
>>> */
>>> - rcu_assign_pointer(tfile->tun, tun);
>>> + if (publish_tun)
>>> + rcu_assign_pointer(tfile->tun, tun);
>>> rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>> tun->numqueues++;
>>> tun_set_real_num_queues(tun);
>>> @@ -2730,7 +2732,7 @@ static int tun_set_iff(struct net *net, struct
>>> file *file, struct ifreq *ifr)
>>> err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
>>> ifr->ifr_flags & IFF_NAPI,
>>> - ifr->ifr_flags & IFF_NAPI_FRAGS);
>>> + ifr->ifr_flags & IFF_NAPI_FRAGS, true);
>>> if (err < 0)
>>> return err;
>>> @@ -2829,13 +2831,17 @@ static int tun_set_iff(struct net *net,
>>> struct file *file, struct ifreq *ifr)
>>> INIT_LIST_HEAD(&tun->disabled);
>>> err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
>>> - ifr->ifr_flags & IFF_NAPI_FRAGS);
>>> + ifr->ifr_flags & IFF_NAPI_FRAGS, false);
>>> if (err < 0)
>>> goto err_free_flow;
>>> err = register_netdevice(tun->dev);
>>> if (err < 0)
>>> goto err_detach;
>>> + /* free_netdev() won't check refcnt, to aovid race
>>> + * with dev_put() we need publish tun after registration.
>>> + */
>>> + rcu_assign_pointer(tfile->tun, tun);
>>> }
>>> netif_carrier_on(tun->dev);
>>> @@ -2978,7 +2984,7 @@ static int tun_set_queue(struct file *file,
>>> struct ifreq *ifr)
>>> if (ret < 0)
>>> goto unlock;
>>> ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
>>> - tun->flags & IFF_NAPI_FRAGS);
>>> + tun->flags & IFF_NAPI_FRAGS, true);
>>> } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
>>> tun = rtnl_dereference(tfile->tun);
>>> if (!tun || !(tun->flags & IFF_MULTI_QUEUE) ||
>>> tfile->detached)
>> I tested this patch, it can fix this UAF.
>>
>> But as Eric replied in my patch v1, tun_get() will return NULL as long
>> as tun_set_iff() (TUNSETIFF ioctl())
>> has not yet been called.
>
> Isn't this the expected behavior. Without TUNSETIFF, it means the
> netdevice is not attached, tun_get() should return NULL here.
>
>
>> This could break some applications, since tun_get() is used from poll()
>> and other syscalls.
>>
>> I think it should return '-EAGIAN' instead of '-EBADFD' in this way. I
>> did some change in patch v1,
>> if it's OK, I will send a v4.
>>
>> drivers/net/tun.c | 34 ++++++++++++++++++++++++++++++----
>> 1 file changed, 30 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index db16d7a13e00..0abc654010e3 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -115,6 +115,7 @@ do { \
>> /* High bits in flags field are unused. */
>> #define TUN_VNET_LE 0x80000000
>> #define TUN_VNET_BE 0x40000000
>> +#define TUN_DEV_REGISTERED 0x20000000
>>
>> #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
>> IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
>> @@ -719,8 +720,10 @@ static void __tun_detach(struct tun_file *tfile,
>> bool clean)
>> netif_carrier_off(tun->dev);
>>
>> if (!(tun->flags & IFF_PERSIST) &&
>> - tun->dev->reg_state == NETREG_REGISTERED)
>> + tun->dev->reg_state == NETREG_REGISTERED) {
>> + tun->flags &= ~TUN_DEV_REGISTERED;
>
> As I said for previous versions. It's not good that try to invent new
> internal state like this, and you need carefully to deal with the
> synchronization, it could be lock or barriers. Consider the
> synchronization of tun is already complex, let's better try to avoid
> adding more but using exist mechanism, e.g pointer publishing through RCU.
OK, need I post a V4 by using the diff file you sent ?
>
> Thanks
>
>
>> unregister_netdevice(tun->dev);
>> + }
>> }
>> if (tun)
>> xdp_rxq_info_unreg(&tfile->xdp_rxq);
>> @@ -884,8 +887,12 @@ static struct tun_struct *tun_get(struct tun_file
>> *tfile)
>>
>> rcu_read_lock();
>> tun = rcu_dereference(tfile->tun);
>> - if (tun)
>> - dev_hold(tun->dev);
>> + if (tun) {
>> + if (tun->flags & TUN_DEV_REGISTERED)
>> + dev_hold(tun->dev);
>> + else
>> + tun = ERR_PTR(-EAGAIN);
>> + }
>> rcu_read_unlock();
>>
>> return tun;
>> @@ -1428,7 +1435,7 @@ static __poll_t tun_chr_poll(struct file *file,
>> poll_table *wait)
>> struct sock *sk;
>> __poll_t mask = 0;
>>
>> - if (!tun)
>> + if (IS_ERR_OR_NULL(tun))
>> return EPOLLERR;
>>
>> sk = tfile->socket.sk;
>> @@ -2017,6 +2024,9 @@ static ssize_t tun_chr_write_iter(struct kiocb
>> *iocb, struct iov_iter *from)
>> if (!tun)
>> return -EBADFD;
>>
>> + if (IS_ERR(tun))
>> + return PTR_ERR(tun);
>> +
>> result = tun_get_user(tun, tfile, NULL, from,
>> file->f_flags & O_NONBLOCK, false);
>>
>> @@ -2242,6 +2252,10 @@ static ssize_t tun_chr_read_iter(struct kiocb
>> *iocb, struct iov_iter *to)
>>
>> if (!tun)
>> return -EBADFD;
>> +
>> + if (IS_ERR(tun))
>> + return PTR_ERR(tun);
>> +
>> ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
>> ret = min_t(ssize_t, ret, len);
>> if (ret > 0)
>> @@ -2525,6 +2539,9 @@ static int tun_sendmsg(struct socket *sock,
>> struct msghdr *m, size_t total_len)
>> if (!tun)
>> return -EBADFD;
>>
>> + if (IS_ERR(tun))
>> + return PTR_ERR(tun);
>> +
>> if (ctl && (ctl->type == TUN_MSG_PTR)) {
>> struct tun_page tpage;
>> int n = ctl->num;
>> @@ -2573,6 +2590,11 @@ static int tun_recvmsg(struct socket *sock,
>> struct msghdr *m, size_t total_len,
>> goto out_free;
>> }
>>
>> + if (IS_ERR(tun)) {
>> + ret = PTR_ERR(tun);
>> + goto out_free;
>> + }
>> +
>> if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
>> ret = -EINVAL;
>> goto out_put_tun;
>> @@ -2622,6 +2644,9 @@ static int tun_peek_len(struct socket *sock)
>> if (!tun)
>> return 0;
>>
>> + if (IS_ERR(tun))
>> + return PTR_ERR(tun);
>> +
>> ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
>> tun_put(tun);
>>
>> @@ -2836,6 +2861,7 @@ static int tun_set_iff(struct net *net, struct
>> file *file, struct ifreq *ifr)
>> err = register_netdevice(tun->dev);
>> if (err < 0)
>> goto err_detach;
>> + tun->flags |= TUN_DEV_REGISTERED;
>> }
>>
>> netif_carrier_on(tun->dev);
^ permalink raw reply
* Re: [PATCH v3] tun: fix use-after-free when register netdev failed
From: Jason Wang @ 2019-09-10 2:36 UTC (permalink / raw)
To: Yang Yingliang
Cc: David Miller, netdev, eric dumazet, xiyou wangcong, weiyongjun1
In-Reply-To: <5D770AF6.1060902@huawei.com>
On 2019/9/10 上午10:31, Yang Yingliang wrote:
>>>
>>> if (!(tun->flags & IFF_PERSIST) &&
>>> - tun->dev->reg_state == NETREG_REGISTERED)
>>> + tun->dev->reg_state == NETREG_REGISTERED) {
>>> + tun->flags &= ~TUN_DEV_REGISTERED;
>>
>> As I said for previous versions. It's not good that try to invent new
>> internal state like this, and you need carefully to deal with the
>> synchronization, it could be lock or barriers. Consider the
>> synchronization of tun is already complex, let's better try to avoid
>> adding more but using exist mechanism, e.g pointer publishing through
>> RCU.
> OK, need I post a V4 by using the diff file you sent ?
Yes, please.
Thanks
^ permalink raw reply
* Re: [PATCH v3 0/3] kernel/notifier.c: avoid duplicate registration
From: Xiaoming Ni @ 2019-09-10 3:57 UTC (permalink / raw)
To: Vasily Averin, gregkh@linuxfoundation.org
Cc: adobriyan@gmail.com, akpm@linux-foundation.org,
anna.schumaker@netapp.com, arjan@linux.intel.com,
bfields@fieldses.org, chuck.lever@oracle.com, davem@davemloft.net,
jlayton@kernel.org, luto@kernel.org, mingo@kernel.org,
Nadia.Derbey@bull.net, paulmck@linux.vnet.ibm.com,
semen.protsenko@linaro.org, stable@kernel.org,
stern@rowland.harvard.edu, tglx@linutronix.de,
torvalds@linux-foundation.org, trond.myklebust@hammerspace.com,
viresh.kumar@linaro.org, Huangjianhui (Alex), Dailei,
linux-kernel@vger.kernel.org, linux-nfs@vger.kernel.org,
netdev@vger.kernel.org
In-Reply-To: <0d245a97-1169-9ef2-e502-043ba80eaa8c@virtuozzo.com>
On 2019/7/17 19:15, Vasily Averin wrote:
> On 7/16/19 5:07 PM, Xiaoming Ni wrote:
>> On 2019/7/16 18:20, Vasily Averin wrote:
>>> On 7/16/19 5:00 AM, Xiaoming Ni wrote:
>>>> On 2019/7/15 13:38, Vasily Averin wrote:
>>>>> On 7/14/19 5:45 AM, Xiaoming Ni wrote:
>>>>>> On 2019/7/12 22:07, gregkh@linuxfoundation.org wrote:
>>>>>>> On Fri, Jul 12, 2019 at 09:11:57PM +0800, Xiaoming Ni wrote:
>>>>>>>> On 2019/7/11 21:57, Vasily Averin wrote:
>>>>>>>>> On 7/11/19 4:55 AM, Nixiaoming wrote:
>>>>>>>>>> On Wed, July 10, 2019 1:49 PM Vasily Averin wrote:
>>>>>>>>>>> On 7/10/19 6:09 AM, Xiaoming Ni wrote:
....
...
>>>>>>>> So in these two cases, is it more reasonable to trigger BUG() directly when checking for duplicate registration ?
>>>>>>>> But why does current notifier_chain_register() just trigger WARN() without exiting ?
>>>>>>>> notifier_chain_cond_register() direct exit without triggering WARN() ?
>>>>>>>
>>>>>>> It should recover from this, if it can be detected. The main point is
>>>>>>> that not all apis have to be this "robust" when used within the kernel
>>>>>>> as we do allow for the callers to know what they are doing :)
>>>>>>>
>>>>>> In the notifier_chain_register(), the condition ( (*nl) == n) is the same registration of the same hook.
>>>>>> We can intercept this situation and avoid forming a linked list ring to make the API more rob
...
...
> Yes, I'm agree, at present there are no difference between
> notifier_chain_cond_register() and notifier_chain_register()
>
> Question is -- how to improve it.
> You propose to remove notifier_chain_cond_register() by some way.
> Another option is return an error, for some abstract callers who expect possible double registration.
>
> Frankly speaking I prefer second one,
> however because of kernel do not have any such callers right now seems you are right,
> and we can delete notifier_chain_cond_register().
>
> So let me finally accept your patch-set.
>
> Thank you,
> Vasily Averin
>
> .
>
Dear Greg Kroah-Hartman
is there any other opinion on this patch set?
can you pick this series?
thanks
Xiaoming Ni
^ permalink raw reply
* Re: [PATCH net-next 2/5] sctp: add pf_expose per netns and sock and asoc
From: Xin Long @ 2019-09-10 6:26 UTC (permalink / raw)
To: Nathan Chancellor, linux-sctp, network dev
Cc: kbuild, Nick Desaulniers, clang-built-linux, kbuild test robot
In-Reply-To: <20190910035421.GB1778@archlinux-threadripper>
On Tue, Sep 10, 2019 at 11:54 AM Nathan Chancellor
<natechancellor@gmail.com> wrote:
>
> Hi Xin,
>
> The 0day team has been doing clang builds for us and this warning popped
> up. Let me know if you have any questions.
>
> On Mon, Sep 09, 2019 at 06:44:47PM +0800, kbuild test robot wrote:
> > CC: kbuild-all@01.org
> > In-Reply-To: <00fb06e74d8eedeb033dad83de18380bf6261231.1568015756.git.lucien.xin@gmail.com>
> > References: <00fb06e74d8eedeb033dad83de18380bf6261231.1568015756.git.lucien.xin@gmail.com>
> > TO: Xin Long <lucien.xin@gmail.com>
> > CC: network dev <netdev@vger.kernel.org>, linux-sctp@vger.kernel.org
> > CC: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>, Neil Horman <nhorman@tuxdriver.com>, davem@davemloft.net
> >
> > Hi Xin,
> >
> > Thank you for the patch! Perhaps something to improve:
> >
> > [auto build test WARNING on net-next/master]
> >
> > url: https://github.com/0day-ci/linux/commits/Xin-Long/sctp-update-from-rfc7829/20190909-160115
> > config: x86_64-rhel-7.6 (attached as .config)
> > compiler: clang version 10.0.0 (git://gitmirror/llvm_project 45a3fd206fb06f77a08968c99a8172cbf2ccdd0f)
> > reproduce:
> > # save the attached .config to linux build tree
> > make ARCH=x86_64
> >
> > If you fix the issue, kindly add following tag
> > Reported-by: kbuild test robot <lkp@intel.com>
> >
> > All warnings (new ones prefixed by >>):
> >
> > >> net/sctp/associola.c:799:24: warning: use of logical '&&' with constant operand [-Wconstant-logical-operand]
> > if (transport->state && SCTP_UNCONFIRMED &&
> > ^ ~~~~~~~~~~~~~~~~
> > net/sctp/associola.c:799:24: note: use '&' for a bitwise operation
> > if (transport->state && SCTP_UNCONFIRMED &&
> > ^~
> > &
> > net/sctp/associola.c:799:24: note: remove constant to silence this warning
> > if (transport->state && SCTP_UNCONFIRMED &&
> > ~^~~~~~~~~~~~~~~~~~~
> > 1 warning generated.
> >
> > vim +799 net/sctp/associola.c
> >
> > 775
> > 776 /* Engage in transport control operations.
> > 777 * Mark the transport up or down and send a notification to the user.
> > 778 * Select and update the new active and retran paths.
> > 779 */
> > 780 void sctp_assoc_control_transport(struct sctp_association *asoc,
> > 781 struct sctp_transport *transport,
> > 782 enum sctp_transport_cmd command,
> > 783 sctp_sn_error_t error)
> > 784 {
> > 785 struct sctp_ulpevent *event;
> > 786 struct sockaddr_storage addr;
> > 787 int spc_state = 0;
> > 788 bool ulp_notify = true;
> > 789
> > 790 /* Record the transition on the transport. */
> > 791 switch (command) {
> > 792 case SCTP_TRANSPORT_UP:
> > 793 /* If we are moving from UNCONFIRMED state due
> > 794 * to heartbeat success, report the SCTP_ADDR_CONFIRMED
> > 795 * state to the user, otherwise report SCTP_ADDR_AVAILABLE.
> > 796 */
> > 797 if (transport->state == SCTP_PF && !asoc->pf_expose)
> > 798 ulp_notify = false;
> > > 799 if (transport->state && SCTP_UNCONFIRMED &&
>
> I assume this && should either be a '&' or '=='?
Right, it should have been "==". It was changed unintentionally
when I swapped the position of 'state' and 'SCTP_UNCONFIRMED'.
Thanks, will post v2 after others' review.
>
> > 800 SCTP_HEARTBEAT_SUCCESS == error)
> > 801 spc_state = SCTP_ADDR_CONFIRMED;
> > 802 else
> > 803 spc_state = SCTP_ADDR_AVAILABLE;
> > 804 transport->state = SCTP_ACTIVE;
> > 805 break;
> > 806
> > 807 case SCTP_TRANSPORT_DOWN:
> > 808 /* If the transport was never confirmed, do not transition it
> > 809 * to inactive state. Also, release the cached route since
> > 810 * there may be a better route next time.
> > 811 */
> > 812 if (transport->state != SCTP_UNCONFIRMED) {
> > 813 transport->state = SCTP_INACTIVE;
> > 814 spc_state = SCTP_ADDR_UNREACHABLE;
> > 815 } else {
> > 816 sctp_transport_dst_release(transport);
> > 817 ulp_notify = false;
> > 818 }
> > 819 break;
> > 820
> > 821 case SCTP_TRANSPORT_PF:
> > 822 transport->state = SCTP_PF;
> > 823 if (!asoc->pf_expose)
> > 824 ulp_notify = false;
> > 825 else
> > 826 spc_state = SCTP_ADDR_POTENTIALLY_FAILED;
> > 827 break;
> > 828
> > 829 default:
> > 830 return;
> > 831 }
> > 832
> > 833 /* Generate and send a SCTP_PEER_ADDR_CHANGE notification
> > 834 * to the user.
> > 835 */
> > 836 if (ulp_notify) {
> > 837 memset(&addr, 0, sizeof(struct sockaddr_storage));
> > 838 memcpy(&addr, &transport->ipaddr,
> > 839 transport->af_specific->sockaddr_len);
> > 840
> > 841 event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
> > 842 0, spc_state, error, GFP_ATOMIC);
> > 843 if (event)
> > 844 asoc->stream.si->enqueue_event(&asoc->ulpq, event);
> > 845 }
> > 846
> > 847 /* Select new active and retran paths. */
> > 848 sctp_select_active_and_retran_path(asoc);
> > 849 }
> > 850
> >
> > ---
> > 0-DAY kernel test infrastructure Open Source Technology Center
> > https://lists.01.org/pipermail/kbuild-all Intel Corporation
>
> Cheers,
> Nathan
^ permalink raw reply
* Re: [PATCH] lib/Kconfig: fix OBJAGG in lib/ menu structure
From: Ido Schimmel @ 2019-09-10 6:47 UTC (permalink / raw)
To: Randy Dunlap; +Cc: netdev@vger.kernel.org, LKML, David Miller, Jiri Pirko
In-Reply-To: <34674398-54dc-a4d1-6052-67ad1a3b2fe9@infradead.org>
On Mon, Sep 09, 2019 at 02:54:21PM -0700, Randy Dunlap wrote:
> From: Randy Dunlap <rdunlap@infradead.org>
>
> Keep the "Library routines" menu intact by moving OBJAGG into it.
> Otherwise OBJAGG is displayed/presented as an orphan in the
> various config menus.
>
> Fixes: 0a020d416d0a ("lib: introduce initial implementation of object aggregation manager")
> Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
> Cc: Jiri Pirko <jiri@mellanox.com>
> Cc: Ido Schimmel <idosch@mellanox.com>
> Cc: David S. Miller <davem@davemloft.net>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Thanks!
^ permalink raw reply
* Re: [RFC PATCH untested] vhost: block speculation of translated descriptors
From: Michael S. Tsirkin @ 2019-09-10 6:48 UTC (permalink / raw)
To: Jason Wang; +Cc: linux-kernel, kvm, virtualization, netdev
In-Reply-To: <9ab48e0f-50a9-bed4-1801-73c37a7da27c@redhat.com>
On Tue, Sep 10, 2019 at 09:52:10AM +0800, Jason Wang wrote:
>
> On 2019/9/9 下午10:45, Michael S. Tsirkin wrote:
> > On Mon, Sep 09, 2019 at 03:19:55PM +0800, Jason Wang wrote:
> > > On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
> > > > iovec addresses coming from vhost are assumed to be
> > > > pre-validated, but in fact can be speculated to a value
> > > > out of range.
> > > >
> > > > Userspace address are later validated with array_index_nospec so we can
> > > > be sure kernel info does not leak through these addresses, but vhost
> > > > must also not leak userspace info outside the allowed memory table to
> > > > guests.
> > > >
> > > > Following the defence in depth principle, make sure
> > > > the address is not validated out of node range.
> > > >
> > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > ---
> > > > drivers/vhost/vhost.c | 4 +++-
> > > > 1 file changed, 3 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > index 5dc174ac8cac..0ee375fb7145 100644
> > > > --- a/drivers/vhost/vhost.c
> > > > +++ b/drivers/vhost/vhost.c
> > > > @@ -2072,7 +2072,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
> > > > size = node->size - addr + node->start;
> > > > _iov->iov_len = min((u64)len - s, size);
> > > > _iov->iov_base = (void __user *)(unsigned long)
> > > > - (node->userspace_addr + addr - node->start);
> > > > + (node->userspace_addr +
> > > > + array_index_nospec(addr - node->start,
> > > > + node->size));
> > > > s += size;
> > > > addr += size;
> > > > ++ret;
> > >
> > > I've tried this on Kaby Lake smap off metadata acceleration off using
> > > testpmd (virtio-user) + vhost_net. I don't see obvious performance
> > > difference with TX PPS.
> > >
> > > Thanks
> > Should I push this to Linus right now then? It's a security thing so
> > maybe we better do it ASAP ... what's your opinion?
>
>
> Yes, you can.
>
> Acked-by: Jason Wang <jasowang@redhat.com>
And should I include
Tested-by: Jason Wang <jasowang@redhat.com>
?
>
>
> >
^ permalink raw reply
* [PATCH net 2/2] sctp: destroy bucket if failed to bind addr
From: Mao Wenan @ 2019-09-10 7:13 UTC (permalink / raw)
To: vyasevich, nhorman, marcelo.leitner, davem
Cc: linux-sctp, netdev, linux-kernel, kernel-janitors, Mao Wenan,
Hulk Robot
In-Reply-To: <20190910071343.18808-1-maowenan@huawei.com>
There is one memory leak bug report:
BUG: memory leak
unreferenced object 0xffff8881dc4c5ec0 (size 40):
comm "syz-executor.0", pid 5673, jiffies 4298198457 (age 27.578s)
hex dump (first 32 bytes):
02 00 00 00 81 88 ff ff 00 00 00 00 00 00 00 00 ................
f8 63 3d c1 81 88 ff ff 00 00 00 00 00 00 00 00 .c=.............
backtrace:
[<0000000072006339>] sctp_get_port_local+0x2a1/0xa00 [sctp]
[<00000000c7b379ec>] sctp_do_bind+0x176/0x2c0 [sctp]
[<000000005be274a2>] sctp_bind+0x5a/0x80 [sctp]
[<00000000b66b4044>] inet6_bind+0x59/0xd0 [ipv6]
[<00000000c68c7f42>] __sys_bind+0x120/0x1f0 net/socket.c:1647
[<000000004513635b>] __do_sys_bind net/socket.c:1658 [inline]
[<000000004513635b>] __se_sys_bind net/socket.c:1656 [inline]
[<000000004513635b>] __x64_sys_bind+0x3e/0x50 net/socket.c:1656
[<0000000061f2501e>] do_syscall_64+0x72/0x2e0 arch/x86/entry/common.c:296
[<0000000003d1e05e>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
This is because in sctp_do_bind, if sctp_get_port_local is to
create hash bucket successfully, and sctp_add_bind_addr failed
to bind address, e.g return -ENOMEM, so memory leak found, it
needs to destroy allocated bucket.
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
net/sctp/socket.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 766b68b55ebe..ab37fc1f7bb6 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -412,11 +412,13 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
ret = sctp_add_bind_addr(bp, addr, af->sockaddr_len,
SCTP_ADDR_SRC, GFP_ATOMIC);
- /* Copy back into socket for getsockname() use. */
- if (!ret) {
- inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num);
- sp->pf->to_sk_saddr(addr, sk);
+ if (ret) {
+ sctp_put_port(sk);
+ return ret;
}
+ /* Copy back into socket for getsockname() use. */
+ inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num);
+ sp->pf->to_sk_saddr(addr, sk);
return ret;
}
--
2.20.1
^ permalink raw reply related
* [PATCH net 0/2] fix memory leak for sctp_do_bind
From: Mao Wenan @ 2019-09-10 7:13 UTC (permalink / raw)
To: vyasevich, nhorman, marcelo.leitner, davem
Cc: linux-sctp, netdev, linux-kernel, kernel-janitors, Mao Wenan
First patch is to do cleanup, remove redundant assignment,
second patch is to fix memory leak for sctp_do_bind if failed
to bind address.
Mao Wenan (2):
sctp: remove redundant assignment when call sctp_get_port_local
sctp: destroy bucket if failed to bind addr
net/sctp/socket.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
--
2.20.1
^ permalink raw reply
* [PATCH net 1/2] sctp: remove redundant assignment when call sctp_get_port_local
From: Mao Wenan @ 2019-09-10 7:13 UTC (permalink / raw)
To: vyasevich, nhorman, marcelo.leitner, davem
Cc: linux-sctp, netdev, linux-kernel, kernel-janitors, Mao Wenan
In-Reply-To: <20190910071343.18808-1-maowenan@huawei.com>
There are more parentheses in if clause when call sctp_get_port_local
in sctp_do_bind, and redundant assignment to 'ret'. This patch is to
do cleanup.
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
net/sctp/socket.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9d1f83b10c0a..766b68b55ebe 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -399,9 +399,8 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
* detection.
*/
addr->v4.sin_port = htons(snum);
- if ((ret = sctp_get_port_local(sk, addr))) {
+ if (sctp_get_port_local(sk, addr))
return -EADDRINUSE;
- }
/* Refresh ephemeral port. */
if (!bp->port)
--
2.20.1
^ permalink raw reply related
* Re: [PATCH net] sctp: fix the missing put_user when dumping transport thresholds
From: Neil Horman @ 2019-09-10 7:15 UTC (permalink / raw)
To: Xin Long; +Cc: network dev, linux-sctp, davem, Marcelo Ricardo Leitner
In-Reply-To: <3fa4f7700c93f06530c80bc666d1696cb7c077de.1568014409.git.lucien.xin@gmail.com>
On Mon, Sep 09, 2019 at 03:33:29PM +0800, Xin Long wrote:
> This issue causes SCTP_PEER_ADDR_THLDS sockopt not to be able to dump
> a transport thresholds info.
>
> Fix it by adding 'goto' put_user in sctp_getsockopt_paddr_thresholds.
>
> Fixes: 8add543e369d ("sctp: add SCTP_FUTURE_ASSOC for SCTP_PEER_ADDR_THLDS sockopt")
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
> net/sctp/socket.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 9d1f83b..ad87518 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -7173,7 +7173,7 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> val.spt_pathmaxrxt = trans->pathmaxrxt;
> val.spt_pathpfthld = trans->pf_retrans;
>
> - return 0;
> + goto out;
> }
>
> asoc = sctp_id2assoc(sk, val.spt_assoc_id);
> @@ -7191,6 +7191,7 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
> val.spt_pathmaxrxt = sp->pathmaxrxt;
> }
>
> +out:
> if (put_user(len, optlen) || copy_to_user(optval, &val, len))
> return -EFAULT;
>
> --
> 2.1.0
>
>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
^ permalink raw reply
* Re: [PATCH net 0/2] fix memory leak for sctp_do_bind
From: Neil Horman @ 2019-09-10 7:16 UTC (permalink / raw)
To: Mao Wenan
Cc: vyasevich, marcelo.leitner, davem, linux-sctp, netdev,
linux-kernel, kernel-janitors
In-Reply-To: <20190910071343.18808-1-maowenan@huawei.com>
On Tue, Sep 10, 2019 at 03:13:41PM +0800, Mao Wenan wrote:
> First patch is to do cleanup, remove redundant assignment,
> second patch is to fix memory leak for sctp_do_bind if failed
> to bind address.
>
> Mao Wenan (2):
> sctp: remove redundant assignment when call sctp_get_port_local
> sctp: destroy bucket if failed to bind addr
>
> net/sctp/socket.c | 13 +++++++------
> 1 file changed, 7 insertions(+), 6 deletions(-)
>
> --
> 2.20.1
>
>
Series
Acked-by: Neil Horman <nhorman@tuxdriver.com>
^ permalink raw reply
* Re: [RFC PATCH untested] vhost: block speculation of translated descriptors
From: Jason Wang @ 2019-09-10 7:28 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: linux-kernel, kvm, virtualization, netdev
In-Reply-To: <20190910024814-mutt-send-email-mst@kernel.org>
On 2019/9/10 下午2:48, Michael S. Tsirkin wrote:
> On Tue, Sep 10, 2019 at 09:52:10AM +0800, Jason Wang wrote:
>> On 2019/9/9 下午10:45, Michael S. Tsirkin wrote:
>>> On Mon, Sep 09, 2019 at 03:19:55PM +0800, Jason Wang wrote:
>>>> On 2019/9/8 下午7:05, Michael S. Tsirkin wrote:
>>>>> iovec addresses coming from vhost are assumed to be
>>>>> pre-validated, but in fact can be speculated to a value
>>>>> out of range.
>>>>>
>>>>> Userspace address are later validated with array_index_nospec so we can
>>>>> be sure kernel info does not leak through these addresses, but vhost
>>>>> must also not leak userspace info outside the allowed memory table to
>>>>> guests.
>>>>>
>>>>> Following the defence in depth principle, make sure
>>>>> the address is not validated out of node range.
>>>>>
>>>>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>>>>> ---
>>>>> drivers/vhost/vhost.c | 4 +++-
>>>>> 1 file changed, 3 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>>>> index 5dc174ac8cac..0ee375fb7145 100644
>>>>> --- a/drivers/vhost/vhost.c
>>>>> +++ b/drivers/vhost/vhost.c
>>>>> @@ -2072,7 +2072,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
>>>>> size = node->size - addr + node->start;
>>>>> _iov->iov_len = min((u64)len - s, size);
>>>>> _iov->iov_base = (void __user *)(unsigned long)
>>>>> - (node->userspace_addr + addr - node->start);
>>>>> + (node->userspace_addr +
>>>>> + array_index_nospec(addr - node->start,
>>>>> + node->size));
>>>>> s += size;
>>>>> addr += size;
>>>>> ++ret;
>>>> I've tried this on Kaby Lake smap off metadata acceleration off using
>>>> testpmd (virtio-user) + vhost_net. I don't see obvious performance
>>>> difference with TX PPS.
>>>>
>>>> Thanks
>>> Should I push this to Linus right now then? It's a security thing so
>>> maybe we better do it ASAP ... what's your opinion?
>>
>> Yes, you can.
>>
>> Acked-by: Jason Wang <jasowang@redhat.com>
>
> And should I include
>
> Tested-by: Jason Wang <jasowang@redhat.com>
>
> ?
Yes.
Thanks
>
>>
^ permalink raw reply
* Re: [PATCH net-next 1/5] enetc: Fix if_mode extraction
From: Andrew Lunn @ 2019-09-10 7:44 UTC (permalink / raw)
To: Claudiu Manoil
Cc: David S . Miller, Alexandru Marginean, netdev@vger.kernel.org
In-Reply-To: <VI1PR04MB48803DB044AB6CF66CACB89E96B70@VI1PR04MB4880.eurprd04.prod.outlook.com>
On Mon, Sep 09, 2019 at 04:24:01PM +0000, Claudiu Manoil wrote:
> >-----Original Message-----
> >From: Andrew Lunn <andrew@lunn.ch>
> >Sent: Friday, September 6, 2019 10:58 PM
> >To: Claudiu Manoil <claudiu.manoil@nxp.com>
> >Cc: David S . Miller <davem@davemloft.net>; Alexandru Marginean
> ><alexandru.marginean@nxp.com>; netdev@vger.kernel.org
> >Subject: Re: [PATCH net-next 1/5] enetc: Fix if_mode extraction
> >
> >On Fri, Sep 06, 2019 at 05:15:40PM +0300, Claudiu Manoil wrote:
> >> Fix handling of error return code. Before this fix,
> >> the error code was handled as unsigned type.
> >> Also, on this path if if_mode not found then just handle
> >> it as fixed link (i.e mac2mac connection).
> >>
> >> Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
> >> ---
> >> drivers/net/ethernet/freescale/enetc/enetc_pf.c | 17 ++++++-----------
> >> 1 file changed, 6 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
> >b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
> >> index 7d6513ff8507..3a556646a2fb 100644
> >> --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
> >> +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
> >> @@ -751,6 +751,7 @@ static int enetc_of_get_phy(struct enetc_ndev_priv
> >*priv)
> >> struct enetc_pf *pf = enetc_si_priv(priv->si);
> >> struct device_node *np = priv->dev->of_node;
> >> struct device_node *mdio_np;
> >> + int phy_mode;
> >> int err;
> >>
> >> if (!np) {
> >> @@ -784,17 +785,11 @@ static int enetc_of_get_phy(struct enetc_ndev_priv
> >*priv)
> >> }
> >> }
> >>
> >> - priv->if_mode = of_get_phy_mode(np);
> >> - if (priv->if_mode < 0) {
> >> - dev_err(priv->dev, "missing phy type\n");
> >> - of_node_put(priv->phy_node);
> >> - if (of_phy_is_fixed_link(np))
> >> - of_phy_deregister_fixed_link(np);
> >> - else
> >> - enetc_mdio_remove(pf);
> >> -
> >> - return -EINVAL;
> >> - }
> >
> >Hi Claudiu
> >
> >It is not clear to me why it is no longer necessary to deregister the
> >fixed link, or remove the mdio bus?
> >
> >> + phy_mode = of_get_phy_mode(np);
> >> + if (phy_mode < 0)
> >> + priv->if_mode = PHY_INTERFACE_MODE_NA; /* fixed link */
> >> + else
> >> + priv->if_mode = phy_mode;
> >
>
> Hi Andrew,
>
> The MAC2MAC connections are defined as fixed-link too, but without
> phy-mode/phy-connection-type properties. We don't want to de-register
> these links. Initial code was bogus in this regard.
Hi Claudiu
This is what is not clear in the change log. That this code is removed
because it is wrong. Please could you expand the explanation to make
this clearer.
> Current proposal is:
> ethernet@0,2 { /* SoC internal, connected to switch port 4 */
> compatible = "fsl,enetc";
> reg = <0x000200 0 0 0 0>;
> fixed-link {
> speed = <1000>;
> full-duplex;
> };
> };
> switch@0,5 {
> compatible = "mscc,felix-switch";
> [...]
> ports {
> #address-cells = <1>;
> #size-cells = <0>;
>
> /* external ports */
> [...]
> /* internal SoC ports */
> port@4 { /* connected to ENETC port2 */
> reg = <4>;
> fixed-link {
> speed = <1000>;
> full-duplex;
> };
> };
So this connection between the SoC and the switch does not use tags?
Can it use tags? Does the hardware allow you to have two CPU ports,
and load balance over them?
This second half is just standard DSA. This looks good.
Andrew
> port@5 { /* CPU port, connected to ENETC port3 */
> reg = <5>;
> ethernet = <&enetc_port3>;
> fixed-link {
> speed = <1000>;
> full-duplex;
> };
> };
> };
> };
> enetc_port3: ethernet@0,6 { /* SoC internal connected to switch port 5 */
> compatible = "fsl,enetc";
> reg = <0x000600 0 0 0 0>;
> fixed-link {
> speed = <1000>;
> full-duplex;
> };
> };
> };
>
> Thanks.
>
> Claudiu
^ permalink raw reply
* Re: [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs
From: Yonghong Song @ 2019-09-10 7:43 UTC (permalink / raw)
To: Alan Maguire
Cc: ast@kernel.org, daniel@iogearbox.net, Martin Lau, Song Liu,
davem@davemloft.net, jakub.kicinski@netronome.com,
hawk@kernel.org, john.fastabend@gmail.com, rostedt@goodmis.org,
mingo@redhat.com, quentin.monnet@netronome.com, Andrey Ignatov,
joe@wand.net.nz, acme@redhat.com, jolsa@kernel.org,
alexey.budankov@linux.intel.com, gregkh@linuxfoundation.org,
namhyung@kernel.org, sdf@google.com, f.fainelli@gmail.com,
shuah@kernel.org, peter@lekensteyn.nl, ivan@cloudflare.com,
Andrii Nakryiko, bhole_prashant_q7@lab.ntt.co.jp,
david.calavera@gmail.com, danieltimlee@gmail.com,
Takshak Chahande, netdev@vger.kernel.org, bpf@vger.kernel.org,
linux-kselftest@vger.kernel.org, toke@redhat.com,
jbenc@redhat.com
In-Reply-To: <alpine.LRH.2.20.1909092236490.10757@dhcp-10-175-172-139.vpn.oracle.com>
On 9/9/19 11:25 PM, Alan Maguire wrote:
> On Sun, 8 Sep 2019, Yonghong Song wrote:
>
>> For net side bpf_perf_event_output, we have
>> static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
>> unsigned long off, unsigned long len)
>> {
>> void *ptr = skb_header_pointer(skb, off, len, dst_buff);
>>
>> if (unlikely(!ptr))
>> return len;
>> if (ptr != dst_buff)
>> memcpy(dst_buff, ptr, len);
>>
>> return 0;
>> }
>>
>> BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map
>> *, map,
>> u64, flags, void *, meta, u64, meta_size)
>> {
>> u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
>>
>> if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
>> return -EINVAL;
>> if (unlikely(skb_size > skb->len))
>> return -EFAULT;
>>
>> return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
>> bpf_skb_copy);
>> }
>>
>> It does not really consider output all the frags.
>> I understand that to get truly all packet data, frags should be
>> considered, but seems we did not do it before? I am wondering
>> whether we need to do here.
>
> Thanks for the feedback! In experimenting with packet capture,
> my original hope was to keep things simple and avoid fragment parsing
> if possible. However if scatter-gather is enabled for the networking
> device, or indeed if it's running in a VM it turns out a lot of the
> interesting packet data ends up in the fragments on transmit (ssh
> headers, http headers etc). So I think it would be worth considering
> adding support for fragment traversal. It's not needed as much
> in the skb program case - we can always pullup the skb - but in
> the tracing situation we probably wouldn't want to do something
> that invasive in tracing context.
Agree that in tracing context, we should avoid push/pull skb. It is
indeed invasive.
>
> Fragment traversal might be worth breaking out as a separate patchset,
> perhaps triggered by a specific flag to bpf_skb_event_output?
This can be done for bpf_skb_event_output as the context is a sk_buff.
And you can just follow the frags to copy the whole thing without
bpf_probe_read().
>
> Feedback from folks at Linux Plumbers (I hope I'm summarizing correctly)
> seemed to agree with what you mentioned WRT the first patch in this
> series. The gist was we probably don't want to force the metadata to be a
> specific packet capture type; we'd rather use the existing perf event
> mechanisms and if we are indeed doing packet capture, simply specify that
> data in the program as metadata.
Agree, you can have whatever metadata you have for bpf_perf_event_output.
>
> I'd be happy with that approach myself if I could capture skb
> fragments in tracing programs - being able to do that would give
> equivalent functionality to what I proposed but without having a packet
> capture-specific helper.
That won't work for tracing program. Full of bpf_probe_read()
in tracing version of packet copying is not nice either.
We may still need a different helper for tracing programs.
I think we need something like below:
- vmlinux BTF at /sys/kernel/btf/kernel, is loaded into kernel.
(/sys/kernel/btf/kernel is the source of truth)
- For a tracing bpf program, if that function eventually
copy helper
bpf_skb_event_output(..., skb, ...)
the verifier needs to verify skb is indeed a valid skb
by tracing back to one of parameters.
Here, I use skb as an example, maybe it can be extended
to other data structures as well.
With this approach, you can reuse some of functions from
tracing side to deal with frag copying and no bpf_probe_read()
is needed.
Here, I use skb as an example, maybe it can be extended
to other data structures as well if needed.
>>
>> If we indeed do not need to handle frags here, I think maybe
>> bpf_probe_read() in existing bpf kprobe function should be
>> enough, we do not need this helper?
>>
>
> Certainly for many use cases, that will get you most of what you need -
> particularly if you're just looking at L2 to L4 data. For full packet
> capture however I think we may need to think about fragment traversal.
>
>>> +
>>> +/* Derive protocol for some of the easier cases. For tracing, a probe point
>>> + * may be dealing with packets in various states. Common cases are IP
>>> + * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
>>> + * (_PCAP_TYPE_ETH). For other cases the caller must specify the
>>> + * protocol they expect. Other heuristics for packet identification
>>> + * should be added here as needed, since determining the packet type
>>> + * ensures we do not capture packets that fail to match the desired
>>> + * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
>>> + */
>>> +static inline int bpf_skb_protocol_get(struct sk_buff *skb)
>>> +{
>>> + switch (htons(skb->protocol)) {
>>> + case ETH_P_IP:
>>> + case ETH_P_IPV6:
>>> + if (skb_network_header(skb) == skb->data)
>>> + return BPF_PCAP_TYPE_IP;
>>> + else
>>> + return BPF_PCAP_TYPE_ETH;
>>> + default:
>>> + return BPF_PCAP_TYPE_UNSET;
>>> + }
>>> +}
>>> +
>>> +BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
>>> + int, protocol_wanted, u64, flags)
>>
>> Up to now, for helpers, verifier has a way to verifier it is used
>> properly regarding to the context. For example, for xdp version
>> perf_event_output, the help prototype,
>> BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct
>> bpf_map *, map,
>> u64, flags, void *, meta, u64, meta_size)
>> the verifier is able to guarantee that the first parameter
>> has correct type xdp_buff, not something from type cast.
>> .arg1_type = ARG_PTR_TO_CTX,
>>
>> This helper, in the below we have
>> .arg1_type = ARG_ANYTHING,
>>
>> So it is not really enforced. Bringing BTF can help, but type
>> name matching typically bad.
>>
>>
> One thing we were discussing - and I think this is similar to what
> you're suggesting - is to investigate if there might be a way to
> leverage BTF to provide additional guarantees that the tracing
> data we are handling is indeed an skb. Specifically if we
> trace a kprobe function argument or a tracepoint function, and
> if we had that guarantee, we could perhaps invoke the skb-style
> perf event output function (trace both the skb data and the metadata).
> The challenge would be how to do that type-based matching; we'd
> need the function argument information from BTF _and_ need to
> somehow associate it at probe attach time.
>
> Thanks again for looking at the code!
>
> Alan
>
^ permalink raw reply
* Re: [net-next v2 00/15][pull request] Intel Wired LAN Driver Updates 2019-09-09
From: David Miller @ 2019-09-10 7:45 UTC (permalink / raw)
To: jeffrey.t.kirsher; +Cc: netdev, nhorman, sassmann
In-Reply-To: <20190909224802.29595-1-jeffrey.t.kirsher@intel.com>
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Mon, 9 Sep 2019 15:47:47 -0700
> This series contains a variety of cold and hot savoury changes to Intel
> drivers. Some of the fixes could be considered for stable even though
> the author did not request it.
...
Pulled.
^ permalink raw reply
* Re: [PATCH] net/ibmvnic: Fix missing { in __ibmvnic_reset
From: David Miller @ 2019-09-10 7:45 UTC (permalink / raw)
To: msuchanek
Cc: netdev, julietk, benh, paulus, mpe, tlfalcon, jallen,
linuxppc-dev, linux-kernel
In-Reply-To: <20190909204451.7929-1-msuchanek@suse.de>
From: Michal Suchanek <msuchanek@suse.de>
Date: Mon, 9 Sep 2019 22:44:51 +0200
> Commit 1c2977c09499 ("net/ibmvnic: free reset work of removed device from queue")
> adds a } without corresponding { causing build break.
>
> Fixes: 1c2977c09499 ("net/ibmvnic: free reset work of removed device from queue")
> Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Applied.
^ permalink raw reply
* Re: [PATCH REPOST 1/2] can: flexcan: fix deadlock when using self wakeup
From: Sean Nyekjaer @ 2019-09-10 7:52 UTC (permalink / raw)
To: Joakim Zhang, mkl@pengutronix.de, linux-can@vger.kernel.org
Cc: wg@grandegger.com, netdev@vger.kernel.org, dl-linux-imx,
Martin Hundebøll
In-Reply-To: <DB7PR04MB461868320DA0B25CC8255213E6BB0@DB7PR04MB4618.eurprd04.prod.outlook.com>
On 05/09/2019 09.10, Joakim Zhang wrote:
> Hi Sean,
>
> Could you update lastest flexcan driver using linux-can-next/flexcan and then merge below two patches from linux-can/testing?
> d0b53616716e (HEAD -> testing, origin/testing) can: flexcan: add LPSR mode support for i.MX7D
> 803eb6bad65b can: flexcan: fix deadlock when using self wakeup
>
> Best Regards,
> Joakim Zhang
Hi
I reverted 2 commits on thw nand driver and got the testing kernel to work.
I can confirm the issue is resolved with this patch :-)
/Sean
^ permalink raw reply
* Re: [PATCH v3 1/2] ethtool: implement Energy Detect Powerdown support via phy-tunable
From: Michal Kubecek @ 2019-09-10 8:00 UTC (permalink / raw)
To: netdev
Cc: Alexandru Ardelean, devicetree, linux-kernel, davem, robh+dt,
mark.rutland, f.fainelli, hkallweit1, andrew
In-Reply-To: <20190909131251.3634-2-alexandru.ardelean@analog.com>
On Mon, Sep 09, 2019 at 04:12:50PM +0300, Alexandru Ardelean wrote:
> The `phy_tunable_id` has been named `ETHTOOL_PHY_EDPD` since it looks like
> this feature is common across other PHYs (like EEE), and defining
> `ETHTOOL_PHY_ENERGY_DETECT_POWER_DOWN` seems too long.
>
> The way EDPD works, is that the RX block is put to a lower power mode,
> except for link-pulse detection circuits. The TX block is also put to low
> power mode, but the PHY wakes-up periodically to send link pulses, to avoid
> lock-ups in case the other side is also in EDPD mode.
>
> Currently, there are 2 PHY drivers that look like they could use this new
> PHY tunable feature: the `adin` && `micrel` PHYs.
>
> The ADIN's datasheet mentions that TX pulses are at intervals of 1 second
> default each, and they can be disabled. For the Micrel KSZ9031 PHY, the
> datasheet does not mention whether they can be disabled, but mentions that
> they can modified.
>
> The way this change is structured, is similar to the PHY tunable downshift
> control:
> * a `ETHTOOL_PHY_EDPD_DFLT_TX_INTERVAL` value is exposed to cover a default
> TX interval; some PHYs could specify a certain value that makes sense
> * `ETHTOOL_PHY_EDPD_NO_TX` would disable TX when EDPD is enabled
> * `ETHTOOL_PHY_EDPD_DISABLE` will disable EDPD
>
> This should allow PHYs to:
> * enable EDPD and not enable TX pulses (interval would be 0)
> * enable EDPD and configure TX pulse interval; note that TX interval units
> would be PHY specific; we could consider `seconds` as units, but it could
> happen that some PHYs would be prefer milliseconds as a unit;
> a maximum of 65533 units should be sufficient
Sorry for missing the discussion on previous version but I don't really
like the idea of leaving the choice of units to PHY. Both for manual
setting and system configuration, it would be IMHO much more convenient
to have the interpretation universal for all NICs.
Seconds as units seem too coarse and maximum of ~18 hours way too big.
Milliseconds would be more practical from granularity point of view,
would maximum of ~65 seconds be sufficient?
Michal Kubecek
> * disable EDPD
>
> Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox