Netdev List
 help / color / mirror / Atom feed
* [PATCH 4/4] ath9k: use timespec64 for tsf_ts
From: Arnd Bergmann @ 2018-06-18 15:11 UTC (permalink / raw)
  To: QCA ath9k Development, Kalle Valo, David S. Miller
  Cc: y2038, Arnd Bergmann, Johannes Berg,
	Toke Høiland-Jørgensen, Ville Syrjälä,
	Wojciech Dubowik, Russell Hu, linux-wireless, netdev,
	linux-kernel
In-Reply-To: <20180618151142.1214422-1-arnd@arndb.de>

ath9k is the last remaining user of the deprecated getrawmonotonic()
interface. There is nothing wrong with this usage, but migrating
to a timespec64 based interface lets us clean up the old API.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/net/wireless/ath/ath9k/ath9k.h   |  4 ++--
 drivers/net/wireless/ath/ath9k/channel.c | 14 +++++++-------
 drivers/net/wireless/ath/ath9k/hw.c      | 10 +++++-----
 drivers/net/wireless/ath/ath9k/hw.h      |  2 +-
 drivers/net/wireless/ath/ath9k/main.c    |  4 ++--
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index ef0de4f1312c..21ba20981a80 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -342,7 +342,7 @@ struct ath_chanctx {
 
 	struct ath_beacon_config beacon;
 	struct ath9k_hw_cal_data caldata;
-	struct timespec tsf_ts;
+	struct timespec64 tsf_ts;
 	u64 tsf_val;
 	u32 last_beacon;
 
@@ -1021,7 +1021,7 @@ struct ath_softc {
 	struct ath_offchannel offchannel;
 	struct ath_chanctx *next_chan;
 	struct completion go_beacon;
-	struct timespec last_event_time;
+	struct timespec64 last_event_time;
 #endif
 
 	unsigned long driver_data;
diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c
index 1b05b5d7a038..fd61ae4782b6 100644
--- a/drivers/net/wireless/ath/ath9k/channel.c
+++ b/drivers/net/wireless/ath/ath9k/channel.c
@@ -233,9 +233,9 @@ static const char *chanctx_state_string(enum ath_chanctx_state state)
 static u32 chanctx_event_delta(struct ath_softc *sc)
 {
 	u64 ms;
-	struct timespec ts, *old;
+	struct timespec64 ts, *old;
 
-	getrawmonotonic(&ts);
+	ktime_get_raw_ts64(&ts);
 	old = &sc->last_event_time;
 	ms = ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
 	ms -= old->tv_sec * 1000 + old->tv_nsec / 1000000;
@@ -334,7 +334,7 @@ ath_chanctx_get_next(struct ath_softc *sc, struct ath_chanctx *ctx)
 static void ath_chanctx_adjust_tbtt_delta(struct ath_softc *sc)
 {
 	struct ath_chanctx *prev, *cur;
-	struct timespec ts;
+	struct timespec64 ts;
 	u32 cur_tsf, prev_tsf, beacon_int;
 	s32 offset;
 
@@ -346,7 +346,7 @@ static void ath_chanctx_adjust_tbtt_delta(struct ath_softc *sc)
 	if (!prev->switch_after_beacon)
 		return;
 
-	getrawmonotonic(&ts);
+	ktime_get_raw_ts64(&ts);
 	cur_tsf = (u32) cur->tsf_val +
 		  ath9k_hw_get_tsf_offset(&cur->tsf_ts, &ts);
 
@@ -1230,7 +1230,7 @@ void ath_chanctx_set_next(struct ath_softc *sc, bool force)
 {
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
 	struct ath_chanctx *old_ctx;
-	struct timespec ts;
+	struct timespec64 ts;
 	bool measure_time = false;
 	bool send_ps = false;
 	bool queues_stopped = false;
@@ -1260,7 +1260,7 @@ void ath_chanctx_set_next(struct ath_softc *sc, bool force)
 		spin_unlock_bh(&sc->chan_lock);
 
 		if (sc->next_chan == &sc->offchannel.chan) {
-			getrawmonotonic(&ts);
+			ktime_get_raw_ts64(&ts);
 			measure_time = true;
 		}
 
@@ -1277,7 +1277,7 @@ void ath_chanctx_set_next(struct ath_softc *sc, bool force)
 		spin_lock_bh(&sc->chan_lock);
 
 		if (sc->cur_chan != &sc->offchannel.chan) {
-			getrawmonotonic(&sc->cur_chan->tsf_ts);
+			ktime_get_raw_ts64(&sc->cur_chan->tsf_ts);
 			sc->cur_chan->tsf_val = ath9k_hw_gettsf64(sc->sc_ah);
 		}
 	}
diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index e60bea4604e4..e8e1f785bc5c 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -1835,13 +1835,13 @@ static int ath9k_hw_do_fastcc(struct ath_hw *ah, struct ath9k_channel *chan)
 	return -EINVAL;
 }
 
-u32 ath9k_hw_get_tsf_offset(struct timespec *last, struct timespec *cur)
+u32 ath9k_hw_get_tsf_offset(struct timespec64 *last, struct timespec64 *cur)
 {
-	struct timespec ts;
+	struct timespec64 ts;
 	s64 usec;
 
 	if (!cur) {
-		getrawmonotonic(&ts);
+		ktime_get_raw_ts64(&ts);
 		cur = &ts;
 	}
 
@@ -1859,7 +1859,7 @@ int ath9k_hw_reset(struct ath_hw *ah, struct ath9k_channel *chan,
 	u32 saveLedState;
 	u32 saveDefAntenna;
 	u32 macStaId1;
-	struct timespec tsf_ts;
+	struct timespec64 tsf_ts;
 	u32 tsf_offset;
 	u64 tsf = 0;
 	int r;
@@ -1905,7 +1905,7 @@ int ath9k_hw_reset(struct ath_hw *ah, struct ath9k_channel *chan,
 	macStaId1 = REG_READ(ah, AR_STA_ID1) & AR_STA_ID1_BASE_RATE_11B;
 
 	/* Save TSF before chip reset, a cold reset clears it */
-	getrawmonotonic(&tsf_ts);
+	ktime_get_raw_ts64(&tsf_ts);
 	tsf = ath9k_hw_gettsf64(ah);
 
 	saveLedState = REG_READ(ah, AR_CFG_LED) &
diff --git a/drivers/net/wireless/ath/ath9k/hw.h b/drivers/net/wireless/ath/ath9k/hw.h
index 9804a24a2dc0..68956cdc8c9a 100644
--- a/drivers/net/wireless/ath/ath9k/hw.h
+++ b/drivers/net/wireless/ath/ath9k/hw.h
@@ -1060,7 +1060,7 @@ u32 ath9k_hw_gettsf32(struct ath_hw *ah);
 u64 ath9k_hw_gettsf64(struct ath_hw *ah);
 void ath9k_hw_settsf64(struct ath_hw *ah, u64 tsf64);
 void ath9k_hw_reset_tsf(struct ath_hw *ah);
-u32 ath9k_hw_get_tsf_offset(struct timespec *last, struct timespec *cur);
+u32 ath9k_hw_get_tsf_offset(struct timespec64 *last, struct timespec64 *cur);
 void ath9k_hw_set_tsfadjust(struct ath_hw *ah, bool set);
 void ath9k_hw_init_global_settings(struct ath_hw *ah);
 u32 ar9003_get_pll_sqsum_dvc(struct ath_hw *ah);
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index b6663c80e7dd..acb681b096d9 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1865,7 +1865,7 @@ static void ath9k_set_tsf(struct ieee80211_hw *hw,
 	mutex_lock(&sc->mutex);
 	ath9k_ps_wakeup(sc);
 	tsf -= le64_to_cpu(avp->tsf_adjust);
-	getrawmonotonic(&avp->chanctx->tsf_ts);
+	ktime_get_raw_ts64(&avp->chanctx->tsf_ts);
 	if (sc->cur_chan == avp->chanctx)
 		ath9k_hw_settsf64(sc->sc_ah, tsf);
 	avp->chanctx->tsf_val = tsf;
@@ -1881,7 +1881,7 @@ static void ath9k_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 	mutex_lock(&sc->mutex);
 
 	ath9k_ps_wakeup(sc);
-	getrawmonotonic(&avp->chanctx->tsf_ts);
+	ktime_get_raw_ts64(&avp->chanctx->tsf_ts);
 	if (sc->cur_chan == avp->chanctx)
 		ath9k_hw_reset_tsf(sc->sc_ah);
 	avp->chanctx->tsf_val = 0;
-- 
2.9.0

^ permalink raw reply related

* [PATCH net-next] liquidio: use ktime_get_real_ts64() instead of getnstimeofday64()
From: Arnd Bergmann @ 2018-06-18 15:17 UTC (permalink / raw)
  To: Derek Chickles, Satanand Burla, Felix Manlunas, Raghu Vatsavayi,
	David S. Miller
  Cc: y2038, Arnd Bergmann, Intiyaz Basha, Rick Farrington,
	Veerasenareddy Burru, Vijaya Mohan Guvva, Denys Vlasenko,
	Weilin Chang, netdev, linux-kernel

The two do the same thing, but we want to have a consistent
naming in the kernel.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c       | 2 +-
 drivers/net/ethernet/cavium/liquidio/octeon_console.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 8a815bb57177..17a64044ac95 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -684,7 +684,7 @@ static void lio_sync_octeon_time(struct work_struct *work)
 	lt = (struct lio_time *)sc->virtdptr;
 
 	/* Get time of the day */
-	getnstimeofday64(&ts);
+	ktime_get_real_ts64(&ts);
 	lt->sec = ts.tv_sec;
 	lt->nsec = ts.tv_nsec;
 	octeon_swap_8B_data((u64 *)lt, (sizeof(struct lio_time)) / 8);
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_console.c b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
index 7f97ae48efed..0cc2338d8d2a 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_console.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
@@ -902,7 +902,7 @@ int octeon_download_firmware(struct octeon_device *oct, const u8 *data,
 	 *
 	 * Octeon always uses UTC time. so timezone information is not sent.
 	 */
-	getnstimeofday64(&ts);
+	ktime_get_real_ts64(&ts);
 	ret = snprintf(boottime, MAX_BOOTTIME_SIZE,
 		       " time_sec=%lld time_nsec=%ld",
 		       (s64)ts.tv_sec, ts.tv_nsec);
-- 
2.9.0

^ permalink raw reply related

* [PATCH net-next] nfp: avoid using getnstimeofday64()
From: Arnd Bergmann @ 2018-06-18 15:20 UTC (permalink / raw)
  To: Jakub Kicinski, David S. Miller
  Cc: y2038, Arnd Bergmann, Simon Horman, John Hurley,
	Pieter Jansen van Vuuren, Jiri Pirko, oss-drivers, netdev,
	linux-kernel

getnstimeofday64 is deprecated in favor of the ktime_get() family of
functions. The direct replacement would be ktime_get_real_ts64(),
but I'm picking the basic ktime_get() instead:

- using a ktime_t simplifies the code compared to timespec64
- using monotonic time instead of real time avoids issues caused
  by a concurrent settimeofday() or during a leap second adjustment.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/net/ethernet/netronome/nfp/flower/main.h     |  2 +-
 drivers/net/ethernet/netronome/nfp/flower/metadata.c | 12 +++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index bbe5764d26cb..ef2114d13387 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -73,7 +73,7 @@ struct nfp_app;
 
 struct nfp_fl_mask_id {
 	struct circ_buf mask_id_free_list;
-	struct timespec64 *last_used;
+	ktime_t *last_used;
 	u8 init_unallocated;
 };
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/metadata.c b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
index 93fb809f50d1..c098730544b7 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/metadata.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
@@ -158,7 +158,6 @@ static int nfp_release_mask_id(struct nfp_app *app, u8 mask_id)
 {
 	struct nfp_flower_priv *priv = app->priv;
 	struct circ_buf *ring;
-	struct timespec64 now;
 
 	ring = &priv->mask_ids.mask_id_free_list;
 	/* Checking if buffer is full. */
@@ -169,8 +168,7 @@ static int nfp_release_mask_id(struct nfp_app *app, u8 mask_id)
 	ring->head = (ring->head + NFP_FLOWER_MASK_ELEMENT_RS) %
 		     (NFP_FLOWER_MASK_ENTRY_RS * NFP_FLOWER_MASK_ELEMENT_RS);
 
-	getnstimeofday64(&now);
-	priv->mask_ids.last_used[mask_id] = now;
+	priv->mask_ids.last_used[mask_id] = ktime_get();
 
 	return 0;
 }
@@ -178,7 +176,7 @@ static int nfp_release_mask_id(struct nfp_app *app, u8 mask_id)
 static int nfp_mask_alloc(struct nfp_app *app, u8 *mask_id)
 {
 	struct nfp_flower_priv *priv = app->priv;
-	struct timespec64 delta, now;
+	ktime_t reuse_timeout;
 	struct circ_buf *ring;
 	u8 temp_id, freed_id;
 
@@ -198,10 +196,10 @@ static int nfp_mask_alloc(struct nfp_app *app, u8 *mask_id)
 	memcpy(&temp_id, &ring->buf[ring->tail], NFP_FLOWER_MASK_ELEMENT_RS);
 	*mask_id = temp_id;
 
-	getnstimeofday64(&now);
-	delta = timespec64_sub(now, priv->mask_ids.last_used[*mask_id]);
+	reuse_timeout = ktime_add_ns(priv->mask_ids.last_used[*mask_id],
+				     NFP_FL_MASK_REUSE_TIME_NS);
 
-	if (timespec64_to_ns(&delta) < NFP_FL_MASK_REUSE_TIME_NS)
+	if (ktime_before(ktime_get(), reuse_timeout))
 		goto err_not_found;
 
 	memcpy(&ring->buf[ring->tail], &freed_id, NFP_FLOWER_MASK_ELEMENT_RS);
-- 
2.9.0

^ permalink raw reply related

* [PATCH net-next 1/3] xfrm: use time64_t for in-kernel timestamps
From: Arnd Bergmann @ 2018-06-18 15:22 UTC (permalink / raw)
  To: Steffen Klassert, Herbert Xu, David S. Miller
  Cc: y2038, Arnd Bergmann, Florian Westphal, Eric Dumazet,
	Kirill Tkhai, Lorenzo Colitti, Al Viro, netdev, linux-kernel

The lifetime managment uses '__u64' timestamps on the user space
interface, but 'unsigned long' for reading the current time in the kernel
with get_seconds().

While this is probably safe beyond y2038, it will still overflow in 2106,
and the get_seconds() call is deprecated because fo that.

This changes the xfrm time handling to use time64_t consistently, along
with reading the time using the safer ktime_get_real_seconds(). It still
suffers from problems that can happen from a concurrent settimeofday()
call or (to a lesser degree) a leap second update, but since the time
stamps are part of the user API, there is nothing we can do to prevent
that.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 net/xfrm/xfrm_policy.c | 24 ++++++++++++------------
 net/xfrm/xfrm_state.c  | 10 +++++-----
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5f48251c1319..9f4afca8b4f5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -182,8 +182,8 @@ static inline unsigned long make_jiffies(long secs)
 static void xfrm_policy_timer(struct timer_list *t)
 {
 	struct xfrm_policy *xp = from_timer(xp, t, timer);
-	unsigned long now = get_seconds();
-	long next = LONG_MAX;
+	time64_t now = ktime_get_real_seconds();
+	time64_t next = TIME64_MAX;
 	int warn = 0;
 	int dir;
 
@@ -195,7 +195,7 @@ static void xfrm_policy_timer(struct timer_list *t)
 	dir = xfrm_policy_id2dir(xp->index);
 
 	if (xp->lft.hard_add_expires_seconds) {
-		long tmo = xp->lft.hard_add_expires_seconds +
+		time64_t tmo = xp->lft.hard_add_expires_seconds +
 			xp->curlft.add_time - now;
 		if (tmo <= 0)
 			goto expired;
@@ -203,7 +203,7 @@ static void xfrm_policy_timer(struct timer_list *t)
 			next = tmo;
 	}
 	if (xp->lft.hard_use_expires_seconds) {
-		long tmo = xp->lft.hard_use_expires_seconds +
+		time64_t tmo = xp->lft.hard_use_expires_seconds +
 			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
 		if (tmo <= 0)
 			goto expired;
@@ -211,7 +211,7 @@ static void xfrm_policy_timer(struct timer_list *t)
 			next = tmo;
 	}
 	if (xp->lft.soft_add_expires_seconds) {
-		long tmo = xp->lft.soft_add_expires_seconds +
+		time64_t tmo = xp->lft.soft_add_expires_seconds +
 			xp->curlft.add_time - now;
 		if (tmo <= 0) {
 			warn = 1;
@@ -221,7 +221,7 @@ static void xfrm_policy_timer(struct timer_list *t)
 			next = tmo;
 	}
 	if (xp->lft.soft_use_expires_seconds) {
-		long tmo = xp->lft.soft_use_expires_seconds +
+		time64_t tmo = xp->lft.soft_use_expires_seconds +
 			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
 		if (tmo <= 0) {
 			warn = 1;
@@ -233,7 +233,7 @@ static void xfrm_policy_timer(struct timer_list *t)
 
 	if (warn)
 		km_policy_expired(xp, dir, 0, 0);
-	if (next != LONG_MAX &&
+	if (next != TIME64_MAX &&
 	    !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
 		xfrm_pol_hold(xp);
 
@@ -783,7 +783,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	}
 	policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
 	hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
-	policy->curlft.add_time = get_seconds();
+	policy->curlft.add_time = ktime_get_real_seconds();
 	policy->curlft.use_time = 0;
 	if (!mod_timer(&policy->timer, jiffies + HZ))
 		xfrm_pol_hold(policy);
@@ -1268,7 +1268,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 	old_pol = rcu_dereference_protected(sk->sk_policy[dir],
 				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
 	if (pol) {
-		pol->curlft.add_time = get_seconds();
+		pol->curlft.add_time = ktime_get_real_seconds();
 		pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
 		xfrm_sk_policy_link(pol, dir);
 	}
@@ -2234,7 +2234,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 	}
 
 	for (i = 0; i < num_pols; i++)
-		pols[i]->curlft.use_time = get_seconds();
+		pols[i]->curlft.use_time = ktime_get_real_seconds();
 
 	if (num_xfrms < 0) {
 		/* Prohibit the flow */
@@ -2446,7 +2446,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		return 1;
 	}
 
-	pol->curlft.use_time = get_seconds();
+	pol->curlft.use_time = ktime_get_real_seconds();
 
 	pols[0] = pol;
 	npols++;
@@ -2460,7 +2460,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 				XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
 				return 0;
 			}
-			pols[1]->curlft.use_time = get_seconds();
+			pols[1]->curlft.use_time = ktime_get_real_seconds();
 			npols++;
 		}
 	}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 8308281f3253..1c2b6fdd9518 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -475,8 +475,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
 {
 	struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
 	struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
-	unsigned long now = get_seconds();
-	long next = LONG_MAX;
+	time64_t now = ktime_get_real_seconds();
+	time64_t next = TIME64_MAX;
 	int warn = 0;
 	int err = 0;
 
@@ -537,7 +537,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
 	if (warn)
 		km_state_expired(x, 0, 0);
 resched:
-	if (next != LONG_MAX) {
+	if (next != TIME64_MAX) {
 		tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
 	}
 
@@ -577,7 +577,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 		tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
 					CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
 		timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0);
-		x->curlft.add_time = get_seconds();
+		x->curlft.add_time = ktime_get_real_seconds();
 		x->lft.soft_byte_limit = XFRM_INF;
 		x->lft.soft_packet_limit = XFRM_INF;
 		x->lft.hard_byte_limit = XFRM_INF;
@@ -1571,7 +1571,7 @@ EXPORT_SYMBOL(xfrm_state_update);
 int xfrm_state_check_expire(struct xfrm_state *x)
 {
 	if (!x->curlft.use_time)
-		x->curlft.use_time = get_seconds();
+		x->curlft.use_time = ktime_get_real_seconds();
 
 	if (x->curlft.bytes >= x->lft.hard_byte_limit ||
 	    x->curlft.packets >= x->lft.hard_packet_limit) {
-- 
2.9.0

^ permalink raw reply related

* [PATCH net-next 2/3] ipv6: xfrm: use 64-bit timestamps
From: Arnd Bergmann @ 2018-06-18 15:22 UTC (permalink / raw)
  To: Steffen Klassert, Herbert Xu, David S. Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI
  Cc: y2038, Arnd Bergmann, Alexey Dobriyan, Hans Liljestrand,
	Kees Cook, Reshetova, Elena, Florian Westphal, netdev,
	linux-kernel
In-Reply-To: <20180618152242.1566661-1-arnd@arndb.de>

get_seconds() is deprecated because it can overflow on 32-bit
architectures.  For the xfrm_state->lastused member, we treat the data
as a 64-bit number already, so we just need to use the right accessor
that works on both 32-bit and 64-bit machines.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/net/xfrm.h       | 2 +-
 net/ipv6/xfrm6_mode_ro.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 557122846e0e..d9031415402f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -225,7 +225,7 @@ struct xfrm_state {
 	long		saved_tmo;
 
 	/* Last used time */
-	unsigned long		lastused;
+	time64_t		lastused;
 
 	struct page_frag xfrag;
 
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
index 07d36573f50b..da28e4407b8f 100644
--- a/net/ipv6/xfrm6_mode_ro.c
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -55,7 +55,7 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
 	__skb_pull(skb, hdr_len);
 	memmove(ipv6_hdr(skb), iph, hdr_len);
 
-	x->lastused = get_seconds();
+	x->lastused = ktime_get_real_seconds();
 
 	return 0;
 }
-- 
2.9.0

^ permalink raw reply related

* [PATCH net-next 3/3] tcp: use monotonic timestamps for PAWS
From: Arnd Bergmann @ 2018-06-18 15:22 UTC (permalink / raw)
  To: Harsh Jain, Herbert Xu, David S. Miller, Eric Dumazet,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Atul Gupta, Arnd Bergmann,
	Gustavo A. R. Silva
  Cc: y2038, Michael Werner, Neal Cardwell, Yuchung Cheng,
	Soheil Hassas Yeganeh, Florian Westphal, Christoph Paasch,
	Lawrence Brakmo, Ursula Braun, Priyaranjan Jha,
	Alexei Starovoitov, David Ahern, Ivan Delalande,
	Maciej Żenczykowski, linux-crypto, linux-kernel, netdev
In-Reply-To: <20180618152242.1566661-1-arnd@arndb.de>

Using get_seconds() for timestamps is deprecated since it can lead
to overflows on 32-bit systems. While the interface generally doesn't
overflow until year 2106, the specific implementation of the TCP PAWS
algorithm breaks in 2038 when the intermediate signed 32-bit timestamps
overflow.

A related problem is that the local timestamps in CLOCK_REALTIME form
lead to unexpected behavior when settimeofday is called to set the system
clock backwards or forwards by more than 24 days.

While the first problem could be solved by using an overflow-safe method
of comparing the timestamps, a nicer solution is to use a monotonic
clocksource with ktime_get_seconds() that simply doesn't overflow (at
least not until 136 years after boot) and that doesn't change during
settimeofday().

To make 32-bit and 64-bit architectures behave the same way here, and
also save a few bytes in the tcp_options_received structure, I'm changing
the type to a 32-bit integer, which is now safe on all architectures.

Finally, the ts_recent_stamp field also (confusingly) gets used to store
a jiffies value in tcp_synq_overflow()/tcp_synq_no_recent_overflow().
This is currently safe, but changing the type to 32-bit requires
some small changes there to keep it working.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/crypto/chelsio/chtls/chtls_cm.c |  2 +-
 include/linux/tcp.h                     |  4 ++--
 include/net/tcp.h                       | 15 ++++++++-------
 net/ipv4/tcp_input.c                    |  2 +-
 net/ipv4/tcp_ipv4.c                     |  2 +-
 net/ipv4/tcp_minisocks.c                |  8 ++++----
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c
index 2bb6f0380758..0997e166ea57 100644
--- a/drivers/crypto/chelsio/chtls/chtls_cm.c
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.c
@@ -1673,7 +1673,7 @@ static void chtls_timewait(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->rcv_nxt++;
-	tp->rx_opt.ts_recent_stamp = get_seconds();
+	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 	tp->srtt_us = 0;
 	tcp_time_wait(sk, TCP_TIME_WAIT, 0);
 }
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 72705eaf4b84..f911b9b09b16 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -89,7 +89,7 @@ struct tcp_sack_block {
 
 struct tcp_options_received {
 /*	PAWS/RTTM data	*/
-	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
+	int	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
 	u32	ts_recent;	/* Time stamp to echo next		*/
 	u32	rcv_tsval;	/* Time stamp value             	*/
 	u32	rcv_tsecr;	/* Time stamp echo reply        	*/
@@ -425,7 +425,7 @@ struct tcp_timewait_sock {
 	/* The time we sent the last out-of-window ACK: */
 	u32			  tw_last_oow_ack_time;
 
-	long			  tw_ts_recent_stamp;
+	int			  tw_ts_recent_stamp;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0448e7c5d2b4..f8c32dc36ea1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -471,19 +471,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
  */
 static inline void tcp_synq_overflow(const struct sock *sk)
 {
-	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
-	unsigned long now = jiffies;
+	unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+	unsigned int now = jiffies;
 
-	if (time_after(now, last_overflow + HZ))
+	if (time_after32(now, last_overflow + HZ))
 		tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
 }
 
 /* syncookies: no recent synqueue overflow on this listening socket? */
 static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
 {
-	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+	unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+	unsigned int now = jiffies;
 
-	return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID);
+	return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID);
 }
 
 static inline u32 tcp_cookie_time(void)
@@ -1361,7 +1362,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
 {
 	if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
 		return true;
-	if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))
+	if (unlikely(ktime_get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))
 		return true;
 	/*
 	 * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
@@ -1391,7 +1392,7 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
 
 	   However, we can relax time bounds for RST segments to MSL.
 	 */
-	if (rst && get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL)
+	if (rst && ktime_get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL)
 		return false;
 	return true;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 355d3dffd021..0eb314774aec 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3449,7 +3449,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 static void tcp_store_ts_recent(struct tcp_sock *tp)
 {
 	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
-	tp->rx_opt.ts_recent_stamp = get_seconds();
+	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 }
 
 static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bea17f1e8302..41d03153c5bf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -155,7 +155,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 	   and use initial timestamp retrieved from peer table.
 	 */
 	if (tcptw->tw_ts_recent_stamp &&
-	    (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+	    (!twp || (reuse && ktime_get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 		if (tp->write_seq == 0)
 			tp->write_seq = 1;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1dda1341a223..1f652beb79ca 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 		tw->tw_substate	  = TCP_TIME_WAIT;
 		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if (tmp_opt.saw_tstamp) {
-			tcptw->tw_ts_recent_stamp = get_seconds();
+			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
 		}
 
@@ -189,7 +189,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 
 		if (tmp_opt.saw_tstamp) {
 			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
-			tcptw->tw_ts_recent_stamp = get_seconds();
+			tcptw->tw_ts_recent_stamp = ktime_get_seconds();
 		}
 
 		inet_twsk_put(tw);
@@ -534,7 +534,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 
 		if (newtp->rx_opt.tstamp_ok) {
 			newtp->rx_opt.ts_recent = req->ts_recent;
-			newtp->rx_opt.ts_recent_stamp = get_seconds();
+			newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 		} else {
 			newtp->rx_opt.ts_recent_stamp = 0;
@@ -600,7 +600,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			 * it can be estimated (approximately)
 			 * from another data.
 			 */
-			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+			tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
 			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 		}
 	}
-- 
2.9.0

^ permalink raw reply related

* Re: Link modes representation in phylib
From: Andrew Lunn @ 2018-06-18 15:40 UTC (permalink / raw)
  To: Maxime Chevallier
  Cc: davem, Russell King - ARM Linux, Florian Fainelli, netdev,
	Antoine Tenart, thomas.petazzoni@bootlin.com, Gregory CLEMENT,
	Miquel Raynal
In-Reply-To: <20180618170224.321f8264@bootlin.com>

On Mon, Jun 18, 2018 at 05:02:24PM +0200, Maxime Chevallier wrote:
> Hello everyone,
> 
> I'm currently working on adding support for 2.5GBaseT on some Marvell
> PHYs (the marvell10g family, including the 88X3310).
> 
> However, phylib doesn't quite support these modes yet. Its stores the
> different supported and advertised modes in u32 fields, which can't
> contain the relevant values for 2500BaseT mode (and all other modes that
> come after the 31st one).

Hi Maxime

Did you look at phylink? I think it already gets this right.  It could
be, any MAC which needs to use > bit 31 should use phylink, not
phylib.

That narrows the problem down to just the PHY drivers. We might be
able to mass convert those. Or maybe we can consider just doing some
conversion work on PHYs which support > 1Gbps?

	   Andrew

^ permalink raw reply

* Re: [PATCH net-next 0/10] xfrm: remove flow cache
From: Kristian Evensen @ 2018-06-18 15:46 UTC (permalink / raw)
  To: Florian Westphal
  Cc: David Miller, Network Development, Steffen Klassert, ilant
In-Reply-To: <CAKfDRXgbA1x9QS7ABiMwKxf11sYqfVggVXskdTNoL-CVDURhRA@mail.gmail.com>

Hi,

On Wed, Jun 13, 2018 at 3:43 PM, Kristian Evensen
<kristian.evensen@gmail.com> wrote:
> Thanks! I will prepare a firmware for one of my devices tonight, start
> testing tomorrow and report back when I have some results.

We have been testing this patch over the weekend and it has a
significant impact on performance. In our tests, the throughput
reduction has been reduced from around -20% to -5%. We also see that
the overall throughput is independent of the number of tunnels, while
before the throughput was reduced as the number of tunnels increased.
We are still running our tests and will report back if we observe
anything new, but so far, so good.

BR,
Kristian

^ permalink raw reply

* Re: [PATCH net] net/ipv6: respect rcu grace period before freeing fib6_info
From: David Ahern @ 2018-06-18 15:49 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller; +Cc: netdev, Eric Dumazet
In-Reply-To: <20180618122431.131265-1-edumazet@google.com>

On 6/18/18 6:24 AM, Eric Dumazet wrote:
> syzbot reported use after free that is caused by fib6_info being
> freed without a proper RCU grace period.
> 

...

> Fixes: a64efe142f5e ("net/ipv6: introduce fib6_info struct and helpers")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: David Ahern <dsahern@gmail.com>
> Reported-by: syzbot+9e6d75e3edef427ee888@syzkaller.appspotmail.com
> ---
>  include/net/ip6_fib.h | 5 +++--
>  net/ipv6/ip6_fib.c    | 5 +++--
>  2 files changed, 6 insertions(+), 4 deletions(-)
> 

I wondered if that was needed when flipping to the new data struct.
Apparently so. Thanks for the patch,

Acked-by: David Ahern <dsahern@gmail.com>
Tested-by: David Ahern <dsahern@gmail.com>

^ permalink raw reply

* Re: [PATCH net-next 3/3] tcp: use monotonic timestamps for PAWS
From: Eric Dumazet @ 2018-06-18 15:59 UTC (permalink / raw)
  To: Arnd Bergmann, Harsh Jain, Herbert Xu, David S. Miller,
	Eric Dumazet, Alexey Kuznetsov, Hideaki YOSHIFUJI, Atul Gupta,
	Gustavo A. R. Silva
  Cc: Lawrence Brakmo, Christoph Paasch, Soheil Hassas Yeganeh, y2038,
	netdev, Ursula Braun, Florian Westphal, Alexei Starovoitov,
	linux-kernel, Priyaranjan Jha, Maciej Żenczykowski,
	Yuchung Cheng, David Ahern, Ivan Delalande, Michael Werner,
	Neal Cardwell, linux-crypto
In-Reply-To: <20180618152242.1566661-3-arnd@arndb.de>



On 06/18/2018 08:22 AM, Arnd Bergmann wrote:
> Using get_seconds() for timestamps is deprecated since it can lead
> to overflows on 32-bit systems. While the interface generally doesn't
> overflow until year 2106, the specific implementation of the TCP PAWS
> algorithm breaks in 2038 when the intermediate signed 32-bit timestamps
> overflow.
>
...

>
>  
>  static inline u32 tcp_cookie_time(void)
> @@ -1361,7 +1362,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
>  {
>  	if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
>  		return true;
> -	if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))
> +	if (unlikely(ktime_get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))
>  		return true;
>  	/*
>  	 * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
> @@ -1391,7 +1392,7 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
>  
>  	   However, we can relax time bounds for RST segments to MSL.
>  	 */
> -	if (rst && get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL)
> +	if (rst && ktime_get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL)
>  		return false;
>  	return true;



Please use the time_after32(), since ktime_get_seconds() is time64_t while ts_recent_stamp is int.

Same remark for tcp_twsk_unique()

Lets clean up this stuff, thanks !

_______________________________________________
Y2038 mailing list
Y2038@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/y2038

^ permalink raw reply

* Re: [PATCH 2/5] net: emaclite: Balance braces in else statement
From: Joe Perches @ 2018-06-18 16:03 UTC (permalink / raw)
  To: Radhey Shyam Pandey, davem, andrew, michal.simek
  Cc: netdev, linux-arm-kernel, linux-kernel
In-Reply-To: <1529322610-27215-3-git-send-email-radhey.shyam.pandey@xilinx.com>

On Mon, 2018-06-18 at 17:20 +0530, Radhey Shyam Pandey wrote:
> Remove else as it is not required with if doing a return.
> Fixes below checkpatch warning.

> WARNING: else is not generally useful after a break or return

checkpatch is stupid and doesn't understand code flow.
Always try to improve code flow instead of merely
following brainless instructions from a script.

So:

> diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
[]
> @@ -569,13 +569,11 @@ static void xemaclite_tx_handler(struct net_device *dev)
>  					(u8 *) lp->deferred_skb->data,
>  					lp->deferred_skb->len) != 0)
>  			return;
> -		else {
> -			dev->stats.tx_bytes += lp->deferred_skb->len;
> -			dev_kfree_skb_irq(lp->deferred_skb);
> -			lp->deferred_skb = NULL;
> -			netif_trans_update(dev); /* prevent tx timeout */
> -			netif_wake_queue(dev);
> -		}
> +		dev->stats.tx_bytes += lp->deferred_skb->len;
> +		dev_kfree_skb_irq(lp->deferred_skb);
> +		lp->deferred_skb = NULL;
> +		netif_trans_update(dev); /* prevent tx timeout */
> +		netif_wake_queue(dev);
>  	}
>  }

If you really want to redo this function, perhaps something like:

static void xemaclite_tx_handler(struct net_device *dev)
{
	struct net_local *lp = netdev_priv(dev);

	dev->stats.tx_packets++;

	if (!lp->deferred_skb)
		return;

	if (xemaclite_send_data(lp, (u8 *)lp->deferred_skb->data,
				lp->deferred_skb->len))
		return;

	dev->stats.tx_bytes += lp->deferred_skb->len;
	dev_kfree_skb_irq(lp->deferred_skb);
	lp->deferred_skb = NULL;
	netif_trans_update(dev); /* prevent tx timeout */
	netif_wake_queue(dev);
}
 
> @@ -1052,13 +1050,13 @@ static bool get_bool(struct platform_device *ofdev, const char *s)
>  {
>  	u32 *p = (u32 *)of_get_property(ofdev->dev.of_node, s, NULL);
>  
> -	if (p) {
> +	if (p)
>  		return (bool)*p;
> -	} else {
> -		dev_warn(&ofdev->dev, "Parameter %s not found,"
> +
> +	dev_warn(&ofdev->dev, "Parameter %s not found,"
>  			"defaulting to false\n", s);
> -		return false;
> -	}
> +
> +	return false;
>  }

And this function has backward logic as the failure paths
are the ones that should return early or use a goto.

Perhaps something like:

static bool get_bool(struct platform_device *ofdev, const char *s)
{
	u32 *p = (u32 *)of_get_property(ofdev->dev.of_node, s, NULL);

	if (!p) {
		dev_warn(&ofdev->dev,
			 "Parameter '%s' not found, defaulting to false\n", s);
		return false;
	}

	return *p;
}

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 0/4] Add switchdev on TI-CPSW
From: Ilias Apalodimas @ 2018-06-18 16:04 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera, f.fainelli, francois.ozog, yogeshs, spatton, Jose.Abreu
In-Reply-To: <20180618150424.GA5865@lunn.ch>

On Mon, Jun 18, 2018 at 05:04:24PM +0200, Andrew Lunn wrote:
> Hi Ilias
> 
> Thanks for removing the CPU port. That helps a lot moving forward.
> 
> > - Multicast testing client-port1(tagged on vlan 100) server-port1
> > switch-config is provided by TI (https://git.ti.com/switch-config)
> > and is used to verify correct switch configuration.
> > 1. switch-config output
> > 	- type: vlan , vid = 100, untag_force = 0x4, reg_mcast = 0x6,
> > 	unreg_mcast = 0x0, member_list = 0x6
> > Server running on sw0p2: iperf -s -u -B 239.1.1.1 -i 1
> > Client running on sw0p1: iperf -c 239.1.1.1 -u -b 990m -f m -i 5 -t 3600
> > No IGMP reaches the CPU port to add MDBs(since CPU does not receive 
> > unregistered multicast as programmed).
> 
> Is this something you can work around? Is there a TCAM you can program
> to detect IGMP and pass the packets to the CPU? Without receiving
> IGMP, multicast is pretty broken.
Yes it's described in example 2. (i'll explain in detail further down)
> 
> If i understand right, a multicast listener running on the CPU should
> work, since you can add an MDB to receive multicast traffic from the
> two ports. Multicast traffic sent from the CPU also works. What does
> not work is IGMP snooping of traffic between the two switch ports. You
> have no access to the IGMP frames, so cannot snoop. So unless you can
> work around that with a TCAM, i think you just have to blindly pass
> all multicast between the two ports.
Yes, if the CPU port is added on the VLAN then unregistered multicast traffic
(and thus IGMP joins) will reach the CPU port and everything will work as
expected. I think we should not consider this as a "problem" as long as it's
descibed properly in Documentation. This switch is excected to support this.

What you describe is essentially what happens on "example 2."
Enabling the unregistered multicat traffic to be directed to the CPU will cover
all use cases that require no user intervention for everything to work. MDBs
will automcatically be added/removed to the hardware and traffic will be
offloaded.
With the current code the user has that possibility, so it's up to him to 
decide what mode of configuration he prefers.

> 
> > Setting on/off and IFF_MULTICAST (on eth0/eth1/br0) will affect registered 
> > multicast masks programmed in the switch(for port1, port2, cpu port
> > respectively).
> 
> > This muct occur before adding VLANs on the interfaces. If you change the
> > flag after the VLAN configuration you need to re-issue the VLAN config 
> > commands. 
> 
> This you should fix. You should be able to get the stack to tell you
> about all the configured VLANs, so you can re-program the switch.
I was planning fixing these via bridge link commands which would get propagated
to the driver for port1/port2 and CPU port. I just didn't find anything
relevant to multicast on bridge commands apart from flooding (which is used 
properly). I think that the proper way to do this is follow the logic that was
introduced by VLANs i.e: 
bridge vlan add dev br0 vid 100 pvid untagged self <---- destined for CPU port
and apply it for multicast/flooding etc.
This requires iproute2 changes first though.

> 
> > - NFS:
> > The only way for NFS to work is by chrooting to a minimal environment when 
> > switch configuration that will affect connectivity is needed.
> 
> You might want to look at the commit history for DSA. Florian added a
> patch which makes NFS root work with DSA. It might give you clues as
> to what you need to add to make it just work.
I'll have a look. Chrooting is rarely needed in our case anyway. It's only
needed when "loss of connectivity" is bound to take place.
> 
>    Andrew

Thanks
Ilias

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 4/4] net/cpsw_switchdev: add switchdev mode of operation on cpsw driver
From: Andrew Lunn @ 2018-06-18 16:16 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera, f.fainelli, francois.ozog, yogeshs, spatton, Jose.Abreu
In-Reply-To: <1528974690-31600-5-git-send-email-ilias.apalodimas@linaro.org>

> @@ -2711,6 +2789,10 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
>  	if (of_property_read_bool(node, "dual_emac"))
>  		data->switch_mode = CPSW_DUAL_EMAC;
>  
> +	/* switchdev overrides DTS */
> +	if (IS_ENABLED(CONFIG_TI_CPSW_SWITCHDEV))
> +		data->switch_mode = CPSW_SWITCHDEV;
> +

I know you discussed this a bit with Jiri, but i still think if
'dual_mac" is found, you should do dual mac. The DT clearly requests
dual mac, and doing anything else is going to cause confusion.

The device tree binding is ambiguous what should happen when dual-mac
is missing. So i would only enable swithdev mode in that case.

But ideally, it should be a new driver with a new binding.

    Andrew

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 0/4] Add switchdev on TI-CPSW
From: Andrew Lunn @ 2018-06-18 16:28 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera, f.fainelli, francois.ozog, yogeshs, spatton, Jose.Abreu
In-Reply-To: <20180618160418.GA25068@apalos>

> Yes, if the CPU port is added on the VLAN then unregistered multicast traffic
> (and thus IGMP joins) will reach the CPU port and everything will work as
> expected. I think we should not consider this as a "problem" as long as it's
> descibed properly in Documentation. This switch is excected to support this.

Back to the two e1000e. What would you expect to happen with them?
Either IGMP snooping needs to work, or your don't do snooping at
all.

> What you describe is essentially what happens on "example 2."
> Enabling the unregistered multicat traffic to be directed to the CPU will cover
> all use cases that require no user intervention for everything to work. MDBs
> will automcatically be added/removed to the hardware and traffic will be
> offloaded.
> With the current code the user has that possibility, so it's up to him to 
> decide what mode of configuration he prefers.

So by default, it just needs to work. You can give the user the option
to shoot themselves in the foot, but they need to actively pull the
trigger to blow their own foot off.

> > > Setting on/off and IFF_MULTICAST (on eth0/eth1/br0) will affect registered 
> > > multicast masks programmed in the switch(for port1, port2, cpu port
> > > respectively).
> > 
> > > This muct occur before adding VLANs on the interfaces. If you change the
> > > flag after the VLAN configuration you need to re-issue the VLAN config 
> > > commands. 
> > 
> > This you should fix. You should be able to get the stack to tell you
> > about all the configured VLANs, so you can re-program the switch.
> I was planning fixing these via bridge link commands which would get propagated
> to the driver for port1/port2 and CPU port. I just didn't find anything
> relevant to multicast on bridge commands apart from flooding (which is used 
> properly). I think that the proper way to do this is follow the logic that was
> introduced by VLANs i.e: 
> bridge vlan add dev br0 vid 100 pvid untagged self <---- destined for CPU port
> and apply it for multicast/flooding etc.
> This requires iproute2 changes first though.

No, i think you can do this in the driver. The driver just needs to
ask the stack to tell it about all the VLANs again. Or you walk the
VLAN tables in the hardware and do the programming based on that. I
don't see why user space should be involved at all. What would you
expect with two e1000e's?

       Andrew

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 0/4] Add switchdev on TI-CPSW
From: Ilias Apalodimas @ 2018-06-18 16:46 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera, f.fainelli, francois.ozog, yogeshs, spatton, Jose.Abreu
In-Reply-To: <20180618162836.GD5865@lunn.ch>

On Mon, Jun 18, 2018 at 06:28:36PM +0200, Andrew Lunn wrote:
> > Yes, if the CPU port is added on the VLAN then unregistered multicast traffic
> > (and thus IGMP joins) will reach the CPU port and everything will work as
> > expected. I think we should not consider this as a "problem" as long as it's
> > descibed properly in Documentation. This switch is excected to support this.
> 
> Back to the two e1000e. What would you expect to happen with them?
> Either IGMP snooping needs to work, or your don't do snooping at
> all.
That's a different use case, you don't have a CPU port on e1000 and it 
"just works". You can't do anything to the card and drop the packet. If you
want to have the same example imagine something like "i filter and drop IGMP
messages on one of the 2 e1000e's on the bridge but i expect IGMP to work".
It's a totally different hardware here were this is an option and for TI 
usecases a valid option.

> 
> > What you describe is essentially what happens on "example 2."
> > Enabling the unregistered multicat traffic to be directed to the CPU will cover
> > all use cases that require no user intervention for everything to work. MDBs
> > will automcatically be added/removed to the hardware and traffic will be
> > offloaded.
> > With the current code the user has that possibility, so it's up to him to 
> > decide what mode of configuration he prefers.
> 
> So by default, it just needs to work. You can give the user the option
> to shoot themselves in the foot, but they need to actively pull the
> trigger to blow their own foot off.
Yes it does by default. I don't consider it "foot shooting" though. 
If we stop thinking about switches connected to user environments and think of
industrial ones, my understanding is that this is a common scenarioa that needs
to be supported.

> 
> > > > Setting on/off and IFF_MULTICAST (on eth0/eth1/br0) will affect registered 
> > > > multicast masks programmed in the switch(for port1, port2, cpu port
> > > > respectively).
> > > 
> > > > This muct occur before adding VLANs on the interfaces. If you change the
> > > > flag after the VLAN configuration you need to re-issue the VLAN config 
> > > > commands. 
> > > 
> > > This you should fix. You should be able to get the stack to tell you
> > > about all the configured VLANs, so you can re-program the switch.
> > I was planning fixing these via bridge link commands which would get propagated
> > to the driver for port1/port2 and CPU port. I just didn't find anything
> > relevant to multicast on bridge commands apart from flooding (which is used 
> > properly). I think that the proper way to do this is follow the logic that was
> > introduced by VLANs i.e: 
> > bridge vlan add dev br0 vid 100 pvid untagged self <---- destined for CPU port
> > and apply it for multicast/flooding etc.
> > This requires iproute2 changes first though.
> 
> No, i think you can do this in the driver. The driver just needs to
> ask the stack to tell it about all the VLANs again. Or you walk the
> VLAN tables in the hardware and do the programming based on that. I
> don't see why user space should be involved at all. What would you
> expect with two e1000e's?
We are pretty much describing the same thing i just thought having a bridge
command for it was more appropriate.
After the user removes the multicast flag for an interface i'll just walk VLANs
and adjust accordingly. It's doable and i'll change it for the patch.


Thanks
Ilias

^ permalink raw reply

* [PATCH v3] bpf: attach type BPF_LIRC_MODE2 should not depend on CONFIG_CGROUP_BPF
From: Sean Young @ 2018-06-18 17:12 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Y Song, Matthias Reichl, linux-media, LKML, Alexei Starovoitov,
	Mauro Carvalho Chehab, netdev, Devin Heitmueller, Quentin Monnet
In-Reply-To: <d2613314-406e-dd7d-1cf0-b5a78a155e3b@iogearbox.net>

If the kernel is compiled with CONFIG_CGROUP_BPF not enabled, it is not
possible to attach, detach or query IR BPF programs to /dev/lircN devices,
making them impossible to use. For embedded devices, it should be possible
to use IR decoding without cgroups or CONFIG_CGROUP_BPF enabled.

This change requires some refactoring, since bpf_prog_{attach,detach,query}
functions are now always compiled, but their code paths for cgroups need
moving out. Rather than a #ifdef CONFIG_CGROUP_BPF in kernel/bpf/syscall.c,
moving them to kernel/bpf/cgroup.c and kernel/bpf/sockmap.c does not
require #ifdefs since that is already conditionally compiled.

Signed-off-by: Sean Young <sean@mess.org>
---
 drivers/media/rc/bpf-lirc.c | 14 +-----
 include/linux/bpf-cgroup.h  | 26 ++++++++++
 include/linux/bpf.h         |  8 +++
 include/linux/bpf_lirc.h    |  5 +-
 kernel/bpf/cgroup.c         | 52 ++++++++++++++++++++
 kernel/bpf/sockmap.c        | 18 +++++++
 kernel/bpf/syscall.c        | 98 ++++++++-----------------------------
 7 files changed, 130 insertions(+), 91 deletions(-)

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 40826bba06b6..fcfab6635f9c 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -207,29 +207,19 @@ void lirc_bpf_free(struct rc_dev *rcdev)
 	bpf_prog_array_free(rcdev->raw->progs);
 }
 
-int lirc_prog_attach(const union bpf_attr *attr)
+int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
-	struct bpf_prog *prog;
 	struct rc_dev *rcdev;
 	int ret;
 
 	if (attr->attach_flags)
 		return -EINVAL;
 
-	prog = bpf_prog_get_type(attr->attach_bpf_fd,
-				 BPF_PROG_TYPE_LIRC_MODE2);
-	if (IS_ERR(prog))
-		return PTR_ERR(prog);
-
 	rcdev = rc_dev_get_from_fd(attr->target_fd);
-	if (IS_ERR(rcdev)) {
-		bpf_prog_put(prog);
+	if (IS_ERR(rcdev))
 		return PTR_ERR(rcdev);
-	}
 
 	ret = lirc_bpf_attach(rcdev, prog);
-	if (ret)
-		bpf_prog_put(prog);
 
 	put_device(&rcdev->dev);
 
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 975fb4cf1bb7..79795c5fa7c3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -188,12 +188,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 									      \
 	__ret;								      \
 })
+int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+			   enum bpf_prog_type ptype, struct bpf_prog *prog);
+int cgroup_bpf_prog_detach(const union bpf_attr *attr,
+			   enum bpf_prog_type ptype);
+int cgroup_bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr);
 #else
 
+struct bpf_prog;
 struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
+static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+					 enum bpf_prog_type ptype,
+					 struct bpf_prog *prog)
+{
+	return -EINVAL;
+}
+
+static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr,
+					 enum bpf_prog_type ptype)
+{
+	return -EINVAL;
+}
+
+static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
+					union bpf_attr __user *uattr)
+{
+	return -EINVAL;
+}
+
 #define cgroup_bpf_enabled (0)
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 995c3b1e59bf..ac4c73d29f96 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -684,6 +684,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
+int sockmap_get_from_fd(const union bpf_attr *attr, int type,
+			struct bpf_prog *prog);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 {
@@ -702,6 +704,12 @@ static inline int sock_map_prog(struct bpf_map *map,
 {
 	return -EOPNOTSUPP;
 }
+
+int sockmap_get_from_fd(const union bpf_attr *attr, int type,
+			struct bpf_prog *prog);
+{
+	return -EINVAL;
+}
 #endif
 
 #if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h
index 5f8a4283092d..9d9ff755ec29 100644
--- a/include/linux/bpf_lirc.h
+++ b/include/linux/bpf_lirc.h
@@ -5,11 +5,12 @@
 #include <uapi/linux/bpf.h>
 
 #ifdef CONFIG_BPF_LIRC_MODE2
-int lirc_prog_attach(const union bpf_attr *attr);
+int lirc_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int lirc_prog_detach(const union bpf_attr *attr);
 int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
 #else
-static inline int lirc_prog_attach(const union bpf_attr *attr)
+static inline int lirc_prog_attach(const union bpf_attr *attr,
+				   struct bpf_prog *prog)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index f7c00bd6f8e4..f0ae8a3d01f9 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -428,6 +428,58 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	return ret;
 }
 
+int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+			   enum bpf_prog_type ptype, struct bpf_prog *prog)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
+	cgroup_put(cgrp);
+
+	return ret;
+}
+
+int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+	struct bpf_prog *prog;
+	struct cgroup *cgrp;
+	int ret;
+
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+	cgroup_put(cgrp);
+	return ret;
+}
+
+int cgroup_bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	cgroup_put(cgrp);
+	return ret;
+}
+
 /**
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 52a91d816c0e..81d0c55a77aa 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1915,6 +1915,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 	return 0;
 }
 
+int sockmap_get_from_fd(const union bpf_attr *attr, int type,
+			struct bpf_prog *prog)
+{
+	int ufd = attr->target_fd;
+	struct bpf_map *map;
+	struct fd f;
+	int err;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = sock_map_prog(map, prog, attr->attach_type);
+	fdput(f);
+	return err;
+}
+
 static void *sock_map_lookup(struct bpf_map *map, void *key)
 {
 	return NULL;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0fa20624707f..93993c03c9ac 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1489,8 +1489,6 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 	return err;
 }
 
-#ifdef CONFIG_CGROUP_BPF
-
 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 					     enum bpf_attach_type attach_type)
 {
@@ -1505,40 +1503,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 
 #define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
-static int sockmap_get_from_fd(const union bpf_attr *attr,
-			       int type, bool attach)
-{
-	struct bpf_prog *prog = NULL;
-	int ufd = attr->target_fd;
-	struct bpf_map *map;
-	struct fd f;
-	int err;
-
-	f = fdget(ufd);
-	map = __bpf_map_get(f);
-	if (IS_ERR(map))
-		return PTR_ERR(map);
-
-	if (attach) {
-		prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
-		if (IS_ERR(prog)) {
-			fdput(f);
-			return PTR_ERR(prog);
-		}
-	}
-
-	err = sock_map_prog(map, prog, attr->attach_type);
-	if (err) {
-		fdput(f);
-		if (prog)
-			bpf_prog_put(prog);
-		return err;
-	}
-
-	fdput(f);
-	return 0;
-}
-
 #define BPF_F_ATTACH_MASK \
 	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
 
@@ -1546,7 +1510,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
 	struct bpf_prog *prog;
-	struct cgroup *cgrp;
 	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -1583,12 +1546,15 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
 		break;
 	case BPF_SK_MSG_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true);
+		ptype = BPF_PROG_TYPE_SK_MSG;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+		ptype = BPF_PROG_TYPE_SK_SKB;
+		break;
 	case BPF_LIRC_MODE2:
-		return lirc_prog_attach(attr);
+		ptype = BPF_PROG_TYPE_LIRC_MODE2;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1602,17 +1568,20 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return -EINVAL;
 	}
 
-	cgrp = cgroup_get_from_fd(attr->target_fd);
-	if (IS_ERR(cgrp)) {
-		bpf_prog_put(prog);
-		return PTR_ERR(cgrp);
+	switch (ptype) {
+	case BPF_PROG_TYPE_SK_SKB:
+	case BPF_PROG_TYPE_SK_MSG:
+		ret = sockmap_get_from_fd(attr, ptype, prog);
+		break;
+	case BPF_PROG_TYPE_LIRC_MODE2:
+		ret = lirc_prog_attach(attr, prog);
+		break;
+	default:
+		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 	}
 
-	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
-				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
-	cgroup_put(cgrp);
 
 	return ret;
 }
@@ -1622,9 +1591,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
-	struct bpf_prog *prog;
-	struct cgroup *cgrp;
-	int ret;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -1657,29 +1623,17 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
 		break;
 	case BPF_SK_MSG_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false);
+		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
 
-	cgrp = cgroup_get_from_fd(attr->target_fd);
-	if (IS_ERR(cgrp))
-		return PTR_ERR(cgrp);
-
-	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
-	if (IS_ERR(prog))
-		prog = NULL;
-
-	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
-	if (prog)
-		bpf_prog_put(prog);
-	cgroup_put(cgrp);
-	return ret;
+	return cgroup_bpf_prog_detach(attr, ptype);
 }
 
 #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
@@ -1687,9 +1641,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
 {
-	struct cgroup *cgrp;
-	int ret;
-
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_QUERY))
@@ -1717,14 +1668,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	default:
 		return -EINVAL;
 	}
-	cgrp = cgroup_get_from_fd(attr->query.target_fd);
-	if (IS_ERR(cgrp))
-		return PTR_ERR(cgrp);
-	ret = cgroup_bpf_query(cgrp, attr, uattr);
-	cgroup_put(cgrp);
-	return ret;
+
+	return cgroup_bpf_prog_query(attr, uattr);
 }
-#endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
 
@@ -2371,7 +2317,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
-#ifdef CONFIG_CGROUP_BPF
 	case BPF_PROG_ATTACH:
 		err = bpf_prog_attach(&attr);
 		break;
@@ -2381,7 +2326,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_QUERY:
 		err = bpf_prog_query(&attr, uattr);
 		break;
-#endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
 		break;
-- 
2.17.1

^ permalink raw reply related

* Re: [RFC v2, net-next, PATCH 0/4] Add switchdev on TI-CPSW
From: Andrew Lunn @ 2018-06-18 17:30 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera, f.fainelli, francois.ozog, yogeshs, spatton, Jose.Abreu
In-Reply-To: <20180618164602.GA26411@apalos>

On Mon, Jun 18, 2018 at 07:46:02PM +0300, Ilias Apalodimas wrote:
> On Mon, Jun 18, 2018 at 06:28:36PM +0200, Andrew Lunn wrote:
> > > Yes, if the CPU port is added on the VLAN then unregistered multicast traffic
> > > (and thus IGMP joins) will reach the CPU port and everything will work as
> > > expected. I think we should not consider this as a "problem" as long as it's
> > > descibed properly in Documentation. This switch is excected to support this.
> > 
> > Back to the two e1000e. What would you expect to happen with them?
> > Either IGMP snooping needs to work, or your don't do snooping at
> > all.
> That's a different use case

I disagree. That is the exact same use case. I add ports to a bridge
and i expect the bridge to either do IGMP snooping, or just forward
all multicast. That is the users expectations. That is how the Linux
network stack works. If the hardware has limitations you want to try
to hide them from the user.

> > So by default, it just needs to work. You can give the user the option
> > to shoot themselves in the foot, but they need to actively pull the
> > trigger to blow their own foot off.

> Yes it does by default. I don't consider it "foot shooting" though. 
> If we stop thinking about switches connected to user environments 

I never think about switches. I think about a block of acceleration
hardware, which i try to offload Linux networking to. And if the
hardware cannot accelerate Linux network functions properly, i don't
try to offload it. That way it always operates in the same way, and
the user expectations are clear.

    Andrew

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 0/4] Add switchdev on TI-CPSW
From: Ilias Apalodimas @ 2018-06-18 17:49 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera, f.fainelli, francois.ozog, yogeshs, spatton, Jose.Abreu
In-Reply-To: <20180618173025.GF5865@lunn.ch>

On Mon, Jun 18, 2018 at 07:30:25PM +0200, Andrew Lunn wrote:
> On Mon, Jun 18, 2018 at 07:46:02PM +0300, Ilias Apalodimas wrote:
> > On Mon, Jun 18, 2018 at 06:28:36PM +0200, Andrew Lunn wrote:
> > > > Yes, if the CPU port is added on the VLAN then unregistered multicast traffic
> > > > (and thus IGMP joins) will reach the CPU port and everything will work as
> > > > expected. I think we should not consider this as a "problem" as long as it's
> > > > descibed properly in Documentation. This switch is excected to support this.
> > > 
> > > Back to the two e1000e. What would you expect to happen with them?
> > > Either IGMP snooping needs to work, or your don't do snooping at
> > > all.
> > That's a different use case
> 
> I disagree. That is the exact same use case. I add ports to a bridge
> and i expect the bridge to either do IGMP snooping, or just forward
> all multicast. That is the users expectations. That is how the Linux
> network stack works. If the hardware has limitations you want to try
> to hide them from the user.
Why is this a limitation? Isn't it proper functionality?
If you configure the CPU port as "passthrough" (which again is
the default) then everything works just like e1000e. The user doesn't have to do
anything at all, MDBs are added/deleted based on proper timers/joins etc.
If the user chooses to use the cpu port as a kind of internal L-2 filter, 
that's up to him, but having hardware do the filtering for you isn't something 
i'd call a limitation.

I am not sure what's the case here though. The hardware operates as you want
by defaulti. As added functionality the user can, if he chooses to, add the 
MDBs manually instead of having some piece of code do that for him. 
This was clearly described in the first RFC since it was the only reason to add
a CPU port. Is there a problem with the user controlling these capabilities of
the hardware?
> > > So by default, it just needs to work. You can give the user the option
> > > to shoot themselves in the foot, but they need to actively pull the
> > > trigger to blow their own foot off.
> 
> > Yes it does by default. I don't consider it "foot shooting" though. 
> > If we stop thinking about switches connected to user environments 
> 
> I never think about switches. I think about a block of acceleration
> hardware, which i try to offload Linux networking to.  And if the
> hardware cannot accelerate Linux network functions properly, i don't
> try to offload it. That way it always operates in the same way, and
> the user expectations are clear.
> 
>     Andrew
The acceleration block is working properly here. The user has the ability to
instruct the acceleration block not to accelerate all the traffic but specific
cases he chooses to. Isn't that a valid use case since the hardware supports
that ?

Regards
Ilias

^ permalink raw reply

* Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"
From: Andreas Schwab @ 2018-06-18 17:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mathieu Malaterre, David S. Miller, Eric Dumazet, LKML,
	Christophe LEROY, Meelis Roos, netdev, linuxppc-dev
In-Reply-To: <e1a9b126-3226-72d0-82b2-b69ca5f59ccb@gmail.com>

On Jun 17 2018, Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Oh this is silly, please try :
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index c642304f178ce0a4e1358d59e45032a39f76fb3f..54dd9c18ecad817812898d6f335e1794a07dabbe 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1845,10 +1845,9 @@ EXPORT_SYMBOL(___pskb_trim);
>  int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
>  {
>         if (skb->ip_summed == CHECKSUM_COMPLETE) {
> -               int delta = skb->len - len;
> +               __wsum csumdiff = skb_checksum(skb, len, skb->len - len, 0);
>  
> -               skb->csum = csum_sub(skb->csum,
> -                                    skb_checksum(skb, len, delta, 0));
> +               skb->csum = csum_block_sub(skb->csum, csumdiff, len);
>         }
>         return __pskb_trim(skb, len);
>  }

That doesn't help either.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply

* array bounds warning in xfrm_output_resume
From: David Ahern @ 2018-06-18 18:10 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netdev@vger.kernel.org

Florian:

I am seeing this warning:

$ make O=kbuild/perf -j 24 -s
In file included from /home/dsa/kernel-3.git/include/linux/kernel.h:10:0,
                 from /home/dsa/kernel-3.git/include/linux/list.h:9,
                 from /home/dsa/kernel-3.git/include/linux/module.h:9,
                 from /home/dsa/kernel-3.git/net/xfrm/xfrm_output.c:13:
/home/dsa/kernel-3.git/net/xfrm/xfrm_output.c: In function
‘xfrm_output_resume’:
/home/dsa/kernel-3.git/include/linux/compiler.h:252:20: warning: array
subscript is above array bounds [-Warray-bounds]
   __read_once_size(&(x), __u.__c, sizeof(x));  \
                    ^~~~
/home/dsa/kernel-3.git/include/linux/compiler.h:258:22: note: in
expansion of macro ‘__READ_ONCE’
 #define READ_ONCE(x) __READ_ONCE(x, 1)
                      ^~~~~~~~~~~
/home/dsa/kernel-3.git/include/linux/rcupdate.h:350:48: note: in
expansion of macro ‘READ_ONCE’
  typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \
                                                ^~~~~~~~~
/home/dsa/kernel-3.git/include/linux/rcupdate.h:487:2: note: in
expansion of macro ‘__rcu_dereference_check’
  __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)
  ^~~~~~~~~~~~~~~~~~~~~~~
/home/dsa/kernel-3.git/include/linux/rcupdate.h:545:28: note: in
expansion of macro ‘rcu_dereference_check’
 #define rcu_dereference(p) rcu_dereference_check(p, 0)
                            ^~~~~~~~~~~~~~~~~~~~~
/home/dsa/kernel-3.git/include/linux/netfilter.h:218:15: note: in
expansion of macro ‘rcu_dereference’
   hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
               ^~~~~~~~~~~~~~~

Line in question is the nf_hook in xfrm_output_resume.
NF_INET_POST_ROUTING = 4 which is greater than NF_ARP_NUMHOOKS = 3

I believe ef57170bbfdd6 is the commit that introduced the warning

^ permalink raw reply

* Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup status
From: Martin KaFai Lau @ 2018-06-18 18:11 UTC (permalink / raw)
  To: dsahern; +Cc: netdev, borkmann, ast, davem, David Ahern
In-Reply-To: <20180617151819.6824-1-dsahern@kernel.org>

On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsahern@kernel.org wrote:
> From: David Ahern <dsahern@gmail.com>
> 
> For ACLs implemented using either FIB rules or FIB entries, the BPF
> program needs the FIB lookup status to be able to drop the packet.
Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH,  can you
give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based
on other BPF_FIB_LKUP_RET_*?

> Since the bpf_fib_lookup API has not reached a released kernel yet,
> change the return code to contain an encoding of the FIB lookup
> result and return the nexthop device index in the params struct.
> 
> In addition, inform the BPF program of any post FIB lookup reason as
> to why the packet needs to go up the stack.
> 
> Update the sample program per the change in API.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
>  include/uapi/linux/bpf.h   | 28 ++++++++++++++----
>  net/core/filter.c          | 74 ++++++++++++++++++++++++++++++++--------------
>  samples/bpf/xdp_fwd_kern.c |  8 ++---
>  3 files changed, 78 insertions(+), 32 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59b19b6a40d7..ceb80071c341 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1857,7 +1857,8 @@ union bpf_attr {
>   *		is resolved), the nexthop address is returned in ipv4_dst
>   *		or ipv6_dst based on family, smac is set to mac address of
>   *		egress device, dmac is set to nexthop mac address, rt_metric
> - *		is set to metric from route (IPv4/IPv6 only).
> + *		is set to metric from route (IPv4/IPv6 only), and ifindex
> + *		is set to the device index of the nexthop from the FIB lookup.
>   *
>   *             *plen* argument is the size of the passed in struct.
>   *             *flags* argument can be a combination of one or more of the
> @@ -1873,9 +1874,9 @@ union bpf_attr {
>   *             *ctx* is either **struct xdp_md** for XDP programs or
>   *             **struct sk_buff** tc cls_act programs.
>   *     Return
> - *             Egress device index on success, 0 if packet needs to continue
> - *             up the stack for further processing or a negative error in case
> - *             of failure.
> + *		< 0 if any input argument is invalid
> + *		  0 on success (packet is forwarded and nexthop neighbor exists)
> + *		> 0 one of BPF_FIB_LKUP_RET_ codes on FIB lookup response
>   *
>   * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
>   *	Description
> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
>  
> +enum {
> +	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
> +	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed */
> +	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
> +	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed */
> +	BPF_FIB_LKUP_RET_NOT_FWDED,    /* pkt is not forwardded */
BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?

> +	BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
> +	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires unsupported encap */
> +	BPF_FIB_LKUP_RET_NO_NHDEV,     /* nh device does not exist */
> +	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neigh entry for nh */
> +	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* pkt too big to fwd */
> +};
> +
>  struct bpf_fib_lookup {
>  	/* input:  network family for lookup (AF_INET, AF_INET6)
>  	 * output: network family of egress nexthop
> @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup {
>  
>  	/* total length of packet from network header - used for MTU check */
>  	__u16	tot_len;
> -	__u32	ifindex;  /* L3 device index for lookup */
> +
> +	/* input: L3 device index for lookup
> +	 * output: nexthop device index from FIB lookup
> +	 */
> +	__u32	ifindex;
>  
>  	union {
>  		/* inputs to lookup */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e7f12e9f598c..e758ca487878 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
>  	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
>  	params->h_vlan_TCI = 0;
>  	params->h_vlan_proto = 0;
> +	params->ifindex = dev->ifindex;
>  
> -	return dev->ifindex;
> +	return 0;
>  }
>  #endif
>  
> @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	/* verify forwarding is enabled on this interface */
>  	in_dev = __in_dev_get_rcu(dev);
>  	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_FWD_DISABLED;
>  
>  	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
>  		fl4.flowi4_iif = 1;
> @@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  
>  		tb = fib_get_table(net, tbid);
>  		if (unlikely(!tb))
> -			return 0;
> +			return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
>  	} else {
> @@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
>  	}
>  
> -	if (err || res.type != RTN_UNICAST)
> -		return 0;
> +	if (err) {
> +		/* map fib lookup errors to RTN_ type */
> +		if (err == -EINVAL)
> +			return BPF_FIB_LKUP_RET_BLACKHOLE;
> +		if (err == -EHOSTUNREACH)
> +			return BPF_FIB_LKUP_RET_UNREACHABLE;
> +		if (err == -EACCES)
> +			return BPF_FIB_LKUP_RET_PROHIBIT;
> +
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
> +	}
> +
> +	if (res.type != RTN_UNICAST)
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  	if (res.fi->fib_nhs > 1)
>  		fib_select_path(net, &res, &fl4, NULL);
> @@ -4144,18 +4157,18 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	if (check_mtu) {
>  		mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
>  		if (params->tot_len > mtu)
> -			return 0;
> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
>  	}
>  
>  	nh = &res.fi->fib_nh[res.nh_sel];
>  
>  	/* do not handle lwt encaps right now */
>  	if (nh->nh_lwtstate)
> -		return 0;
> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>  
>  	dev = nh->nh_dev;
>  	if (unlikely(!dev))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
>  
>  	if (nh->nh_gw)
>  		params->ipv4_dst = nh->nh_gw;
> @@ -4166,10 +4179,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	 * rcu_read_lock_bh is not needed here
>  	 */
>  	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
> -	if (neigh)
> -		return bpf_fib_set_fwd_params(params, neigh, dev);
> +	if (!neigh)
> +		return BPF_FIB_LKUP_RET_NO_NEIGH;
>  
> -	return 0;
> +	return bpf_fib_set_fwd_params(params, neigh, dev);
>  }
>  #endif
>  
> @@ -4190,7 +4203,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  
>  	/* link local addresses are never forwarded */
>  	if (rt6_need_strict(dst) || rt6_need_strict(src))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  	dev = dev_get_by_index_rcu(net, params->ifindex);
>  	if (unlikely(!dev))
> @@ -4198,7 +4211,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  
>  	idev = __in6_dev_get_safely(dev);
>  	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_FWD_DISABLED;
>  
>  	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
>  		fl6.flowi6_iif = 1;
> @@ -4225,7 +4238,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  
>  		tb = ipv6_stub->fib6_get_table(net, tbid);
>  		if (unlikely(!tb))
> -			return 0;
> +			return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
>  	} else {
> @@ -4238,11 +4251,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	}
>  
>  	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
> -		return 0;
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
> +
> +	if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
> +		switch (f6i->fib6_type) {
> +		case RTN_BLACKHOLE:
> +			return BPF_FIB_LKUP_RET_BLACKHOLE;
> +		case RTN_UNREACHABLE:
> +			return BPF_FIB_LKUP_RET_UNREACHABLE;
> +		case RTN_PROHIBIT:
> +			return BPF_FIB_LKUP_RET_PROHIBIT;
> +		default:
> +			return BPF_FIB_LKUP_RET_NOT_FWDED;
> +		}
> +	}
>  
> -	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
> -	    f6i->fib6_type != RTN_UNICAST))
> -		return 0;
> +	if (f6i->fib6_type != RTN_UNICAST)
> +		return BPF_FIB_LKUP_RET_NOT_FWDED;
>  
>  	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
>  		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	if (check_mtu) {
>  		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
>  		if (params->tot_len > mtu)
> -			return 0;
> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
>  	}
>  
>  	if (f6i->fib6_nh.nh_lwtstate)
> -		return 0;
> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>  
>  	if (f6i->fib6_flags & RTF_GATEWAY)
>  		*dst = f6i->fib6_nh.nh_gw;
>  
>  	dev = f6i->fib6_nh.nh_dev;
> +	if (unlikely(!dev))
> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
Is this a bug fix?

> +
>  	params->rt_metric = f6i->fib6_metric;
>  
>  	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
> @@ -4270,10 +4298,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>  	 */
>  	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
>  				      ndisc_hashfn, dst, dev);
> -	if (neigh)
> -		return bpf_fib_set_fwd_params(params, neigh, dev);
> +	if (!neigh)
> +		return BPF_FIB_LKUP_RET_NO_NEIGH;
>  
> -	return 0;
> +	return bpf_fib_set_fwd_params(params, neigh, dev);
>  }
>  #endif
>  
> diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
> index 6673cdb9f55c..a7e94e7ff87d 100644
> --- a/samples/bpf/xdp_fwd_kern.c
> +++ b/samples/bpf/xdp_fwd_kern.c
> @@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>  	struct ethhdr *eth = data;
>  	struct ipv6hdr *ip6h;
>  	struct iphdr *iph;
> -	int out_index;
>  	u16 h_proto;
>  	u64 nh_off;
> +	int rc;
>  
>  	nh_off = sizeof(*eth);
>  	if (data + nh_off > data_end)
> @@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>  
>  	fib_params.ifindex = ctx->ingress_ifindex;
>  
> -	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
> +	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
>  
>  	/* verify egress index has xdp support
>  	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
> @@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>  	 * NOTE: without verification that egress index supports XDP
>  	 *       forwarding packets are dropped.
>  	 */
> -	if (out_index > 0) {
> +	if (rc == 0) {
>  		if (h_proto == htons(ETH_P_IP))
>  			ip_decrease_ttl(iph);
>  		else if (h_proto == htons(ETH_P_IPV6))
> @@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
>  
>  		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
>  		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
> -		return bpf_redirect_map(&tx_port, out_index, 0);
> +		return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
>  	}
>  
>  	return XDP_PASS;
> -- 
> 2.11.0
> 

^ permalink raw reply

* Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"
From: Eric Dumazet @ 2018-06-18 18:18 UTC (permalink / raw)
  To: Andreas Schwab, Eric Dumazet
  Cc: Mathieu Malaterre, David S. Miller, Eric Dumazet, LKML,
	Christophe LEROY, Meelis Roos, netdev, linuxppc-dev
In-Reply-To: <87o9g8geu0.fsf@igel.home>



On 06/18/2018 10:54 AM, Andreas Schwab wrote:
> On Jun 17 2018, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
>> Oh this is silly, please try :
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index c642304f178ce0a4e1358d59e45032a39f76fb3f..54dd9c18ecad817812898d6f335e1794a07dabbe 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -1845,10 +1845,9 @@ EXPORT_SYMBOL(___pskb_trim);
>>  int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
>>  {
>>         if (skb->ip_summed == CHECKSUM_COMPLETE) {
>> -               int delta = skb->len - len;
>> +               __wsum csumdiff = skb_checksum(skb, len, skb->len - len, 0);
>>  
>> -               skb->csum = csum_sub(skb->csum,
>> -                                    skb_checksum(skb, len, delta, 0));
>> +               skb->csum = csum_block_sub(skb->csum, csumdiff, len);
>>         }
>>         return __pskb_trim(skb, len);
>>  }
> 
> That doesn't help either.
> 
> Andreas.
> 

Then maybe NIC provided csum is not correct.

It does not compute a checksum on all the frame, but part of it.

You could use this patch to double check.

diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c
index 7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..277859ea73e35271a10b02011120fca248ec8e71 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -857,6 +857,11 @@ static int gem_rx(struct gem *gp, int work_to_do)
 
                csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 0xffff);
                skb->csum = csum_unfold(csum);
+               {
+               __wsum rsum = csum_partial(skb->data + ETH_HLEN, len - ETH_HLEN, 0);
+               if (csum != csum_fold(rsum))
+                       pr_err_ratelimited("sungem wrong csum : %x/%x, len %u bytes\n", csum, csum_fold(rsum), len);
+               }
                skb->ip_summed = CHECKSUM_COMPLETE;
                skb->protocol = eth_type_trans(skb, gp->dev);
 

^ permalink raw reply related

* Re: [PATCH] ptp: replace getnstimeofday64() with ktime_get_real_ts64()
From: Richard Cochran @ 2018-06-18 18:26 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Yangbo Lu, y2038, David S. Miller, Fabio Estevam, netdev,
	linux-kernel
In-Reply-To: <20180618142109.3445025-1-arnd@arndb.de>

On Mon, Jun 18, 2018 at 04:20:39PM +0200, Arnd Bergmann wrote:
> getnstimeofday64() is deprecated and getting replaced throughout
> the kernel with ktime_get_*() based helpers for a more consistent
> interface.
> 
> The two functions do the exact same thing, so this is just
> a cosmetic change.

Acked-by: Richard Cochran <richardcochran@gmail.com>

^ permalink raw reply

* Re: [PATCH bpf-net] bpf: Change bpf_fib_lookup to return lookup status
From: David Ahern @ 2018-06-18 18:27 UTC (permalink / raw)
  To: Martin KaFai Lau, dsahern; +Cc: netdev, borkmann, ast, davem
In-Reply-To: <20180618181123.eczjeb3axd6sao57@kafai-mbp.dhcp.thefacebook.com>

On 6/18/18 12:11 PM, Martin KaFai Lau wrote:
> On Sun, Jun 17, 2018 at 08:18:19AM -0700, dsahern@kernel.org wrote:
>> From: David Ahern <dsahern@gmail.com>
>>
>> For ACLs implemented using either FIB rules or FIB entries, the BPF
>> program needs the FIB lookup status to be able to drop the packet.
> Except BPF_FIB_LKUP_RET_SUCCESS and BPF_FIB_LKUP_RET_NO_NEIGH,  can you
> give an example on how the xdp_prog may decide XDP_PASS vs XDP_DROP based
> on other BPF_FIB_LKUP_RET_*?
> 

	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
	if (rc == 0)
		packet is forwarded, do the redirect

	/* the program is misconfigured -- wrong parameters in struct or flags */
	if (rc < 0)
		....

	/* rc > 0 case */
	switch(rc) {
	case BPF_FIB_LKUP_RET_BLACKHOLE:
	case BPF_FIB_LKUP_RET_UNREACHABLE:
	case BPF_FIB_LKUP_RET_PROHIBIT:
		return XDP_DROP;
	}

For the others it becomes a question of do we share why the stack needs
to be involved? Maybe the program wants to collect stats to show traffic
patterns that can be improved (BPF_FIB_LKUP_RET_FRAG_NEEDED) or support
in the kernel needs to be improved (BPF_FIB_LKUP_RET_UNSUPP_LWT) or an
interface is misconfigured (BPF_FIB_LKUP_RET_FWD_DISABLED).

Arguably BPF_FIB_LKUP_RET_NO_NHDEV is not needed. See below.

>> @@ -2612,6 +2613,19 @@ struct bpf_raw_tracepoint_args {
>>  #define BPF_FIB_LOOKUP_DIRECT  BIT(0)
>>  #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
>>  
>> +enum {
>> +	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
>> +	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed */
>> +	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable */
>> +	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed */
>> +	BPF_FIB_LKUP_RET_NOT_FWDED,    /* pkt is not forwardded */
> BPF_FIB_LKUP_RET_NOT_FWDED is a catch all?
> 

Destination is local. More precisely, the FIB lookup is not unicast so
not forwarded. It could be RTN_LOCAL, RTN_BROADCAST, RTN_ANYCAST, or
RTN_MULTICAST. The next ones -- blackhole, reachable, prohibit -- are
called out.

>> @@ -4252,16 +4277,19 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
>>  	if (check_mtu) {
>>  		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
>>  		if (params->tot_len > mtu)
>> -			return 0;
>> +			return BPF_FIB_LKUP_RET_FRAG_NEEDED;
>>  	}
>>  
>>  	if (f6i->fib6_nh.nh_lwtstate)
>> -		return 0;
>> +		return BPF_FIB_LKUP_RET_UNSUPP_LWT;
>>  
>>  	if (f6i->fib6_flags & RTF_GATEWAY)
>>  		*dst = f6i->fib6_nh.nh_gw;
>>  
>>  	dev = f6i->fib6_nh.nh_dev;
>> +	if (unlikely(!dev))
>> +		return BPF_FIB_LKUP_RET_NO_NHDEV;
> Is this a bug fix?
> 

Difference between IPv4 and IPv6. Making them consistent.

It is a major BUG in the kernel to reach this point in either protocol
to have a unicast route not tied to a device. IPv4 has checks; v6 does
not. I figured this being new code, why not make bpf_ipv{4,6}_fib_lookup
as close to the same as possible.

^ permalink raw reply

* Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"
From: Mathieu Malaterre @ 2018-06-18 18:29 UTC (permalink / raw)
  To: schwab
  Cc: eric.dumazet, David S. Miller, Eric Dumazet, LKML,
	Christophe LEROY, Meelis Roos, netdev, linuxppc-dev
In-Reply-To: <87o9g8geu0.fsf@igel.home>

On Mon, Jun 18, 2018 at 7:54 PM Andreas Schwab <schwab@linux-m68k.org> wrote:
>
> On Jun 17 2018, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> > Oh this is silly, please try :
> >
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index c642304f178ce0a4e1358d59e45032a39f76fb3f..54dd9c18ecad817812898d6f335e1794a07dabbe 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -1845,10 +1845,9 @@ EXPORT_SYMBOL(___pskb_trim);
> >  int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
> >  {
> >         if (skb->ip_summed == CHECKSUM_COMPLETE) {
> > -               int delta = skb->len - len;
> > +               __wsum csumdiff = skb_checksum(skb, len, skb->len - len, 0);
> >
> > -               skb->csum = csum_sub(skb->csum,
> > -                                    skb_checksum(skb, len, delta, 0));
> > +               skb->csum = csum_block_sub(skb->csum, csumdiff, len);
> >         }
> >         return __pskb_trim(skb, len);
> >  }
>
> That doesn't help either.

seconded (setup g4+sungem):

[  100.272676] eth0: hw csum failure
[  100.272710] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #6
[  100.272716] Call Trace:
[  100.272733] [dffedbd0] [c069ddb8]
__skb_checksum_complete+0xf0/0x108 (unreliable)
[  100.272752] [dffedbf0] [c078ea28] __udp4_lib_rcv+0x238/0xf98
[  100.272767] [dffedc70] [c0731630] ip_local_deliver_finish+0xa8/0x3c4
[  100.272777] [dffedcb0] [c073243c] ip_local_deliver+0xf0/0x154
[  100.272786] [dffedcf0] [c07328e8] ip_rcv+0x448/0x774
[  100.272800] [dffedd50] [c06aeaec] __netif_receive_skb_core+0x5e8/0x1184
[  100.272811] [dffedde0] [c06bba2c] napi_gro_receive+0x160/0x22c
[  100.272828] [dffede10] [e1571590] gem_poll+0x7fc/0x1ac0 [sungem]
[  100.272837] [dffedee0] [c06bacfc] net_rx_action+0x34c/0x618
[  100.272849] [dffedf60] [c07fd28c] __do_softirq+0x16c/0x5f0
[  100.272863] [dffedfd0] [c0064c7c] irq_exit+0x110/0x1a8
[  100.272877] [dffedff0] [c0016170] call_do_irq+0x24/0x3c
[  100.272890] [c0cf7e80] [c0009a84] do_IRQ+0x98/0x1a0
[  100.272900] [c0cf7eb0] [c001b474] ret_from_except+0x0/0x14
[  100.272911] --- interrupt: 501 at arch_cpu_idle+0x30/0x78
                   LR = arch_cpu_idle+0x30/0x78
[  100.272920] [c0cf7f70] [c0cf6000] 0xc0cf6000 (unreliable)
[  100.272935] [c0cf7f80] [c00a3868] do_idle+0xc4/0x158
[  100.272944] [c0cf7fb0] [c00a3ab4] cpu_startup_entry+0x24/0x28
[  100.272955] [c0cf7fc0] [c0998820] start_kernel+0x47c/0x490
[  100.272963] [c0cf7ff0] [00003444] 0x3444


> Andreas.
>
> --
> Andreas Schwab, schwab@linux-m68k.org
> GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
> "And now for something completely different."

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox