Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 0/5] cpsw: add per channel shaper configuration
From: Ivan Khoronzhuk @ 2016-11-29 15:00 UTC (permalink / raw)
  To: netdev, linux-kernel, mugunthanvnm, grygorii.strashko
  Cc: linux-omap, Ivan Khoronzhuk

This series is intended to allow user to set rate for per channel
shapers at cpdma level. This patchset doesn't have impact on performance.
The rate can be set with:

echo 100 > /sys/class/net/ethX/queues/tx-0/tx_maxrate

Tested on am572xx
Based on net-next/master

Ivan Khoronzhuk (5):
  net: ethernet: ti: davinci_cpdma: add weight function for channels
  net: ethernet: ti: davinci_cpdma: add set rate for a channel
  net: ethernet: ti: cpsw: add .ndo to set per-queue rate
  net: ethernet: ti: cpsw: optimize end of poll cycle
  net: ethernet: ti: cpsw: split tx budget according between channels

 drivers/net/ethernet/ti/cpsw.c          | 264 +++++++++++++++----
 drivers/net/ethernet/ti/davinci_cpdma.c | 453 ++++++++++++++++++++++++++++----
 drivers/net/ethernet/ti/davinci_cpdma.h |   6 +
 3 files changed, 624 insertions(+), 99 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH 1/5] net: ethernet: ti: davinci_cpdma: add weight function for channels
From: Ivan Khoronzhuk @ 2016-11-29 15:00 UTC (permalink / raw)
  To: netdev, linux-kernel, mugunthanvnm, grygorii.strashko
  Cc: linux-omap, Ivan Khoronzhuk
In-Reply-To: <1480431651-6081-1-git-send-email-ivan.khoronzhuk@linaro.org>

The weight of a channel is needed to split descriptors between
channels. The weight can depend on maximum rate of channels, maximum
rate of an interface or other reasons. The channel weight is in
percentage and is independent for rx and tx channels.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 124 +++++++++++++++++++++++++++++---
 drivers/net/ethernet/ti/davinci_cpdma.h |   1 +
 2 files changed, 115 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 56708a7..87456a9 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -122,6 +122,7 @@ struct cpdma_chan {
 	struct cpdma_chan_stats		stats;
 	/* offsets into dmaregs */
 	int	int_set, int_clear, td;
+	int				weight;
 };
 
 struct cpdma_control_info {
@@ -474,29 +475,131 @@ u32 cpdma_ctrl_txchs_state(struct cpdma_ctlr *ctlr)
 }
 EXPORT_SYMBOL_GPL(cpdma_ctrl_txchs_state);
 
+static void cpdma_chan_set_descs(struct cpdma_ctlr *ctlr,
+				 int rx, int desc_num,
+				 int per_ch_desc)
+{
+	struct cpdma_chan *chan, *most_chan = NULL;
+	int desc_cnt = desc_num;
+	int most_dnum = 0;
+	int min, max, i;
+
+	if (!desc_num)
+		return;
+
+	if (rx) {
+		min = rx_chan_num(0);
+		max = rx_chan_num(CPDMA_MAX_CHANNELS);
+	} else {
+		min = tx_chan_num(0);
+		max = tx_chan_num(CPDMA_MAX_CHANNELS);
+	}
+
+	for (i = min; i < max; i++) {
+		chan = ctlr->channels[i];
+		if (!chan)
+			continue;
+
+		if (chan->weight)
+			chan->desc_num = (chan->weight * desc_num) / 100;
+		else
+			chan->desc_num = per_ch_desc;
+
+		desc_cnt -= chan->desc_num;
+
+		if (most_dnum < chan->desc_num) {
+			most_dnum = chan->desc_num;
+			most_chan = chan;
+		}
+	}
+	/* use remains */
+	most_chan->desc_num += desc_cnt;
+}
+
 /**
  * cpdma_chan_split_pool - Splits ctrl pool between all channels.
  * Has to be called under ctlr lock
  */
-static void cpdma_chan_split_pool(struct cpdma_ctlr *ctlr)
+static int cpdma_chan_split_pool(struct cpdma_ctlr *ctlr)
 {
+	int tx_per_ch_desc = 0, rx_per_ch_desc = 0;
 	struct cpdma_desc_pool *pool = ctlr->pool;
+	int free_rx_num = 0, free_tx_num = 0;
+	int rx_weight = 0, tx_weight = 0;
+	int tx_desc_num, rx_desc_num;
 	struct cpdma_chan *chan;
-	int ch_desc_num;
-	int i;
+	int i, tx_num = 0;
 
 	if (!ctlr->chan_num)
-		return;
-
-	/* calculate average size of pool slice */
-	ch_desc_num = pool->num_desc / ctlr->chan_num;
+		return 0;
 
-	/* split ctlr pool */
 	for (i = 0; i < ARRAY_SIZE(ctlr->channels); i++) {
 		chan = ctlr->channels[i];
-		if (chan)
-			chan->desc_num = ch_desc_num;
+		if (!chan)
+			continue;
+
+		if (is_rx_chan(chan)) {
+			if (!chan->weight)
+				free_rx_num++;
+			rx_weight += chan->weight;
+		} else {
+			if (!chan->weight)
+				free_tx_num++;
+			tx_weight += chan->weight;
+			tx_num++;
+		}
+	}
+
+	if (rx_weight > 100 || tx_weight > 100)
+		return -EINVAL;
+
+	tx_desc_num = (tx_num * pool->num_desc) / ctlr->chan_num;
+	rx_desc_num = pool->num_desc - tx_desc_num;
+
+	if (free_tx_num) {
+		tx_per_ch_desc = tx_desc_num - (tx_weight * tx_desc_num) / 100;
+		tx_per_ch_desc /= free_tx_num;
+	}
+	if (free_rx_num) {
+		rx_per_ch_desc = rx_desc_num - (rx_weight * rx_desc_num) / 100;
+		rx_per_ch_desc /= free_rx_num;
 	}
+
+	cpdma_chan_set_descs(ctlr, 0, tx_desc_num, tx_per_ch_desc);
+	cpdma_chan_set_descs(ctlr, 1, rx_desc_num, rx_per_ch_desc);
+
+	return 0;
+}
+
+/* cpdma_chan_set_weight - set weight of a channel in percentage.
+ * Tx and Rx channels have separate weights. That is 100% for RX
+ * and 100% for Tx. The weight is used to split cpdma resources
+ * in correct proportion required by the channels, including number
+ * of descriptors. The channel rate is not enough to know the
+ * weight of a channel as the maximum rate of an interface is needed.
+ * If weight = 0, then channel uses rest of descriptors leaved by
+ * weighted channels.
+ */
+int cpdma_chan_set_weight(struct cpdma_chan *ch, int weight)
+{
+	struct cpdma_ctlr *ctlr = ch->ctlr;
+	unsigned long flags, ch_flags;
+	int ret;
+
+	spin_lock_irqsave(&ctlr->lock, flags);
+	spin_lock_irqsave(&ch->lock, ch_flags);
+	if (ch->weight == weight) {
+		spin_unlock_irqrestore(&ch->lock, ch_flags);
+		spin_unlock_irqrestore(&ctlr->lock, flags);
+		return 0;
+	}
+	ch->weight = weight;
+	spin_unlock_irqrestore(&ch->lock, ch_flags);
+
+	/* re-split pool using new channel weight */
+	ret = cpdma_chan_split_pool(ctlr);
+	spin_unlock_irqrestore(&ctlr->lock, flags);
+	return ret;
 }
 
 struct cpdma_chan *cpdma_chan_create(struct cpdma_ctlr *ctlr, int chan_num,
@@ -527,6 +630,7 @@ struct cpdma_chan *cpdma_chan_create(struct cpdma_ctlr *ctlr, int chan_num,
 	chan->chan_num	= chan_num;
 	chan->handler	= handler;
 	chan->desc_num = ctlr->pool->num_desc / 2;
+	chan->weight	= 0;
 
 	if (is_rx_chan(chan)) {
 		chan->hdp	= ctlr->params.rxhdp + offset;
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index a07b22b..629020c 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -90,6 +90,7 @@ int cpdma_chan_int_ctrl(struct cpdma_chan *chan, bool enable);
 u32 cpdma_ctrl_rxchs_state(struct cpdma_ctlr *ctlr);
 u32 cpdma_ctrl_txchs_state(struct cpdma_ctlr *ctlr);
 bool cpdma_check_free_tx_desc(struct cpdma_chan *chan);
+int cpdma_chan_set_weight(struct cpdma_chan *ch, int weight);
 
 enum cpdma_control {
 	CPDMA_CMD_IDLE,			/* write-only */
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/5] net: ethernet: ti: davinci_cpdma: add set rate for a channel
From: Ivan Khoronzhuk @ 2016-11-29 15:00 UTC (permalink / raw)
  To: netdev, linux-kernel, mugunthanvnm, grygorii.strashko
  Cc: linux-omap, Ivan Khoronzhuk
In-Reply-To: <1480431651-6081-1-git-send-email-ivan.khoronzhuk@linaro.org>

The cpdma has 8 rate limited tx channels. This patch adds
ability for cpdma driver to use 8 tx h/w shapers. If at least one
channel is not rate limited then it must have higher number, this
is because the rate limited channels have to have higher priority
then not rate limited channels. The channel priority is set in low-hi
direction already, so that when a new channel is added with ethtool
and it doesn't have rate yet, it cannot affect on rate limited
channels. It can be useful for TSN streams and just in cases when
h/w rate limited channels are needed.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 329 +++++++++++++++++++++++++++-----
 drivers/net/ethernet/ti/davinci_cpdma.h |   5 +
 2 files changed, 289 insertions(+), 45 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 87456a9..c776e45 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -32,6 +32,7 @@
 #define CPDMA_RXCONTROL		0x14
 #define CPDMA_SOFTRESET		0x1c
 #define CPDMA_RXTEARDOWN	0x18
+#define CPDMA_TX_PRI0_RATE	0x30
 #define CPDMA_TXINTSTATRAW	0x80
 #define CPDMA_TXINTSTATMASKED	0x84
 #define CPDMA_TXINTMASKSET	0x88
@@ -68,6 +69,8 @@
 
 #define CPDMA_TEARDOWN_VALUE	0xfffffffc
 
+#define CPDMA_MAX_RLIM_CNT	16384
+
 struct cpdma_desc {
 	/* hardware fields */
 	u32			hw_next;
@@ -123,6 +126,8 @@ struct cpdma_chan {
 	/* offsets into dmaregs */
 	int	int_set, int_clear, td;
 	int				weight;
+	u32				rate_factor;
+	u32				rate;
 };
 
 struct cpdma_control_info {
@@ -135,6 +140,7 @@ struct cpdma_control_info {
 };
 
 static struct cpdma_control_info controls[] = {
+	[CPDMA_TX_RLIM]		  = {CPDMA_DMACONTROL,	8,  0xffff, ACCESS_RW},
 	[CPDMA_CMD_IDLE]	  = {CPDMA_DMACONTROL,	3,  1,      ACCESS_WO},
 	[CPDMA_COPY_ERROR_FRAMES] = {CPDMA_DMACONTROL,	4,  1,      ACCESS_RW},
 	[CPDMA_RX_OFF_LEN_UPDATE] = {CPDMA_DMACONTROL,	2,  1,      ACCESS_RW},
@@ -302,6 +308,186 @@ static int _cpdma_control_set(struct cpdma_ctlr *ctlr, int control, int value)
 	return 0;
 }
 
+static int _cpdma_control_get(struct cpdma_ctlr *ctlr, int control)
+{
+	struct cpdma_control_info *info = &controls[control];
+	int ret;
+
+	if (!ctlr->params.has_ext_regs)
+		return -ENOTSUPP;
+
+	if (ctlr->state != CPDMA_STATE_ACTIVE)
+		return -EINVAL;
+
+	if (control < 0 || control >= ARRAY_SIZE(controls))
+		return -ENOENT;
+
+	if ((info->access & ACCESS_RO) != ACCESS_RO)
+		return -EPERM;
+
+	ret = (dma_reg_read(ctlr, info->reg) >> info->shift) & info->mask;
+	return ret;
+}
+
+/* cpdma_chan_set_chan_shaper - set shaper for a channel
+ * Has to be called under ctlr lock
+ */
+static int cpdma_chan_set_chan_shaper(struct cpdma_chan *chan)
+{
+	struct cpdma_ctlr *ctlr = chan->ctlr;
+	u32 rate_reg;
+	u32 rmask;
+	int ret;
+
+	if (!chan->rate)
+		return 0;
+
+	rate_reg = CPDMA_TX_PRI0_RATE + 4 * chan->chan_num;
+	dma_reg_write(ctlr, rate_reg, chan->rate_factor);
+
+	rmask = _cpdma_control_get(ctlr, CPDMA_TX_RLIM);
+	rmask |= chan->mask;
+
+	ret = _cpdma_control_set(ctlr, CPDMA_TX_RLIM, rmask);
+	return ret;
+}
+
+static int cpdma_chan_on(struct cpdma_chan *chan)
+{
+	struct cpdma_ctlr *ctlr = chan->ctlr;
+	struct cpdma_desc_pool	*pool = ctlr->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->lock, flags);
+	if (chan->state != CPDMA_STATE_IDLE) {
+		spin_unlock_irqrestore(&chan->lock, flags);
+		return -EBUSY;
+	}
+	if (ctlr->state != CPDMA_STATE_ACTIVE) {
+		spin_unlock_irqrestore(&chan->lock, flags);
+		return -EINVAL;
+	}
+	dma_reg_write(ctlr, chan->int_set, chan->mask);
+	chan->state = CPDMA_STATE_ACTIVE;
+	if (chan->head) {
+		chan_write(chan, hdp, desc_phys(pool, chan->head));
+		if (chan->rxfree)
+			chan_write(chan, rxfree, chan->count);
+	}
+
+	spin_unlock_irqrestore(&chan->lock, flags);
+	return 0;
+}
+
+/* cpdma_chan_fit_rate - set rate for a channel and check if it's possible.
+ * rmask - mask of rate limited channels
+ * Returns min rate in Kb/s
+ */
+static int cpdma_chan_fit_rate(struct cpdma_chan *ch, u32 rate,
+			       u32 *rmask, int *prio_mode)
+{
+	struct cpdma_ctlr *ctlr = ch->ctlr;
+	struct cpdma_chan *chan;
+	u32 old_rate = ch->rate;
+	u32 new_rmask = 0;
+	int rlim = 1;
+	int i;
+
+	*prio_mode = 0;
+	for (i = tx_chan_num(0); i < tx_chan_num(CPDMA_MAX_CHANNELS); i++) {
+		chan = ctlr->channels[i];
+		if (!chan) {
+			rlim = 0;
+			continue;
+		}
+
+		if (chan == ch)
+			chan->rate = rate;
+
+		if (chan->rate) {
+			if (rlim) {
+				new_rmask |= chan->mask;
+			} else {
+				ch->rate = old_rate;
+				dev_err(ctlr->dev, "Prev channel of %dch is not rate limited\n",
+					chan->chan_num);
+				return -EINVAL;
+			}
+		} else {
+			*prio_mode = 1;
+			rlim = 0;
+		}
+	}
+
+	*rmask = new_rmask;
+	return 0;
+}
+
+static u32 cpdma_chan_set_factors(struct cpdma_ctlr *ctlr,
+				  struct cpdma_chan *ch)
+{
+	u32 delta = UINT_MAX, prev_delta = UINT_MAX, best_delta = UINT_MAX;
+	u32 best_send_cnt = 0, best_idle_cnt = 0;
+	u32 new_rate, best_rate = 0, rate_reg;
+	u64 send_cnt, idle_cnt;
+	u32 min_send_cnt, freq;
+	u64 divident, divisor;
+
+	if (!ch->rate) {
+		ch->rate_factor = 0;
+		goto set_factor;
+	}
+
+	freq = ctlr->params.bus_freq_mhz * 1000 * 32;
+	if (!freq) {
+		dev_err(ctlr->dev, "The bus frequency is not set\n");
+		return -EINVAL;
+	}
+
+	min_send_cnt = freq - ch->rate;
+	send_cnt = DIV_ROUND_UP(min_send_cnt, ch->rate);
+	while (send_cnt <= CPDMA_MAX_RLIM_CNT) {
+		divident = ch->rate * send_cnt;
+		divisor = min_send_cnt;
+		idle_cnt = DIV_ROUND_CLOSEST_ULL(divident, divisor);
+
+		divident = freq * idle_cnt;
+		divisor = idle_cnt + send_cnt;
+		new_rate = DIV_ROUND_CLOSEST_ULL(divident, divisor);
+
+		delta = new_rate >= ch->rate ? new_rate - ch->rate : delta;
+		if (delta < best_delta) {
+			best_delta = delta;
+			best_send_cnt = send_cnt;
+			best_idle_cnt = idle_cnt;
+			best_rate = new_rate;
+
+			if (!delta)
+				break;
+		}
+
+		if (prev_delta >= delta) {
+			prev_delta = delta;
+			send_cnt++;
+			continue;
+		}
+
+		idle_cnt++;
+		divident = freq * idle_cnt;
+		send_cnt = DIV_ROUND_CLOSEST_ULL(divident, ch->rate);
+		send_cnt -= idle_cnt;
+		prev_delta = UINT_MAX;
+	}
+
+	ch->rate = best_rate;
+	ch->rate_factor = best_send_cnt | (best_idle_cnt << 16);
+
+set_factor:
+	rate_reg = CPDMA_TX_PRI0_RATE + 4 * ch->chan_num;
+	dma_reg_write(ctlr, rate_reg, ch->rate_factor);
+	return 0;
+}
+
 struct cpdma_ctlr *cpdma_ctlr_create(struct cpdma_params *params)
 {
 	struct cpdma_ctlr *ctlr;
@@ -332,8 +518,9 @@ EXPORT_SYMBOL_GPL(cpdma_ctlr_create);
 
 int cpdma_ctlr_start(struct cpdma_ctlr *ctlr)
 {
+	struct cpdma_chan *chan;
 	unsigned long flags;
-	int i;
+	int i, prio_mode;
 
 	spin_lock_irqsave(&ctlr->lock, flags);
 	if (ctlr->state != CPDMA_STATE_IDLE) {
@@ -369,12 +556,20 @@ int cpdma_ctlr_start(struct cpdma_ctlr *ctlr)
 
 	ctlr->state = CPDMA_STATE_ACTIVE;
 
+	prio_mode = 0;
 	for (i = 0; i < ARRAY_SIZE(ctlr->channels); i++) {
-		if (ctlr->channels[i])
-			cpdma_chan_start(ctlr->channels[i]);
+		chan = ctlr->channels[i];
+		if (chan) {
+			cpdma_chan_set_chan_shaper(chan);
+			cpdma_chan_on(chan);
+
+			/* off prio mode if all tx channels are rate limited */
+			if (is_tx_chan(chan) && !chan->rate)
+				prio_mode = 1;
+		}
 	}
 
-	_cpdma_control_set(ctlr, CPDMA_TX_PRIO_FIXED, 1);
+	_cpdma_control_set(ctlr, CPDMA_TX_PRIO_FIXED, prio_mode);
 	_cpdma_control_set(ctlr, CPDMA_RX_BUFFER_OFFSET, 0);
 
 	spin_unlock_irqrestore(&ctlr->lock, flags);
@@ -602,6 +797,75 @@ int cpdma_chan_set_weight(struct cpdma_chan *ch, int weight)
 	return ret;
 }
 
+/* cpdma_chan_get_min_rate - get minimum allowed rate for channel
+ * Should be called before cpdma_chan_set_rate.
+ * Returns min rate in Kb/s
+ */
+u32 cpdma_chan_get_min_rate(struct cpdma_ctlr *ctlr)
+{
+	unsigned int divident, divisor;
+
+	divident = ctlr->params.bus_freq_mhz * 32 * 1000;
+	divisor = 1 + CPDMA_MAX_RLIM_CNT;
+
+	return DIV_ROUND_UP(divident, divisor);
+}
+
+/* cpdma_chan_set_rate - limits bandwidth for transmit channel.
+ * The bandwidth * limited channels have to be in order beginning from lowest.
+ * ch - transmit channel the bandwidth is configured for
+ * rate - bandwidth in Kb/s, if 0 - then off shaper
+ */
+int cpdma_chan_set_rate(struct cpdma_chan *ch, u32 rate)
+{
+	struct cpdma_ctlr *ctlr = ch->ctlr;
+	unsigned long flags, ch_flags;
+	int ret, prio_mode;
+	u32 rmask;
+
+	if (!ch || !is_tx_chan(ch))
+		return -EINVAL;
+
+	if (ch->rate == rate)
+		return rate;
+
+	spin_lock_irqsave(&ctlr->lock, flags);
+	spin_lock_irqsave(&ch->lock, ch_flags);
+
+	ret = cpdma_chan_fit_rate(ch, rate, &rmask, &prio_mode);
+	if (ret)
+		goto err;
+
+	ret = cpdma_chan_set_factors(ctlr, ch);
+	if (ret)
+		goto err;
+
+	spin_unlock_irqrestore(&ch->lock, ch_flags);
+
+	/* on shapers */
+	_cpdma_control_set(ctlr, CPDMA_TX_RLIM, rmask);
+	_cpdma_control_set(ctlr, CPDMA_TX_PRIO_FIXED, prio_mode);
+	spin_unlock_irqrestore(&ctlr->lock, flags);
+	return ret;
+
+err:
+	spin_unlock_irqrestore(&ch->lock, ch_flags);
+	spin_unlock_irqrestore(&ctlr->lock, flags);
+	return ret;
+}
+
+u32 cpdma_chan_get_rate(struct cpdma_chan *ch)
+{
+	unsigned long flags;
+	u32 rate;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	rate = ch->rate;
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	return rate;
+}
+
 struct cpdma_chan *cpdma_chan_create(struct cpdma_ctlr *ctlr, int chan_num,
 				     cpdma_handler_fn handler, int rx_type)
 {
@@ -629,6 +893,7 @@ struct cpdma_chan *cpdma_chan_create(struct cpdma_ctlr *ctlr, int chan_num,
 	chan->state	= CPDMA_STATE_IDLE;
 	chan->chan_num	= chan_num;
 	chan->handler	= handler;
+	chan->rate	= 0;
 	chan->desc_num = ctlr->pool->num_desc / 2;
 	chan->weight	= 0;
 
@@ -924,28 +1189,20 @@ EXPORT_SYMBOL_GPL(cpdma_chan_process);
 
 int cpdma_chan_start(struct cpdma_chan *chan)
 {
-	struct cpdma_ctlr	*ctlr = chan->ctlr;
-	struct cpdma_desc_pool	*pool = ctlr->pool;
-	unsigned long		flags;
+	struct cpdma_ctlr *ctlr = chan->ctlr;
+	unsigned long flags;
+	int ret;
 
-	spin_lock_irqsave(&chan->lock, flags);
-	if (chan->state != CPDMA_STATE_IDLE) {
-		spin_unlock_irqrestore(&chan->lock, flags);
-		return -EBUSY;
-	}
-	if (ctlr->state != CPDMA_STATE_ACTIVE) {
-		spin_unlock_irqrestore(&chan->lock, flags);
-		return -EINVAL;
-	}
-	dma_reg_write(ctlr, chan->int_set, chan->mask);
-	chan->state = CPDMA_STATE_ACTIVE;
-	if (chan->head) {
-		chan_write(chan, hdp, desc_phys(pool, chan->head));
-		if (chan->rxfree)
-			chan_write(chan, rxfree, chan->count);
-	}
+	spin_lock_irqsave(&ctlr->lock, flags);
+	ret = cpdma_chan_set_chan_shaper(chan);
+	spin_unlock_irqrestore(&ctlr->lock, flags);
+	if (ret)
+		return ret;
+
+	ret = cpdma_chan_on(chan);
+	if (ret)
+		return ret;
 
-	spin_unlock_irqrestore(&chan->lock, flags);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cpdma_chan_start);
@@ -1033,31 +1290,12 @@ int cpdma_chan_int_ctrl(struct cpdma_chan *chan, bool enable)
 int cpdma_control_get(struct cpdma_ctlr *ctlr, int control)
 {
 	unsigned long flags;
-	struct cpdma_control_info *info = &controls[control];
 	int ret;
 
 	spin_lock_irqsave(&ctlr->lock, flags);
-
-	ret = -ENOTSUPP;
-	if (!ctlr->params.has_ext_regs)
-		goto unlock_ret;
-
-	ret = -EINVAL;
-	if (ctlr->state != CPDMA_STATE_ACTIVE)
-		goto unlock_ret;
-
-	ret = -ENOENT;
-	if (control < 0 || control >= ARRAY_SIZE(controls))
-		goto unlock_ret;
-
-	ret = -EPERM;
-	if ((info->access & ACCESS_RO) != ACCESS_RO)
-		goto unlock_ret;
-
-	ret = (dma_reg_read(ctlr, info->reg) >> info->shift) & info->mask;
-
-unlock_ret:
+	ret = _cpdma_control_get(ctlr, control);
 	spin_unlock_irqrestore(&ctlr->lock, flags);
+
 	return ret;
 }
 
@@ -1069,6 +1307,7 @@ int cpdma_control_set(struct cpdma_ctlr *ctlr, int control, int value)
 	spin_lock_irqsave(&ctlr->lock, flags);
 	ret = _cpdma_control_set(ctlr, control, value);
 	spin_unlock_irqrestore(&ctlr->lock, flags);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(cpdma_control_set);
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 629020c..4a167db 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -36,6 +36,7 @@ struct cpdma_params {
 	u32			desc_hw_addr;
 	int			desc_mem_size;
 	int			desc_align;
+	u32			bus_freq_mhz;
 
 	/*
 	 * Some instances of embedded cpdma controllers have extra control and
@@ -91,8 +92,12 @@ u32 cpdma_ctrl_rxchs_state(struct cpdma_ctlr *ctlr);
 u32 cpdma_ctrl_txchs_state(struct cpdma_ctlr *ctlr);
 bool cpdma_check_free_tx_desc(struct cpdma_chan *chan);
 int cpdma_chan_set_weight(struct cpdma_chan *ch, int weight);
+int cpdma_chan_set_rate(struct cpdma_chan *ch, u32 rate);
+u32 cpdma_chan_get_rate(struct cpdma_chan *ch);
+u32 cpdma_chan_get_min_rate(struct cpdma_ctlr *ctlr);
 
 enum cpdma_control {
+	CPDMA_TX_RLIM,			/* read-write */
 	CPDMA_CMD_IDLE,			/* write-only */
 	CPDMA_COPY_ERROR_FRAMES,	/* read-write */
 	CPDMA_RX_OFF_LEN_UPDATE,	/* read-write */
-- 
2.7.4

^ permalink raw reply related

* [PATCH 5/5] net: ethernet: ti: cpsw: split tx budget according between channels
From: Ivan Khoronzhuk @ 2016-11-29 15:00 UTC (permalink / raw)
  To: netdev, linux-kernel, mugunthanvnm, grygorii.strashko
  Cc: linux-omap, Ivan Khoronzhuk
In-Reply-To: <1480431651-6081-1-git-send-email-ivan.khoronzhuk@linaro.org>

Split device budget between channels according to channel rate.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/cpsw.c | 159 +++++++++++++++++++++++++++++++++--------
 1 file changed, 130 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 8a70298..ec4873f 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -365,6 +365,11 @@ static inline void slave_write(struct cpsw_slave *slave, u32 val, u32 offset)
 	__raw_writel(val, slave->regs + offset);
 }
 
+struct cpsw_vector {
+	struct cpdma_chan *ch;
+	int budget;
+};
+
 struct cpsw_common {
 	struct device			*dev;
 	struct cpsw_platform_data	data;
@@ -380,8 +385,8 @@ struct cpsw_common {
 	int				rx_packet_max;
 	struct cpsw_slave		*slaves;
 	struct cpdma_ctlr		*dma;
-	struct cpdma_chan		*txch[CPSW_MAX_QUEUES];
-	struct cpdma_chan		*rxch[CPSW_MAX_QUEUES];
+	struct cpsw_vector		txv[CPSW_MAX_QUEUES];
+	struct cpsw_vector		rxv[CPSW_MAX_QUEUES];
 	struct cpsw_ale			*ale;
 	bool				quirk_irq;
 	bool				rx_irq_disabled;
@@ -741,7 +746,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
 		return;
 	}
 
-	ch = cpsw->rxch[skb_get_queue_mapping(new_skb)];
+	ch = cpsw->rxv[skb_get_queue_mapping(new_skb)].ch;
 	ret = cpdma_chan_submit(ch, new_skb, new_skb->data,
 				skb_tailroom(new_skb), 0);
 	if (WARN_ON(ret < 0))
@@ -783,8 +788,9 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id)
 static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
 {
 	u32			ch_map;
-	int			num_tx, ch;
+	int			num_tx, cur_budget, ch;
 	struct cpsw_common	*cpsw = napi_to_cpsw(napi_tx);
+	struct cpsw_vector	*txv;
 
 	/* process every unprocessed channel */
 	ch_map = cpdma_ctrl_txchs_state(cpsw->dma);
@@ -792,7 +798,13 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
 		if (!(ch_map & 0x01))
 			continue;
 
-		num_tx += cpdma_chan_process(cpsw->txch[ch], budget - num_tx);
+		txv = &cpsw->txv[ch];
+		if (unlikely(txv->budget > budget - num_tx))
+			cur_budget = budget - num_tx;
+		else
+			cur_budget = txv->budget;
+
+		num_tx += cpdma_chan_process(txv->ch, cur_budget);
 		if (num_tx >= budget)
 			break;
 	}
@@ -812,8 +824,9 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
 static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
 {
 	u32			ch_map;
-	int			num_rx, ch;
+	int			num_rx, cur_budget, ch;
 	struct cpsw_common	*cpsw = napi_to_cpsw(napi_rx);
+	struct cpsw_vector	*rxv;
 
 	/* process every unprocessed channel */
 	ch_map = cpdma_ctrl_rxchs_state(cpsw->dma);
@@ -821,7 +834,13 @@ static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
 		if (!(ch_map & 0x01))
 			continue;
 
-		num_rx += cpdma_chan_process(cpsw->rxch[ch], budget - num_rx);
+		rxv = &cpsw->rxv[ch];
+		if (unlikely(rxv->budget > budget - num_rx))
+			cur_budget = budget - num_rx;
+		else
+			cur_budget = rxv->budget;
+
+		num_rx += cpdma_chan_process(rxv->ch, cur_budget);
 		if (num_rx >= budget)
 			break;
 	}
@@ -1063,7 +1082,7 @@ static void cpsw_get_ethtool_stats(struct net_device *ndev,
 				cpsw_gstrings_stats[l].stat_offset);
 
 	for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
-		cpdma_chan_get_stats(cpsw->rxch[ch], &ch_stats);
+		cpdma_chan_get_stats(cpsw->rxv[ch].ch, &ch_stats);
 		for (i = 0; i < CPSW_STATS_CH_LEN; i++, l++) {
 			p = (u8 *)&ch_stats +
 				cpsw_gstrings_ch_stats[i].stat_offset;
@@ -1072,7 +1091,7 @@ static void cpsw_get_ethtool_stats(struct net_device *ndev,
 	}
 
 	for (ch = 0; ch < cpsw->tx_ch_num; ch++) {
-		cpdma_chan_get_stats(cpsw->txch[ch], &ch_stats);
+		cpdma_chan_get_stats(cpsw->txv[ch].ch, &ch_stats);
 		for (i = 0; i < CPSW_STATS_CH_LEN; i++, l++) {
 			p = (u8 *)&ch_stats +
 				cpsw_gstrings_ch_stats[i].stat_offset;
@@ -1261,6 +1280,82 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
 	}
 }
 
+/* split budget depending on channel rates */
+static void cpsw_split_budget(struct net_device *ndev)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct cpsw_common *cpsw = priv->cpsw;
+	struct cpsw_vector *txv = cpsw->txv;
+	u32 consumed_rate, bigest_rate = 0;
+	int budget, bigest_rate_ch = 0;
+	struct cpsw_slave *slave;
+	int i, rlim_ch_num = 0;
+	u32 ch_rate, max_rate;
+	int ch_budget = 0;
+
+	if (cpsw->data.dual_emac)
+		slave = &cpsw->slaves[priv->emac_port];
+	else
+		slave = &cpsw->slaves[cpsw->data.active_slave];
+
+	max_rate = slave->phy->speed * 1000;
+
+	consumed_rate = 0;
+	for (i = 0; i < cpsw->tx_ch_num; i++) {
+		ch_rate = cpdma_chan_get_rate(txv[i].ch);
+		if (!ch_rate)
+			continue;
+
+		rlim_ch_num++;
+		consumed_rate += ch_rate;
+	}
+
+	if (cpsw->tx_ch_num == rlim_ch_num) {
+		max_rate = consumed_rate;
+	} else {
+		ch_budget = (consumed_rate * CPSW_POLL_WEIGHT) / max_rate;
+		ch_budget = (CPSW_POLL_WEIGHT - ch_budget) /
+			    (cpsw->tx_ch_num - rlim_ch_num);
+		bigest_rate = (max_rate - consumed_rate) /
+			      (cpsw->tx_ch_num - rlim_ch_num);
+	}
+
+	/* split tx budget */
+	budget = CPSW_POLL_WEIGHT;
+	for (i = 0; i < cpsw->tx_ch_num; i++) {
+		ch_rate = cpdma_chan_get_rate(txv[i].ch);
+		if (ch_rate) {
+			txv[i].budget = (ch_rate * CPSW_POLL_WEIGHT) / max_rate;
+			if (!txv[i].budget)
+				txv[i].budget = 1;
+			if (ch_rate > bigest_rate) {
+				bigest_rate_ch = i;
+				bigest_rate = ch_rate;
+			}
+		} else {
+			txv[i].budget = ch_budget;
+			if (!bigest_rate_ch)
+				bigest_rate_ch = i;
+		}
+
+		budget -= txv[i].budget;
+	}
+
+	if (budget)
+		txv[bigest_rate_ch].budget += budget;
+
+	/* split rx budget */
+	budget = CPSW_POLL_WEIGHT;
+	ch_budget = budget / cpsw->rx_ch_num;
+	for (i = 0; i < cpsw->rx_ch_num; i++) {
+		cpsw->rxv[i].budget = ch_budget;
+		budget -= ch_budget;
+	}
+
+	if (budget)
+		cpsw->rxv[0].budget += budget;
+}
+
 static int cpsw_fill_rx_channels(struct cpsw_priv *priv)
 {
 	struct cpsw_common *cpsw = priv->cpsw;
@@ -1269,7 +1364,7 @@ static int cpsw_fill_rx_channels(struct cpsw_priv *priv)
 	int ch, i, ret;
 
 	for (ch = 0; ch < cpsw->rx_ch_num; ch++) {
-		ch_buf_num = cpdma_chan_get_rx_buf_num(cpsw->rxch[ch]);
+		ch_buf_num = cpdma_chan_get_rx_buf_num(cpsw->rxv[ch].ch);
 		for (i = 0; i < ch_buf_num; i++) {
 			skb = __netdev_alloc_skb_ip_align(priv->ndev,
 							  cpsw->rx_packet_max,
@@ -1280,8 +1375,9 @@ static int cpsw_fill_rx_channels(struct cpsw_priv *priv)
 			}
 
 			skb_set_queue_mapping(skb, ch);
-			ret = cpdma_chan_submit(cpsw->rxch[ch], skb, skb->data,
-						skb_tailroom(skb), 0);
+			ret = cpdma_chan_submit(cpsw->rxv[ch].ch, skb,
+						skb->data, skb_tailroom(skb),
+						0);
 			if (ret < 0) {
 				cpsw_err(priv, ifup,
 					 "cannot submit skb to channel %d rx, error %d\n",
@@ -1405,6 +1501,7 @@ static int cpsw_ndo_open(struct net_device *ndev)
 		cpsw_set_coalesce(ndev, &coal);
 	}
 
+	cpsw_split_budget(ndev);
 	cpdma_ctlr_start(cpsw->dma);
 	cpsw_intr_enable(cpsw);
 
@@ -1474,7 +1571,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
 	if (q_idx >= cpsw->tx_ch_num)
 		q_idx = q_idx % cpsw->tx_ch_num;
 
-	txch = cpsw->txch[q_idx];
+	txch = cpsw->txv[q_idx].ch;
 	ret = cpsw_tx_packet_submit(priv, skb, txch);
 	if (unlikely(ret != 0)) {
 		cpsw_err(priv, tx_err, "desc submit failed\n");
@@ -1681,8 +1778,8 @@ static void cpsw_ndo_tx_timeout(struct net_device *ndev)
 	ndev->stats.tx_errors++;
 	cpsw_intr_disable(cpsw);
 	for (ch = 0; ch < cpsw->tx_ch_num; ch++) {
-		cpdma_chan_stop(cpsw->txch[ch]);
-		cpdma_chan_start(cpsw->txch[ch]);
+		cpdma_chan_stop(cpsw->txv[ch].ch);
+		cpdma_chan_start(cpsw->txv[ch].ch);
 	}
 
 	cpsw_intr_enable(cpsw);
@@ -1887,7 +1984,6 @@ static int cpsw_ndo_set_tx_maxrate(struct net_device *ndev, int queue, u32 rate)
 			ch_rate = rate;
 		else
 			ch_rate = netdev_get_tx_queue(ndev, i)->tx_maxrate;
-
 		if (!ch_rate)
 			continue;
 
@@ -1935,9 +2031,12 @@ static int cpsw_ndo_set_tx_maxrate(struct net_device *ndev, int queue, u32 rate)
 		max_rate = consumed_rate;
 
 	weight = (rate * 100) / (max_rate * 1000);
-	cpdma_chan_set_weight(cpsw->txch[queue], weight);
+	cpdma_chan_set_weight(cpsw->txv[queue].ch, weight);
+	ret = cpdma_chan_set_rate(cpsw->txv[queue].ch, rate);
 
-	ret = cpdma_chan_set_rate(cpsw->txch[queue], rate);
+	/* re-split budget between channels */
+	if (!rate)
+		cpsw_split_budget(ndev);
 	pm_runtime_put(cpsw->dev);
 	return ret;
 }
@@ -2172,30 +2271,30 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
 	struct cpsw_common *cpsw = priv->cpsw;
 	void (*handler)(void *, int, int);
 	struct netdev_queue *queue;
-	struct cpdma_chan **chan;
+	struct cpsw_vector *vec;
 	int ret, *ch;
 
 	if (rx) {
 		ch = &cpsw->rx_ch_num;
-		chan = cpsw->rxch;
+		vec = cpsw->rxv;
 		handler = cpsw_rx_handler;
 		poll = cpsw_rx_poll;
 	} else {
 		ch = &cpsw->tx_ch_num;
-		chan = cpsw->txch;
+		vec = cpsw->txv;
 		handler = cpsw_tx_handler;
 		poll = cpsw_tx_poll;
 	}
 
 	while (*ch < ch_num) {
-		chan[*ch] = cpdma_chan_create(cpsw->dma, *ch, handler, rx);
+		vec[*ch].ch = cpdma_chan_create(cpsw->dma, *ch, handler, rx);
 		queue = netdev_get_tx_queue(priv->ndev, *ch);
 		queue->tx_maxrate = 0;
 
-		if (IS_ERR(chan[*ch]))
-			return PTR_ERR(chan[*ch]);
+		if (IS_ERR(vec[*ch].ch))
+			return PTR_ERR(vec[*ch].ch);
 
-		if (!chan[*ch])
+		if (!vec[*ch].ch)
 			return -EINVAL;
 
 		cpsw_info(priv, ifup, "created new %d %s channel\n", *ch,
@@ -2206,7 +2305,7 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
 	while (*ch > ch_num) {
 		(*ch)--;
 
-		ret = cpdma_chan_destroy(chan[*ch]);
+		ret = cpdma_chan_destroy(vec[*ch].ch);
 		if (ret)
 			return ret;
 
@@ -2293,6 +2392,8 @@ static int cpsw_set_channels(struct net_device *ndev,
 		if (ret)
 			goto err;
 
+		cpsw_split_budget(ndev);
+
 		/* After this receive is started */
 		cpdma_ctlr_start(cpsw->dma);
 		cpsw_intr_enable(cpsw);
@@ -2835,9 +2936,9 @@ static int cpsw_probe(struct platform_device *pdev)
 		goto clean_dt_ret;
 	}
 
-	cpsw->txch[0] = cpdma_chan_create(cpsw->dma, 0, cpsw_tx_handler, 0);
-	cpsw->rxch[0] = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
-	if (WARN_ON(!cpsw->rxch[0] || !cpsw->txch[0])) {
+	cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_tx_handler, 0);
+	cpsw->rxv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_rx_handler, 1);
+	if (WARN_ON(!cpsw->rxv[0].ch || !cpsw->txv[0].ch)) {
 		dev_err(priv->dev, "error initializing dma channels\n");
 		ret = -ENOMEM;
 		goto clean_dma_ret;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 3/5] net: ethernet: ti: cpsw: add .ndo to set per-queue rate
From: Ivan Khoronzhuk @ 2016-11-29 15:00 UTC (permalink / raw)
  To: netdev, linux-kernel, mugunthanvnm, grygorii.strashko
  Cc: linux-omap, Ivan Khoronzhuk
In-Reply-To: <1480431651-6081-1-git-send-email-ivan.khoronzhuk@linaro.org>

This patch allows to rate limit queues tx queues for cpsw interface.
The rate is set in absolute Mb/s units and cannot be more a speed
an interface is connected with.

The rate for a tx queue can be tested with:

ethtool -L eth0 rx 4 tx 4

echo 100 > /sys/class/net/eth0/queues/tx-0/tx_maxrate
echo 200 > /sys/class/net/eth0/queues/tx-1/tx_maxrate
echo 50 > /sys/class/net/eth0/queues/tx-2/tx_maxrate
echo 30 > /sys/class/net/eth0/queues/tx-3/tx_maxrate

tc qdisc add dev eth0 root handle 1: multiq

tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip\
dport 5001 0xffff action skbedit queue_mapping 0

tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip\
dport 5002 0xffff action skbedit queue_mapping 1

tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip\
dport 5003 0xffff action skbedit queue_mapping 2

tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip\
dport 5004 0xffff action skbedit queue_mapping 3

iperf -c 192.168.2.1 -b 110M -p 5001 -f m -t 60
iperf -c 192.168.2.1 -b 215M -p 5002 -f m -t 60
iperf -c 192.168.2.1 -b 55M -p 5003 -f m -t 60
iperf -c 192.168.2.1 -b 32M -p 5004 -f m -t 60

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/cpsw.c | 87 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index da40ea5..c102675 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1872,6 +1872,88 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
 	return ret;
 }
 
+static int cpsw_ndo_set_tx_maxrate(struct net_device *ndev, int queue, u32 rate)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	int tx_ch_num = ndev->real_num_tx_queues;
+	u32 consumed_rate, min_rate, max_rate;
+	struct cpsw_common *cpsw = priv->cpsw;
+	struct cpsw_slave *slave;
+	int ret, i, weight;
+	int rlim_num = 0;
+	u32 ch_rate;
+
+	ch_rate = netdev_get_tx_queue(ndev, queue)->tx_maxrate;
+	if (ch_rate == rate)
+		return 0;
+
+	if (cpsw->data.dual_emac)
+		slave = &cpsw->slaves[priv->emac_port];
+	else
+		slave = &cpsw->slaves[cpsw->data.active_slave];
+	max_rate = slave->phy->speed;
+
+	consumed_rate = 0;
+	for (i = 0; i < tx_ch_num; i++) {
+		if (i == queue)
+			ch_rate = rate;
+		else
+			ch_rate = netdev_get_tx_queue(ndev, i)->tx_maxrate;
+
+		if (!ch_rate)
+			continue;
+
+		rlim_num++;
+		consumed_rate += ch_rate;
+	}
+
+	if (consumed_rate > max_rate)
+		dev_info(priv->dev, "The common rate shouldn't be more than %dMbps",
+			 max_rate);
+
+	if (consumed_rate > max_rate) {
+		if (max_rate == 10 && consumed_rate <= 100) {
+			max_rate = 100;
+		} else if (max_rate <= 100 && consumed_rate <= 1000) {
+			max_rate = 1000;
+		} else {
+			dev_err(priv->dev, "The common rate cannot be more than %dMbps",
+				max_rate);
+			return -EINVAL;
+		}
+	}
+
+	if (consumed_rate > max_rate) {
+		dev_err(priv->dev, "The common rate cannot be more than %dMbps",
+			max_rate);
+		return -EINVAL;
+	}
+
+	rate *= 1000;
+	min_rate = cpdma_chan_get_min_rate(cpsw->dma);
+	if ((rate < min_rate && rate)) {
+		dev_err(priv->dev, "The common rate cannot be less than %dMbps",
+			min_rate);
+		return -EINVAL;
+	}
+
+	ret = pm_runtime_get_sync(cpsw->dev);
+	if (ret < 0) {
+		pm_runtime_put_noidle(cpsw->dev);
+		return ret;
+	}
+
+	if (rlim_num == tx_ch_num)
+		max_rate = consumed_rate;
+
+	weight = (rate * 100) / (max_rate * 1000);
+	cpdma_chan_set_weight(cpsw->txch[queue], weight);
+
+	ret = cpdma_chan_set_rate(cpsw->txch[queue], rate);
+	pm_runtime_put(cpsw->dev);
+	return ret;
+}
+
 static const struct net_device_ops cpsw_netdev_ops = {
 	.ndo_open		= cpsw_ndo_open,
 	.ndo_stop		= cpsw_ndo_stop,
@@ -1881,6 +1963,7 @@ static const struct net_device_ops cpsw_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_tx_timeout		= cpsw_ndo_tx_timeout,
 	.ndo_set_rx_mode	= cpsw_ndo_set_rx_mode,
+	.ndo_set_tx_maxrate	= cpsw_ndo_set_tx_maxrate,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= cpsw_ndo_poll_controller,
 #endif
@@ -2100,6 +2183,7 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
 	int (*poll)(struct napi_struct *, int);
 	struct cpsw_common *cpsw = priv->cpsw;
 	void (*handler)(void *, int, int);
+	struct netdev_queue *queue;
 	struct cpdma_chan **chan;
 	int ret, *ch;
 
@@ -2117,6 +2201,8 @@ static int cpsw_update_channels_res(struct cpsw_priv *priv, int ch_num, int rx)
 
 	while (*ch < ch_num) {
 		chan[*ch] = cpdma_chan_create(cpsw->dma, *ch, handler, rx);
+		queue = netdev_get_tx_queue(priv->ndev, *ch);
+		queue->tx_maxrate = 0;
 
 		if (IS_ERR(chan[*ch]))
 			return PTR_ERR(chan[*ch]);
@@ -2752,6 +2838,7 @@ static int cpsw_probe(struct platform_device *pdev)
 	dma_params.desc_align		= 16;
 	dma_params.has_ext_regs		= true;
 	dma_params.desc_hw_addr         = dma_params.desc_mem_phys;
+	dma_params.bus_freq_mhz		= cpsw->bus_freq_mhz;
 
 	cpsw->dma = cpdma_ctlr_create(&dma_params);
 	if (!cpsw->dma) {
-- 
2.7.4

^ permalink raw reply related

* [PATCH 4/5] net: ethernet: ti: cpsw: optimize end of poll cycle
From: Ivan Khoronzhuk @ 2016-11-29 15:00 UTC (permalink / raw)
  To: netdev, linux-kernel, mugunthanvnm, grygorii.strashko
  Cc: linux-omap, Ivan Khoronzhuk
In-Reply-To: <1480431651-6081-1-git-send-email-ivan.khoronzhuk@linaro.org>

Check budget fullness only after it's updated and update
channel mask only once to keep budget balance between channels.
It's also needed for farther changes.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/cpsw.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index c102675..8a70298 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -788,19 +788,13 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
 
 	/* process every unprocessed channel */
 	ch_map = cpdma_ctrl_txchs_state(cpsw->dma);
-	for (ch = 0, num_tx = 0; num_tx < budget; ch_map >>= 1, ch++) {
-		if (!ch_map) {
-			ch_map = cpdma_ctrl_txchs_state(cpsw->dma);
-			if (!ch_map)
-				break;
-
-			ch = 0;
-		}
-
+	for (ch = 0, num_tx = 0; ch_map; ch_map >>= 1, ch++) {
 		if (!(ch_map & 0x01))
 			continue;
 
 		num_tx += cpdma_chan_process(cpsw->txch[ch], budget - num_tx);
+		if (num_tx >= budget)
+			break;
 	}
 
 	if (num_tx < budget) {
@@ -823,19 +817,13 @@ static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
 
 	/* process every unprocessed channel */
 	ch_map = cpdma_ctrl_rxchs_state(cpsw->dma);
-	for (ch = 0, num_rx = 0; num_rx < budget; ch_map >>= 1, ch++) {
-		if (!ch_map) {
-			ch_map = cpdma_ctrl_rxchs_state(cpsw->dma);
-			if (!ch_map)
-				break;
-
-			ch = 0;
-		}
-
+	for (ch = 0, num_rx = 0; ch_map; ch_map >>= 1, ch++) {
 		if (!(ch_map & 0x01))
 			continue;
 
 		num_rx += cpdma_chan_process(cpsw->rxch[ch], budget - num_rx);
+		if (num_rx >= budget)
+			break;
 	}
 
 	if (num_rx < budget) {
-- 
2.7.4

^ permalink raw reply related

* Re: AF_VSOCK network namespace support
From: Stefan Hajnoczi @ 2016-11-29 15:09 UTC (permalink / raw)
  To: Jorgen S. Hansen; +Cc: netdev@vger.kernel.org, imbrenda@linux.vnet.ibm.com
In-Reply-To: <4DA0F029-EE07-4E51-879C-2033D1F8AFCD@vmware.com>

[-- Attachment #1: Type: text/plain, Size: 2894 bytes --]

On Mon, Nov 28, 2016 at 03:24:36PM +0000, Jorgen S. Hansen wrote:
> > On Nov 23, 2016, at 3:55 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > 
> > Hi Jorgen,
> > There are two use cases where network namespace support in AF_VSOCK
> > could be useful:
> > 
> > 1. Claudio Imbrenda pointed out that a machine cannot act as both host
> >   and guest at the same time.  This is necessary for nested
> >   virtualization.  Currently only one transport (the host side or the
> >   guest side) can be registered at a time.
> 
> VMCI based AF_VSOCK relies on the VMCI driver for nested virtualization support. The VMCI driver is a combined host/guest driver with a routing component, that will either direct traffic to VMs managed by the host “personality” of the driver, or to the outer host. So any VMCI driver driver is able to function simultaneously as both a guest and a host driver - exactly to be able to support nested virtualization.
> 
> Since, for VMCI based vSocket, the host has a fixed CID (2), any traffic generated by an application inside a VM destined for CID 2 will be routed out of the VM (to the host - either a virtual or physical one). Any traffic for a CID > 2 will be directed towards VMs managed by the host personality of the VMCI driver.
> 
> Since VMCI predates nested virtualization, the solution above was partly a result of having to support existing configurations in a transparent way.

Thanks for your reply.

It seems like a reasonable solution.  We could do something similar for
virtio.

> > 2. Users may wish to isolate the AF_VSOCK address namespace so that two
> >   VMs have completely independent CID and ports (they could even use
> >   the same CID and ports because they're in separate namespaces).  This
> >   ensures that a host service visible to VM1 is not automatically
> >   visible to VM2.
> 
> If the goal is to provide fine grained service access control, won’t this end up requiring a namespace per VM? For ESX, we have a mechanism to tag VMs that allows them to be granted access to a service offered through AF_VSOCK, but this is not part of the Linux hypervisor.

Yes it would and using one network namespace per VM is a bit clumsy
because it means IP networking (e.g. for libiscsi or GlusterFS) needs to
be configured carefully so that external network connectivity works
(using a veth interface).

I did think about netfilter support but one of the main reasons for
using AF_VSOCK vs Ethernet is to avoid user firewall rules from
disrupting hypervisor services :).

> If the intent is to be able to support multi tenancy, then this sounds like a better fit. Also, in the multi tenancy case, isolating the other AFs is probably what you want as well.

Eventually multi-tenancy might be interesting but just fine-grained
service access control would be enough for most cases.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply

* Re: bpf debug info
From: Daniel Borkmann @ 2016-11-29 15:11 UTC (permalink / raw)
  To: Alexei Starovoitov, netdev
  Cc: Brenden Blanco, Thomas Graf, Wangnan, He Kuang, kernel-team
In-Reply-To: <20161129064217.GA20124@ast-mbp.thefacebook.com>

On 11/29/2016 07:42 AM, Alexei Starovoitov wrote:
[...]
> The support for debug information in BPF was recently added to llvm.
>
> In order to use it recompile bpf programs with the following patch
> in samples/bpf/Makefile
> @@ -155,4 +155,4 @@ $(obj)/%.o: $(src)/%.c
>          $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
>                  -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
>                  -Wno-compare-distinct-pointer-types \
> -               -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
> +               -O2 -emit-llvm -g -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
>
> and compiled .o files can be consumed by standard llvm-objdump utility.
>
> $ llvm-objdump -S -no-show-raw-insn samples/bpf/xdp1_kern.o
> xdp1_kern.o:    file format ELF64-BPF
>
> Disassembly of section xdp1:
> xdp_prog1:
> ; {
>         0:       r2 = *(u32 *)(r1 + 4)
> ; void *data = (void *)(long)ctx->data;
>         8:       r1 = *(u32 *)(r1 + 0)
> ; if (data + nh_off > data_end)
>        10:       r3 = r1
>        18:       r3 += 14
>        20:       if r3 > r2 goto 55
> ; h_proto = eth->h_proto;
>        28:       r3 = *(u8 *)(r1 + 12)
>        30:       r4 = *(u8 *)(r1 + 13)
>        38:       r4 <<= 8
>        40:       r4 |= r3
> ; if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
>        48:       if r4 == 43144 goto 2
>        50:       r3 = 14
>        58:       if r4 != 129 goto 5
>
> LBB0_3:
> ; if (data + nh_off > data_end)
>        60:       r3 = r1
>        68:       r3 += 18
>        70:       if r3 > r2 goto 45
>        78:       r3 = 18
> ; h_proto = vhdr->h_vlan_encapsulated_proto;
>        80:       r4 = *(u16 *)(r1 + 16)
>
> LBB0_5:
>        88:       r5 = r4
>        90:       r5 &= 65535
> ; if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
>        98:       if r5 == 43144 goto 1
>        a0:       if r5 != 129 goto 9
>
> Notice that 'clang -S -o a.s' output and llvm-objdump disassembler
> were changed to use kernel verifier style, so now it should be easier
> to see what's going on.

Sounds really useful, is that scheduled for llvm 3.10 release?

That debugging info is stored in dwarf format into the obj, right?
Would be nice if also pahole could work on bpf object files.

> The main advantage of debug info is that verifier error messages
> are now easier to correlate to original C code.

Does that mean that the old -S output format is not available anymore?
Personally, I liked that one better tbh, was hoping someone would have
added asm parsing for it, so compilation from .S to .o also works.

> For example, say, in samples/bpf/parse_varlen.c I forgot
> to compare pointer into packet with data_end:
> --- a/samples/bpf/parse_varlen.c
> +++ b/samples/bpf/parse_varlen.c
> @@ -33,8 +33,8 @@ static int udp(void *data, uint64_t tp_off, void *data_end)
>   {
>          struct udphdr *udp = data + tp_off;
>
> -       if (udp + 1 > data_end)
> -               return 0;
> +//     if (udp + 1 > data_end)
> +//             return 0;
>          if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) ||
>              udp->source == htons(DEFAULT_PKTGEN_UDP_PORT)) {
>
> If I try to run samples/bpf/test_cls_bpf.sh the verifier will complain:
> R0=imm0,min_value=0,max_value=0 R1=pkt(id=0,off=0,r=42) R2=pkt_end
> 112: (0f) r4 += r3
> 113: (0f) r1 += r4
> 114: (b7) r0 = 2
> 115: (69) r2 = *(u16 *)(r1 +2)
> invalid access to packet, off=2 size=2, R1(id=3,off=0,r=0)
>
> Now multiply 115 * 8 and convert to hex. This is address 0x398 in llvm-objdump:
> ; struct udphdr *udp = data + tp_off;
>       388:       r1 += r4
>       390:       r0 = 2
> ; if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) ||
>       398:       r2 = *(u16 *)(r1 + 2)
>       3a0:       if r2 == 2304 goto 16
>
> Now it's clear which line of C code is causing the verifier to reject.
[...]

Could llvm-objdump switch line numbering for bpf same way as verifier
output, so mapping step is not really needed? Alternatively, on each
verifier error, there could be a hint with cmd to use regarding llvm-objdump.

> So next step is to improve verifier messages to be more human friendly.
> The step after is to introduce BPF_COMMENT pseudo instruction
> that will be ignored by the interpreter yet it will contain the text
> of original source code. Then llvm-objdump step won't be necessary.
> The bpf loader will load both instructions and pieces of C sources.
> Then verifier errors should be even easier to read and humans
> can easily understand the purpose of the program.

So the BPF_COMMENT pseudo insn will get stripped away from the insn array
after verification step, so we don't need to hold/account for this mem? I
assume in it's ->imm member it will just hold offset into text blob?

Given that the generated verifier log can already become huge nowadays up
to a point where less than 4k insns prog fails to load due to reaching max
kernel allowed verifier buffer size, is the plan to only dump C code for
last few lines or for everything?

> PS
> A year ago He Kuang reported that dwarf emitted by bpf llvm backend is broken.
> Sorry it took so long to fix. It's probably still broken on big endian,
> since I've only tested on x86.
>

^ permalink raw reply

* Re: [PATCH RFC v2] ethtool: implement helper to get flow_type value
From: Jakub Kicinski @ 2016-11-29 15:21 UTC (permalink / raw)
  To: Jacob Keller; +Cc: netdev, Intel Wired LAN, David Miller
In-Reply-To: <20161128230343.19110-1-jacob.e.keller@intel.com>

On Mon, 28 Nov 2016 15:03:43 -0800, Jacob Keller wrote:
> Often a driver wants to store the flow type and thus it must mask the
> extra fields. This is a task that could grow more complex as more flags
> are added in the future. Add a helper function that masks the flags for
> marking additional fields.
> 
> Modify drivers in drivers/net/ethernet that currently check for FLOW_EXT
> and FLOW_MAC_EXT to use the helper. Currently this is only the mellanox
> drivers.
> 
> I chose not to modify other drivers as I'm actually unsure whether we
> should always mask the flow type even for drivers which don't recognize
> the newer flags. On the one hand, today's drivers (generally)
> automatically fail when a new flag is used because they won't mask it
> and their checks against flow_type will not match. On the other hand, it
> means another place that you have to update when you begin implementing
> a flag.
> 
> An alternative is to have the driver store a set of flags that it knows
> about, and then have ethtool core do the check for us to discard frames.
> I haven't implemented this quite yet.
> 
> Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
> ---
>  drivers/net/ethernet/mellanox/mlx4/en_ethtool.c         | 4 ++--
>  drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 6 +++---
>  include/uapi/linux/ethtool.h                            | 5 +++++
>  3 files changed, 10 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> index 487a58f9c192..d8f9839ce2a3 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> @@ -1270,7 +1270,7 @@ static int mlx4_en_validate_flow(struct net_device *dev,
>  			return -EINVAL;
>  	}
>  
> -	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
> +	switch (ethtool_get_flow_spec_type(cmd->fs.flow_type)) {
>  	case TCP_V4_FLOW:
>  	case UDP_V4_FLOW:
>  		if (cmd->fs.m_u.tcp_ip4_spec.tos)
> @@ -1493,7 +1493,7 @@ static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
>  	if (err)
>  		return err;
>  
> -	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
> +	switch (ethtool_get_flow_spec_type(cmd->fs.flow_type)) {
>  	case ETHER_FLOW:
>  		spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
>  		if (!spec_l2)
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
> index 3691451c728c..066e6c5cf38b 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
> @@ -63,7 +63,7 @@ static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv,
>  	int table_size;
>  	int prio;
>  
> -	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
> +	switch (ethtool_get_flow_spec_type(fs->flow_type)) {
>  	case TCP_V4_FLOW:
>  	case UDP_V4_FLOW:
>  		max_tuples = ETHTOOL_NUM_L3_L4_FTS;
> @@ -147,7 +147,7 @@ static int set_flow_attrs(u32 *match_c, u32 *match_v,
>  					     outer_headers);
>  	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
>  					     outer_headers);
> -	u32 flow_type = fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT);
> +	u32 flow_type = ethtool_get_flow_spec_type(fs->flow_type);
>  	struct ethtool_tcpip4_spec *l4_mask;
>  	struct ethtool_tcpip4_spec *l4_val;
>  	struct ethtool_usrip4_spec *l3_mask;
> @@ -393,7 +393,7 @@ static int validate_flow(struct mlx5e_priv *priv,
>  	    fs->ring_cookie != RX_CLS_FLOW_DISC)
>  		return -EINVAL;
>  
> -	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
> +	switch (ethtool_get_flow_spec_type(fs->flow_type)) {
>  	case ETHER_FLOW:
>  		eth_mask = &fs->m_u.ether_spec;
>  		if (!is_zero_ether_addr(eth_mask->h_dest))
> diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
> index f0db7788f887..e92ad725c9d0 100644
> --- a/include/uapi/linux/ethtool.h
> +++ b/include/uapi/linux/ethtool.h
> @@ -1583,6 +1583,11 @@ static inline int ethtool_validate_duplex(__u8 duplex)
>  #define	FLOW_EXT	0x80000000
>  #define	FLOW_MAC_EXT	0x40000000
>  
> +static inline __u32 ethtool_get_flow_spec_type(__u32 flow_type)
> +{
> +	return flow_type & (FLOW_EXT | FLOW_MAC_EXT);

I don't have anything of substance to say but I think you are missing a
negation (~) here compared to the code you are replacing ;)

^ permalink raw reply

* [PATCH] tun: Use netif_receive_skb instead of netif_rx
From: Andrey Konovalov @ 2016-11-29 15:25 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Jason Wang, Eric Dumazet,
	Paolo Abeni, Michael S . Tsirkin, Soheil Hassas Yeganeh,
	Markus Elfring, Mike Rapoport, netdev, linux-kernel
  Cc: Dmitry Vyukov, Kostya Serebryany, syzkaller, Andrey Konovalov

This patch changes tun.c to call netif_receive_skb instead of netif_rx
when a packet is received. The difference between the two is that netif_rx
queues the packet into the backlog, and netif_receive_skb proccesses the
packet in the current context.

This patch is required for syzkaller [1] to collect coverage from packet
receive paths, when a packet being received through tun (syzkaller collects
coverage per process in the process context).

A similar patch was introduced back in 2010 [2, 3], but the author found
out that the patch doesn't help with the task he had in mind (for cgroups
to shape network traffic based on the original process) and decided not to
go further with it. The main concern back then was about possible stack
exhaustion with 4K stacks, but CONFIG_4KSTACKS was removed and stacks are
8K now.

[1] https://github.com/google/syzkaller

[2] https://www.spinics.net/lists/netdev/thrd440.html#130570

[3] https://www.spinics.net/lists/netdev/msg130570.html

Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
---
 drivers/net/tun.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8093e39..4b56e91 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1304,7 +1304,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	skb_probe_transport_header(skb, 0);

 	rxhash = skb_get_hash(skb);
-	netif_rx_ni(skb);
+	local_bh_disable();
+	netif_receive_skb(skb);
+	local_bh_enable();

 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: [PATCH] ipv6:ip6_xmit remove unnecessary np NULL check
From: Eric Dumazet @ 2016-11-29 15:26 UTC (permalink / raw)
  To: Manjeet Pawar
  Cc: davem, kuznet, jmorris, yoshfuji, kaber, netdev, linux-kernel,
	pankaj.m, ajeet.y, Rohit Thapliyal
In-Reply-To: <1480401179-9220-1-git-send-email-manjeet.p@samsung.com>

On Tue, 2016-11-29 at 12:02 +0530, Manjeet Pawar wrote:
> From: Rohit Thapliyal <r.thapliyal@samsung.com>
> 
> np NULL check doesn't seem required here as it shall never
> be NULL anyways in inet6_sk(sk).
> 
> Signed-off-by: Rohit Thapliyal <r.thapliyal@samsung.com>
> Signed-off-by: Manjeet Pawar <manjeet.p@samsung.com>
> Signed-off-by: David Miller <davem@davemloft.net>
> Reviewed-by: Akhilesh Kumar <akhilesh.k@samsung.com>
> 
> ---
> v2->v3: Modified as per the suggestion from David Miller
>         ip6_xmit calls are made without checking NULL np
>         pointer, so no need to explicitly check NULL np in
>         ip6_xmit.
> 
>  include/linux/ipv6.h  | 2 +-
>  net/ipv6/ip6_output.c | 3 +--
>  2 files changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
> index a064997..6c9c604 100644
> --- a/include/linux/ipv6.h
> +++ b/include/linux/ipv6.h
> @@ -299,7 +299,7 @@ struct tcp6_timewait_sock {
>  
>  static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk)
>  {
> -	return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL;
> +	return inet_sk(__sk)->pinet6;


David suggestion was about np being NULL or not in ip6_xmit()

But have you checked inet6_sk() was never called for a TCPv6 TIMEWAIT or
SYN_RECV request ?

>  }
>  
>  static inline struct raw6_sock *raw6_sk(const struct sock *sk)
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index 59eb4ed..f8c63ec 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -213,8 +213,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
>  	/*
>  	 *	Fill in the IPv6 header
>  	 */
> -	if (np)
> -		hlimit = np->hop_limit;
> +	hlimit = np->hop_limit;
>  	if (hlimit < 0)
>  		hlimit = ip6_dst_hoplimit(dst);
>  

This part is fine.

^ permalink raw reply

* Re: net: GPF in eth_header
From: Andrey Konovalov @ 2016-11-29 15:31 UTC (permalink / raw)
  To: syzkaller
  Cc: Dmitry Vyukov, David Miller, Tom Herbert, Alexander Duyck,
	Hannes Frederic Sowa, Jiri Benc, Sabrina Dubroca, netdev, LKML
In-Reply-To: <1480431501.18162.131.camel@edumazet-glaptop3.roam.corp.google.com>

On Tue, Nov 29, 2016 at 3:58 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2016-11-29 at 11:26 +0100, Andrey Konovalov wrote:
>> On Sat, Nov 26, 2016 at 9:05 PM, Eric Dumazet <erdlkml@gmail.com> wrote:
>> >> I actually see multiple places where skb_network_offset() is used as
>> >> an argument to skb_pull().
>> >> So I guess every place can potentially be buggy.
>> >
>> > Well, I think the intent is to accept a negative number.
>>
>> I'm not sure that was the intent since it results in a signedness
>> issue which leads to an out-of-bounds.
>>
>
> Hey, I already mentioned where was the bug.
>
> You missed the investigation where I pointed it to FLorian ?
>
>> A quick grep shows that the same issue can potentially happen in
>> multiple places across the kernel:
>>
>> net/ipv6/ip6_output.c:1655: __skb_pull(skb, skb_network_offset(skb));
>> net/packet/af_packet.c:2043: skb_pull(skb, skb_network_offset(skb));
>> net/packet/af_packet.c:2165: skb_pull(skb, skb_network_offset(skb));
>> net/core/neighbour.c:1301: __skb_pull(skb, skb_network_offset(skb));
>> net/core/neighbour.c:1331: __skb_pull(skb, skb_network_offset(skb));
>> net/core/dev.c:3157: __skb_pull(skb, skb_network_offset(skb));
>> net/sched/sch_teql.c:337: __skb_pull(skb, skb_network_offset(skb));
>> net/sched/sch_atm.c:479: skb_pull(skb, skb_network_offset(skb));
>> net/ipv4/ip_output.c:1385: __skb_pull(skb, skb_network_offset(skb));
>> net/ipv4/ip_fragment.c:391: if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
>> drivers/net/vxlan.c:1440: __skb_pull(reply, skb_network_offset(reply));
>> drivers/net/vxlan.c:1902: __skb_pull(skb, skb_network_offset(skb));
>> drivers/net/vrf.c:220: __skb_pull(skb, skb_network_offset(skb));
>> drivers/net/vrf.c:314: __skb_pull(skb, skb_network_offset(skb));
>>
>> A similar thing also happened to somebody else (on a receive path!):
>> https://forums.grsecurity.net/viewtopic.php?f=3&t=4550
>>
>> Does it make sense to check skb_network_offset() before passing it to
>> skb_pull() everywhere?
>
> Well, sure, we could add safety checks everywhere and slow the kernel
> when debugging is requested.
>
> But skb_network_offset() is not the problem here. Why are you focusing
> on it ?
>
> The real problem is in __skb_pull() or __skb_push() and all similar
> helpers. Lots of added checks and slowdowns.

The issue is not with skb_network_offset(), but with __skb_pull()
using skb_network_offset() as an argument.

I'm not sure what would be the beast way to fix this, to add a check
before every __skb_pull(skb_network_offset()), to fix __skb_pull() to
work with signed ints, to add BUG_ON()'s in __skb_pull, or something
else.

What I meant is that you fixed this very instance of the bug, and I'm
pointing out that a similar one might hit us again.

>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 9c535fbccf2c7dbfae04cee393460e86d588c26b..d6116e37d054fc1536114347ed3c41fc7dc7a882 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1923,6 +1923,7 @@ static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len)
>  unsigned char *skb_push(struct sk_buff *skb, unsigned int len);
>  static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len)
>  {
> +       BUG_ON((int)len < 0);
>         skb->data -= len;
>         skb->len  += len;
>         return skb->data;
> @@ -1931,6 +1932,7 @@ static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len)
>  unsigned char *skb_pull(struct sk_buff *skb, unsigned int len);
>  static inline unsigned char *__skb_pull(struct sk_buff *skb, unsigned int len)
>  {
> +       BUG_ON((int)len < 0);
>         skb->len -= len;
>         BUG_ON(skb->len < skb->data_len);
>         return skb->data += len;
> @@ -1938,6 +1940,7 @@ static inline unsigned char *__skb_pull(struct sk_buff *skb, unsigned int len)
>
>  static inline unsigned char *skb_pull_inline(struct sk_buff *skb, unsigned int len)
>  {
> +       BUG_ON((int)len < 0);
>         return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
>  }
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index d1d1a5a5ad24ded523fc12ffba8c602b03bd0830..7bf098c848fd857ba5d287fc91d43f62f381bd55 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1450,6 +1450,7 @@ EXPORT_SYMBOL(skb_put);
>   */
>  unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
>  {
> +       BUG_ON((int)len < 0);
>         skb->data -= len;
>         skb->len  += len;
>         if (unlikely(skb->data<skb->head))
>
>
>
>
>
>
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller+unsubscribe@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: adm80211: Removed unused 'io_addr' 'mem_addr' variables
From: Kalle Valo @ 2016-11-29 15:32 UTC (permalink / raw)
  To: Kirtika Ruchandani
  Cc: Arnd Bergmann, Johannes Berg,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20161124064045.GA8836-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Kirtika Ruchandani <kirtika.ruchandani-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> Initial commit cc0b88cf5ecf ([PATCH] Add adm8211 802.11b wireless driver)
> introduced variables mem_addr and io_addr in adm80211_probe() that are
> set but not used. Compiling with W=1 gives the following warnings,
> fix them.
> 
> drivers/net/wireless/admtek/adm8211.c: In function ‘adm8211_probe’:
> drivers/net/wireless/admtek/adm8211.c:1769:15: warning: variable ‘io_addr’ set but not used [-Wunused-but-set-variable]
>   unsigned int io_addr, io_len;
>                ^
> drivers/net/wireless/admtek/adm8211.c:1768:16: warning: variable ‘mem_addr’ set but not used [-Wunused-but-set-variable]
>   unsigned long mem_addr, mem_len;
>                 ^
> 
> These are harmless warnings and are only being fixed to reduce the
> noise with W=1 in the kernel. The calls to pci_resource_start do not
> have any side-effects and are safe to remove.
> 
> Fixes: cc0b88cf5ecf ("[PATCH] Add adm8211 802.11b wireless driver")
> Cc: Michael Wu <flamingice-R9e9/4HEdknk1uMJSBkQmQ@public.gmane.org>
> Cc: John W. Linville <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>
> Signed-off-by: Kirtika Ruchandani <kirtika-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>

The patch failed to apply:

fatal: corrupt patch at line 14
Applying: adm80211: Removed unused 'io_addr' 'mem_addr' variables
Repository lacks necessary blobs to fall back on 3-way merge.
Cannot fall back to three-way merge.
Patch failed at 0001 adm80211: Removed unused 'io_addr' 'mem_addr' variables

Patch set to Changes Requested.

-- 
https://patchwork.kernel.org/patch/9444901/

Documentation about submitting wireless patches and checking status
from patchwork:

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH net-next 1/3] ethtool: (uapi) Add ETHTOOL_PHY_LOOPBACK to PHY tunables
From: Allan W. Nielsen @ 2016-11-29 15:32 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Florian Fainelli, netdev, raju.lakkaraju
In-Reply-To: <20161129004653.GA24782@lunn.ch>


On 28/11/16 21:21, Andrew Lunn wrote:
> On Mon, Nov 28, 2016 at 08:23:06PM +0100, Allan W. Nielsen wrote:
> > On 28/11/16 15:14, Andrew Lunn wrote:
> > > On Mon, Nov 28, 2016 at 02:24:30PM +0100, Allan W. Nielsen wrote:
> > > > From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
> > > > 3 types of PHY loopback are supported.
> > > > i.e. Near-End Loopback, Far-End Loopback and External Loopback.
> > > Is this integrated with ethtool --test? You only want the PHY to go
> > > into loopback mode when running ethtool --test external_lb or maybe
> > > ethtool --test offline.
> > There are other use-cases for enabling PHY loopback:
> > 1) If the PHY is connected to a switch then a loop-port is sometime
> >    used to "force/enable" one or more extra pass through the switch
> >    core. This "hack" can sometime be used to achieve new functionality
> >    with existing silicon.
> With Linux, switches are managed via switchdev, or DSA. You will have
> to teach this infrastructure that something really odd is going on
> with one of its ports before you do anything like this in the PHY
> layer. I suggest you leave this use case alone until somebody
> really-really wants it. From my knowledge of the Marvell DSA driver,
> this is not easy.
> > 2) Existing user-space application may expect to be able to do the
> >    testing on its own (generate/validate the test traffic).
> Please can you reference some existing user-space application and the
> kernel API it uses to put the PHY into loopback mode?
The application I had in mind, is the switch application that I'm normally
working on (a product of MSCC). This application was originally written for
eCos, but is now moved to Linux. The application is currently doing almost
all drivers in user-space (reaching the HW through a UIO driver). This
means that we have an existing set of APIs and associated applications which
among many other things tests the PHYs using loopback facilities. There are
also cases where the loopports are being used as I described earlier.

We are looking at moving some of the drivers into the kernel, and that
require us to find a solution to such issues (nothing have been decided,
and nothing will be decided anytime soon).

I also understand your point of view, you have presented pretty good
arguments, and I do not expect this to change your view on this topic.

On 29/11/16 01:46, Andrew Lunn wrote:
> > > If you really do what to do this, you should look at NETIF_F_LOOPBACK
> > > and consider how this concept can be applied at the PHY, not the MAC.
> > > But you need the full concept, so you see things like:
> > >
> > > 2: eth0: <LOOPBACK,BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
> > >     link/ether 80:ee:73:83:60:27 brd ff:ff:ff:ff:ff:ff
> > >
> > > Humm, i've no idea how you actually enable the MAC loopback
> > > NETIF_F_LOOPBACK represents. I don't see anything with ip link set.
> >
> > I am afraid you lost me on this, NETIF_F_LOOPBACK is a netdev_features_t
> > bit, so it tells ethtool that this is a potential feature to be turned
> > on with ethtool -K <iface>.
> Yep, i'm talking rubbish after jumping to a wrong conclusion.
> > The semantics of this loopack feature are
> > not defined AFAICT, but a reasonable behavior from the driver is to put
> > itself in a mode where packets send by a socket-level application are
> > looped through the Ethernet adapter itself. Whether this happens at the
> > DMA level, the MII signals, or somewhere in the PHY, is driver specific
> > unfortunately.
> 
> Yes, the interesting one might be forcedeth which writes to the PHY
> BMCR_LOOPBACK | BMCR_FULLDPLX | BMCR_SPEED1000;
> when the NETIF_F_LOOPBACK feature is set.
> 
> Maybe this could be generalised and made available for all MACs which
> don't support MAC loopback?
> 
> What needs considering is the correct duplex and speed. We need to
> ensure the MAC and PHY agree on this.
> 
> > Now, there is another way to toggle a loopback for a given Ethernet
> > adapter which is to actually set IFF_LOOPBACK in dev->flags for this
> > interface. Some drivers seem to be able to properly react to that as
> > well, although I see no way this can be done looking at the iproute2 or
> > ifconfig man pages..
> 
> I prefer this one, since i expect this will set LOOPBACK in
> 
> 2: eth0: <LOOPBACK,BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
> 
> making it lot more obvious something funny is going on.
Raju and I will need to dive deeper into this to understand the details of
what you are suggesting. But I think it points in a different direction...

The approach you are describing is to use existing flags (which will make
the loop-back state visible to the user, which is a good thing) to set the
loopback state. Where/how the frames are looped depends on the drivers. The
suggested tunable could be kept, but it will only allow the user to say,
"if the frames are looped in the PHY, then this is how I want them to be
looped".

Also, it does not make a lot of sense for the "FAR" loopback patch (which I
admit is a bit strange...).

The facility we are seeking is much more specific: "Allow the user to bring
the PHY in and out of specific loopback modes" assuming that the user have
reasons to do so.

I'm not sure if your main dislike with this feature is the lack of
transparency/visibility to the end-user, or if is the concept of allowing
the user to control where and how frames are looped (or both).

Best regards
Allan W. Nielsen

^ permalink raw reply

* Re: [PATCH] tun: Use netif_receive_skb instead of netif_rx
From: Eric Dumazet @ 2016-11-29 15:35 UTC (permalink / raw)
  To: Andrey Konovalov, Peter Klausler
  Cc: Herbert Xu, David S . Miller, Jason Wang, Eric Dumazet,
	Paolo Abeni, Michael S . Tsirkin, Soheil Hassas Yeganeh,
	Markus Elfring, Mike Rapoport, netdev, linux-kernel,
	Dmitry Vyukov, Kostya Serebryany, syzkaller
In-Reply-To: <1480433136-7922-1-git-send-email-andreyknvl@google.com>

On Tue, 2016-11-29 at 16:25 +0100, Andrey Konovalov wrote:
> This patch changes tun.c to call netif_receive_skb instead of netif_rx
> when a packet is received. The difference between the two is that netif_rx
> queues the packet into the backlog, and netif_receive_skb proccesses the
> packet in the current context.
> 
> This patch is required for syzkaller [1] to collect coverage from packet
> receive paths, when a packet being received through tun (syzkaller collects
> coverage per process in the process context).
> 
> A similar patch was introduced back in 2010 [2, 3], but the author found
> out that the patch doesn't help with the task he had in mind (for cgroups
> to shape network traffic based on the original process) and decided not to
> go further with it. The main concern back then was about possible stack
> exhaustion with 4K stacks, but CONFIG_4KSTACKS was removed and stacks are
> 8K now.

Acked-by: Eric Dumazet <edumazet@google.com>

We're using a similar patch written by Peter , let me copy here part of
his changelog, since main motivation was speed improvement at that
time :

commit 29aa09f47d43e93327a706cd835a37012ccc5b9e
Author: Peter Klausler <pmk@google.com>
Date:   Fri Mar 29 17:08:02 2013 -0400

    net-tun: Add netif_rx_ni_immediate() variant to speed up tun/tap.
    
    Speed up packet reception from (i.e., writes to) a tun/tap
    device by adding an alternative netif_rx_ni_immediate()
    interface that invokes netif_receive_skb() immediately rather
    than enqueueing the packet in the backlog queue and then driving
    its processing with do_softirq().  Forced queueing as a consequence
    of an RPS CPU mask will still work as expected.
    
    This change speeds up my closed-loop single-stream tap/OVS benchmark
    by about 23%, from 700k packets/second to 867k packets/second.
    

^ permalink raw reply

* Re: bpf debug info
From: Jakub Kicinski @ 2016-11-29 15:38 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, netdev, Brenden Blanco, Thomas Graf, Wangnan,
	He Kuang, kernel-team
In-Reply-To: <583D9AA4.8050601@iogearbox.net>

On Tue, 29 Nov 2016 16:11:32 +0100, Daniel Borkmann wrote:
> On 11/29/2016 07:42 AM, Alexei Starovoitov wrote:
> [...]
> > The support for debug information in BPF was recently added to llvm.
> >
> > In order to use it recompile bpf programs with the following patch
> > in samples/bpf/Makefile
> > @@ -155,4 +155,4 @@ $(obj)/%.o: $(src)/%.c
> >          $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
> >                  -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
> >                  -Wno-compare-distinct-pointer-types \
> > -               -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
> > +               -O2 -emit-llvm -g -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
> >
> > and compiled .o files can be consumed by standard llvm-objdump utility.
> >
> > $ llvm-objdump -S -no-show-raw-insn samples/bpf/xdp1_kern.o
> > xdp1_kern.o:    file format ELF64-BPF
> >
> > Disassembly of section xdp1:
> > xdp_prog1:
> > ; {
> >         0:       r2 = *(u32 *)(r1 + 4)
> > ; void *data = (void *)(long)ctx->data;
> >         8:       r1 = *(u32 *)(r1 + 0)
> > ; if (data + nh_off > data_end)
> >        10:       r3 = r1
> >        18:       r3 += 14
> >        20:       if r3 > r2 goto 55
> > ; h_proto = eth->h_proto;
> >        28:       r3 = *(u8 *)(r1 + 12)
> >        30:       r4 = *(u8 *)(r1 + 13)
> >        38:       r4 <<= 8
> >        40:       r4 |= r3
> > ; if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
> >        48:       if r4 == 43144 goto 2
> >        50:       r3 = 14
> >        58:       if r4 != 129 goto 5
> >
> > LBB0_3:
> > ; if (data + nh_off > data_end)
> >        60:       r3 = r1
> >        68:       r3 += 18
> >        70:       if r3 > r2 goto 45
> >        78:       r3 = 18
> > ; h_proto = vhdr->h_vlan_encapsulated_proto;
> >        80:       r4 = *(u16 *)(r1 + 16)
> >
> > LBB0_5:
> >        88:       r5 = r4
> >        90:       r5 &= 65535
> > ; if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
> >        98:       if r5 == 43144 goto 1
> >        a0:       if r5 != 129 goto 9
> >
> > Notice that 'clang -S -o a.s' output and llvm-objdump disassembler
> > were changed to use kernel verifier style, so now it should be easier
> > to see what's going on.  

This sounds super useful!  Thanks a lot!

>>> [...]
> > So next step is to improve verifier messages to be more human friendly.
> > The step after is to introduce BPF_COMMENT pseudo instruction
> > that will be ignored by the interpreter yet it will contain the text
> > of original source code. Then llvm-objdump step won't be necessary.
> > The bpf loader will load both instructions and pieces of C sources.
> > Then verifier errors should be even easier to read and humans
> > can easily understand the purpose of the program.  
> 
> So the BPF_COMMENT pseudo insn will get stripped away from the insn array
> after verification step, so we don't need to hold/account for this mem? I
> assume in it's ->imm member it will just hold offset into text blob?

Associating any form of opaque data with programs always makes me
worried about opening a side channel of communication with a specialized
user space implementations/compilers.  But I guess if the BPF_COMMENTs
are stripped in the verifier as Daniel assumes drivers and JITs will
never see it.

Just to clarify, however - is there any reason why pushing the source
code into the kernel is necessary?  Or is it just for convenience?
Provided the user space loader has access to the debug info it should
have no problems matching the verifier output to code lines?

^ permalink raw reply

* [PATCH net-next 2/2] tcp: allow to turn tcp timestamp randomization off
From: Florian Westphal @ 2016-11-29 15:45 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <1480434342-12367-1-git-send-email-fw@strlen.de>

Eric says: "By looking at tcpdump, and TS val of xmit packets of multiple
flows, we can deduct the relative qdisc delays (think of fq pacing).
This should work even if we have one flow per remote peer."

Having random per flow (or host) offsets doesn't allow that anymore so add
a way to turn this off.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++--
 net/ipv4/tcp_input.c                   | 10 +++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5af48dd7c5fc..de2448313799 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -610,8 +610,13 @@ tcp_syn_retries - INTEGER
 	with the current initial RTO of 1second. With this the final timeout
 	for an active TCP connection attempt will happen after 127seconds.
 
-tcp_timestamps - BOOLEAN
-	Enable timestamps as defined in RFC1323.
+tcp_timestamps - INTEGER
+Enable timestamps as defined in RFC1323.
+	0: Disabled.
+	1: Enable timestamps as defined in RFC1323.
+	2: Like 1, but also use a random offset for each connection
+	rather than only using the current time.
+	Default: 2
 
 tcp_min_tso_segs - INTEGER
 	Minimal number of segments per TSO frame.
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1b1921c71f7c..ebed73703198 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -76,7 +76,7 @@
 #include <asm/unaligned.h>
 #include <linux/errqueue.h>
 
-int sysctl_tcp_timestamps __read_mostly = 1;
+int sysctl_tcp_timestamps __read_mostly = 2;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
 int sysctl_tcp_fack __read_mostly = 1;
@@ -6323,10 +6323,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
 
-	if (isn && tmp_opt.tstamp_ok)
+	if (isn && tmp_opt.tstamp_ok && sysctl_tcp_timestamps == 2)
 		af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
 
 	if (!want_cookie && !isn) {
+		u32 ts_off;
+
 		/* VJ's idea. We save last timestamp seen
 		 * from the destination in peer table, when entering
 		 * state TIME-WAIT, and check against it before
@@ -6366,7 +6368,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			goto drop_and_release;
 		}
 
-		isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+		isn = af_ops->init_seq(skb, &ts_off);
+		if (sysctl_tcp_timestamps == 2)
+			tcp_rsk(req)->ts_off = ts_off;
 	}
 	if (!dst) {
 		dst = af_ops->route_req(sk, &fl, req, NULL);
-- 
2.7.3

^ permalink raw reply related

* [PATCH net-next 1/2] tcp: randomize tcp timestamp offsets for each connection
From: Florian Westphal @ 2016-11-29 15:45 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal, Mirja Kühlewind

jiffies based timestamps allow for easy inference of number of devices
behind NAT translators and also makes tracking of hosts simpler.

commit ceaa1fef65a7c2e ("tcp: adding a per-socket timestamp offset")
added the main infrastructure that is needed for per-connection ts
randomization, in particular writing/reading the on-wire tcp header
format takes the offset into account so rest of stack can use normal
tcp_time_stamp (jiffies).

So only two items are left:
 - add a tsoffset for request sockets
 - extend the tcp isn generator to also return another 32bit number
   in addition to the ISN.

Re-use of ISN generator also means timestamps are still monotonically
increasing for same connection quadruple, i.e. PAWS will still work.

Includes fixes from Eric Dumazet.

Cc: Mirja Kühlewind <mirja.kuehlewind@tik.ee.ethz.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/tcp.h      |  1 +
 include/net/secure_seq.h |  8 ++++----
 include/net/tcp.h        |  2 +-
 net/core/secure_seq.c    | 10 ++++++----
 net/ipv4/syncookies.c    |  1 +
 net/ipv4/tcp_input.c     |  7 ++++++-
 net/ipv4/tcp_ipv4.c      |  9 +++++----
 net/ipv4/tcp_minisocks.c |  4 +++-
 net/ipv4/tcp_output.c    |  2 +-
 net/ipv6/syncookies.c    |  1 +
 net/ipv6/tcp_ipv6.c      | 10 ++++++----
 11 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 32a7c7e35b71..2408bcc579f1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -123,6 +123,7 @@ struct tcp_request_sock {
 	u32				txhash;
 	u32				rcv_isn;
 	u32				snt_isn;
+	u32				ts_off;
 	u32				last_oow_ack_time; /* last SYNACK */
 	u32				rcv_nxt; /* the ack # by SYNACK. For
 						  * FastOpen it's the seq#
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 3f36d45b714a..0caee631a836 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -6,10 +6,10 @@
 u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
-				 __be16 sport, __be16 dport);
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
-				   __be16 sport, __be16 dport);
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+			       __be16 sport, __be16 dport, u32 *tsoff);
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+				 __be16 sport, __be16 dport, u32 *tsoff);
 u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7de80739adab..1c09d909bd43 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1809,7 +1809,7 @@ struct tcp_request_sock_ops {
 	struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
 				       const struct request_sock *req,
 				       bool *strict);
-	__u32 (*init_seq)(const struct sk_buff *skb);
+	__u32 (*init_seq)(const struct sk_buff *skb, u32 *tsoff);
 	int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
 			   struct flowi *fl, struct request_sock *req,
 			   struct tcp_fastopen_cookie *foc,
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce461fbe6..a8d6062cbb4a 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -40,8 +40,8 @@ static u32 seq_scale(u32 seq)
 #endif
 
 #if IS_ENABLED(CONFIG_IPV6)
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
-				   __be16 sport, __be16 dport)
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+				 __be16 sport, __be16 dport, u32 *tsoff)
 {
 	u32 secret[MD5_MESSAGE_BYTES / 4];
 	u32 hash[MD5_DIGEST_WORDS];
@@ -58,6 +58,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
 
 	md5_transform(hash, secret);
 
+	*tsoff = hash[1];
 	return seq_scale(hash[0]);
 }
 EXPORT_SYMBOL(secure_tcpv6_sequence_number);
@@ -86,8 +87,8 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 
 #ifdef CONFIG_INET
 
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
-				 __be16 sport, __be16 dport)
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+			       __be16 sport, __be16 dport, u32 *tsoff)
 {
 	u32 hash[MD5_DIGEST_WORDS];
 
@@ -99,6 +100,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
 
 	md5_transform(hash, net_secret);
 
+	*tsoff = hash[1];
 	return seq_scale(hash[0]);
 }
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0dc6286272aa..3e88467d70ee 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -334,6 +334,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	treq = tcp_rsk(req);
 	treq->rcv_isn		= ntohl(th->seq) - 1;
 	treq->snt_isn		= cookie;
+	treq->ts_off		= 0;
 	req->mss		= mss;
 	ireq->ir_num		= ntohs(th->dest);
 	ireq->ir_rmt_port	= th->source;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 22e6a2097ff6..1b1921c71f7c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6301,6 +6301,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		goto drop;
 
 	tcp_rsk(req)->af_specific = af_ops;
+	tcp_rsk(req)->ts_off = 0;
 
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6322,6 +6323,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
 
+	if (isn && tmp_opt.tstamp_ok)
+		af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+
 	if (!want_cookie && !isn) {
 		/* VJ's idea. We save last timestamp seen
 		 * from the destination in peer table, when entering
@@ -6362,7 +6366,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			goto drop_and_release;
 		}
 
-		isn = af_ops->init_seq(skb);
+		isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
 	}
 	if (!dst) {
 		dst = af_ops->route_req(sk, &fl, req, NULL);
@@ -6374,6 +6378,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	if (want_cookie) {
 		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+		tcp_rsk(req)->ts_off = 0;
 		req->cookie_ts = tmp_opt.tstamp_ok;
 		if (!tmp_opt.tstamp_ok)
 			inet_rsk(req)->ecn_ok = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5555eb86e549..b50f05905ced 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -95,12 +95,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 struct inet_hashinfo tcp_hashinfo;
 EXPORT_SYMBOL(tcp_hashinfo);
 
-static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
 {
 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 					  ip_hdr(skb)->saddr,
 					  tcp_hdr(skb)->dest,
-					  tcp_hdr(skb)->source);
+					  tcp_hdr(skb)->source, tsoff);
 }
 
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -237,7 +237,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 							   inet->inet_daddr,
 							   inet->inet_sport,
-							   usin->sin_port);
+							   usin->sin_port,
+							   &tp->tsoffset);
 
 	inet->inet_id = tp->write_seq ^ jiffies;
 
@@ -824,7 +825,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	tcp_v4_send_ack(sk, skb, seq,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
-			tcp_time_stamp,
+			tcp_time_stamp + tcp_rsk(req)->ts_off,
 			req->ts_recent,
 			0,
 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6234ebaa7db1..28ce5ee831f5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -532,7 +532,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 			newtp->rx_opt.ts_recent_stamp = 0;
 			newtp->tcp_header_len = sizeof(struct tcphdr);
 		}
-		newtp->tsoffset = 0;
+		newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
 		newtp->md5sig_info = NULL;	/*XXX*/
 		if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -581,6 +581,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
+			if (tmp_opt.rcv_tsecr)
+				tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
 			/* We do not store true stamp, but it is not required,
 			 * it can be estimated (approximately)
 			 * from another data.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 19105b46a304..1b6d5f34bf45 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -640,7 +640,7 @@ static unsigned int tcp_synack_options(struct request_sock *req,
 	}
 	if (likely(ireq->tstamp_ok)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = tcp_skb_timestamp(skb);
+		opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
 		opts->tsecr = req->ts_recent;
 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
 	}
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 97830a6a9cbb..a4d49760bf43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -209,6 +209,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_synack.v64	= 0;
 	treq->rcv_isn = ntohl(th->seq) - 1;
 	treq->snt_isn = cookie;
+	treq->ts_off = 0;
 
 	/*
 	 * We need to lookup the dst_entry to get the correct window size.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 28ec0a2e7b72..a2185a214abc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -101,12 +101,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 	}
 }
 
-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v6_init_sequence(const struct sk_buff *skb, u32 *tsoff)
 {
 	return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
 					    ipv6_hdr(skb)->saddr.s6_addr32,
 					    tcp_hdr(skb)->dest,
-					    tcp_hdr(skb)->source);
+					    tcp_hdr(skb)->source, tsoff);
 }
 
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
@@ -283,7 +283,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
 							     sk->sk_v6_daddr.s6_addr32,
 							     inet->inet_sport,
-							     inet->inet_dport);
+							     inet->inet_dport,
+							     &tp->tsoffset);
 
 	err = tcp_connect(sk);
 	if (err)
@@ -956,7 +957,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
-			tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
+			tcp_time_stamp + tcp_rsk(req)->ts_off,
+			req->ts_recent, sk->sk_bound_dev_if,
 			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
 			0, 0);
 }
-- 
2.7.3

^ permalink raw reply related

* Re: [PATCH v2 net-next 10/11] qede: Add basic XDP support
From: Daniel Borkmann @ 2016-11-29 15:48 UTC (permalink / raw)
  To: Yuval Mintz; +Cc: davem, netdev, alexei.starovoitov
In-Reply-To: <1480430830-17671-11-git-send-email-Yuval.Mintz@cavium.com>

On 11/29/2016 03:47 PM, Yuval Mintz wrote:
> Add support for the ndo_xdp callback. This patch would support XDP_PASS,
> XDP_DROP and XDP_ABORTED commands.
>
> This also adds a per Rx queue statistic which counts number of packets
> which didn't reach the stack [due to XDP].
>
> Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
[...]
> @@ -1560,6 +1593,7 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
>   			       struct qede_fastpath *fp,
>   			       struct qede_rx_queue *rxq)
>   {
> +	struct bpf_prog *xdp_prog = READ_ONCE(rxq->xdp_prog);
>   	struct eth_fast_path_rx_reg_cqe *fp_cqe;
>   	u16 len, pad, bd_cons_idx, parse_flag;
>   	enum eth_rx_cqe_type cqe_type;
> @@ -1596,6 +1630,11 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
>   	len = le16_to_cpu(fp_cqe->len_on_first_bd);
>   	pad = fp_cqe->placement_offset;
>
> +	/* Run eBPF program if one is attached */
> +	if (xdp_prog)
> +		if (!qede_rx_xdp(edev, fp, rxq, xdp_prog, bd, fp_cqe))
> +			return 1;
> +

You also need to wrap this under rcu_read_lock() (at least I haven't seen
it in your patches) for same reasons as stated in 326fe02d1ed6 ("net/mlx4_en:
protect ring->xdp_prog with rcu_read_lock"), as otherwise xdp_prog could
disappear underneath you. mlx4 and nfp does it correctly, looks like mlx5
doesn't.

^ permalink raw reply

* Re: [PATCH v2 07/13] net: ethernet: ti: cpts: rework initialization/deinitialization
From: Grygorii Strashko @ 2016-11-29 15:50 UTC (permalink / raw)
  To: Richard Cochran
  Cc: David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA, Mugunthan V N,
	Sekhar Nori, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-omap-u79uwXL29TY76Z2rM5mHXA, Rob Herring,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Murali Karicheri, Wingman Kwok
In-Reply-To: <20161129100737.GF3110-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>

Hi Richard,

On 11/29/2016 04:07 AM, Richard Cochran wrote:
> On Mon, Nov 28, 2016 at 05:03:31PM -0600, Grygorii Strashko wrote:
>> +int cpts_register(struct cpts *cpts)
>>  {
>>  	int err, i;
>>  
>> -	cpts->info = cpts_info;
>> -	spin_lock_init(&cpts->lock);
>> -
>> -	cpts->cc.read = cpts_systim_read;
>> -	cpts->cc.mask = CLOCKSOURCE_MASK(32);
>> -	cpts->cc_mult = mult;
>> -	cpts->cc.mult = mult;
>> -	cpts->cc.shift = shift;
>> -
>>  	INIT_LIST_HEAD(&cpts->events);
>>  	INIT_LIST_HEAD(&cpts->pool);
>>  	for (i = 0; i < CPTS_MAX_EVENTS; i++)
>>  		list_add(&cpts->pool_data[i].list, &cpts->pool);
>>  
>> -	cpts_clk_init(dev, cpts);
>> +	clk_enable(cpts->refclk);
>> +
>>  	cpts_write32(cpts, CPTS_EN, control);
>>  	cpts_write32(cpts, TS_PEND_EN, int_enable);
>>  
>> +	cpts->cc.mult = cpts->cc_mult;
> 
> It is not clear why you set cc.mult in a different place than
> cc.shift.  That isn't logical, but maybe later patches make it
> clear...

cc.mult has to be reloaded to original value each time CPTS is registered(restarted)
as it can be modified by cpts_ptp_adjfreq().

While cc.shift is static.



> 
>>  	timecounter_init(&cpts->tc, &cpts->cc, ktime_to_ns(ktime_get_real()));
>>  

[...]

>>  }
>>  EXPORT_SYMBOL_GPL(cpts_unregister);
>>  
>> +struct cpts *cpts_create(struct device *dev, void __iomem *regs,
>> +			 u32 mult, u32 shift)
>> +{
>> +	struct cpts *cpts;
>> +
>> +	if (!regs || !dev)
>> +		return ERR_PTR(-EINVAL);
> 
> There is no need for this test, as the caller will always pass valid
> pointers.  (This isn't a user space library!)
> 
ok

-- 
regards,
-grygorii
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH iproute2 net-next 0/4] tc: flower: SCTP and other port fixes
From: Simon Horman @ 2016-11-29 15:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Simon Horman

Hi,

this short series:

* Makes some improvements to the documentation of flower;
  A follow-up to recent work by Paul Blakey and myself.
* Corrects some errors introduced when SCTP port matching support was
  recently added.

It applies on top of the following:
* [PATCH iproute2] tc: flower: Fix usage message
* [PATCH iproute2 0/2] tc: flower: Support matching on SCTP port

Simon Horman (4):
  tc: flower: remove references to eth_type in manpage
  tc: flower: document SCTP ip_proto
  tc: flower: correct name of ip_proto parameter to flower_parse_port()
  tc: flower: make use of flower_port_attr_type() safe and silent

 man/man8/tc-flower.8 | 37 ++++++++++++++++++-------------------
 tc/f_flower.c        | 32 +++++++++++++++++---------------
 2 files changed, 35 insertions(+), 34 deletions(-)

-- 
2.7.0.rc3.207.g0ac5344

^ permalink raw reply

* [PATCH iproute2 net-next 1/4] tc: flower: remove references to eth_type in manpage
From: Simon Horman @ 2016-11-29 15:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Simon Horman, Paul Blakey
In-Reply-To: <1480434693-29397-1-git-send-email-simon.horman@netronome.com>

Remove references to eth_type and ether_type (spelling error) in
the tc flower manpage.

Also correct formatting of boldface text with whitespace.

Cc: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
This is a follow-up to "tc: flower: Fix usage message"
---
 man/man8/tc-flower.8 | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 16ef261797ab..56db42f983c1 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -103,8 +103,8 @@ or an unsigned 8bit value in hexadecimal format.
 Match on source or destination IP address.
 .I ADDRESS
 must be a valid IPv4 or IPv6 address, depending on
-.BR ether_type ,
-which has to be specified in beforehand.
+.BR protocol
+option of tc filter.
 .TP
 .BI dst_port " NUMBER"
 .TQ
@@ -114,16 +114,15 @@ Match on layer 4 protocol source or destination port number. Only available for
 which has to be specified in beforehand.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
-on the matches of the next lower layer. Precisely, layer one and two matches (
-.BR indev , dst_mac , src_mac " and " eth_type )
-have no dependency, layer three matches (
-.BR ip_proto , dst_ip " and " src_ip )
-require
-.B eth_type
-being set to either
-.BR ipv4 " or " ipv6 ,
-and finally layer four matches (
-.BR dst_port " and " src_port )
+on the matches of the next lower layer. Precisely, layer one and two matches
+(\fBindev\fR,  \fBdst_mac\fR and \fBsrc_mac\fR)
+have no dependency, layer three matches
+(\fBip_proto\fR, \fBdst_ip\fR and \fBsrc_ip\fR)
+depend on the
+.B protocol
+option of tc filter
+and finally layer four matches
+(\fBdst_port\fR and \fBsrc_port\fR)
 depend on
 .B ip_proto
 being set to either
-- 
2.7.0.rc3.207.g0ac5344

^ permalink raw reply related

* [PATCH iproute2 net-next 3/4] tc: flower: correct name of ip_proto parameter to flower_parse_port()
From: Simon Horman @ 2016-11-29 15:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Simon Horman
In-Reply-To: <1480434693-29397-1-git-send-email-simon.horman@netronome.com>

This corrects a typo.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
This a follow-up to ("tc: flower: Support matching on SCTP ports")
---
 tc/f_flower.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tc/f_flower.c b/tc/f_flower.c
index c74a0244e244..bca910d95729 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -157,15 +157,15 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
 	return 0;
 }
 
-static int flower_port_attr_type(__u8 ip_port, bool is_src)
+static int flower_port_attr_type(__u8 ip_proto, bool is_src)
 {
-	if (ip_port == IPPROTO_TCP) {
+	if (ip_proto == IPPROTO_TCP) {
 		return is_src ? TCA_FLOWER_KEY_TCP_SRC :
 			TCA_FLOWER_KEY_TCP_DST;
-	} else if (ip_port == IPPROTO_UDP) {
+	} else if (ip_proto == IPPROTO_UDP) {
 		return is_src ? TCA_FLOWER_KEY_UDP_SRC :
 			TCA_FLOWER_KEY_UDP_DST;
-	} else if (ip_port == IPPROTO_SCTP) {
+	} else if (ip_proto == IPPROTO_SCTP) {
 		return is_src ? TCA_FLOWER_KEY_SCTP_SRC :
 			TCA_FLOWER_KEY_SCTP_DST;
 	} else {
@@ -174,14 +174,14 @@ static int flower_port_attr_type(__u8 ip_port, bool is_src)
 	}
 }
 
-static int flower_parse_port(char *str, __u8 ip_port, bool is_src,
+static int flower_parse_port(char *str, __u8 ip_proto, bool is_src,
 			     struct nlmsghdr *n)
 {
 	int ret;
 	int type;
 	__be16 port;
 
-	type = flower_port_attr_type(ip_port, is_src);
+	type = flower_port_attr_type(ip_proto, is_src);
 	if (type < 0)
 		return -1;
 
-- 
2.7.0.rc3.207.g0ac5344

^ permalink raw reply related

* [PATCH iproute2 net-next 2/4] tc: flower: document SCTP ip_proto
From: Simon Horman @ 2016-11-29 15:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Simon Horman
In-Reply-To: <1480434693-29397-1-git-send-email-simon.horman@netronome.com>

Add SCTP ip_proto to help text and man page.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
This a follow-up to ("tc: flower: Support matching on SCTP ports")
---
 man/man8/tc-flower.8 | 14 +++++++-------
 tc/f_flower.c        |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 56db42f983c1..a401293fed50 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -29,7 +29,7 @@ flower \- flow based traffic control filter
 .IR PRIORITY " | "
 .BR vlan_eth_type " { " ipv4 " | " ipv6 " | "
 .IR ETH_TYPE " } | "
-.BR ip_proto " { " tcp " | " udp " | "
+.BR ip_proto " { " tcp " | " udp " | " sctp " | "
 .IR IP_PROTO " } | { "
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
@@ -93,8 +93,8 @@ or an unsigned 16bit value in hexadecimal format.
 .BI ip_proto " IP_PROTO"
 Match on layer four protocol.
 .I IP_PROTO
-may be either
-.BR tcp , udp
+may be
+.BR tcp ", " udp ", " sctp
 or an unsigned 8bit value in hexadecimal format.
 .TP
 .BI dst_ip " ADDRESS"
@@ -110,8 +110,8 @@ option of tc filter.
 .TQ
 .BI src_port " NUMBER"
 Match on layer 4 protocol source or destination port number. Only available for
-.BR ip_proto " values " udp " and " tcp ,
-which has to be specified in beforehand.
+.BR ip_proto " values " udp ", " tcp  " and " sctp
+which have to be specified in beforehand.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches
@@ -125,8 +125,8 @@ and finally layer four matches
 (\fBdst_port\fR and \fBsrc_port\fR)
 depend on
 .B ip_proto
-being set to either
-.BR tcp " or " udp .
+being set to
+.BR tcp ", " udp " or " sctp.
 .P
 There can be only used one mask per one prio. If user needs to specify different
 mask, he has to use different prio.
diff --git a/tc/f_flower.c b/tc/f_flower.c
index d1694623863d..c74a0244e244 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -36,7 +36,7 @@ static void explain(void)
 	fprintf(stderr, "                       vlan_ethtype [ ipv4 | ipv6 | ETH-TYPE ] |\n");
 	fprintf(stderr, "                       dst_mac MAC-ADDR |\n");
 	fprintf(stderr, "                       src_mac MAC-ADDR |\n");
-	fprintf(stderr, "                       ip_proto [tcp | udp | IP-PROTO ] |\n");
+	fprintf(stderr, "                       ip_proto [ tcp | udp | sctp | IP-PROTO ] |\n");
 	fprintf(stderr, "                       dst_ip [ IPV4-ADDR | IPV6-ADDR ] |\n");
 	fprintf(stderr, "                       src_ip [ IPV4-ADDR | IPV6-ADDR ] |\n");
 	fprintf(stderr, "                       dst_port PORT-NUMBER |\n");
-- 
2.7.0.rc3.207.g0ac5344

^ permalink raw reply related

* [PATCH iproute2 net-next 4/4] tc: flower: make use of flower_port_attr_type() safe and silent
From: Simon Horman @ 2016-11-29 15:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Simon Horman
In-Reply-To: <1480434693-29397-1-git-send-email-simon.horman@netronome.com>

Make use of flower_port_attr_type() safe:
* flower_port_attr_type() may return a valid index into tb[] or -1.
  Only access tb[] in the case of the former.
* Do not access null entries in tb[]

Also make usage silent - it is valid for ip_proto to be invalid,
for example if it is not specified as part of the filter.

Fixes: ("tc: flower: Support matching on SCTP ports")
Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 tc/f_flower.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/tc/f_flower.c b/tc/f_flower.c
index bca910d95729..21655f950386 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -159,19 +159,17 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
 
 static int flower_port_attr_type(__u8 ip_proto, bool is_src)
 {
-	if (ip_proto == IPPROTO_TCP) {
+	if (ip_proto == IPPROTO_TCP)
 		return is_src ? TCA_FLOWER_KEY_TCP_SRC :
 			TCA_FLOWER_KEY_TCP_DST;
-	} else if (ip_proto == IPPROTO_UDP) {
+	else if (ip_proto == IPPROTO_UDP)
 		return is_src ? TCA_FLOWER_KEY_UDP_SRC :
 			TCA_FLOWER_KEY_UDP_DST;
-	} else if (ip_proto == IPPROTO_SCTP) {
+	else if (ip_proto == IPPROTO_SCTP)
 		return is_src ? TCA_FLOWER_KEY_SCTP_SRC :
 			TCA_FLOWER_KEY_SCTP_DST;
-	} else {
-		fprintf(stderr, "Illegal \"ip_proto\" for port\n");
+	else
 		return -1;
-	}
 }
 
 static int flower_parse_port(char *str, __u8 ip_proto, bool is_src,
@@ -505,7 +503,8 @@ static void flower_print_ip_addr(FILE *f, char *name, __be16 eth_type,
 
 static void flower_print_port(FILE *f, char *name, struct rtattr *attr)
 {
-	fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
+	if (attr)
+		fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
 }
 
 static int flower_print_opt(struct filter_util *qu, FILE *f,
@@ -514,6 +513,7 @@ static int flower_print_opt(struct filter_util *qu, FILE *f,
 	struct rtattr *tb[TCA_FLOWER_MAX + 1];
 	__be16 eth_type = 0;
 	__u8 ip_proto = 0xff;
+	int nl_type;
 
 	if (!opt)
 		return 0;
@@ -568,10 +568,12 @@ static int flower_print_opt(struct filter_util *qu, FILE *f,
 			     tb[TCA_FLOWER_KEY_IPV6_SRC],
 			     tb[TCA_FLOWER_KEY_IPV6_SRC_MASK]);
 
-	flower_print_port(f, "dst_port",
-			  tb[flower_port_attr_type(ip_proto, false)]);
-	flower_print_port(f, "src_port",
-			  tb[flower_port_attr_type(ip_proto, true)]);
+	nl_type = flower_port_attr_type(ip_proto, false);
+	if (nl_type >= 0)
+		flower_print_port(f, "dst_port", tb[nl_type]);
+	nl_type = flower_port_attr_type(ip_proto, true);
+	if (nl_type >= 0)
+		flower_print_port(f, "src_port", tb[nl_type]);
 
 	if (tb[TCA_FLOWER_FLAGS])  {
 		__u32 flags = rta_getattr_u32(tb[TCA_FLOWER_FLAGS]);
-- 
2.7.0.rc3.207.g0ac5344

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox