* [RFC PATCH] net: add a tx_queue attribute rate_queue_limits in Mbps
@ 2013-06-24 3:24 John Fastabend
2013-06-24 15:11 ` Stephen Hemminger
2013-06-25 19:00 ` Ben Hutchings
0 siblings, 2 replies; 5+ messages in thread
From: John Fastabend @ 2013-06-24 3:24 UTC (permalink / raw)
To: netdev; +Cc: therbert, ben, jesse.brandeburg, jeffrey.t.kirsher
This adds a rate_queue_limit attribute to the tx_queue sysfs entry
to allow rate limiting in units of Mpbs. Along with mqprio and BQL
this provides another knob to tune queue performance. By default it
is disabled with a setting of '0'.
By adding this as a queue attribute and _not_ a qdisc option allows
using rate limits with qdisc schemes that may not align with tx rings
and also allows using QOS schemes along with rate limits.
A sample implementation is provided for ixgbe. Any improvements or
suggestions welcome I would also be interested to know if this works
with other hardware and if Mbps is a good default unit.
I tested this briefly with iperf/netperf,
# echo 4000 > /sys/class/net/p3p2/queues/tx-0/tx_rate_limit
# cat /sys/class/net/p3p2/queues/tx-0/tx_rate_limit
4000
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1
drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 47 ++++++++++++++--
drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h | 2 +
include/linux/netdevice.h | 12 ++++
net/core/net-sysfs.c | 69 +++++++++++++++++++-----
5 files changed, 110 insertions(+), 21 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 047ebaa..8d168a0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7244,6 +7244,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_fdb_add = ixgbe_ndo_fdb_add,
.ndo_bridge_setlink = ixgbe_ndo_bridge_setlink,
.ndo_bridge_getlink = ixgbe_ndo_bridge_getlink,
+ .ndo_set_ratelimit = ixgbe_set_rate_limit,
};
/**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 1e7d587..22f3df2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1107,17 +1107,14 @@ static int ixgbe_link_mbps(struct ixgbe_adapter *adapter)
}
}
-static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
+static u32 ixgbe_bcnrc_from_rate(struct ixgbe_adapter *adapter,
+ u16 tx_rate, int link_speed)
{
- struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
- struct ixgbe_hw *hw = &adapter->hw;
u32 bcnrc_val = 0;
- u16 queue, queues_per_pool;
- u16 tx_rate = adapter->vfinfo[vf].tx_rate;
if (tx_rate) {
/* start with base link speed value */
- bcnrc_val = adapter->vf_rate_link_speed;
+ bcnrc_val = link_speed;
/* Calculate the rate factor values to set */
bcnrc_val <<= IXGBE_RTTBCNRC_RF_INT_SHIFT;
@@ -1131,6 +1128,11 @@ static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
bcnrc_val |= IXGBE_RTTBCNRC_RS_ENA;
}
+ return bcnrc_val;
+}
+
+static void ixgbe_set_xmit_compensation(struct ixgbe_hw *hw)
+{
/*
* Set global transmit compensation time to the MMW_SIZE in RTTBCNRM
* register. Typically MMW_SIZE=0x014 if 9728-byte jumbo is supported
@@ -1146,6 +1148,39 @@ static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
default:
break;
}
+}
+
+int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate)
+{
+ struct ixgbe_adapter *a = netdev_priv(dev);
+ struct ixgbe_hw *hw = &a->hw;
+ int linkspeed = ixgbe_link_mbps(a);
+ u8 reg_idx = a->tx_ring[index]->reg_idx;
+ u32 bcnrc = ixgbe_bcnrc_from_rate(a, *tx_rate, linkspeed);
+
+ /* rate limit cannot be less than 10Mbs or greater than link speed */
+ if (*tx_rate && ((*tx_rate <= 10) || (*tx_rate > linkspeed)))
+ return -EINVAL;
+
+ ixgbe_set_xmit_compensation(hw);
+
+ IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, reg_idx);
+ IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, bcnrc);
+
+ return 0;
+}
+
+static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
+{
+ struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
+ struct ixgbe_hw *hw = &adapter->hw;
+ u32 bcnrc_val = 0;
+ u16 queue, queues_per_pool;
+ u16 tx_rate = adapter->vfinfo[vf].tx_rate;
+
+ bcnrc_val = ixgbe_bcnrc_from_rate(adapter, tx_rate,
+ adapter->vf_rate_link_speed);
+ ixgbe_set_xmit_compensation(hw);
/* determine how many queues per pool based on VMDq mask */
queues_per_pool = __ALIGN_MASK(1, ~vmdq->mask);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index 4713f9f..d8b4bbe 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -56,5 +56,7 @@ static inline void ixgbe_set_vmvir(struct ixgbe_adapter *adapter,
IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf), vmvir);
}
+int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate);
+
#endif /* _IXGBE_SRIOV_H_ */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 09b4188..d84d69a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -574,6 +574,7 @@ struct netdev_queue {
#ifdef CONFIG_BQL
struct dql dql;
#endif
+ unsigned long rate_limit;
} ____cacheline_aligned_in_smp;
static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
@@ -932,6 +933,14 @@ struct netdev_fcoe_hbainfo {
* that determine carrier state from physical hardware properties (eg
* network cables) or protocol-dependent mechanisms (eg
* USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
+ *
+ * int (*ndo_set_ratelimit)(struct net_device *dev,
+ * int queue_index, u32 *maxrate)
+ * Called to set the rate limit in Mpbs specified by maxrate of the
+ * specified queue_index. It is expected that hardware way may quantize
+ * the rate limits. In these cases the driver should guarentee the
+ * specfied maxrate is not exceeded and return the set value in maxrate.
+ * Zero should be returned on sucess otherwise use appropriate error code.
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
@@ -1060,6 +1069,9 @@ struct net_device_ops {
struct nlmsghdr *nlh);
int (*ndo_change_carrier)(struct net_device *dev,
bool new_carrier);
+ int (*ndo_set_ratelimit)(struct net_device *dev,
+ int queue_index,
+ u32 *max_rate);
};
/*
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 981fed3..ff61852 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -819,6 +819,59 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
return sprintf(buf, "%lu", trans_timeout);
}
+static ssize_t show_rate_limit(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ return sprintf(buf, "%lu", queue->rate_limit);
+}
+
+static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++)
+ if (queue == &dev->_tx[i])
+ break;
+
+ BUG_ON(i >= dev->num_tx_queues);
+
+ return i;
+}
+
+static ssize_t set_tx_rate_limit(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct net_device *dev = queue->dev;
+ int err, index = get_netdev_queue_index(queue);
+ u32 rate = 0;
+
+ err = kstrtou32(buf, 10, &rate);
+ if (err < 0)
+ return err;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev->netdev_ops->ndo_set_ratelimit)
+ err = dev->netdev_ops->ndo_set_ratelimit(dev, index, &rate);
+ else
+ err = -EOPNOTSUPP;
+ rtnl_unlock();
+
+ if (err < 0)
+ return err;
+
+ queue->rate_limit = rate;
+ return len;
+}
+
+static struct netdev_queue_attribute queue_rate_limit =
+ __ATTR(tx_rate_limit, S_IRUGO | S_IWUSR,
+ show_rate_limit, set_tx_rate_limit);
+
static struct netdev_queue_attribute queue_trans_timeout =
__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
@@ -933,21 +986,6 @@ static struct attribute_group dql_group = {
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
-{
- struct net_device *dev = queue->dev;
- int i;
-
- for (i = 0; i < dev->num_tx_queues; i++)
- if (queue == &dev->_tx[i])
- break;
-
- BUG_ON(i >= dev->num_tx_queues);
-
- return i;
-}
-
-
static ssize_t show_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute, char *buf)
{
@@ -1032,6 +1070,7 @@ static struct attribute *netdev_queue_default_attrs[] = {
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
#endif
+ &queue_rate_limit.attr,
NULL
};
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [RFC PATCH] net: add a tx_queue attribute rate_queue_limits in Mbps
2013-06-24 3:24 [RFC PATCH] net: add a tx_queue attribute rate_queue_limits in Mbps John Fastabend
@ 2013-06-24 15:11 ` Stephen Hemminger
2013-06-24 18:03 ` John Fastabend
2013-06-25 19:00 ` Ben Hutchings
1 sibling, 1 reply; 5+ messages in thread
From: Stephen Hemminger @ 2013-06-24 15:11 UTC (permalink / raw)
To: John Fastabend; +Cc: netdev, therbert, ben, jesse.brandeburg, jeffrey.t.kirsher
On Sun, 23 Jun 2013 20:24:11 -0700
John Fastabend <john.fastabend@gmail.com> wrote:
> This adds a rate_queue_limit attribute to the tx_queue sysfs entry
> to allow rate limiting in units of Mpbs. Along with mqprio and BQL
> this provides another knob to tune queue performance. By default it
> is disabled with a setting of '0'.
>
> By adding this as a queue attribute and _not_ a qdisc option allows
> using rate limits with qdisc schemes that may not align with tx rings
> and also allows using QOS schemes along with rate limits.
>
> A sample implementation is provided for ixgbe. Any improvements or
> suggestions welcome I would also be interested to know if this works
> with other hardware and if Mbps is a good default unit.
>
> I tested this briefly with iperf/netperf,
>
> # echo 4000 > /sys/class/net/p3p2/queues/tx-0/tx_rate_limit
> # cat /sys/class/net/p3p2/queues/tx-0/tx_rate_limit
> 4000
I like the facility and there is a real need for it, but we need
to think about what best API for it is.
There are several possible API's for this. My preference in order is:
1. Netlink:
PRO: most well structured and can notify control applications
CON: doesn't propagate well down to device
2. Ethtool:
PRO: fits current model of speed/duplex
CON: still and ioctl based model, non-extensible structures
3. Sysfs
PRO: easy to add
CON: not a general mechanism, ends up being per-device
4. Module parameter
PRO: easy to code
CON: hard to manage from application, hard to associate with multiple devices
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC PATCH] net: add a tx_queue attribute rate_queue_limits in Mbps
2013-06-24 15:11 ` Stephen Hemminger
@ 2013-06-24 18:03 ` John Fastabend
0 siblings, 0 replies; 5+ messages in thread
From: John Fastabend @ 2013-06-24 18:03 UTC (permalink / raw)
To: Stephen Hemminger
Cc: netdev, therbert, ben, jesse.brandeburg, jeffrey.t.kirsher
On 06/24/2013 08:11 AM, Stephen Hemminger wrote:
> On Sun, 23 Jun 2013 20:24:11 -0700
> John Fastabend <john.fastabend@gmail.com> wrote:
>
>> This adds a rate_queue_limit attribute to the tx_queue sysfs entry
>> to allow rate limiting in units of Mpbs. Along with mqprio and BQL
>> this provides another knob to tune queue performance. By default it
>> is disabled with a setting of '0'.
>>
>> By adding this as a queue attribute and _not_ a qdisc option allows
>> using rate limits with qdisc schemes that may not align with tx rings
>> and also allows using QOS schemes along with rate limits.
>>
>> A sample implementation is provided for ixgbe. Any improvements or
>> suggestions welcome I would also be interested to know if this works
>> with other hardware and if Mbps is a good default unit.
>>
>> I tested this briefly with iperf/netperf,
>>
>> # echo 4000 > /sys/class/net/p3p2/queues/tx-0/tx_rate_limit
>> # cat /sys/class/net/p3p2/queues/tx-0/tx_rate_limit
>> 4000
>
> I like the facility and there is a real need for it, but we need
> to think about what best API for it is.
>
> There are several possible API's for this. My preference in order is:
> 1. Netlink:
> PRO: most well structured and can notify control applications
> CON: doesn't propagate well down to device
Not sure I understand the con here. We push a lot of netlink messages
down to the device already via ndo_ops/rtnetlink.
Having a mechanism in place to notify control applications is nice.
> 2. Ethtool:
> PRO: fits current model of speed/duplex
> CON: still and ioctl based model, non-extensible structures
I prefer netlink or sysfs but this is likely my own bias. Being able
to extend the interface easily if/when we get more queue attributes
will help in the future.
> 3. Sysfs
> PRO: easy to add
> CON: not a general mechanism, ends up being per-device
There are already entries for other queue attributes here namely
byte_queue_limits, tx_timeout, and xps_cpus. And net-sysfs.c keeps all
of this reasonable sane IMO and not really device specific. My thought
for using this was to keep all the queue attributes in one location.
> 4. Module parameter
> PRO: easy to code
> CON: hard to manage from application, hard to associate with multiple devices
>
>
>
I would prefer Netlink (for notifications) or Sysfs (to keep attributes
in one place). I don't really have a strong preference either way.
.John
--
John Fastabend Intel Corporation
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC PATCH] net: add a tx_queue attribute rate_queue_limits in Mbps
2013-06-24 3:24 [RFC PATCH] net: add a tx_queue attribute rate_queue_limits in Mbps John Fastabend
2013-06-24 15:11 ` Stephen Hemminger
@ 2013-06-25 19:00 ` Ben Hutchings
2013-06-25 21:08 ` John Fastabend
1 sibling, 1 reply; 5+ messages in thread
From: Ben Hutchings @ 2013-06-25 19:00 UTC (permalink / raw)
To: John Fastabend; +Cc: netdev, therbert, ben, jesse.brandeburg, jeffrey.t.kirsher
On Sun, 2013-06-23 at 20:24 -0700, John Fastabend wrote:
[...]
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
[...]
> +int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate)
> +{
> + struct ixgbe_adapter *a = netdev_priv(dev);
> + struct ixgbe_hw *hw = &a->hw;
> + int linkspeed = ixgbe_link_mbps(a);
> + u8 reg_idx = a->tx_ring[index]->reg_idx;
> + u32 bcnrc = ixgbe_bcnrc_from_rate(a, *tx_rate, linkspeed);
> +
> + /* rate limit cannot be less than 10Mbs or greater than link speed */
> + if (*tx_rate && ((*tx_rate <= 10) || (*tx_rate > linkspeed)))
> + return -EINVAL;
What if the link is not up?
What if the link speed is currently higher than tx_rate, but is later
reduced below tx_rate?
[...]
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -574,6 +574,7 @@ struct netdev_queue {
> #ifdef CONFIG_BQL
> struct dql dql;
> #endif
> + unsigned long rate_limit;
Should be u32, consistent with variables in ndo_set_ratelimit and
set_tx_rate_limit.
> } ____cacheline_aligned_in_smp;
>
> static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
> @@ -932,6 +933,14 @@ struct netdev_fcoe_hbainfo {
> * that determine carrier state from physical hardware properties (eg
> * network cables) or protocol-dependent mechanisms (eg
> * USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
> + *
> + * int (*ndo_set_ratelimit)(struct net_device *dev,
> + * int queue_index, u32 *maxrate)
> + * Called to set the rate limit in Mpbs specified by maxrate of the
> + * specified queue_index. It is expected that hardware way may quantize
> + * the rate limits. In these cases the driver should guarentee the
> + * specfied maxrate is not exceeded and return the set value in maxrate.
> + * Zero should be returned on sucess otherwise use appropriate error code.
[...]
In general, how does maxrate interact with link speed?
There are also several spelling and grammar errors in this text.
Ben.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [RFC PATCH] net: add a tx_queue attribute rate_queue_limits in Mbps
2013-06-25 19:00 ` Ben Hutchings
@ 2013-06-25 21:08 ` John Fastabend
0 siblings, 0 replies; 5+ messages in thread
From: John Fastabend @ 2013-06-25 21:08 UTC (permalink / raw)
To: Ben Hutchings; +Cc: netdev, therbert, ben, jesse.brandeburg, jeffrey.t.kirsher
On 06/25/2013 12:00 PM, Ben Hutchings wrote:
> On Sun, 2013-06-23 at 20:24 -0700, John Fastabend wrote:
> [...]
>> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
>> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
> [...]
>> +int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate)
>> +{
>> + struct ixgbe_adapter *a = netdev_priv(dev);
>> + struct ixgbe_hw *hw = &a->hw;
>> + int linkspeed = ixgbe_link_mbps(a);
>> + u8 reg_idx = a->tx_ring[index]->reg_idx;
>> + u32 bcnrc = ixgbe_bcnrc_from_rate(a, *tx_rate, linkspeed);
>> +
>> + /* rate limit cannot be less than 10Mbs or greater than link speed */
>> + if (*tx_rate && ((*tx_rate <= 10) || (*tx_rate > linkspeed)))
>> + return -EINVAL;
>
> What if the link is not up?
>
As its written this would always be an error because link_speed is 0.
Not likely what we want.
> What if the link speed is currently higher than tx_rate, but is later
> reduced below tx_rate?
As its written you just end up with a tx_rate limit that is broken and
because the hw calculation depends on link speed the value won't be
correct any more.
Both of these cases are probably broken as is.
>
> [...]
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -574,6 +574,7 @@ struct netdev_queue {
>> #ifdef CONFIG_BQL
>> struct dql dql;
>> #endif
>> + unsigned long rate_limit;
>
> Should be u32, consistent with variables in ndo_set_ratelimit and
> set_tx_rate_limit.
>
>> } ____cacheline_aligned_in_smp;
>>
>> static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
>> @@ -932,6 +933,14 @@ struct netdev_fcoe_hbainfo {
>> * that determine carrier state from physical hardware properties (eg
>> * network cables) or protocol-dependent mechanisms (eg
>> * USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
>> + *
>> + * int (*ndo_set_ratelimit)(struct net_device *dev,
>> + * int queue_index, u32 *maxrate)
>> + * Called to set the rate limit in Mpbs specified by maxrate of the
>> + * specified queue_index. It is expected that hardware way may quantize
>> + * the rate limits. In these cases the driver should guarentee the
>> + * specfied maxrate is not exceeded and return the set value in maxrate.
>> + * Zero should be returned on sucess otherwise use appropriate error code.
> [...]
>
> In general, how does maxrate interact with link speed?
I think the best approach is to allow a max_rate that is greater than
the link speed. This seems more straight forward than arbitrarily
resetting the value or doing something else.
ixgbe and maybe other drivers will need to reconfigure their internal
hardware limiters if they are derived from link speed.
>
> There are also several spelling and grammar errors in this text.
>
Yeah, must have been late, not enough coffee. Thanks for looking at it
Ben.
Also I was thinking about pushing it over to netlink as Stephen
preferred.
--
John Fastabend Intel Corporation
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2013-06-25 21:08 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-06-24 3:24 [RFC PATCH] net: add a tx_queue attribute rate_queue_limits in Mbps John Fastabend
2013-06-24 15:11 ` Stephen Hemminger
2013-06-24 18:03 ` John Fastabend
2013-06-25 19:00 ` Ben Hutchings
2013-06-25 21:08 ` John Fastabend
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).