* [PATCH 1/4 iproute2-next] lib: Update backend of print_size to accept 64 bit size
2025-09-07 1:42 [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jay Vosburgh
@ 2025-09-07 1:42 ` Jay Vosburgh
2025-09-07 1:42 ` [PATCH 2/4 iproute2-next] tc: Add get_size64 and get_size64_and_cell Jay Vosburgh
` (3 subsequent siblings)
4 siblings, 0 replies; 13+ messages in thread
From: Jay Vosburgh @ 2025-09-07 1:42 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, David Ahern
In preparation for accepting 64 bit burst sizes, modify
sprint_size, the formatting function behind print_size, to accept __u64 as
its size parameter. Also include a "Gb" size category.
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
---
include/json_print.h | 4 ++--
lib/json_print_math.c | 11 +++++++----
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/include/json_print.h b/include/json_print.h
index daebcf5d25f5..59edd5b2467e 100644
--- a/include/json_print.h
+++ b/include/json_print.h
@@ -68,7 +68,7 @@ _PRINT_FUNC(on_off, bool)
_PRINT_FUNC(null, const char*)
_PRINT_FUNC(string, const char*)
_PRINT_FUNC(uint, unsigned int)
-_PRINT_FUNC(size, __u32)
+_PRINT_FUNC(size, __u64)
_PRINT_FUNC(u64, uint64_t)
_PRINT_FUNC(hhu, unsigned char)
_PRINT_FUNC(hu, unsigned short)
@@ -109,6 +109,6 @@ static inline int print_bool_opt(enum output_type type,
}
/* A backdoor to the size formatter. Please use print_size() instead. */
-char *sprint_size(__u32 sz, char *buf);
+char *sprint_size(__u64 sz, char *buf);
#endif /* _JSON_PRINT_H_ */
diff --git a/lib/json_print_math.c b/lib/json_print_math.c
index f4d504995924..3e951cd9f504 100644
--- a/lib/json_print_math.c
+++ b/lib/json_print_math.c
@@ -7,25 +7,28 @@
#include "utils.h"
#include "json_print.h"
-char *sprint_size(__u32 sz, char *buf)
+char *sprint_size(__u64 sz, char *buf)
{
long kilo = 1024;
long mega = kilo * kilo;
+ long giga = mega * kilo;
size_t len = SPRINT_BSIZE - 1;
double tmp = sz;
- if (sz >= mega && fabs(mega * rint(tmp / mega) - sz) < 1024)
+ if (sz >= giga && fabs(giga * rint(tmp / giga) - sz) < 1024)
+ snprintf(buf, len, "%gGb", rint(tmp / giga));
+ else if (sz >= mega && fabs(mega * rint(tmp / mega) - sz) < 1024)
snprintf(buf, len, "%gMb", rint(tmp / mega));
else if (sz >= kilo && fabs(kilo * rint(tmp / kilo) - sz) < 16)
snprintf(buf, len, "%gKb", rint(tmp / kilo));
else
- snprintf(buf, len, "%ub", sz);
+ snprintf(buf, len, "%llub", sz);
return buf;
}
int print_color_size(enum output_type type, enum color_attr color,
- const char *key, const char *fmt, __u32 sz)
+ const char *key, const char *fmt, __u64 sz)
{
SPRINT_BUF(buf);
--
2.25.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 2/4 iproute2-next] tc: Add get_size64 and get_size64_and_cell
2025-09-07 1:42 [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jay Vosburgh
2025-09-07 1:42 ` [PATCH 1/4 iproute2-next] lib: Update backend of print_size to accept 64 bit size Jay Vosburgh
@ 2025-09-07 1:42 ` Jay Vosburgh
2025-09-07 1:42 ` [PATCH 3/4 iproute2-next] tc: Expand tc_calc_xmittime, tc_calc_xmitsize to u64 Jay Vosburgh
` (2 subsequent siblings)
4 siblings, 0 replies; 13+ messages in thread
From: Jay Vosburgh @ 2025-09-07 1:42 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, David Ahern
In preparation for accepting 64 bit burst sizes, create 64-bit
versions of get_size and get_size_and_cell. The 32-bit versions become
wrappers around the 64-bit versions.
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
---
include/utils.h | 1 +
lib/utils_math.c | 19 ++++++++++++++++++-
tc/tc_util.c | 21 +++++++++++++++++++--
tc/tc_util.h | 1 +
4 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/include/utils.h b/include/utils.h
index 9a81494dd3e3..128cbb59cb90 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -163,6 +163,7 @@ int get_addr64(__u64 *ap, const char *cp);
int get_rate(unsigned int *rate, const char *str);
int get_rate64(__u64 *rate, const char *str);
int get_size(unsigned int *size, const char *str);
+int get_size64(__u64 *size, const char *str);
int hex2mem(const char *buf, uint8_t *mem, int count);
char *hexstring_n2a(const __u8 *str, int len, char *buf, int blen);
diff --git a/lib/utils_math.c b/lib/utils_math.c
index 9ef3dd6ed93b..a7e747440039 100644
--- a/lib/utils_math.c
+++ b/lib/utils_math.c
@@ -87,7 +87,7 @@ int get_rate64(__u64 *rate, const char *str)
return 0;
}
-int get_size(unsigned int *size, const char *str)
+int get_size64(__u64 *size, const char *str)
{
double sz;
char *p;
@@ -121,3 +121,20 @@ int get_size(unsigned int *size, const char *str)
return 0;
}
+
+int get_size(unsigned int *size, const char *str)
+{
+ __u64 sz64;
+ int rv;
+
+ rv = get_size64(&sz64, str);
+ *size = sz64;
+
+ if (rv)
+ return rv;
+
+ if (sz64 > UINT_MAX)
+ return -1;
+
+ return 0;
+}
diff --git a/tc/tc_util.c b/tc/tc_util.c
index ff0ac170730b..45d76e7578d4 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -257,14 +257,14 @@ tc_print_rate(enum output_type t, const char *key, const char *fmt,
print_rate(use_iec, t, key, fmt, rate);
}
-int get_size_and_cell(unsigned int *size, int *cell_log, char *str)
+int get_size64_and_cell(__u64 *size, int *cell_log, char *str)
{
char *slash = strchr(str, '/');
if (slash)
*slash = 0;
- if (get_size(size, str))
+ if (get_size64(size, str))
return -1;
if (slash) {
@@ -286,6 +286,23 @@ int get_size_and_cell(unsigned int *size, int *cell_log, char *str)
return 0;
}
+int get_size_and_cell(unsigned int *size, int *cell_log, char *str)
+{
+ __u64 size64;
+ int rv;
+
+ rv = get_size64_and_cell(&size64, cell_log, str);
+ if (rv)
+ return rv;
+
+ if (size64 > UINT32_MAX)
+ return -1;
+
+ *size = size64;
+
+ return 0;
+}
+
void print_devname(enum output_type type, int ifindex)
{
const char *ifname = ll_index_to_name(ifindex);
diff --git a/tc/tc_util.h b/tc/tc_util.h
index ec2063729b07..8ebca3963d94 100644
--- a/tc/tc_util.h
+++ b/tc/tc_util.h
@@ -80,6 +80,7 @@ int get_qdisc_handle(__u32 *h, const char *str);
int get_percent_rate(unsigned int *rate, const char *str, const char *dev);
int get_percent_rate64(__u64 *rate, const char *str, const char *dev);
int get_size_and_cell(unsigned int *size, int *cell_log, char *str);
+int get_size64_and_cell(__u64 *size, int *cell_log, char *str);
int get_linklayer(unsigned int *val, const char *arg);
void tc_print_rate(enum output_type t, const char *key, const char *fmt,
--
2.25.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 3/4 iproute2-next] tc: Expand tc_calc_xmittime, tc_calc_xmitsize to u64
2025-09-07 1:42 [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jay Vosburgh
2025-09-07 1:42 ` [PATCH 1/4 iproute2-next] lib: Update backend of print_size to accept 64 bit size Jay Vosburgh
2025-09-07 1:42 ` [PATCH 2/4 iproute2-next] tc: Add get_size64 and get_size64_and_cell Jay Vosburgh
@ 2025-09-07 1:42 ` Jay Vosburgh
2025-09-07 1:42 ` [PATCH 4/4 iproute2-next] tc/police: enable use of 64 bit burst parameter Jay Vosburgh
2025-09-10 3:32 ` [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jamal Hadi Salim
4 siblings, 0 replies; 13+ messages in thread
From: Jay Vosburgh @ 2025-09-07 1:42 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, David Ahern
In preparation for accepting 64-bit burst sizes, modify
tc_calc_xmittime and tc_calc_xmitsize to handle 64-bit values.
tc_calc_xmittime continues to return a 32-bit value, as its range
is limited by the kernel API, but overflow is now detected and the return
value is limited to UINT_MAX.
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
---
tc/tc_core.c | 9 ++++++---
tc/tc_core.h | 4 ++--
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/tc/tc_core.c b/tc/tc_core.c
index 32fd094f6a05..a422e02c8795 100644
--- a/tc/tc_core.c
+++ b/tc/tc_core.c
@@ -43,12 +43,15 @@ unsigned int tc_core_ktime2time(unsigned int ktime)
return ktime / clock_factor;
}
-unsigned int tc_calc_xmittime(__u64 rate, unsigned int size)
+unsigned int tc_calc_xmittime(__u64 rate, __u64 size)
{
- return ceil(tc_core_time2tick(TIME_UNITS_PER_SEC*((double)size/(double)rate)));
+ double val;
+
+ val = ceil(tc_core_time2tick(TIME_UNITS_PER_SEC*((double)size/(double)rate)));
+ return val > UINT_MAX ? UINT_MAX : val;
}
-unsigned int tc_calc_xmitsize(__u64 rate, unsigned int ticks)
+__u64 tc_calc_xmitsize(__u64 rate, unsigned int ticks)
{
return ((double)rate*tc_core_tick2time(ticks))/TIME_UNITS_PER_SEC;
}
diff --git a/tc/tc_core.h b/tc/tc_core.h
index c0fb7481420a..d80370360dec 100644
--- a/tc/tc_core.h
+++ b/tc/tc_core.h
@@ -15,8 +15,8 @@ enum link_layer {
double tc_core_tick2time(double tick);
unsigned tc_core_time2ktime(unsigned time);
unsigned tc_core_ktime2time(unsigned ktime);
-unsigned tc_calc_xmittime(__u64 rate, unsigned size);
-unsigned tc_calc_xmitsize(__u64 rate, unsigned ticks);
+unsigned tc_calc_xmittime(__u64 rate, __u64 size);
+__u64 tc_calc_xmitsize(__u64 rate, unsigned ticks);
int tc_calc_rtable(struct tc_ratespec *r, __u32 *rtab,
int cell_log, unsigned mtu, enum link_layer link_layer);
int tc_calc_rtable_64(struct tc_ratespec *r, __u32 *rtab,
--
2.25.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 4/4 iproute2-next] tc/police: enable use of 64 bit burst parameter
2025-09-07 1:42 [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jay Vosburgh
` (2 preceding siblings ...)
2025-09-07 1:42 ` [PATCH 3/4 iproute2-next] tc: Expand tc_calc_xmittime, tc_calc_xmitsize to u64 Jay Vosburgh
@ 2025-09-07 1:42 ` Jay Vosburgh
2025-09-10 3:32 ` [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jamal Hadi Salim
4 siblings, 0 replies; 13+ messages in thread
From: Jay Vosburgh @ 2025-09-07 1:42 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, David Ahern
Modify tc police to permit burst sizes up to the limit of the
kernel API, which may exceed 4 GB of burst size at higher rates.
As presently implemented, the tc police burst option limits the
size of the burst to 4 GB in size. This is a reasonable limit for the
rates common when this was developed. However, the underlying
implementation of burst is expressed in terms of time at the specified
rate, and for higher rates, a burst size exceeding 4 GB is feasible
without modification to the kernel.
The kernel API specifies the burst size as the number of "psched
ticks" needed to send the burst at the specified rate. As each psched
tick is 64 nsec, the actual kernel limit on burst size is approximately
274.88 seconds (UINT_MAX * 64 / NSEC_PER_SEC).
For example, at a rate of 10 Gbit/sec, the current 4 GB size limit
corresponds to just under 3.5 seconds.
Additionally, overflows (burst values that exceed UINT_MAX psched
ticks) are now correctly detected, and flagged as an error, rather than
passing arbitrary psched tick values to the kernel.
Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
---
tc/m_police.c | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/tc/m_police.c b/tc/m_police.c
index 5c7438b94d83..f5c538c93d2c 100644
--- a/tc/m_police.c
+++ b/tc/m_police.c
@@ -54,12 +54,12 @@ static int act_parse_police(const struct action_util *a, int *argc_p, char ***ar
__u32 ptab[256];
__u32 avrate = 0;
int presult = 0;
- unsigned buffer = 0, mtu = 0, mpu = 0;
+ unsigned mtu = 0, mpu = 0;
unsigned short overhead = 0;
unsigned int linklayer = LINKLAYER_ETHERNET; /* Assume ethernet */
int Rcell_log = -1, Pcell_log = -1;
struct rtattr *tail;
- __u64 rate64 = 0, prate64 = 0;
+ __u64 rate64 = 0, prate64 = 0, buffer64 = 0;
__u64 pps64 = 0, ppsburst64 = 0;
if (a) /* new way of doing things */
@@ -78,9 +78,10 @@ static int act_parse_police(const struct action_util *a, int *argc_p, char ***ar
strcmp(*argv, "buffer") == 0 ||
strcmp(*argv, "maxburst") == 0) {
NEXT_ARG();
- if (buffer)
+ if (buffer64)
duparg("buffer/burst", *argv);
- if (get_size_and_cell(&buffer, &Rcell_log, *argv) < 0)
+ if (get_size64_and_cell(&buffer64, &Rcell_log,
+ *argv) < 0)
invarg("buffer", *argv);
} else if (strcmp(*argv, "mtu") == 0 ||
strcmp(*argv, "minburst") == 0) {
@@ -173,7 +174,7 @@ action_ctrl_ok:
}
/* When the TB policer is used, burst is required */
- if (rate64 && !buffer && !avrate) {
+ if (rate64 && !buffer64 && !avrate) {
fprintf(stderr, "'burst' requires 'rate'.\n");
return -1;
}
@@ -210,7 +211,11 @@ action_ctrl_ok:
fprintf(stderr, "POLICE: failed to calculate rate table.\n");
return -1;
}
- p.burst = tc_calc_xmittime(rate64, buffer);
+ p.burst = tc_calc_xmittime(rate64, buffer64);
+ if (p.burst == UINT_MAX) {
+ fprintf(stderr, "POLICE: burst out of range\n");
+ return -1;
+ }
}
p.mtu = mtu;
if (prate64) {
@@ -265,9 +270,8 @@ static int print_police(const struct action_util *a, FILE *funused, struct rtatt
SPRINT_BUF(b2);
struct tc_police *p;
struct rtattr *tb[TCA_POLICE_MAX+1];
- unsigned int buffer;
unsigned int linklayer;
- __u64 rate64, prate64;
+ __u64 rate64, prate64, buffer64;
__u64 pps64, ppsburst64;
print_string(PRINT_JSON, "kind", "%s", "police");
@@ -296,8 +300,8 @@ static int print_police(const struct action_util *a, FILE *funused, struct rtatt
print_hex(PRINT_FP, NULL, " police 0x%x ", p->index);
print_uint(PRINT_JSON, "index", NULL, p->index);
tc_print_rate(PRINT_FP, NULL, "rate %s ", rate64);
- buffer = tc_calc_xmitsize(rate64, p->burst);
- print_size(PRINT_FP, NULL, "burst %s ", buffer);
+ buffer64 = tc_calc_xmitsize(rate64, p->burst);
+ print_size(PRINT_FP, NULL, "burst %s ", buffer64);
print_size(PRINT_FP, NULL, "mtu %s ", p->mtu);
if (show_raw)
print_hex(PRINT_FP, NULL, "[%08x] ", p->burst);
--
2.25.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH iproute2-next] tc/police: Allow 64 bit burst size
2025-09-07 1:42 [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jay Vosburgh
` (3 preceding siblings ...)
2025-09-07 1:42 ` [PATCH 4/4 iproute2-next] tc/police: enable use of 64 bit burst parameter Jay Vosburgh
@ 2025-09-10 3:32 ` Jamal Hadi Salim
2025-09-10 4:04 ` Jay Vosburgh
2025-09-11 20:50 ` David Ahern
4 siblings, 2 replies; 13+ messages in thread
From: Jamal Hadi Salim @ 2025-09-10 3:32 UTC (permalink / raw)
To: Jay Vosburgh; +Cc: netdev, Stephen Hemminger, David Ahern
On Sat, Sep 6, 2025 at 9:50 PM Jay Vosburgh <jay.vosburgh@canonical.com> wrote:
>
>
> In summary, this patchset changes the user space handling of the
> tc police burst parameter to permit burst sizes that exceed 4 GB when the
> specified rate is high enough that the kernel API for burst can accomodate
> such.
>
> Additionally, if the burst exceeds the upper limit of the kernel
> API, this is now flagged as an error. The existing behavior silently
> overflows, resulting in arbitrary values passed to the kernel.
>
> In detail, as presently implemented, the tc police burst option
> limits the size of the burst to to 4 GB, i.e., UINT_MAX for a 32 bit
> unsigned int. This is a reasonable limit for the low rates common when
> this was developed. However, the underlying implementation of burst is
> computed as "time at the specified rate," and for higher rates, a burst
> size exceeding 4 GB is feasible without modification to the kernel.
>
> The burst size provided on the command line is translated into a
> duration, representing how much time is required at the specified rate to
> transmit the given burst size.
>
> This time is calculated in units of "psched ticks," each of which
> is 64 nsec[0]. The computed number of psched ticks is sent to the kernel
> as a __u32 value.
>
Please run tdc tests. David/Stephen - can we please make this a
requirement for iproute2 tc related changes?
Jay, your patches fail at least one test because you changed the unit outputs.
Either we fix the tdc test or you make your changes backward compatible.
In the future also cc kernel tc maintainers (I only saw this because
someone pointed it to me).
Overall the changes look fine.
cheers,
jamal
> Because burst is ultimately calculated as a time duration, the
> real upper limit for a burst is UINT_MAX psched ticks, i.e.,
>
> UINT_MAX * psched tick duration / NSEC_PER_SEC
> (2^32-1) * 64 / 1E9
>
> which is roughly 274.88 seconds (274.8779...).
>
> At low rates, e.g., 5 Mbit/sec, UINT_MAX psched ticks does not
> correspond to a burst size in excess of 4 GB, so the above is moot, e.g.,
>
> 5Mbit/sec / 8 = 625000 MBytes/sec
> 625000 * ~274.88 seconds = ~171800000 max burst size, below UINT_MAX
>
> Thus, the burst size at 5Mbit/sec is limited by the __u32 size of
> the psched tick field in the kernel API, not the 4 GB limit of the tc
> police burst user space API.
>
> However, at higher rates, e.g., 10 Gbit/sec, the burst size is
> currently limited by the 4 GB maximum for the burst command line parameter
> value, rather than UINT_MAX psched ticks:
>
> 10 Gbit/sec / 8 = 1250000000 MBbytes/sec
> 1250000000 * ~274.88 seconds = ~343600000000, more than UINT_MAX
>
> Here, the maximum duration of a burst the kernel can handle
> exceeds 4 GB of burst size.
>
> While the above maximum may be an excessively large burst value,
> at 10 Gbit/sec, a 4 GB burst size corresponds to just under 3.5 seconds in
> duration:
>
> 2^32 bytes / 10 Gbit/sec
> 2^32 bytes / 1250000000 bytes/sec
> equals ~3.43 sec
>
> So, at higher rates, burst sizes exceeding 4 GB are both
> reasonable and feasible, up to the UINT_MAX limit for psched ticks.
> Enabling this requires changes only to the user space processing of the
> burst size parameter in tc.
>
> In principle, the other packet schedulers utilizing psched ticks
> for burst sizing, htb and tbf, could be similarly changed to permit larger
> burst sizes, but this patch set does not do so.
>
> Separately, for the burst duration calculation overflow (i.e.,
> that the number of psched ticks exceeds UINT_MAX), under the current
> implementation, one example of overflow is as follows:
>
> # /sbin/tc filter add dev eth0 protocol ip prio 1 parent ffff: handle 1 fw police rate 1Mbit peakrate 10Gbit burst 34375000 mtu 64Kb conform-exceed reclassify
>
> # /sbin/tc -raw filter get dev eth0 ingress protocol ip pref 1 handle 1 fw
> filter ingress protocol ip pref 1 fw chain 0 handle 0x1 police 0x1 rate 1Mbit burst 15261b mtu 64Kb [001d1bf8] peakrate 10Gbit action reclassify overhead 0b
> ref 1 bind 1
>
> Note that the returned burst value is 15261b, which does not match
> the supplied value of 34375000. With this patch set applied, this
> situation is flagged as an error.
>
>
> [0] psched ticks are defined in the kernel in include/net/pkt_sched.h:
>
> #define PSCHED_SHIFT 6
> #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT)
> #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT)
>
> #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC)
>
> where PSCHED_TICKS_PER_SEC is 15625000.
>
> These values are exported to user space via /proc/net/psched, the
> second field being PSCHED_TICKS2NS(1), which at present is 64 (0x40). tc
> uses this value to compute its internal "tick_in_usec" variable containing
> the number of psched ticks per usec (15.625) used for the psched tick
> computations.
>
> Lastly, note that PSCHED_SHIFT was previously 10, and changed to 6
> in commit a4a710c4a7490 in 2009. I have not tested backwards
> compatibility of these changes with kernels of that era.
>
>
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH iproute2-next] tc/police: Allow 64 bit burst size
2025-09-10 3:32 ` [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jamal Hadi Salim
@ 2025-09-10 4:04 ` Jay Vosburgh
2025-09-10 4:29 ` Jamal Hadi Salim
2025-09-11 20:50 ` David Ahern
1 sibling, 1 reply; 13+ messages in thread
From: Jay Vosburgh @ 2025-09-10 4:04 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: netdev, Stephen Hemminger, David Ahern
Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>On Sat, Sep 6, 2025 at 9:50 PM Jay Vosburgh <jay.vosburgh@canonical.com> wrote:
>>
>>
>> In summary, this patchset changes the user space handling of the
>> tc police burst parameter to permit burst sizes that exceed 4 GB when the
>> specified rate is high enough that the kernel API for burst can accomodate
>> such.
>>
>> Additionally, if the burst exceeds the upper limit of the kernel
>> API, this is now flagged as an error. The existing behavior silently
>> overflows, resulting in arbitrary values passed to the kernel.
>>
>> In detail, as presently implemented, the tc police burst option
>> limits the size of the burst to to 4 GB, i.e., UINT_MAX for a 32 bit
>> unsigned int. This is a reasonable limit for the low rates common when
>> this was developed. However, the underlying implementation of burst is
>> computed as "time at the specified rate," and for higher rates, a burst
>> size exceeding 4 GB is feasible without modification to the kernel.
>>
>> The burst size provided on the command line is translated into a
>> duration, representing how much time is required at the specified rate to
>> transmit the given burst size.
>>
>> This time is calculated in units of "psched ticks," each of which
>> is 64 nsec[0]. The computed number of psched ticks is sent to the kernel
>> as a __u32 value.
>>
>
>Please run tdc tests. David/Stephen - can we please make this a
>requirement for iproute2 tc related changes?
I was not familiar with those tests (but see them now that I
look in the kernel source). I did run the tests included in the
iproute2-next git repository.
>Jay, your patches fail at least one test because you changed the unit outputs.
>Either we fix the tdc test or you make your changes backward compatible.
I'll run the tdc tests and have a look at the failures.
>In the future also cc kernel tc maintainers (I only saw this because
>someone pointed it to me).
>Overall the changes look fine.
Understood, but this isn't documented in iproute2. Perhaps the
iproute2 MAINTAINERS should have a tc section to clarify this
expectation?
-J
>cheers,
>jamal
>
>> Because burst is ultimately calculated as a time duration, the
>> real upper limit for a burst is UINT_MAX psched ticks, i.e.,
>>
>> UINT_MAX * psched tick duration / NSEC_PER_SEC
>> (2^32-1) * 64 / 1E9
>>
>> which is roughly 274.88 seconds (274.8779...).
>>
>> At low rates, e.g., 5 Mbit/sec, UINT_MAX psched ticks does not
>> correspond to a burst size in excess of 4 GB, so the above is moot, e.g.,
>>
>> 5Mbit/sec / 8 = 625000 MBytes/sec
>> 625000 * ~274.88 seconds = ~171800000 max burst size, below UINT_MAX
>>
>> Thus, the burst size at 5Mbit/sec is limited by the __u32 size of
>> the psched tick field in the kernel API, not the 4 GB limit of the tc
>> police burst user space API.
>>
>> However, at higher rates, e.g., 10 Gbit/sec, the burst size is
>> currently limited by the 4 GB maximum for the burst command line parameter
>> value, rather than UINT_MAX psched ticks:
>>
>> 10 Gbit/sec / 8 = 1250000000 MBbytes/sec
>> 1250000000 * ~274.88 seconds = ~343600000000, more than UINT_MAX
>>
>> Here, the maximum duration of a burst the kernel can handle
>> exceeds 4 GB of burst size.
>>
>> While the above maximum may be an excessively large burst value,
>> at 10 Gbit/sec, a 4 GB burst size corresponds to just under 3.5 seconds in
>> duration:
>>
>> 2^32 bytes / 10 Gbit/sec
>> 2^32 bytes / 1250000000 bytes/sec
>> equals ~3.43 sec
>>
>> So, at higher rates, burst sizes exceeding 4 GB are both
>> reasonable and feasible, up to the UINT_MAX limit for psched ticks.
>> Enabling this requires changes only to the user space processing of the
>> burst size parameter in tc.
>>
>> In principle, the other packet schedulers utilizing psched ticks
>> for burst sizing, htb and tbf, could be similarly changed to permit larger
>> burst sizes, but this patch set does not do so.
>>
>> Separately, for the burst duration calculation overflow (i.e.,
>> that the number of psched ticks exceeds UINT_MAX), under the current
>> implementation, one example of overflow is as follows:
>>
>> # /sbin/tc filter add dev eth0 protocol ip prio 1 parent ffff: handle 1 fw police rate 1Mbit peakrate 10Gbit burst 34375000 mtu 64Kb conform-exceed reclassify
>>
>> # /sbin/tc -raw filter get dev eth0 ingress protocol ip pref 1 handle 1 fw
>> filter ingress protocol ip pref 1 fw chain 0 handle 0x1 police 0x1 rate 1Mbit burst 15261b mtu 64Kb [001d1bf8] peakrate 10Gbit action reclassify overhead 0b
>> ref 1 bind 1
>>
>> Note that the returned burst value is 15261b, which does not match
>> the supplied value of 34375000. With this patch set applied, this
>> situation is flagged as an error.
>>
>>
>> [0] psched ticks are defined in the kernel in include/net/pkt_sched.h:
>>
>> #define PSCHED_SHIFT 6
>> #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT)
>> #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT)
>>
>> #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC)
>>
>> where PSCHED_TICKS_PER_SEC is 15625000.
>>
>> These values are exported to user space via /proc/net/psched, the
>> second field being PSCHED_TICKS2NS(1), which at present is 64 (0x40). tc
>> uses this value to compute its internal "tick_in_usec" variable containing
>> the number of psched ticks per usec (15.625) used for the psched tick
>> computations.
>>
>> Lastly, note that PSCHED_SHIFT was previously 10, and changed to 6
>> in commit a4a710c4a7490 in 2009. I have not tested backwards
>> compatibility of these changes with kernels of that era.
---
-Jay Vosburgh, jv@jvosburgh.net
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH iproute2-next] tc/police: Allow 64 bit burst size
2025-09-10 4:04 ` Jay Vosburgh
@ 2025-09-10 4:29 ` Jamal Hadi Salim
0 siblings, 0 replies; 13+ messages in thread
From: Jamal Hadi Salim @ 2025-09-10 4:29 UTC (permalink / raw)
To: Jay Vosburgh; +Cc: netdev, Stephen Hemminger, David Ahern
On Wed, Sep 10, 2025 at 12:04 AM Jay Vosburgh <jv@jvosburgh.net> wrote:
>
> Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> >On Sat, Sep 6, 2025 at 9:50 PM Jay Vosburgh <jay.vosburgh@canonical.com> wrote:
> >>
> >>
> >> In summary, this patchset changes the user space handling of the
> >> tc police burst parameter to permit burst sizes that exceed 4 GB when the
> >> specified rate is high enough that the kernel API for burst can accomodate
> >> such.
> >>
> >> Additionally, if the burst exceeds the upper limit of the kernel
> >> API, this is now flagged as an error. The existing behavior silently
> >> overflows, resulting in arbitrary values passed to the kernel.
> >>
> >> In detail, as presently implemented, the tc police burst option
> >> limits the size of the burst to to 4 GB, i.e., UINT_MAX for a 32 bit
> >> unsigned int. This is a reasonable limit for the low rates common when
> >> this was developed. However, the underlying implementation of burst is
> >> computed as "time at the specified rate," and for higher rates, a burst
> >> size exceeding 4 GB is feasible without modification to the kernel.
> >>
> >> The burst size provided on the command line is translated into a
> >> duration, representing how much time is required at the specified rate to
> >> transmit the given burst size.
> >>
> >> This time is calculated in units of "psched ticks," each of which
> >> is 64 nsec[0]. The computed number of psched ticks is sent to the kernel
> >> as a __u32 value.
> >>
> >
> >Please run tdc tests. David/Stephen - can we please make this a
> >requirement for iproute2 tc related changes?
>
> I was not familiar with those tests (but see them now that I
> look in the kernel source). I did run the tests included in the
> iproute2-next git repository.
>
> >Jay, your patches fail at least one test because you changed the unit outputs.
> >Either we fix the tdc test or you make your changes backward compatible.
>
> I'll run the tdc tests and have a look at the failures.
>
> >In the future also cc kernel tc maintainers (I only saw this because
> >someone pointed it to me).
> >Overall the changes look fine.
>
> Understood, but this isn't documented in iproute2. Perhaps the
> iproute2 MAINTAINERS should have a tc section to clarify this
> expectation?
>
Ive been asking for a while now;->
cheers,
jamal
> -J
>
> >cheers,
> >jamal
> >
> >> Because burst is ultimately calculated as a time duration, the
> >> real upper limit for a burst is UINT_MAX psched ticks, i.e.,
> >>
> >> UINT_MAX * psched tick duration / NSEC_PER_SEC
> >> (2^32-1) * 64 / 1E9
> >>
> >> which is roughly 274.88 seconds (274.8779...).
> >>
> >> At low rates, e.g., 5 Mbit/sec, UINT_MAX psched ticks does not
> >> correspond to a burst size in excess of 4 GB, so the above is moot, e.g.,
> >>
> >> 5Mbit/sec / 8 = 625000 MBytes/sec
> >> 625000 * ~274.88 seconds = ~171800000 max burst size, below UINT_MAX
> >>
> >> Thus, the burst size at 5Mbit/sec is limited by the __u32 size of
> >> the psched tick field in the kernel API, not the 4 GB limit of the tc
> >> police burst user space API.
> >>
> >> However, at higher rates, e.g., 10 Gbit/sec, the burst size is
> >> currently limited by the 4 GB maximum for the burst command line parameter
> >> value, rather than UINT_MAX psched ticks:
> >>
> >> 10 Gbit/sec / 8 = 1250000000 MBbytes/sec
> >> 1250000000 * ~274.88 seconds = ~343600000000, more than UINT_MAX
> >>
> >> Here, the maximum duration of a burst the kernel can handle
> >> exceeds 4 GB of burst size.
> >>
> >> While the above maximum may be an excessively large burst value,
> >> at 10 Gbit/sec, a 4 GB burst size corresponds to just under 3.5 seconds in
> >> duration:
> >>
> >> 2^32 bytes / 10 Gbit/sec
> >> 2^32 bytes / 1250000000 bytes/sec
> >> equals ~3.43 sec
> >>
> >> So, at higher rates, burst sizes exceeding 4 GB are both
> >> reasonable and feasible, up to the UINT_MAX limit for psched ticks.
> >> Enabling this requires changes only to the user space processing of the
> >> burst size parameter in tc.
> >>
> >> In principle, the other packet schedulers utilizing psched ticks
> >> for burst sizing, htb and tbf, could be similarly changed to permit larger
> >> burst sizes, but this patch set does not do so.
> >>
> >> Separately, for the burst duration calculation overflow (i.e.,
> >> that the number of psched ticks exceeds UINT_MAX), under the current
> >> implementation, one example of overflow is as follows:
> >>
> >> # /sbin/tc filter add dev eth0 protocol ip prio 1 parent ffff: handle 1 fw police rate 1Mbit peakrate 10Gbit burst 34375000 mtu 64Kb conform-exceed reclassify
> >>
> >> # /sbin/tc -raw filter get dev eth0 ingress protocol ip pref 1 handle 1 fw
> >> filter ingress protocol ip pref 1 fw chain 0 handle 0x1 police 0x1 rate 1Mbit burst 15261b mtu 64Kb [001d1bf8] peakrate 10Gbit action reclassify overhead 0b
> >> ref 1 bind 1
> >>
> >> Note that the returned burst value is 15261b, which does not match
> >> the supplied value of 34375000. With this patch set applied, this
> >> situation is flagged as an error.
> >>
> >>
> >> [0] psched ticks are defined in the kernel in include/net/pkt_sched.h:
> >>
> >> #define PSCHED_SHIFT 6
> >> #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT)
> >> #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT)
> >>
> >> #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC)
> >>
> >> where PSCHED_TICKS_PER_SEC is 15625000.
> >>
> >> These values are exported to user space via /proc/net/psched, the
> >> second field being PSCHED_TICKS2NS(1), which at present is 64 (0x40). tc
> >> uses this value to compute its internal "tick_in_usec" variable containing
> >> the number of psched ticks per usec (15.625) used for the psched tick
> >> computations.
> >>
> >> Lastly, note that PSCHED_SHIFT was previously 10, and changed to 6
> >> in commit a4a710c4a7490 in 2009. I have not tested backwards
> >> compatibility of these changes with kernels of that era.
>
> ---
> -Jay Vosburgh, jv@jvosburgh.net
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH iproute2-next] tc/police: Allow 64 bit burst size
2025-09-10 3:32 ` [PATCH iproute2-next] tc/police: Allow 64 bit burst size Jamal Hadi Salim
2025-09-10 4:04 ` Jay Vosburgh
@ 2025-09-11 20:50 ` David Ahern
2025-09-12 4:19 ` Jay Vosburgh
2025-09-15 7:51 ` Jamal Hadi Salim
1 sibling, 2 replies; 13+ messages in thread
From: David Ahern @ 2025-09-11 20:50 UTC (permalink / raw)
To: Jamal Hadi Salim, Jay Vosburgh; +Cc: netdev, Stephen Hemminger
On 9/9/25 9:32 PM, Jamal Hadi Salim wrote:
>
> Please run tdc tests. David/Stephen - can we please make this a
> requirement for iproute2 tc related changes?
I will try to remember to run tdc tests for tc patches. Without an
automated setup, there will be misses over time.
>
> Jay, your patches fail at least one test because you changed the unit outputs.
> Either we fix the tdc test or you make your changes backward compatible.
> In the future also cc kernel tc maintainers (I only saw this because
> someone pointed it to me).
> Overall the changes look fine.
Sent a patch to add a tc entry to iproute2 maintainers file.
You say the change looks fine but at least one test fails meaning
changes are requested?
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH iproute2-next] tc/police: Allow 64 bit burst size
2025-09-11 20:50 ` David Ahern
@ 2025-09-12 4:19 ` Jay Vosburgh
2025-09-12 14:31 ` Jamal Hadi Salim
2025-09-15 7:51 ` Jamal Hadi Salim
1 sibling, 1 reply; 13+ messages in thread
From: Jay Vosburgh @ 2025-09-12 4:19 UTC (permalink / raw)
To: David Ahern; +Cc: Jamal Hadi Salim, netdev, Stephen Hemminger
David Ahern <dsahern@gmail.com> wrote:
>On 9/9/25 9:32 PM, Jamal Hadi Salim wrote:
>>
>> Please run tdc tests. David/Stephen - can we please make this a
>> requirement for iproute2 tc related changes?
>
>I will try to remember to run tdc tests for tc patches. Without an
>automated setup, there will be misses over time.
>
>>
>> Jay, your patches fail at least one test because you changed the unit outputs.
>> Either we fix the tdc test or you make your changes backward compatible.
>> In the future also cc kernel tc maintainers (I only saw this because
>> someone pointed it to me).
>> Overall the changes look fine.
>
>Sent a patch to add a tc entry to iproute2 maintainers file.
>
>You say the change looks fine but at least one test fails meaning
>changes are requested?
Yes, I ran the tests and saw one failure, in the following:
"cmdUnderTest": "$TC actions add action police pkts_rate 1000 pkts_burst
200 index 1",
"expExitCode": "0",
"verifyCmd": "$TC actions ls action police",
"matchPattern": "action order [0-9]*: police 0x1 rate 0bit burst 0b mtu 4096Mb pkts_rate 1000 pkts_burst 200",
Which is trying to match a returned mtu value of "4096Mb" but
the new code prints "4Gb"; should be straightforward to change the test
to accept either returned value.
Or I can take out the bit that prints sufficiently large values
in units of Gb, if you've got a preference for leaving that part alone.
Doing so would ease the lockstep problem between the tests in the kernel
tree and the change in iproute2. The numeric formatting isn't the
important part of the patch set, so I'm ok either way.
-J
---
-Jay Vosburgh, jay.vosburgh@canonical.com
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH iproute2-next] tc/police: Allow 64 bit burst size
2025-09-12 4:19 ` Jay Vosburgh
@ 2025-09-12 14:31 ` Jamal Hadi Salim
2025-09-12 15:48 ` Victor Nogueira
0 siblings, 1 reply; 13+ messages in thread
From: Jamal Hadi Salim @ 2025-09-12 14:31 UTC (permalink / raw)
To: Jay Vosburgh, Victor Nogueira; +Cc: David Ahern, netdev, Stephen Hemminger
On Fri, Sep 12, 2025 at 12:19 AM Jay Vosburgh
<jay.vosburgh@canonical.com> wrote:
>
> David Ahern <dsahern@gmail.com> wrote:
>
> >On 9/9/25 9:32 PM, Jamal Hadi Salim wrote:
> >>
> >> Please run tdc tests. David/Stephen - can we please make this a
> >> requirement for iproute2 tc related changes?
> >
> >I will try to remember to run tdc tests for tc patches. Without an
> >automated setup, there will be misses over time.
> >
> >>
> >> Jay, your patches fail at least one test because you changed the unit outputs.
> >> Either we fix the tdc test or you make your changes backward compatible.
> >> In the future also cc kernel tc maintainers (I only saw this because
> >> someone pointed it to me).
> >> Overall the changes look fine.
> >
> >Sent a patch to add a tc entry to iproute2 maintainers file.
> >
> >You say the change looks fine but at least one test fails meaning
> >changes are requested?
>
> Yes, I ran the tests and saw one failure, in the following:
>
> "cmdUnderTest": "$TC actions add action police pkts_rate 1000 pkts_burst
> 200 index 1",
> "expExitCode": "0",
> "verifyCmd": "$TC actions ls action police",
> "matchPattern": "action order [0-9]*: police 0x1 rate 0bit burst 0b mtu 4096Mb pkts_rate 1000 pkts_burst 200",
>
> Which is trying to match a returned mtu value of "4096Mb" but
> the new code prints "4Gb"; should be straightforward to change the test
> to accept either returned value.
>
> Or I can take out the bit that prints sufficiently large values
> in units of Gb, if you've got a preference for leaving that part alone.
> Doing so would ease the lockstep problem between the tests in the kernel
> tree and the change in iproute2. The numeric formatting isn't the
> important part of the patch set, so I'm ok either way.
>
For backward compat we need to support both. IOW, if someone was using
an older tc then a new kernel should work fine and not fail because of
different output expected. @Victor Nogueira wanna take a crack at
fixing the test? Then when the iproute side is merged as is - both
should work.
cheers,
jamal
> -J
>
> ---
> -Jay Vosburgh, jay.vosburgh@canonical.com
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH iproute2-next] tc/police: Allow 64 bit burst size
2025-09-12 14:31 ` Jamal Hadi Salim
@ 2025-09-12 15:48 ` Victor Nogueira
0 siblings, 0 replies; 13+ messages in thread
From: Victor Nogueira @ 2025-09-12 15:48 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: Jay Vosburgh, David Ahern, netdev, Stephen Hemminger
On Fri, Sep 12, 2025 at 11:31 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Fri, Sep 12, 2025 at 12:19 AM Jay Vosburgh
> <jay.vosburgh@canonical.com> wrote:
> >
> > > [...]
> > >You say the change looks fine but at least one test fails meaning
> > >changes are requested?
> >
> > Yes, I ran the tests and saw one failure, in the following:
> >
> > "cmdUnderTest": "$TC actions add action police pkts_rate 1000 pkts_burst
> > 200 index 1",
> > "expExitCode": "0",
> > "verifyCmd": "$TC actions ls action police",
> > "matchPattern": "action order [0-9]*: police 0x1 rate 0bit burst 0b mtu 4096Mb pkts_rate 1000 pkts_burst 200",
> >
> > Which is trying to match a returned mtu value of "4096Mb" but
> > the new code prints "4Gb"; should be straightforward to change the test
> > to accept either returned value.
> > [...]
> >
>
> For backward compat we need to support both. IOW, if someone was using
> an older tc then a new kernel should work fine and not fail because of
> different output expected. @Victor Nogueira wanna take a crack at
> fixing the test? Then when the iproute side is merged as is - both
> should work.
Yes, I just sent a patch making the test work with both Mb and Gb [1].
[1] https://lore.kernel.org/netdev/20250912154616.67489-1-victor@mojatatu.com/
cheers,
Victor
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH iproute2-next] tc/police: Allow 64 bit burst size
2025-09-11 20:50 ` David Ahern
2025-09-12 4:19 ` Jay Vosburgh
@ 2025-09-15 7:51 ` Jamal Hadi Salim
1 sibling, 0 replies; 13+ messages in thread
From: Jamal Hadi Salim @ 2025-09-15 7:51 UTC (permalink / raw)
To: David Ahern; +Cc: Jay Vosburgh, netdev, Stephen Hemminger
On Thu, Sep 11, 2025 at 4:50 PM David Ahern <dsahern@gmail.com> wrote:
>
> On 9/9/25 9:32 PM, Jamal Hadi Salim wrote:
> >
> > Please run tdc tests. David/Stephen - can we please make this a
> > requirement for iproute2 tc related changes?
>
> I will try to remember to run tdc tests for tc patches. Without an
> automated setup, there will be misses over time.
>
> >
> > Jay, your patches fail at least one test because you changed the unit outputs.
> > Either we fix the tdc test or you make your changes backward compatible.
> > In the future also cc kernel tc maintainers (I only saw this because
> > someone pointed it to me).
> > Overall the changes look fine.
>
> Sent a patch to add a tc entry to iproute2 maintainers file.
>
> You say the change looks fine but at least one test fails meaning
> changes are requested?
Pending the kernel patch being applied...
Acked-by: Jamal Hadi Salim
cheers,
jamal
^ permalink raw reply [flat|nested] 13+ messages in thread