* [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
@ 2017-04-19 1:14 Jamal Hadi Salim
2017-04-19 1:14 ` [PATCH net-next 2/2 v2] net sched actions: add time filter for action dumping Jamal Hadi Salim
` (2 more replies)
0 siblings, 3 replies; 14+ messages in thread
From: Jamal Hadi Salim @ 2017-04-19 1:14 UTC (permalink / raw)
To: davem; +Cc: netdev, eric.dumazet, jiri, xiyou.wangcong, Jamal Hadi Salim
From: Jamal Hadi Salim <jhs@mojatatu.com>
When you dump hundreds of thousands of actions, getting only 32 per
dump batch even when the socket buffer and memory allocations allow
is inefficient.
With this change, the user will get as many as possibly fitting
within the given constraints available to the kernel.
A new top level TLV space is introduced. An attribute
TCAA_ACT_FLAGS is used to carry the flags indicating the user
is capable of processing these large dumps. Older user space which
doesnt set this flag doesnt get the large (than 32) batches.
The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
actions are put in a single batch. As such user space app knows how long
to iterate (independent of the type of action being dumped)
instead of hardcoded maximum of 32.
Some results dumping 1.5M actions, first unpatched tc which the
kernel doesnt help:
prompt$ time -p tc actions ls action gact | grep index | wc -l
1500000
real 1388.43
user 2.07
sys 1386.79
Now lets see a patched tc which sets the correct flags when requesting
a dump:
prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
1500000
real 178.13
user 2.02
sys 176.96
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
MAINTAINERS | 1 +
include/uapi/linux/rtnetlink.h | 21 +++++++++++++++++++--
net/sched/act_api.c | 43 ++++++++++++++++++++++++++++++++----------
3 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cce0613..c7080ec 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -674,10 +674,27 @@ struct tcamsg {
unsigned char tca__pad1;
unsigned short tca__pad2;
};
+
+enum {
+ TCAA_UNSPEC,
+ TCAA_ACT_TAB,
+ TCAA_ACT_FLAGS,
+ TCAA_ACT_COUNT,
+ __TCAA_MAX
+};
+
+#define TCAA_MAX (__TCAA_MAX - 1)
#define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
#define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
-#define TCA_ACT_TAB 1 /* attr type must be >=1 */
-#define TCAA_MAX 1
+#define TCA_ACT_TAB TCAA_ACT_TAB
+/* tcamsg flags stored in attribute TCAA_ACT_FLAGS
+ *
+ * ACT_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
+ * actions in a dump. All dump responses will contain the number of actions
+ * being dumped stored in for user app's consumption in TCAA_ACT_COUNT
+ *
+ */
+#define ACT_LARGE_DUMP_ON (1 << 0)
/* New extended info filters for IFLA_EXT_MASK */
#define RTEXT_FILTER_VF (1 << 0)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 82b1d48..45e1cf7 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
struct netlink_callback *cb)
{
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+ unsigned short act_flags = cb->args[2];
struct nlattr *nest;
spin_lock_bh(&hinfo->lock);
@@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
}
nla_nest_end(skb, nest);
n_i++;
- if (n_i >= TCA_ACT_MAX_PRIO)
+ if (!(act_flags & ACT_LARGE_DUMP_ON) &&
+ n_i >= TCA_ACT_MAX_PRIO)
goto done;
}
}
done:
spin_unlock_bh(&hinfo->lock);
- if (n_i)
+ if (n_i) {
cb->args[0] += n_i;
+ if (act_flags & ACT_LARGE_DUMP_ON)
+ cb->args[1] = n_i;
+ }
return n_i;
nla_put_failure:
@@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
return tcf_add_notify(net, n, &actions, portid);
}
+static const struct nla_policy tcaa_policy[TCAA_MAX + 1] = {
+ [TCAA_ACT_FLAGS] = { .type = NLA_U32 },
+};
+
static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
- struct nlattr *tca[TCA_ACT_MAX + 1];
+ struct nlattr *tca[TCAA_MAX + 1];
u32 portid = skb ? NETLINK_CB(skb).portid : 0;
int ret = 0, ovr = 0;
@@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
- ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL,
+ ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, tcaa_policy,
extack);
if (ret < 0)
return ret;
@@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
return ret;
}
-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(struct nlattr **nla)
{
struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
- struct nlattr *nla[TCAA_MAX + 1];
struct nlattr *kind;
- if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
- NULL, NULL) < 0)
- return NULL;
tb1 = nla[TCA_ACT_TAB];
if (tb1 == NULL)
return NULL;
@@ -1081,9 +1086,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
struct nlattr *nest;
struct tc_action_ops *a_o;
int ret = 0;
+ struct nlattr *tcaa[TCAA_MAX + 1];
struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
- struct nlattr *kind = find_dump_kind(cb->nlh);
+ struct nlattr *kind = NULL;
+ u32 act_flags = 0;
+
+ ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tcaa, TCAA_MAX,
+ tcaa_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ kind = find_dump_kind(tcaa);
if (kind == NULL) {
pr_info("tc_dump_action: action bad kind\n");
return 0;
@@ -1093,10 +1107,16 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
if (a_o == NULL)
return 0;
+ if (tcaa[TCAA_ACT_FLAGS])
+ act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);
+
nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
cb->nlh->nlmsg_type, sizeof(*t), 0);
if (!nlh)
goto out_module_put;
+
+ cb->args[2] = act_flags;
+
t = nlmsg_data(nlh);
t->tca_family = AF_UNSPEC;
t->tca__pad1 = 0;
@@ -1113,6 +1133,9 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
if (ret > 0) {
nla_nest_end(skb, nest);
ret = skb->len;
+ if (nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]))
+ goto out_module_put;
+ cb->args[1] = 0;
} else
nlmsg_trim(skb, b);
--
1.9.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH net-next 2/2 v2] net sched actions: add time filter for action dumping
2017-04-19 1:14 [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch Jamal Hadi Salim
@ 2017-04-19 1:14 ` Jamal Hadi Salim
2017-04-19 1:35 ` Eric Dumazet
2017-04-19 1:49 ` [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch Eric Dumazet
2017-04-19 4:55 ` Roopa Prabhu
2 siblings, 1 reply; 14+ messages in thread
From: Jamal Hadi Salim @ 2017-04-19 1:14 UTC (permalink / raw)
To: davem; +Cc: netdev, eric.dumazet, jiri, xiyou.wangcong, Jamal Hadi Salim
From: Jamal Hadi Salim <jhs@mojatatu.com>
This adds support for filtering based on time since last used.
When we are dumping a large number of actions it is useful to
have the option of filtering based on when the action was last
used to reduce the amount of data crossing to user space.
With this patch the user space app sets the TCAA_ACT_TIME_FILTER
attribute with the value in milliseconds with "time of interest
since now". The kernel converts this to jiffies and does the
filtering comparison matching entries that have seen activity
since then and returns them to user space.
Old kernels and old tc continue to work in legacy mode since
they dont specify this attribute.
Some example (we have 400 actions bound to 400 filters); at installation
time using hacked tc which sets the time of interest to 120 seconds:
prompt$ hackedtc actions ls action gact | grep index | wc -l
400
go get some coffee and wait for > 120 seconds and try again:
prompt$ hackedtc actions ls action gact | grep index | wc -l
0
Lets see a filter bound to one of these actions:
..
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10 (rule hit 2 success 1)
match 7f000002/ffffffff at 12 (success 1 )
action order 1: gact action pass
random type none pass val 0
index 23 ref 2 bind 1 installed 1145 sec used 802 sec
Action statistics:
Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
....
that coffee took long, no?
Now lets ping -c 1 127.0.0.2, then run the actions again:
prompt$ hackedtc actions ls action gact | grep index | wc -l
1
More details please:
prompt$ hackedtc -s actions ls action gact
total acts 1 flags 0x3
action order 0: gact action pass
random type none pass val 0
index 23 ref 2 bind 1 installed 1270 sec used 30 sec
Action statistics:
Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
And the filter?
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10 (rule hit 4 success 2)
match 7f000002/ffffffff at 12 (success 2 )
action order 1: gact action pass
random type none pass val 0
index 23 ref 2 bind 1 installed 1324 sec used 84 sec
Action statistics:
Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
include/uapi/linux/rtnetlink.h | 1 +
net/sched/act_api.c | 23 +++++++++++++++++++++--
2 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index c7080ec..1b36cc0 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -680,6 +680,7 @@ enum {
TCAA_ACT_TAB,
TCAA_ACT_FLAGS,
TCAA_ACT_COUNT,
+ TCAA_ACT_TIME_FILTER,
__TCAA_MAX
};
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 45e1cf7..cb5b23d 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -84,11 +84,12 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
{
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
unsigned short act_flags = cb->args[2];
+ unsigned long jiffy_filter = cb->args[3];
struct nlattr *nest;
spin_lock_bh(&hinfo->lock);
- s_i = cb->args[0];
+ s_i = cb->args[4];
for (i = 0; i < (hinfo->hmask + 1); i++) {
struct hlist_head *head;
@@ -101,6 +102,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
if (index < s_i)
continue;
+ if (jiffy_filter &&
+ time_after(jiffy_filter,
+ (unsigned long)p->tcfa_tm.lastuse))
+ continue;
+
nest = nla_nest_start(skb, n_i);
if (nest == NULL)
goto nla_put_failure;
@@ -118,6 +124,9 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
}
}
done:
+ if (index > 0)
+ cb->args[4] = index + 1;
+
spin_unlock_bh(&hinfo->lock);
if (n_i) {
cb->args[0] += n_i;
@@ -1090,13 +1099,14 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
struct nlattr *kind = NULL;
u32 act_flags = 0;
+ u32 msecs_filter = 0;
+ unsigned long jiffy_wanted = 0;
ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tcaa, TCAA_MAX,
tcaa_policy, NULL);
if (ret < 0)
return ret;
-
kind = find_dump_kind(tcaa);
if (kind == NULL) {
pr_info("tc_dump_action: action bad kind\n");
@@ -1110,12 +1120,21 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
if (tcaa[TCAA_ACT_FLAGS])
act_flags = nla_get_u32(tcaa[TCAA_ACT_FLAGS]);
+ if (tcaa[TCAA_ACT_TIME_FILTER])
+ msecs_filter = nla_get_u32(tcaa[TCAA_ACT_TIME_FILTER]);
+
nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
cb->nlh->nlmsg_type, sizeof(*t), 0);
if (!nlh)
goto out_module_put;
+ if (msecs_filter) {
+ unsigned long jiffy_msecs = msecs_to_jiffies(msecs_filter);
+ jiffy_wanted = jiffies - jiffy_msecs;
+ }
+
cb->args[2] = act_flags;
+ cb->args[3] = jiffy_wanted;
t = nlmsg_data(nlh);
t->tca_family = AF_UNSPEC;
--
1.9.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 2/2 v2] net sched actions: add time filter for action dumping
2017-04-19 1:14 ` [PATCH net-next 2/2 v2] net sched actions: add time filter for action dumping Jamal Hadi Salim
@ 2017-04-19 1:35 ` Eric Dumazet
2017-04-19 1:39 ` Jamal Hadi Salim
0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2017-04-19 1:35 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: davem, netdev, jiri, xiyou.wangcong
On Tue, 2017-04-18 at 21:14 -0400, Jamal Hadi Salim wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
>
> + if (tcaa[TCAA_ACT_TIME_FILTER])
> + msecs_filter = nla_get_u32(tcaa[TCAA_ACT_TIME_FILTER]);
You forgot to add TCAA_ACT_TIME_FILTER in tcaa_policy
There is no guarantee user passed 32bit data here.
> +
> nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
> cb->nlh->nlmsg_type, sizeof(*t), 0);
> if (!nlh)
> goto out_module_put;
>
> + if (msecs_filter) {
> + unsigned long jiffy_msecs = msecs_to_jiffies(msecs_filter);
> + jiffy_wanted = jiffies - jiffy_msecs;
> + }
> +
> cb->args[2] = act_flags;
> + cb->args[3] = jiffy_wanted;
>
> t = nlmsg_data(nlh);
> t->tca_family = AF_UNSPEC;
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 2/2 v2] net sched actions: add time filter for action dumping
2017-04-19 1:35 ` Eric Dumazet
@ 2017-04-19 1:39 ` Jamal Hadi Salim
0 siblings, 0 replies; 14+ messages in thread
From: Jamal Hadi Salim @ 2017-04-19 1:39 UTC (permalink / raw)
To: Eric Dumazet; +Cc: davem, netdev, jiri, xiyou.wangcong
On 17-04-18 09:35 PM, Eric Dumazet wrote:
> On Tue, 2017-04-18 at 21:14 -0400, Jamal Hadi Salim wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>
>> + if (tcaa[TCAA_ACT_TIME_FILTER])
>> + msecs_filter = nla_get_u32(tcaa[TCAA_ACT_TIME_FILTER]);
>
>
> You forgot to add TCAA_ACT_TIME_FILTER in tcaa_policy
>
I will fix that.
> There is no guarantee user passed 32bit data here.
>
Did you mean because we dont have the tcaa_policy or
is it for some other reason?
cheers,
jamal
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 1:14 [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch Jamal Hadi Salim
2017-04-19 1:14 ` [PATCH net-next 2/2 v2] net sched actions: add time filter for action dumping Jamal Hadi Salim
@ 2017-04-19 1:49 ` Eric Dumazet
2017-04-19 2:32 ` Jamal Hadi Salim
2017-04-19 4:55 ` Roopa Prabhu
2 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2017-04-19 1:49 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: davem, netdev, jiri, xiyou.wangcong
On Tue, 2017-04-18 at 21:14 -0400, Jamal Hadi Salim wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
So there is no more limit ? How user is supposed to size the buffer for
recvmsg() ?
> t->tca__pad1 = 0;
> @@ -1113,6 +1133,9 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
> if (ret > 0) {
> nla_nest_end(skb, nest);
> ret = skb->len;
> + if (nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]))
> + goto out_module_put;
> + cb->args[1] = 0;
I really do not see how you manage to get room to add one additional
attribute, if dump had to stop at N actions, filling whole skb.
You might be lucky because nla_put_u32() wants a bit less space than an
action ?
Presumably you need to reserve space before the dump of actions.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 1:49 ` [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch Eric Dumazet
@ 2017-04-19 2:32 ` Jamal Hadi Salim
2017-04-19 3:17 ` Eric Dumazet
2017-04-19 6:12 ` Cong Wang
0 siblings, 2 replies; 14+ messages in thread
From: Jamal Hadi Salim @ 2017-04-19 2:32 UTC (permalink / raw)
To: Eric Dumazet; +Cc: davem, netdev, jiri, xiyou.wangcong
On 17-04-18 09:49 PM, Eric Dumazet wrote:
> On Tue, 2017-04-18 at 21:14 -0400, Jamal Hadi Salim wrote:
>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>
> So there is no more limit ? How user is supposed to size the buffer for
> recvmsg() ?
>
That part doesnt change. Ok, I believe more clarity is needed:->
Current code allows only 32 actions to be dumped at a time.
This code:
So lets i have 128 actions. I can fit them into 10K.
I do recvmsg(..., 32K). This will make about 32K space for me to fill
up the 128 actions. But the code only allows me to send 32
TCA_ACT_MAX_PRIO which takes about 2-3K.
So I will do 4 runs from kernel->user to get 128 actions ;->
User space in tc is trained to expect no more than
TCA_ACT_MAX_PRIO in every batch.
With this change I ask the kernel to fit as many actions as
possible in the 32K (all these 128 will fit in one batch).
Then it has to tell user space how many are in that batch
using TCAA_ACT_COUNT attribute.
Make sense?
cheers,
jamal
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 2:32 ` Jamal Hadi Salim
@ 2017-04-19 3:17 ` Eric Dumazet
2017-04-19 11:24 ` Jamal Hadi Salim
2017-04-19 6:12 ` Cong Wang
1 sibling, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2017-04-19 3:17 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: davem, netdev, jiri, xiyou.wangcong
On Tue, 2017-04-18 at 22:32 -0400, Jamal Hadi Salim wrote:
> On 17-04-18 09:49 PM, Eric Dumazet wrote:
> > On Tue, 2017-04-18 at 21:14 -0400, Jamal Hadi Salim wrote:
> >> From: Jamal Hadi Salim <jhs@mojatatu.com>
> >
> > So there is no more limit ? How user is supposed to size the buffer for
> > recvmsg() ?
> >
>
> That part doesnt change. Ok, I believe more clarity is needed:->
>
> Current code allows only 32 actions to be dumped at a time.
> This code:
>
> So lets i have 128 actions. I can fit them into 10K.
> I do recvmsg(..., 32K). This will make about 32K space for me to fill
> up the 128 actions. But the code only allows me to send 32
> TCA_ACT_MAX_PRIO which takes about 2-3K.
> So I will do 4 runs from kernel->user to get 128 actions ;->
> User space in tc is trained to expect no more than
> TCA_ACT_MAX_PRIO in every batch.
>
> With this change I ask the kernel to fit as many actions as
> possible in the 32K (all these 128 will fit in one batch).
> Then it has to tell user space how many are in that batch
> using TCAA_ACT_COUNT attribute.
>
> Make sense?
What if we have 1024 actions, and user provides a 4KB buffer ?
Normally multiple recvmsg() calls would be needed, but I do not see how
the nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]) can always succeed.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 1:14 [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch Jamal Hadi Salim
2017-04-19 1:14 ` [PATCH net-next 2/2 v2] net sched actions: add time filter for action dumping Jamal Hadi Salim
2017-04-19 1:49 ` [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch Eric Dumazet
@ 2017-04-19 4:55 ` Roopa Prabhu
2017-04-19 11:02 ` Jamal Hadi Salim
2 siblings, 1 reply; 14+ messages in thread
From: Roopa Prabhu @ 2017-04-19 4:55 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: davem, netdev, eric.dumazet, jiri, xiyou.wangcong
On 4/18/17, 6:14 PM, Jamal Hadi Salim wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>
>
> When you dump hundreds of thousands of actions, getting only 32 per
> dump batch even when the socket buffer and memory allocations allow
> is inefficient.
>
> With this change, the user will get as many as possibly fitting
> within the given constraints available to the kernel.
>
> A new top level TLV space is introduced. An attribute
> TCAA_ACT_FLAGS is used to carry the flags indicating the user
> is capable of processing these large dumps. Older user space which
> doesnt set this flag doesnt get the large (than 32) batches.
> The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
> actions are put in a single batch. As such user space app knows how long
> to iterate (independent of the type of action being dumped)
> instead of hardcoded maximum of 32.
Is the count attribute really needed ?. user space knows how to iterate multiple
attributes of the same type till the end of msg. eg. fdb dumps don't use count.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 2:32 ` Jamal Hadi Salim
2017-04-19 3:17 ` Eric Dumazet
@ 2017-04-19 6:12 ` Cong Wang
2017-04-19 11:06 ` Jamal Hadi Salim
1 sibling, 1 reply; 14+ messages in thread
From: Cong Wang @ 2017-04-19 6:12 UTC (permalink / raw)
To: Jamal Hadi Salim
Cc: Eric Dumazet, David Miller, Linux Kernel Network Developers,
Jiri Pirko
On Tue, Apr 18, 2017 at 7:32 PM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> On 17-04-18 09:49 PM, Eric Dumazet wrote:
>>
>> On Tue, 2017-04-18 at 21:14 -0400, Jamal Hadi Salim wrote:
>>>
>>> From: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>>
>> So there is no more limit ? How user is supposed to size the buffer for
>> recvmsg() ?
>>
>
> That part doesnt change. Ok, I believe more clarity is needed:->
>
> Current code allows only 32 actions to be dumped at a time.
> This code:
>
> So lets i have 128 actions. I can fit them into 10K.
> I do recvmsg(..., 32K). This will make about 32K space for me to fill
> up the 128 actions. But the code only allows me to send 32
> TCA_ACT_MAX_PRIO which takes about 2-3K.
> So I will do 4 runs from kernel->user to get 128 actions ;->
> User space in tc is trained to expect no more than
> TCA_ACT_MAX_PRIO in every batch.
>
> With this change I ask the kernel to fit as many actions as
> possible in the 32K (all these 128 will fit in one batch).
> Then it has to tell user space how many are in that batch
> using TCAA_ACT_COUNT attribute.
>
> Make sense?
Hmm? How do tc filters do dumping? There is no max and no
COUNT attribute either, IIUC.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 4:55 ` Roopa Prabhu
@ 2017-04-19 11:02 ` Jamal Hadi Salim
0 siblings, 0 replies; 14+ messages in thread
From: Jamal Hadi Salim @ 2017-04-19 11:02 UTC (permalink / raw)
To: Roopa Prabhu; +Cc: davem, netdev, eric.dumazet, jiri, xiyou.wangcong
On 17-04-19 12:55 AM, Roopa Prabhu wrote:
> On 4/18/17, 6:14 PM, Jamal Hadi Salim wrote:
>>
>> A new top level TLV space is introduced. An attribute
>> TCAA_ACT_FLAGS is used to carry the flags indicating the user
>> is capable of processing these large dumps. Older user space which
>> doesnt set this flag doesnt get the large (than 32) batches.
>> The kernel uses the TCAA_ACT_COUNT attribute to tell the user how many
>> actions are put in a single batch. As such user space app knows how long
>> to iterate (independent of the type of action being dumped)
>> instead of hardcoded maximum of 32.
> Is the count attribute really needed ?. user space knows how to iterate multiple
> attributes of the same type till the end of msg. eg. fdb dumps don't use count.
>
fdb dumping is different. One nlmsg contains one fdb entry.
So the boundary for checking for iproute2 is the nlmsg header.
So you wouldnt need such a counter.
OTOH, actions are packed, many per nlmsg:
I can put X or more policers in one nlmsg and batch many nlmsgs;
all constrained by the dump skb size.
Back in the day it was considered that a filter having more than
32 actions in its pipeline was probably enough. So in a dump, the
kernel would not send more than 32(TCA_ACT_MAX_PRIO) at a time.
This patch basically makes it fit as many as possible.
The other difference is there is only one type of fdb. There are
many types of actions - each with different sizes and different
optionally appearing entities. So if you are writing generic code
to print for example, you can play acrobatics in user space and
look at TLV lengths, deduce where the boundaries are for each type
and then for each type of action do speacial handling.
If the kernel tells you how many actions are packed then a major
change in iproute2 is:
- for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
+ for (i = 0; i < tot_acts_from_kernel; i++) {
cheers,
jamal
PS: I think fdb is a good candidate for time filtering since
it keeps track of lastused time. You dont really have to change it to
be as efficient as actions in terms of packing but it would
definetely improve status quo since you now have stats.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 6:12 ` Cong Wang
@ 2017-04-19 11:06 ` Jamal Hadi Salim
0 siblings, 0 replies; 14+ messages in thread
From: Jamal Hadi Salim @ 2017-04-19 11:06 UTC (permalink / raw)
To: Cong Wang
Cc: Eric Dumazet, David Miller, Linux Kernel Network Developers,
Jiri Pirko
On 17-04-19 02:12 AM, Cong Wang wrote:
> On Tue, Apr 18, 2017 at 7:32 PM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>> On 17-04-18 09:49 PM, Eric Dumazet wrote:
>> With this change I ask the kernel to fit as many actions as
>> possible in the 32K (all these 128 will fit in one batch).
>> Then it has to tell user space how many are in that batch
>> using TCAA_ACT_COUNT attribute.
>>
>> Make sense?
>
> Hmm? How do tc filters do dumping? There is no max and no
> COUNT attribute either, IIUC.
>
They dont have this hard coded 32 entries limit. So standard
dumping works fine. Note: count is a very speacial attribute to
make sure we can distinguish between old kernels which send
only 32 and new kernels which could send more.
cheers,
jamal
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 3:17 ` Eric Dumazet
@ 2017-04-19 11:24 ` Jamal Hadi Salim
2017-04-19 13:47 ` Eric Dumazet
0 siblings, 1 reply; 14+ messages in thread
From: Jamal Hadi Salim @ 2017-04-19 11:24 UTC (permalink / raw)
To: Eric Dumazet; +Cc: davem, netdev, jiri, xiyou.wangcong
On 17-04-18 11:17 PM, Eric Dumazet wrote:
> On Tue, 2017-04-18 at 22:32 -0400, Jamal Hadi Salim wrote:
>> On 17-04-18 09:49 PM, Eric Dumazet wrote:
>>> On Tue, 2017-04-18 at 21:14 -0400, Jamal Hadi Salim wrote:
>>
>> Make sense?
>
> What if we have 1024 actions, and user provides a 4KB buffer ?
>
No problem - we will fit as many per batch to consume 4KB
and will send them as long as user calls recvmsg.
> Normally multiple recvmsg() calls would be needed, but I do not see how
> the nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]) can always succeed.
Oh, I see the cross-talk Eric;->
We dont pack the actions in TCAA_ACT_COUNT - we put them in
TCAA_ACT_TAB.
Here's some strace capture that best describes what happened before
and after which i hope will make sense. Granted tc uses 32KB from
user space and not 4KB you mention.
We have 400 actions in the kernel at this point:
tc with no changes (doesnt for this large dump):
---
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\240\16\0\0002\0\2\0^6\367XE\3\0\0\0\0\0\0\204\16\1\0\200\0\0\0\t\0\1\0"...,
32768,g}], msg_controllen=0, msg_flags=0}, 0) = 3744
===> total actions dumped 29
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"
\20\0\0002\0\2\0s6\367XI\3\0\0\0\0\0\0\4\20\1\0\200\0\0\0\t\0\1\0"...,
32768}], msg_controllen=0, msg_flags=0}, 0) = 4128
==> total actions dumped 32
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"
\20\0\0002\0\2\0s6\367XI\3\0\0\0\0\0\0\4\20\1\0\200\0\0\0\t\0\1\0"...,
32768}], msg_controllen=0, msg_flags=0}, 0) = 4128
==> total actions dumped 32
....
.....
.........
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\24\0\0\0\3\0\2\0s6\367XI\3\0\0\0\0\0\0", 32768}],
msg_controllen=0, msg_flags=0}, 0) = 20
-----
Goes on a few times until we get all 400 entries - last recvmsg (with
20B) has no actions and indicates dump is complete.
Issue: The kernel is refusing to add more than 32 entries in the skb
even though we get allocated 32KB for the skb.
So now lets see what happens with this change:
------
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\240\16\0\0002\0\2\0^6\367XE\3\0\0\0\0\0\0\204\16\1\0\200\0\0\0\t\0\1\0"...,
32768,g}], msg_controllen=0, msg_flags=0}, 0) = 3744
==> total actions dumped 29
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\240~\0\0002\0\2\0^6\367XE\3\0\0\0\0\0\0\204~\1\0\200\0\0\0\t\0\1\0"...,
32768,g}], msg_controllen=0, msg_flags=0}, 0) = 32416
==> total actions dumped 253
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"
;\0\0002\0\2\0^6\367XE\3\0\0\0\0\0\0\4;\1\0\200\0\0\0\t\0\1\0"...,
32768,g}], msg_controllen=0, msg_flags=0}, 0) = 15136
==> total actions dumped 118
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\24\0\0\0\3\0\2\0s6\367XI\3\0\0\0\0\0\0", 32768}],
msg_controllen=0, msg_flags=0}, 0) = 20
--------
We got all 400 in 3 requests. Imagine what we have to deal with
when we have 2M actions (the improvement is about 10x).
cheers,
jamal
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 11:24 ` Jamal Hadi Salim
@ 2017-04-19 13:47 ` Eric Dumazet
2017-04-19 15:43 ` Jamal Hadi Salim
0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2017-04-19 13:47 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: davem, netdev, jiri, xiyou.wangcong
On Wed, 2017-04-19 at 07:24 -0400, Jamal Hadi Salim wrote:
> On 17-04-18 11:17 PM, Eric Dumazet wrote:
> > On Tue, 2017-04-18 at 22:32 -0400, Jamal Hadi Salim wrote:
> >> On 17-04-18 09:49 PM, Eric Dumazet wrote:
> >>> On Tue, 2017-04-18 at 21:14 -0400, Jamal Hadi Salim wrote:
>
> >>
> >> Make sense?
> >
> > What if we have 1024 actions, and user provides a 4KB buffer ?
> >
>
> No problem - we will fit as many per batch to consume 4KB
> and will send them as long as user calls recvmsg.
>
> > Normally multiple recvmsg() calls would be needed, but I do not see how
> > the nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1]) can always succeed.
>
> Oh, I see the cross-talk Eric;->
> We dont pack the actions in TCAA_ACT_COUNT - we put them in
> TCAA_ACT_TAB.
>
> Here's some strace capture that best describes what happened before
> and after which i hope will make sense. Granted tc uses 32KB from
> user space and not 4KB you mention.
>
> We have 400 actions in the kernel at this point:
>
> tc with no changes (doesnt for this large dump):
> ---
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
> msg_iov(1)=[{"\240\16\0\0002\0\2\0^6\367XE\3\0\0\0\0\0\0\204\16\1\0\200\0\0\0\t\0\1\0"...,
> 32768,g}], msg_controllen=0, msg_flags=0}, 0) = 3744
>
> ===> total actions dumped 29
>
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
> msg_iov(1)=[{"
> \20\0\0002\0\2\0s6\367XI\3\0\0\0\0\0\0\4\20\1\0\200\0\0\0\t\0\1\0"...,
> 32768}], msg_controllen=0, msg_flags=0}, 0) = 4128
>
> ==> total actions dumped 32
>
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
> msg_iov(1)=[{"
> \20\0\0002\0\2\0s6\367XI\3\0\0\0\0\0\0\4\20\1\0\200\0\0\0\t\0\1\0"...,
> 32768}], msg_controllen=0, msg_flags=0}, 0) = 4128
>
> ==> total actions dumped 32
> ....
> .....
> .........
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
> msg_iov(1)=[{"\24\0\0\0\3\0\2\0s6\367XI\3\0\0\0\0\0\0", 32768}],
> msg_controllen=0, msg_flags=0}, 0) = 20
> -----
>
> Goes on a few times until we get all 400 entries - last recvmsg (with
> 20B) has no actions and indicates dump is complete.
> Issue: The kernel is refusing to add more than 32 entries in the skb
> even though we get allocated 32KB for the skb.
>
> So now lets see what happens with this change:
> ------
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
> msg_iov(1)=[{"\240\16\0\0002\0\2\0^6\367XE\3\0\0\0\0\0\0\204\16\1\0\200\0\0\0\t\0\1\0"...,
> 32768,g}], msg_controllen=0, msg_flags=0}, 0) = 3744
>
> ==> total actions dumped 29
>
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
> msg_iov(1)=[{"\240~\0\0002\0\2\0^6\367XE\3\0\0\0\0\0\0\204~\1\0\200\0\0\0\t\0\1\0"...,
> 32768,g}], msg_controllen=0, msg_flags=0}, 0) = 32416
>
> ==> total actions dumped 253
>
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
> msg_iov(1)=[{"
> ;\0\0002\0\2\0^6\367XE\3\0\0\0\0\0\0\4;\1\0\200\0\0\0\t\0\1\0"...,
> 32768,g}], msg_controllen=0, msg_flags=0}, 0) = 15136
>
> ==> total actions dumped 118
>
> recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
> msg_iov(1)=[{"\24\0\0\0\3\0\2\0s6\367XI\3\0\0\0\0\0\0", 32768}],
> msg_controllen=0, msg_flags=0}, 0) = 20
>
> --------
>
> We got all 400 in 3 requests. Imagine what we have to deal with
> when we have 2M actions (the improvement is about 10x).
Just because your strace works with the size of 32768 does not mean it
will work in the future.
Please read again my question.
Try to _not_ use 32768 bytes for the recvmsg() sizes, but 4KB
You pack XXX actions until 4KB skb is full.
Then code does :
nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1])
This might fail, then you
goto out_module_put;
Then we are stuck ?
What am I missing ?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
2017-04-19 13:47 ` Eric Dumazet
@ 2017-04-19 15:43 ` Jamal Hadi Salim
0 siblings, 0 replies; 14+ messages in thread
From: Jamal Hadi Salim @ 2017-04-19 15:43 UTC (permalink / raw)
To: Eric Dumazet; +Cc: davem, netdev, jiri, xiyou.wangcong
On 17-04-19 09:47 AM, Eric Dumazet wrote:
> Try to _not_ use 32768 bytes for the recvmsg() sizes, but 4KB
>
> You pack XXX actions until 4KB skb is full.
>
> Then code does :
>
> nla_put_u32(skb, TCAA_ACT_COUNT, cb->args[1])
>
> This might fail, then you
>
> goto out_module_put;
>
>
> Then we are stuck ?
>
> What am I missing ?
>
>
Ok, I understand you now ;->
You are saying we need make sure there is at
least 64 bits reserved so we can stick this TLV in there.
We may have to review a lot of other dumping code in general
to make sure this doesnt happen. Thanks Eric.
cheers,
jamal
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2017-04-19 15:44 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-04-19 1:14 [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch Jamal Hadi Salim
2017-04-19 1:14 ` [PATCH net-next 2/2 v2] net sched actions: add time filter for action dumping Jamal Hadi Salim
2017-04-19 1:35 ` Eric Dumazet
2017-04-19 1:39 ` Jamal Hadi Salim
2017-04-19 1:49 ` [PATCH net-next 1/2 v2] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch Eric Dumazet
2017-04-19 2:32 ` Jamal Hadi Salim
2017-04-19 3:17 ` Eric Dumazet
2017-04-19 11:24 ` Jamal Hadi Salim
2017-04-19 13:47 ` Eric Dumazet
2017-04-19 15:43 ` Jamal Hadi Salim
2017-04-19 6:12 ` Cong Wang
2017-04-19 11:06 ` Jamal Hadi Salim
2017-04-19 4:55 ` Roopa Prabhu
2017-04-19 11:02 ` Jamal Hadi Salim
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).