From: Aaron Conole <aconole@redhat.com>
To: Adrian Moreno <amorenoz@redhat.com>
Cc: Ilya Maximets <i.maximets@ovn.org>,
netdev@vger.kernel.org, jiri@resnulli.us,
xiyou.wangcong@gmail.com, cmi@nvidia.com, yotam.gi@gmail.com,
echaudro@redhat.com, horms@kernel.org
Subject: Re: [RFC net-next v2 2/5] net: psample: add multicast filtering on group_id
Date: Tue, 09 Apr 2024 10:43:08 -0400 [thread overview]
Message-ID: <f7til0qmwar.fsf@redhat.com> (raw)
In-Reply-To: <801ccf5c-5ac9-455f-8d9a-48517d7db614@redhat.com> (Adrian Moreno's message of "Mon, 8 Apr 2024 21:24:17 +0200")
Adrian Moreno <amorenoz@redhat.com> writes:
> On 4/8/24 15:18, Ilya Maximets wrote:
>> [copying my previous reply since this version actually has netdev@ in Cc]
>> On 4/8/24 14:57, Adrian Moreno wrote:
>>> Packet samples can come from several places (e.g: different tc sample
>>> actions), typically using the sample group (PSAMPLE_ATTR_SAMPLE_GROUP)
>>> to differentiate them.
>>>
>>> Likewise, sample consumers that listen on the multicast group may only
>>> be interested on a single group. However, they are currently forced to
>>> receive all samples and discard the ones that are not relevant, causing
>>> unnecessary overhead.
>>>
>>> Allow users to filter on the desired group_id by adding a new command
>>> SAMPLE_FILTER_SET that can be used to pass the desired group id.
>>> Store this filter on the per-socket private pointer and use it for
>>> filtering multicasted samples.
>>>
>>> Signed-off-by: Adrian Moreno <amorenoz@redhat.com>
>>> ---
>>> include/uapi/linux/psample.h | 1 +
>>> net/psample/psample.c | 127 +++++++++++++++++++++++++++++++++--
>>> 2 files changed, 122 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h
>>> index e585db5bf2d2..5e0305b1520d 100644
>>> --- a/include/uapi/linux/psample.h
>>> +++ b/include/uapi/linux/psample.h
>>> @@ -28,6 +28,7 @@ enum psample_command {
>>> PSAMPLE_CMD_GET_GROUP,
>>> PSAMPLE_CMD_NEW_GROUP,
>>> PSAMPLE_CMD_DEL_GROUP,
>>> + PSAMPLE_CMD_SAMPLE_FILTER_SET,
>> Other commands are names as PSAMPLE_CMD_VERB_NOUN, so this new one
>> should be PSAMPLE_CMD_SET_FILTER. (The SAMPLE part seems unnecessary.)
>> Some functions/structures need to be renamed accordingly.
>>
>
> Sure, I'll rename it when I sent the next version.
>
>>> };
>>> enum psample_tunnel_key_attr {
>>> diff --git a/net/psample/psample.c b/net/psample/psample.c
>>> index a5d9b8446f77..a0cef63dfdec 100644
>>> --- a/net/psample/psample.c
>>> +++ b/net/psample/psample.c
>>> @@ -98,13 +98,84 @@ static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
>>> return msg->len;
>>> }
>>> -static const struct genl_small_ops psample_nl_ops[] = {
>>> +struct psample_obj_desc {
>>> + struct rcu_head rcu;
>>> + u32 group_num;
>>> + bool group_num_valid;
>>> +};
>>> +
>>> +struct psample_nl_sock_priv {
>>> + struct psample_obj_desc __rcu *flt;
>> Can we call it 'fileter' ? I find it hard to read the code with
>> this unnecessary abbreviation. Same for the lock below.
>>
>
> Sure.
>
>>> + spinlock_t flt_lock; /* Protects flt. */
>>> +};
>>> +
>>> +static void psample_nl_sock_priv_init(void *priv)
>>> +{
>>> + struct psample_nl_sock_priv *sk_priv = priv;
>>> +
>>> + spin_lock_init(&sk_priv->flt_lock);
>>> +}
>>> +
>>> +static void psample_nl_sock_priv_destroy(void *priv)
>>> +{
>>> + struct psample_nl_sock_priv *sk_priv = priv;
>>> + struct psample_obj_desc *flt;
>>> +
>>> + flt = rcu_dereference_protected(sk_priv->flt, true);
>>> + kfree_rcu(flt, rcu);
>>> +}
>>> +
>>> +static int psample_nl_sample_filter_set_doit(struct sk_buff *skb,
>>> + struct genl_info *info)
>>> +{
>>> + struct psample_nl_sock_priv *sk_priv;
>>> + struct nlattr **attrs = info->attrs;
>>> + struct psample_obj_desc *flt;
>>> +
>>> + flt = kzalloc(sizeof(*flt), GFP_KERNEL);
>>> +
>>> + if (attrs[PSAMPLE_ATTR_SAMPLE_GROUP]) {
>>> + flt->group_num = nla_get_u32(attrs[PSAMPLE_ATTR_SAMPLE_GROUP]);
>>> + flt->group_num_valid = true;
>>> + }
>>> +
>>> + if (!flt->group_num_valid) {
>>> + kfree(flt);
>> Might be better to not allocate it in the first place.
>>
>
> Absolutely.
>
>>> + flt = NULL;
>>> + }
>>> +
>>> + sk_priv = genl_sk_priv_get(&psample_nl_family, NETLINK_CB(skb).sk);
>>> + if (IS_ERR(sk_priv)) {
>>> + kfree(flt);
>>> + return PTR_ERR(sk_priv);
>>> + }
>>> +
>>> + spin_lock(&sk_priv->flt_lock);
>>> + flt = rcu_replace_pointer(sk_priv->flt, flt,
>>> + lockdep_is_held(&sk_priv->flt_lock));
>>> + spin_unlock(&sk_priv->flt_lock);
>>> + kfree_rcu(flt, rcu);
>>> + return 0;
>>> +}
>>> +
>>> +static const struct nla_policy
>>> + psample_sample_filter_set_policy[PSAMPLE_ATTR_SAMPLE_GROUP + 1] = {
>>> + [PSAMPLE_ATTR_SAMPLE_GROUP] = { .type = NLA_U32, },
>> This indentation is confusing, though I'm not sure what's a better
>> way.
>>
>
> I now! I'll try to move it around see if it improves things.
>
>>> +};
>>> +
>>> +static const struct genl_ops psample_nl_ops[] = {
>>> {
>>> .cmd = PSAMPLE_CMD_GET_GROUP,
>>> .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
>>> .dumpit = psample_nl_cmd_get_group_dumpit,
>>> /* can be retrieved by unprivileged users */
>>> - }
>>> + },
>>> + {
>>> + .cmd = PSAMPLE_CMD_SAMPLE_FILTER_SET,
>>> + .doit = psample_nl_sample_filter_set_doit,
>>> + .policy = psample_sample_filter_set_policy,
>>> + .flags = 0,
>>> + },
>>> };
>>> static struct genl_family psample_nl_family __ro_after_init = {
>>> @@ -114,10 +185,13 @@ static struct genl_family psample_nl_family __ro_after_init = {
>>> .netnsok = true,
>>> .module = THIS_MODULE,
>>> .mcgrps = psample_nl_mcgrps,
>>> - .small_ops = psample_nl_ops,
>>> - .n_small_ops = ARRAY_SIZE(psample_nl_ops),
>>> + .ops = psample_nl_ops,
>>> + .n_ops = ARRAY_SIZE(psample_nl_ops),
>>> .resv_start_op = PSAMPLE_CMD_GET_GROUP + 1,
>>> .n_mcgrps = ARRAY_SIZE(psample_nl_mcgrps),
>>> + .sock_priv_size = sizeof(struct psample_nl_sock_priv),
>>> + .sock_priv_init = psample_nl_sock_priv_init,
>>> + .sock_priv_destroy = psample_nl_sock_priv_destroy,
>>> };
>>> static void psample_group_notify(struct psample_group *group,
>>> @@ -360,6 +434,42 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
>>> }
>>> #endif
>>> +static inline void psample_nl_obj_desc_init(struct
>>> psample_obj_desc *desc,
>>> + u32 group_num)
>>> +{
>>> + memset(desc, 0, sizeof(*desc));
>>> + desc->group_num = group_num;
>>> + desc->group_num_valid = true;
>>> +}
>>> +
>>> +static bool psample_obj_desc_match(struct psample_obj_desc *desc,
>>> + struct psample_obj_desc *flt)
>>> +{
>>> + if (desc->group_num_valid && flt->group_num_valid &&
>>> + desc->group_num != flt->group_num)
>>> + return false;
>>> + return true;
>> This fucntion returns 'true' if one of the arguments is not valid.
>> I'd not expect such behavior from a 'match' function.
>> I understand the intention that psample should sample everything
>> to sockets that do not request filters, but that should not be part
>> of the 'match' logic, or more appropriate function name should be
>> chosen. Also, if the group is not initialized, but the filter is,
>> it should not match, logically. The validity on filter and the
>> current sample is not symmetric.
>>
>
> The descriptor should always be initialized but I think double
> checking should be OK as in the context of this particular function,
> it might not be clear it is.
>
>> And I'm not really sure if the 'group_num_valid' is actually needed.
>> Can the NULL pointer be used as an indicator? If so, then maybe
>> the whole psample_obj_desc structure is not needed as it will
>> contain a single field.
>
> If we only filter on group_id, then yes. However, as I was writing
> this, I thought maybe opening the door to filtering on more fields
> such as the protocol in/out interfaces, etc. Now that I read this I
> understand the current code is confusing: I should have left a comment
> or mention it in the commit message.
If you want to have such filtering options, does it make sense to
instead have the listening program send a set of bpf instructions for
filtering instead? I think the data should be available at the point
where simple bpf is attached (SO_ATTACH_BPF to the psample socket, and
the filter should run as part of the broadcast message IIRC since it
populates the sk_filter field).
>>
>>> +}
>>> +
>>> +static int psample_nl_sample_filter(struct sock *dsk, struct sk_buff *skb,
>>> + void *data)
>>> +{
>>> + struct psample_obj_desc *desc = data;
>>> + struct psample_nl_sock_priv *sk_priv;
>>> + struct psample_obj_desc *flt;
>>> + int ret = 0;
>>> +
>>> + rcu_read_lock();
>>> + sk_priv = __genl_sk_priv_get(&psample_nl_family, dsk);
>>> + if (!IS_ERR_OR_NULL(sk_priv)) {
>>> + flt = rcu_dereference(sk_priv->flt);
>>> + if (flt)
>>> + ret = !psample_obj_desc_match(desc, flt);
>>> + }
>>> + rcu_read_unlock();
>>> + return ret;
>>> +}
>>> +
>>> void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
>>> u32 sample_rate, const struct psample_metadata *md)
>>> {
>>> @@ -370,6 +480,7 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
>>> #ifdef CONFIG_INET
>>> struct ip_tunnel_info *tun_info;
>>> #endif
>>> + struct psample_obj_desc desc;
>>> struct sk_buff *nl_skb;
>>> int data_len;
>>> int meta_len;
>>> @@ -487,8 +598,12 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
>>> #endif
>>> genlmsg_end(nl_skb, data);
>>> - genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0,
>>> - PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC);
>>> + psample_nl_obj_desc_init(&desc, group->group_num);
>>> + genlmsg_multicast_netns_filtered(&psample_nl_family,
>>> + group->net, nl_skb, 0,
>>> + PSAMPLE_NL_MCGRP_SAMPLE,
>>> + GFP_ATOMIC, psample_nl_sample_filter,
>>> + &desc);
>>> return;
>>> error:
>>
next prev parent reply other threads:[~2024-04-09 14:43 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-08 12:57 [RFC net-next v2 0/5] net: openvswitch: Add sample multicasting Adrian Moreno
2024-04-08 12:57 ` [RFC net-next v2 1/5] net: netlink: export genl private pointer getters Adrian Moreno
2024-04-09 21:48 ` Jakub Kicinski
2024-04-08 12:57 ` [RFC net-next v2 2/5] net: psample: add multicast filtering on group_id Adrian Moreno
2024-04-08 13:18 ` Ilya Maximets
2024-04-08 19:24 ` Adrian Moreno
2024-04-09 14:43 ` Aaron Conole [this message]
2024-04-10 13:32 ` Adrian Moreno
2024-04-10 13:06 ` Ido Schimmel
2024-04-10 13:42 ` Adrian Moreno
2024-04-08 12:57 ` [RFC net-next v2 3/5] net: psample: add user cookie Adrian Moreno
2024-04-08 13:19 ` Ilya Maximets
2024-04-08 19:28 ` Adrian Moreno
2024-04-08 20:28 ` Ilya Maximets
2024-04-08 12:57 ` [RFC net-next v2 4/5] net:sched:act_sample: add action cookie to sample Adrian Moreno
2024-04-08 13:20 ` Ilya Maximets
2024-04-11 8:40 ` Adrian Moreno
2024-04-08 12:57 ` [RFC net-next v2 5/5] net:openvswitch: add psample support Adrian Moreno
2024-04-08 13:37 ` Ilya Maximets
2024-04-08 19:48 ` Adrian Moreno
2024-04-08 20:40 ` Ilya Maximets
2024-04-09 8:16 ` Adrian Moreno
2024-04-09 9:35 ` Ilya Maximets
2024-04-09 21:49 ` Jakub Kicinski
2024-04-10 13:44 ` Adrian Moreno
2024-04-08 13:16 ` [RFC net-next v2 0/5] net: openvswitch: Add sample multicasting Ilya Maximets
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=f7til0qmwar.fsf@redhat.com \
--to=aconole@redhat.com \
--cc=amorenoz@redhat.com \
--cc=cmi@nvidia.com \
--cc=echaudro@redhat.com \
--cc=horms@kernel.org \
--cc=i.maximets@ovn.org \
--cc=jiri@resnulli.us \
--cc=netdev@vger.kernel.org \
--cc=xiyou.wangcong@gmail.com \
--cc=yotam.gi@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.