From: Vlad Buslov <vladbu@mellanox.com>
To: Saeed Mahameed <saeedm@mellanox.com>
Cc: "netdev@vger.kernel.org" <netdev@vger.kernel.org>,
Vlad Buslov <vladbu@mellanox.com>,
"jhs@mojatatu.com" <jhs@mojatatu.com>,
"davem@davemloft.net" <davem@davemloft.net>,
"jakub.kicinski@netronome.com" <jakub.kicinski@netronome.com>,
"xiyou.wangcong@gmail.com" <xiyou.wangcong@gmail.com>,
"jiri@resnulli.us" <jiri@resnulli.us>
Subject: Re: [PATCH net-next] net: sched: flower: refactor reoffload for concurrent access
Date: Tue, 23 Apr 2019 07:43:28 +0000 [thread overview]
Message-ID: <vbf4l6prwtg.fsf@mellanox.com> (raw)
In-Reply-To: <19ac9266d24379b11294a50fa4e702dca296109d.camel@mellanox.com>
On Mon 22 Apr 2019 at 23:34, Saeed Mahameed <saeedm@mellanox.com> wrote:
> On Mon, 2019-04-22 at 10:21 +0300, Vlad Buslov wrote:
>> Recent changes that introduced unlocked flower did not properly
>> account for
>> case when reoffload is initiated concurrently with filter updates. To
>> fix
>> the issue, extend flower with 'hw_filters' list that is used to store
>> filters that don't have 'skip_hw' flag set. Filter is added to the
>> list
>> when it is inserted to hardware and only removed from it after being
>> unoffloaded from all drivers that parent block is attached to. This
>> ensures
>> that concurrent reoffload can still access filter that is being
>> deleted and
>> prevents race condition when driver callback can be removed when
>> filter is
>> no longer accessible trough idr, but is still present in hardware.
>>
>> Refactor fl_change() to respect new filter reference counter and to
>> release
>> filter reference with __fl_put() in case of error, instead of
>> directly
>> deallocating filter memory. This allows for concurrent access to
>> filter
>> from fl_reoffload() and protects it with reference counting. Refactor
>> fl_reoffload() to iterate over hw_filters list instead of idr.
>> Implement
>> fl_get_next_hw_filter() helper function that is used to iterate over
>> hw_filters list with reference counting and skips filters that are
>> being
>> concurrently deleted.
>>
>> Fixes: 92149190067d ("net: sched: flower: set unlocked flag for
>> flower proto ops")
>> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>> Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>> ---
>> net/sched/cls_flower.c | 77 +++++++++++++++++++++++++++++++---------
>> --
>> 1 file changed, 57 insertions(+), 20 deletions(-)
>>
>> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
>> index 4b5585358699..524b86560af3 100644
>> --- a/net/sched/cls_flower.c
>> +++ b/net/sched/cls_flower.c
>> @@ -90,6 +90,7 @@ struct cls_fl_head {
>> struct rhashtable ht;
>> spinlock_t masks_lock; /* Protect masks list */
>> struct list_head masks;
>> + struct list_head hw_filters;
>> struct rcu_work rwork;
>> struct idr handle_idr;
>> };
>> @@ -102,6 +103,7 @@ struct cls_fl_filter {
>> struct tcf_result res;
>> struct fl_flow_key key;
>> struct list_head list;
>> + struct list_head hw_list;
>> u32 handle;
>> u32 flags;
>> u32 in_hw_count;
>> @@ -315,6 +317,7 @@ static int fl_init(struct tcf_proto *tp)
>>
>> spin_lock_init(&head->masks_lock);
>> INIT_LIST_HEAD_RCU(&head->masks);
>> + INIT_LIST_HEAD(&head->hw_filters);
>> rcu_assign_pointer(tp->root, head);
>> idr_init(&head->handle_idr);
>>
>> @@ -352,6 +355,16 @@ static bool fl_mask_put(struct cls_fl_head
>> *head, struct fl_flow_mask *mask)
>> return true;
>> }
>>
>> +static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
>> +{
>> + /* Flower classifier only changes root pointer during init and
>> destroy.
>> + * Users must obtain reference to tcf_proto instance before
>> calling its
>> + * API, so tp->root pointer is protected from concurrent call
>> to
>> + * fl_destroy() by reference counting.
>> + */
>> + return rcu_dereference_raw(tp->root);
>> +}
>> +
>> static void __fl_destroy_filter(struct cls_fl_filter *f)
>> {
>> tcf_exts_destroy(&f->exts);
>> @@ -382,6 +395,8 @@ static void fl_hw_destroy_filter(struct tcf_proto
>> *tp, struct cls_fl_filter *f,
>>
>> tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower,
>> false);
>> spin_lock(&tp->lock);
>> + if (!list_empty(&f->hw_list))
>> + list_del_init(&f->hw_list);
>> tcf_block_offload_dec(block, &f->flags);
>> spin_unlock(&tp->lock);
>>
>> @@ -393,6 +408,7 @@ static int fl_hw_replace_filter(struct tcf_proto
>> *tp,
>> struct cls_fl_filter *f, bool
>> rtnl_held,
>> struct netlink_ext_ack *extack)
>> {
>> + struct cls_fl_head *head = fl_head_dereference(tp);
>> struct tc_cls_flower_offload cls_flower = {};
>> struct tcf_block *block = tp->chain->block;
>> bool skip_sw = tc_skip_sw(f->flags);
>> @@ -444,6 +460,9 @@ static int fl_hw_replace_filter(struct tcf_proto
>> *tp,
>> goto errout;
>> }
>>
>> + spin_lock(&tp->lock);
>> + list_add(&f->hw_list, &head->hw_filters);
>> + spin_unlock(&tp->lock);
>> errout:
>> if (!rtnl_held)
>> rtnl_unlock();
>> @@ -475,23 +494,11 @@ static void fl_hw_update_stats(struct tcf_proto
>> *tp, struct cls_fl_filter *f,
>> rtnl_unlock();
>> }
>>
>> -static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
>> -{
>> - /* Flower classifier only changes root pointer during init and
>> destroy.
>> - * Users must obtain reference to tcf_proto instance before
>> calling its
>> - * API, so tp->root pointer is protected from concurrent call
>> to
>> - * fl_destroy() by reference counting.
>> - */
>> - return rcu_dereference_raw(tp->root);
>> -}
>> -
>> static void __fl_put(struct cls_fl_filter *f)
>> {
>> if (!refcount_dec_and_test(&f->refcnt))
>> return;
>>
>> - WARN_ON(!f->deleted);
>> -
>> if (tcf_exts_get_net(&f->exts))
>> tcf_queue_work(&f->rwork, fl_destroy_filter_work);
>> else
>> @@ -1522,6 +1529,7 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>> err = -ENOBUFS;
>> goto errout_tb;
>> }
>> + INIT_LIST_HEAD(&fnew->hw_list);
>> refcount_set(&fnew->refcnt, 1);
>>
>> err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
>> @@ -1569,7 +1577,6 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>> goto errout_hw;
>> }
>>
>> - refcount_inc(&fnew->refcnt);
>> if (fold) {
>> /* Fold filter was deleted concurrently. Retry lookup.
>> */
>> if (fold->deleted) {
>> @@ -1591,6 +1598,7 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>> in_ht = true;
>> }
>>
>> + refcount_inc(&fnew->refcnt);
>> rhashtable_remove_fast(&fold->mask->ht,
>> &fold->ht_node,
>> fold->mask->filter_ht_params);
>> @@ -1631,6 +1639,7 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>> if (err)
>> goto errout_hw;
>>
>> + refcount_inc(&fnew->refcnt);
>> fnew->handle = handle;
>> list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
>> spin_unlock(&tp->lock);
>> @@ -1642,19 +1651,20 @@ static int fl_change(struct net *net, struct
>> sk_buff *in_skb,
>> kfree(mask);
>> return 0;
>>
>> +errout_ht:
>> + spin_lock(&tp->lock);
>> errout_hw:
>> + fnew->deleted = true;
>> spin_unlock(&tp->lock);
>> if (!tc_skip_hw(fnew->flags))
>> fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
>> -errout_ht:
>> if (in_ht)
>> rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
>> fnew->mask->filter_ht_params);
>> errout_mask:
>> fl_mask_put(head, fnew->mask);
>> errout:
>> - tcf_exts_get_net(&fnew->exts);
>> - tcf_queue_work(&fnew->rwork, fl_destroy_filter_work);
>> + __fl_put(fnew);
>> errout_tb:
>> kfree(tb);
>> errout_mask_alloc:
>> @@ -1699,16 +1709,44 @@ static void fl_walk(struct tcf_proto *tp,
>> struct tcf_walker *arg,
>> }
>> }
>>
>> +static struct cls_fl_filter *
>> +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
>> bool add)
>> +{
>> + struct cls_fl_head *head = fl_head_dereference(tp);
>> +
>> + spin_lock(&tp->lock);
>> + if (!f) {
>> + if (list_empty(&head->hw_filters)) {
>> + spin_unlock(&tp->lock);
>> + return NULL;
>> + }
>
> Shouldn't this be a pre-condition to the whole function ? i mean
> regardless of whether 'f' is NULL or not ?
List can't be empty if we already have an element of the list (f), so
why check this on every iteration?
>
>> +
>> + f = list_first_entry(&head->hw_filters, struct
>> cls_fl_filter,
>> + hw_list);
>> + } else {
>> + f = list_next_entry(f, hw_list);
>> + }
>> +
>
> Maybe if you use list_for_each_entry_continue below, might simplify
> the above logic. it is weird that you need to figure out next entry
> then call list_for_each_from, list 'continue' variation is made for
> such use cases.
list_for_each_entry_continue requires initialized cursor and will skip
first element if we obtain initial cursor with list_first_entry(). We
can have two loops - one that uses list_for_each_entry for initial
iteration, and another one that uses list_for_each_entry_continue for
case when f!=NULL, but I don't see how that would be any simpler.
>
>> + list_for_each_entry_from(f, &head->hw_filters, hw_list) {
>> + if (!(add && f->deleted) && refcount_inc_not_zero(&f-
>> >refcnt)) {
>> + spin_unlock(&tp->lock);
>> + return f;
>> + }
>> + }
>> +
>> + spin_unlock(&tp->lock);
>> + return NULL;
>> +}
>> +
>> static int fl_reoffload(struct tcf_proto *tp, bool add,
>> tc_setup_cb_t *cb,
>> void *cb_priv, struct netlink_ext_ack *extack)
>> {
>> struct tc_cls_flower_offload cls_flower = {};
>> struct tcf_block *block = tp->chain->block;
>> - unsigned long handle = 0;
>> - struct cls_fl_filter *f;
>> + struct cls_fl_filter *f = NULL;
>> int err;
>>
>> - while ((f = fl_get_next_filter(tp, &handle))) {
>> + while ((f = fl_get_next_hw_filter(tp, f, add))) {
>> if (tc_skip_hw(f->flags))
>> goto next_flow;
>
> this can never be true as it is already a pre-condition for
> fl_hw_replace_filter which actually adds to the hw_filters list, i
> think it needs to be removed. if it is, then i think it should be part
> of fl_get_next_hw_filter and not the caller responsibility.
Good catch.
>
>>
>> @@ -1757,7 +1795,6 @@ static int fl_reoffload(struct tcf_proto *tp,
>> bool add, tc_setup_cb_t *cb,
>> add);
>> spin_unlock(&tp->lock);
>> next_flow:
>> - handle++;
>> __fl_put(f);
>> }
>>
next prev parent reply other threads:[~2019-04-23 7:43 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-04-22 7:21 [PATCH net-next] net: sched: flower: refactor reoffload for concurrent access Vlad Buslov
2019-04-22 20:34 ` Saeed Mahameed
2019-04-23 7:43 ` Vlad Buslov [this message]
2019-04-23 16:52 ` Saeed Mahameed
2019-04-24 7:50 ` Vlad Buslov
2019-04-22 21:02 ` Jakub Kicinski
2019-04-23 7:34 ` Vlad Buslov
2019-04-23 16:51 ` Jakub Kicinski
2019-04-24 7:50 ` Vlad Buslov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=vbf4l6prwtg.fsf@mellanox.com \
--to=vladbu@mellanox.com \
--cc=davem@davemloft.net \
--cc=jakub.kicinski@netronome.com \
--cc=jhs@mojatatu.com \
--cc=jiri@resnulli.us \
--cc=netdev@vger.kernel.org \
--cc=saeedm@mellanox.com \
--cc=xiyou.wangcong@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).