Re: [ovs-dev] [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted

From: Ilya Maximets <i.maximets@ovn.org>
To: Eelco Chaudron <echaudro@redhat.com>, Ilya Maximets <i.maximets@ovn.org>
Cc: netdev@vger.kernel.org, dev@openvswitch.org,
	bigeasy@linutronix.de, jlelli@redhat.com, kuba@kernel.org,
	pabeni@redhat.com, davem@davemloft.net
Subject: Re: [ovs-dev] [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
Date: Thu, 15 Oct 2020 13:04:37 +0200	[thread overview]
Message-ID: <e6be9a9c-2ff0-11f1-e10b-cd2f54820622@ovn.org> (raw)
In-Reply-To: <06EADE58-FBCA-44C8-9EEC-FDAACD7DD126@redhat.com>

On 10/15/20 12:54 PM, Eelco Chaudron wrote:
> 
> 
> On 15 Oct 2020, at 12:27, Ilya Maximets wrote:
> 
>> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>>> The flow_lookup() function uses per CPU variables, which must not be
>>> preempted. However, this is fine in the general napi use case where
>>> the local BH is disabled. But, it's also called in the netlink
>>> context, which is preemptible. The below patch makes sure that even
>>> in the netlink path, preemption is disabled.
>>>
>>> In addition, the u64_stats_update_begin() sync point was not protected,
>>> making the sync point part of the per CPU variable fixed this.
>>>
>>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
>>> Reported-by: Juri Lelli <jlelli@redhat.com>
>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>> ---
>>> v2: - Add u64_stats_update_begin() sync point protection
>>>     - Moved patch to net from net-next branch
>>>
>>>  net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
>>>  net/openvswitch/flow_table.h |    8 +++++-
>>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
>>> index e2235849a57e..d90b4af6f539 100644
>>> --- a/net/openvswitch/flow_table.c
>>> +++ b/net/openvswitch/flow_table.c
>>> @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int new_size)
>>>
>>>  static void __mask_array_destroy(struct mask_array *ma)
>>>  {
>>> -    free_percpu(ma->masks_usage_cntr);
>>> +    free_percpu(ma->masks_usage_stats);
>>>      kfree(ma);
>>>  }
>>>
>>> @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
>>>          ma->masks_usage_zero_cntr[i] = 0;
>>>
>>>          for_each_possible_cpu(cpu) {
>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>> -                              cpu);
>>> +            struct mask_array_stats *stats;
>>>              unsigned int start;
>>>              u64 counter;
>>>
>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>              do {
>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>> -                counter = usage_counters[i];
>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>> +                counter = stats->usage_cntrs[i];
>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>>
>>>              ma->masks_usage_zero_cntr[i] += counter;
>>>          }
>>> @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
>>>                           sizeof(struct sw_flow_mask *) *
>>>                           size);
>>>
>>> -    new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>>> -                           __alignof__(u64));
>>> -    if (!new->masks_usage_cntr) {
>>> +    new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
>>> +                        sizeof(u64) * size,
>>> +                        __alignof__(u64));
>>> +    if (!new->masks_usage_stats) {
>>>          kfree(new);
>>>          return NULL;
>>>      }
>>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>                     u32 *n_cache_hit,
>>>                     u32 *index)
>>>  {
>>> -    u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>>> +    struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
>>>      struct sw_flow *flow;
>>>      struct sw_flow_mask *mask;
>>>      int i;
>>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>          if (mask) {
>>>              flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>              if (flow) {
>>> -                u64_stats_update_begin(&ma->syncp);
>>> -                usage_counters[*index]++;
>>> -                u64_stats_update_end(&ma->syncp);
>>> +                u64_stats_update_begin(&stats->syncp);
>>> +                stats->usage_cntrs[*index]++;
>>> +                u64_stats_update_end(&stats->syncp);
>>>                  (*n_cache_hit)++;
>>>                  return flow;
>>>              }
>>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>          flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>          if (flow) { /* Found */
>>>              *index = i;
>>> -            u64_stats_update_begin(&ma->syncp);
>>> -            usage_counters[*index]++;
>>> -            u64_stats_update_end(&ma->syncp);
>>> +            u64_stats_update_begin(&stats->syncp);
>>> +            stats->usage_cntrs[*index]++;
>>> +            u64_stats_update_end(&stats->syncp);
>>>              return flow;
>>>          }
>>>      }
>>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>>>      struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>>      u32 __always_unused n_mask_hit;
>>>      u32 __always_unused n_cache_hit;
>>> +    struct sw_flow *flow;
>>>      u32 index = 0;
>>>
>>> -    return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>> +    /* This function gets called trough the netlink interface and therefore
>>> +     * is preemptible. However, flow_lookup() function needs to be called
>>> +     * with preemption disabled due to CPU specific variables.
>>
>> Is it possible to add some kind of assertion inside flow_lookup() to avoid
>> this kind of issues in the future?
> 
> We could do something like WARN_ON_ONCE(preemptible()) but do not think such check should be added to the fast path.

I meant something like lockdep_assert_preemption_disabled().  This will not
impact fast path if CONFIG_PROVE_LOCKING disabled, but will allow to catch
issues during development.

> 
>> It might be also good to update the comment for flow_lookup() function itself.
> 
> Good idea, will do this in a v3
> 
>>> +     */
>>> +    local_bh_disable();
>>> +    flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>> +    local_bh_enable();
>>> +    return flow;
>>>  }
>>>
>>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>
>>>      for (i = 0; i < ma->max; i++)  {
>>>          struct sw_flow_mask *mask;
>>> -        unsigned int start;
>>>          int cpu;
>>>
>>>          mask = rcu_dereference_ovsl(ma->masks[i]);
>>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>          masks_and_count[i].counter = 0;
>>>
>>>          for_each_possible_cpu(cpu) {
>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>> -                              cpu);
>>> +            struct mask_array_stats *stats;
>>> +            unsigned int start;
>>>              u64 counter;
>>>
>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>              do {
>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>> -                counter = usage_counters[i];
>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>> +                counter = stats->usage_cntrs[i];
>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp,
>>> +                               start));
>>>
>>>              masks_and_count[i].counter += counter;
>>>          }
>>> diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
>>> index 6e7d4ac59353..43144396e192 100644
>>> --- a/net/openvswitch/flow_table.h
>>> +++ b/net/openvswitch/flow_table.h
>>> @@ -38,12 +38,16 @@ struct mask_count {
>>>      u64 counter;
>>>  };
>>>
>>> +struct mask_array_stats {
>>> +    struct u64_stats_sync syncp;
>>> +    u64 usage_cntrs[];
>>> +};
>>> +
>>>  struct mask_array {
>>>      struct rcu_head rcu;
>>>      int count, max;
>>> -    u64 __percpu *masks_usage_cntr;
>>> +    struct mask_array_stats __percpu *masks_usage_stats;
>>>      u64 *masks_usage_zero_cntr;
>>> -    struct u64_stats_sync syncp;
>>>      struct sw_flow_mask __rcu *masks[];
>>>  };
>>>
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>