Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH v4 0/3] mm, treewide: Rename kzfree() to kfree_sensitive()
From: Waiman Long @ 2020-06-16 20:01 UTC (permalink / raw)
  To: Joe Perches, Andrew Morton, David Howells, Jarkko Sakkinen,
	James Morris, Serge E. Hallyn, Linus Torvalds, Matthew Wilcox,
	David Rientjes
  Cc: Michal Hocko, Johannes Weiner, Dan Carpenter, David Sterba,
	Jason A . Donenfeld, linux-mm, keyrings, linux-kernel,
	linux-crypto, linux-pm, linux-stm32, linux-amlogic,
	linux-mediatek, linuxppc-dev, virtualization, netdev, linux-ppp,
	wireguard, linux-wireless, devel, linux-scsi, target-devel,
	linux-btrfs, linux-cifs, linux-fscrypt, ecryptfs, kasan-dev,
	linux-bluetooth, linux-wpan, linux-sctp, linux-nfs,
	tipc-discussion, linux-security-module, linux-integrity
In-Reply-To: <fe3b9a437be4aeab3bac68f04193cb6daaa5bee4.camel@perches.com>

On 6/16/20 2:53 PM, Joe Perches wrote:
> On Mon, 2020-06-15 at 21:57 -0400, Waiman Long wrote:
>>   v4:
>>    - Break out the memzero_explicit() change as suggested by Dan Carpenter
>>      so that it can be backported to stable.
>>    - Drop the "crypto: Remove unnecessary memzero_explicit()" patch for
>>      now as there can be a bit more discussion on what is best. It will be
>>      introduced as a separate patch later on after this one is merged.
> To this larger audience and last week without reply:
> https://lore.kernel.org/lkml/573b3fbd5927c643920e1364230c296b23e7584d.camel@perches.com/
>
> Are there _any_ fastpath uses of kfree or vfree?
>
> Many patches have been posted recently to fix mispairings
> of specific types of alloc and free functions.
>
> To eliminate these mispairings at a runtime cost of four
> comparisons, should the kfree/vfree/kvfree/kfree_const
> functions be consolidated into a single kfree?
>
> Something like the below:
>
>     void kfree(const void *addr)
>     {
>     	if (is_kernel_rodata((unsigned long)addr))
>     		return;
>
>     	if (is_vmalloc_addr(addr))
>     		_vfree(addr);
>     	else
>     		_kfree(addr);
>     }
>
>     #define kvfree		kfree
>     #define vfree		kfree
>     #define kfree_const	kfree
>
>
How about adding CONFIG_DEBUG_VM code to check for invalid address 
ranges in kfree() and vfree()? By doing this, we can catch unmatched 
pairing in debug mode, but won't have the overhead when debug mode is off.

Thought?

Cheers,
Longman


^ permalink raw reply

* Re: [PATCH net v3 1/4] flow_offload: fix incorrect cleanup for flowtable indirect flow_blocks
From: Pablo Neira Ayuso @ 2020-06-16 20:11 UTC (permalink / raw)
  To: wenxu; +Cc: netdev, davem, vladbu
In-Reply-To: <1592277580-5524-2-git-send-email-wenxu@ucloud.cn>

On Tue, Jun 16, 2020 at 11:19:37AM +0800, wenxu@ucloud.cn wrote:
> From: wenxu <wenxu@ucloud.cn>
> 
> The cleanup operation based on the setup callback. But in the mlx5e
> driver there are tc and flowtable indrict setup callback and shared
> the same release callbacks. So when the representor is removed,
> then identify the indirect flow_blocks that need to be removed by  
> the release callback.
> 
> Fixes: 1fac52da5942 ("net: flow_offload: consolidate indirect flow_block infrastructure")
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
>  drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c        | 2 +-
>  drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 2 +-
>  drivers/net/ethernet/netronome/nfp/flower/main.c    | 2 +-
>  drivers/net/ethernet/netronome/nfp/flower/main.h    | 3 +--
>  drivers/net/ethernet/netronome/nfp/flower/offload.c | 6 +++---
>  include/net/flow_offload.h                          | 2 +-
>  net/core/flow_offload.c                             | 9 +++++----
>  7 files changed, 13 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
> index 0eef4f5..ef7f6bc 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
> @@ -2074,7 +2074,7 @@ void bnxt_shutdown_tc(struct bnxt *bp)
>  		return;
>  
>  	flow_indr_dev_unregister(bnxt_tc_setup_indr_cb, bp,
> -				 bnxt_tc_setup_indr_block_cb);
> +				 bnxt_tc_setup_indr_rel);
>  	rhashtable_destroy(&tc_info->flow_table);
>  	rhashtable_destroy(&tc_info->l2_table);
>  	rhashtable_destroy(&tc_info->decap_l2_table);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
> index 80713123..a62bcf0 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
> @@ -496,7 +496,7 @@ int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv)
>  void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv)
>  {
>  	flow_indr_dev_unregister(mlx5e_rep_indr_setup_cb, rpriv,
> -				 mlx5e_rep_indr_setup_tc_cb);
> +				 mlx5e_rep_indr_block_unbind);
>  }
>  
>  #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
> index c393276..bb448c8 100644
> --- a/drivers/net/ethernet/netronome/nfp/flower/main.c
> +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
> @@ -861,7 +861,7 @@ static void nfp_flower_clean(struct nfp_app *app)
>  	flush_work(&app_priv->cmsg_work);
>  
>  	flow_indr_dev_unregister(nfp_flower_indr_setup_tc_cb, app,
> -				 nfp_flower_setup_indr_block_cb);
> +				 nfp_flower_setup_indr_tc_release);
>  
>  	if (app_priv->flower_ext_feats & NFP_FL_FEATS_VF_RLIM)
>  		nfp_flower_qos_cleanup(app);
> diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
> index 6c3dc3b..c983337 100644
> --- a/drivers/net/ethernet/netronome/nfp/flower/main.h
> +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
> @@ -460,8 +460,7 @@ int nfp_flower_setup_qos_offload(struct nfp_app *app, struct net_device *netdev,
>  void nfp_flower_stats_rlim_reply(struct nfp_app *app, struct sk_buff *skb);
>  int nfp_flower_indr_setup_tc_cb(struct net_device *netdev, void *cb_priv,
>  				enum tc_setup_type type, void *type_data);
> -int nfp_flower_setup_indr_block_cb(enum tc_setup_type type, void *type_data,
> -				   void *cb_priv);
> +void nfp_flower_setup_indr_tc_release(void *cb_priv);
>  
>  void
>  __nfp_flower_non_repr_priv_get(struct nfp_flower_non_repr_priv *non_repr_priv);
> diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
> index 695d24b9..28de905 100644
> --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
> +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
> @@ -1619,8 +1619,8 @@ struct nfp_flower_indr_block_cb_priv {
>  	return NULL;
>  }
>  
> -int nfp_flower_setup_indr_block_cb(enum tc_setup_type type,
> -				   void *type_data, void *cb_priv)
> +static int nfp_flower_setup_indr_block_cb(enum tc_setup_type type,
> +					  void *type_data, void *cb_priv)
>  {
>  	struct nfp_flower_indr_block_cb_priv *priv = cb_priv;
>  	struct flow_cls_offload *flower = type_data;
> @@ -1637,7 +1637,7 @@ int nfp_flower_setup_indr_block_cb(enum tc_setup_type type,
>  	}
>  }
>  
> -static void nfp_flower_setup_indr_tc_release(void *cb_priv)
> +void nfp_flower_setup_indr_tc_release(void *cb_priv)
>  {
>  	struct nfp_flower_indr_block_cb_priv *priv = cb_priv;
>  
> diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
> index f2c8311..3a2d6b4 100644
> --- a/include/net/flow_offload.h
> +++ b/include/net/flow_offload.h
> @@ -536,7 +536,7 @@ typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
>  
>  int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv);
>  void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv,
> -			      flow_setup_cb_t *setup_cb);
> +			      void (*release)(void *cb_priv));
>  int flow_indr_dev_setup_offload(struct net_device *dev,
>  				enum tc_setup_type type, void *data,
>  				struct flow_block_offload *bo,
> diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
> index 0cfc35e..b288d2f 100644
> --- a/net/core/flow_offload.c
> +++ b/net/core/flow_offload.c
> @@ -372,13 +372,14 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
>  }
>  EXPORT_SYMBOL(flow_indr_dev_register);
>  
> -static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv,
> +static void __flow_block_indr_cleanup(void (*release)(void *cb_priv),
> +				      void *cb_priv,
>  				      struct list_head *cleanup_list)
>  {
>  	struct flow_block_cb *this, *next;
>  
>  	list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) {
> -		if (this->cb == setup_cb &&
> +		if (this->release == release &&
>  		    this->cb_priv == cb_priv) {
>  			list_move(&this->indr.list, cleanup_list);
>  			return;
> @@ -397,7 +398,7 @@ static void flow_block_indr_notify(struct list_head *cleanup_list)
>  }
>  
>  void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv,
> -			      flow_setup_cb_t *setup_cb)
> +			      void (*release)(void *cb_priv))

If you use cb_priv to identify the callback, then this should be
instead cb_ident?

^ permalink raw reply

* Re: [PATCH net v3 2/4] flow_offload: fix incorrect cb_priv check for flow_block_cb
From: Pablo Neira Ayuso @ 2020-06-16 20:13 UTC (permalink / raw)
  To: wenxu; +Cc: netdev, davem, vladbu
In-Reply-To: <1592277580-5524-3-git-send-email-wenxu@ucloud.cn>

On Tue, Jun 16, 2020 at 11:19:38AM +0800, wenxu@ucloud.cn wrote:
> From: wenxu <wenxu@ucloud.cn>
> 
> In the function __flow_block_indr_cleanup, The match stataments
> this->cb_priv == cb_priv is always false, the flow_block_cb->cb_priv
> is totally different data with the flow_indr_dev->cb_priv.
> 
> Store the representor cb_priv to the flow_block_cb->indr.cb_priv in
> the driver.
> 
> Fixes: 1fac52da5942 ("net: flow_offload: consolidate indirect flow_block infrastructure")
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
>  drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c        | 1 +
>  drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 2 +-
>  drivers/net/ethernet/netronome/nfp/flower/offload.c | 1 +
>  include/net/flow_offload.h                          | 1 +
>  net/core/flow_offload.c                             | 2 +-
>  5 files changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
> index ef7f6bc..042c285 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
> @@ -1918,6 +1918,7 @@ static int bnxt_tc_setup_indr_block(struct net_device *netdev, struct bnxt *bp,
>  
>  		flow_block_cb_add(block_cb, f);
>  		list_add_tail(&block_cb->driver_list, &bnxt_block_cb_list);
> +		block_cb->indr.cb_priv = bp;

cb_indent ?

Why are you splitting the fix in multiple patches? This makes it
harder to review.

I think patch 1/4, 2/4 and 4/4 belong to the same logical change?
Collapse them.

^ permalink raw reply

* Re: [PATCH net v3 3/4] net/sched: cls_api: fix nooffloaddevcnt warning dmesg log
From: Pablo Neira Ayuso @ 2020-06-16 20:17 UTC (permalink / raw)
  To: wenxu; +Cc: netdev, davem, vladbu
In-Reply-To: <1592277580-5524-4-git-send-email-wenxu@ucloud.cn>

On Tue, Jun 16, 2020 at 11:19:39AM +0800, wenxu@ucloud.cn wrote:
> From: wenxu <wenxu@ucloud.cn>
> 
> When a indr device add in offload success. The block->nooffloaddevcnt
> should be 0. After the representor go away. When the dir device go away
> the flow_block UNBIND operation with -EOPNOTSUPP which lead the warning
> dmesg log. 
> 
> The block->nooffloaddevcnt should always count for indr block.
> even the indr block offload successful. The representor maybe
> gone away and the ingress qdisc can work in software mode.
> 
> block->nooffloaddevcnt warning with following dmesg log:
> 
> [  760.667058] #####################################################
> [  760.668186] ## TEST test-ecmp-add-vxlan-encap-disable-sriov.sh ##
> [  760.669179] #####################################################
> [  761.780655] :test: Fedora 30 (Thirty)
> [  761.783794] :test: Linux reg-r-vrt-018-180 5.7.0+
> [  761.822890] :test: NIC ens1f0 FW 16.26.6000 PCI 0000:81:00.0 DEVICE 0x1019 ConnectX-5 Ex
> [  761.860244] mlx5_core 0000:81:00.0 ens1f0: Link up
> [  761.880693] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
> [  762.059732] mlx5_core 0000:81:00.1 ens1f1: Link up
> [  762.234341] :test: unbind vfs of ens1f0
> [  762.257825] :test: Change ens1f0 eswitch (0000:81:00.0) mode to switchdev
> [  762.291363] :test: unbind vfs of ens1f1
> [  762.306914] :test: Change ens1f1 eswitch (0000:81:00.1) mode to switchdev
> [  762.309237] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3)
> [  763.282598] mlx5_core 0000:81:00.1: E-Switch: Supported tc offload range - chains: 4294967294, prios: 4294967295
> [  763.362825] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
> [  763.444465] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
> [  763.460088] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
> [  763.502586] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
> [  763.552429] ens1f1_0: renamed from eth0
> [  763.569569] mlx5_core 0000:81:00.1: E-Switch: Enable: mode(OFFLOADS), nvfs(2), active vports(3)
> [  763.629694] ens1f1_1: renamed from eth1
> [  764.631552] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_0: link becomes ready
> [  764.670841] :test: unbind vfs of ens1f0
> [  764.681966] :test: unbind vfs of ens1f1
> [  764.726762] mlx5_core 0000:81:00.0 ens1f0: Link up
> [  764.766511] mlx5_core 0000:81:00.1 ens1f1: Link up
> [  764.797325] :test: Add multipath vxlan encap rule and disable sriov
> [  764.798544] :test: config multipath route
> [  764.812732] mlx5_core 0000:81:00.0: lag map port 1:2 port 2:2
> [  764.874556] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:2
> [  765.603681] :test: OK
> [  765.659048] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_1: link becomes ready
> [  765.675085] :test: verify rule in hw
> [  765.694237] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
> [  765.711892] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1: link becomes ready
> [  766.979230] :test: OK
> [  768.125419] :test: OK
> [  768.127519] :test: - disable sriov ens1f1
> [  768.131160] pci 0000:81:02.2: Removing from iommu group 75
> [  768.132646] pci 0000:81:02.3: Removing from iommu group 76
> [  769.179749] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
> [  769.455627] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:1
> [  769.703990] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
> [  769.988637] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
> [  769.990022] :test: - disable sriov ens1f0
> [  769.994922] pci 0000:81:00.2: Removing from iommu group 73
> [  769.997048] pci 0000:81:00.3: Removing from iommu group 74
> [  771.035813] mlx5_core 0000:81:00.0: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
> [  771.339091] ------------[ cut here ]------------
> [  771.340812] WARNING: CPU: 6 PID: 3448 at net/sched/cls_api.c:749 tcf_block_offload_unbind.isra.0+0x5c/0x60
> [  771.341728] Modules linked in: act_mirred act_tunnel_key cls_flower dummy vxlan ip6_udp_tunnel udp_tunnel sch_ingress nfsv3 nfs_acl nfs lockd grace fscache tun bridge stp llc sunrpc rdma_ucm rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mlxfw act_ct nf_flow_table kvm_intel nf_nat kvm nf_conntrack irqbypass crct10dif_pclmul igb crc32_pclmul nf_defrag_ipv6 libcrc32c nf_defrag_ipv4 crc32c_intel ghash_clmulni_intel ptp ipmi_ssif intel_cstate pps_c
> ore ses intel_uncore mei_me iTCO_wdt joydev ipmi_si iTCO_vendor_support i2c_i801 enclosure mei ioatdma dca lpc_ich wmi ipmi_devintf pcspkr acpi_power_meter ipmi_msghandler acpi_pad ast i2c_algo_bit drm_vram_helper drm_kms_helper drm_ttm_helper ttm drm mpt3sas raid_class scsi_transport_sas
> [  771.347818] CPU: 6 PID: 3448 Comm: test-ecmp-add-v Not tainted 5.7.0+ #1146
> [  771.348727] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
> [  771.349646] RIP: 0010:tcf_block_offload_unbind.isra.0+0x5c/0x60
> [  771.350553] Code: 4a fd ff ff 83 f8 a1 74 0e 5b 4c 89 e7 5d 41 5c 41 5d e9 07 93 89 ff 8b 83 a0 00 00 00 8d 50 ff 89 93 a0 00 00 00 85 c0 75 df <0f> 0b eb db 0f 1f 44 00 00 41 57 41 56 41 55 41 89 cd 41 54 49 89
> [  771.352420] RSP: 0018:ffffb33144cd3b00 EFLAGS: 00010246
> [  771.353353] RAX: 0000000000000000 RBX: ffff8b37cf4b2800 RCX: 0000000000000000
> [  771.354294] RDX: 00000000ffffffff RSI: ffff8b3b9aad0000 RDI: ffffffff8d5c6e20
> [  771.355245] RBP: ffff8b37eb546948 R08: ffffffffc0b7a348 R09: ffff8b3b9aad0000
> [  771.356189] R10: 0000000000000001 R11: ffff8b3ba7a0a1c0 R12: ffff8b37cf4b2850
> [  771.357123] R13: ffff8b3b9aad0000 R14: ffff8b37cf4b2820 R15: ffff8b37cf4b2820
> [  771.358039] FS:  00007f8a19b6e740(0000) GS:ffff8b3befa00000(0000) knlGS:0000000000000000
> [  771.358965] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  771.359885] CR2: 00007f3afb91c1a0 CR3: 000000045133c004 CR4: 00000000001606e0
> [  771.360825] Call Trace:
> [  771.361764]  __tcf_block_put+0x84/0x150
> [  771.362712]  ingress_destroy+0x1b/0x20 [sch_ingress]
> [  771.363658]  qdisc_destroy+0x3e/0xc0
> [  771.364594]  dev_shutdown+0x7a/0xa5
> [  771.365522]  rollback_registered_many+0x20d/0x530
> [  771.366458]  ? netdev_upper_dev_unlink+0x15d/0x1c0
> [  771.367387]  unregister_netdevice_many.part.0+0xf/0x70
> [  771.368310]  vxlan_netdevice_event+0xa4/0x110 [vxlan]
> [  771.369454]  notifier_call_chain+0x4c/0x70
> [  771.370579]  rollback_registered_many+0x2f5/0x530
> [  771.371719]  rollback_registered+0x56/0x90
> [  771.372843]  unregister_netdevice_queue+0x73/0xb0
> [  771.373982]  unregister_netdev+0x18/0x20
> [  771.375168]  mlx5e_vport_rep_unload+0x56/0xc0 [mlx5_core]
> [  771.376327]  esw_offloads_disable+0x81/0x90 [mlx5_core]
> [  771.377512]  mlx5_eswitch_disable_locked.cold+0xcb/0x1af [mlx5_core]
> [  771.378679]  mlx5_eswitch_disable+0x44/0x60 [mlx5_core]
> [  771.379822]  mlx5_device_disable_sriov+0xad/0xb0 [mlx5_core]
> [  771.380968]  mlx5_core_sriov_configure+0xc1/0xe0 [mlx5_core]
> [  771.382087]  sriov_numvfs_store+0xfc/0x130
> [  771.383195]  kernfs_fop_write+0xce/0x1b0
> [  771.384302]  vfs_write+0xb6/0x1a0
> [  771.385410]  ksys_write+0x5f/0xe0
> [  771.386500]  do_syscall_64+0x5b/0x1d0
> [  771.387569]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()")
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
>  net/sched/cls_api.c | 24 ++++++++++++++----------
>  1 file changed, 14 insertions(+), 10 deletions(-)
> 
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index a00a203..86c3937 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -671,25 +671,29 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
>  				 struct netlink_ext_ack *extack)
>  {
>  	struct flow_block_offload bo = {};
> -	int err;
>  
>  	tcf_block_offload_init(&bo, dev, command, ei->binder_type,
>  			       &block->flow_block, tcf_block_shared(block),
>  			       extack);
>  
> -	if (dev->netdev_ops->ndo_setup_tc)
> +	if (dev->netdev_ops->ndo_setup_tc) {
> +		int err;
> +
>  		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
> -	else
> -		err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block,
> -						  &bo, tc_block_indr_cleanup);
> +		if (err < 0) {
> +			if (err != -EOPNOTSUPP)
> +				NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed");
> +			return err;
> +		}
>  
> -	if (err < 0) {
> -		if (err != -EOPNOTSUPP)
> -			NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed");
> -		return err;
> +		return tcf_block_setup(block, &bo);
>  	}
>  
> -	return tcf_block_setup(block, &bo);
> +	flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, &bo,
> +				    tc_block_indr_cleanup);
> +	tcf_block_setup(block, &bo);
> +
> +	return -EOPNOTSUPP;

So tcf_block_offload_cmd() always return -EOPNOTSUPP for _BIND and
_UNBIND operations after this patch ?

^ permalink raw reply

* Re: [PATCH bpf 2/2] selftests/bpf: add variable-length data concatenation pattern test
From: Daniel Borkmann @ 2020-06-16 20:21 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf, netdev, ast
  Cc: andrii.nakryiko, kernel-team, Christoph Hellwig
In-Reply-To: <20200616050432.1902042-2-andriin@fb.com>

On 6/16/20 7:04 AM, Andrii Nakryiko wrote:
> Add selftest that validates variable-length data reading and concatentation
> with one big shared data array. This is a common pattern in production use for
> monitoring and tracing applications, that potentially can read a lot of data,
> but usually reads much less. Such pattern allows to determine precisely what
> amount of data needs to be sent over perfbuf/ringbuf and maximize efficiency.
> 
> This is the first BPF selftest that at all looks at and tests
> bpf_probe_read_str()-like helper's return value, closing a major gap in BPF
> testing. It surfaced the problem with bpf_probe_read_kernel_str() returning
> 0 on success, instead of amount of bytes successfully read.
> 
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>

Fix looks good, but I'm seeing an issue in the selftest on my side. With latest
Clang/LLVM I'm getting:

# ./test_progs -t varlen
#86 varlen:OK
Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED

All good, however, the test_progs-no_alu32 fails for me with:

# ./test_progs-no_alu32 -t varlen
Switching to flavor 'no_alu32' subdirectory...
libbpf: load bpf program failed: Invalid argument
libbpf: -- BEGIN DUMP LOG ---
libbpf:
arg#0 type is not a struct
Unrecognized arg#0 type PTR
; int pid = bpf_get_current_pid_tgid() >> 32;
0: (85) call bpf_get_current_pid_tgid#14
; int pid = bpf_get_current_pid_tgid() >> 32;
1: (77) r0 >>= 32
; if (test_pid != pid || !capture)
2: (18) r1 = 0xffffb14a4010c200
4: (61) r1 = *(u32 *)(r1 +0)
  R0_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=map_value(id=0,off=512,ks=4,vs=1056,imm=0) R10=fp0
; if (test_pid != pid || !capture)
5: (5d) if r1 != r0 goto pc+43
  R0_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0
6: (18) r1 = 0xffffb14a4010c204
8: (71) r1 = *(u8 *)(r1 +0)
  R0_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=map_value(id=0,off=516,ks=4,vs=1056,imm=0) R10=fp0
9: (15) if r1 == 0x0 goto pc+39
  R0=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0
; len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in1[0]);
10: (18) r6 = 0xffffb14a4010c220
12: (18) r1 = 0xffffb14a4010c220
14: (b7) r2 = 256
15: (18) r3 = 0xffffb14a4010c000
17: (85) call bpf_probe_read_kernel_str#115
  R0=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R2_w=inv256 R3_w=map_value(id=0,off=0,ks=4,vs=1056,imm=0) R6_w=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R10=fp0
last_idx 17 first_idx 9
regs=4 stack=0 before 15: (18) r3 = 0xffffb14a4010c000
regs=4 stack=0 before 14: (b7) r2 = 256
18: (67) r0 <<= 32
19: (bf) r1 = r0
20: (77) r1 >>= 32
; if (len <= MAX_LEN) {
21: (25) if r1 > 0x100 goto pc+7
  R0=inv(id=0,smax_value=1099511627776,umax_value=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min_value=0,s32_max_value=0,u32_max_value=0) R1=inv(id=0,umax_value=256,var_off=(0x0; 0x1ff)) R6=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R10=fp0
;
22: (c7) r0 s>>= 32
; payload1_len1 = len;
23: (18) r1 = 0xffffb14a4010c208
25: (7b) *(u64 *)(r1 +0) = r0
  R0_w=inv(id=0,smin_value=-2147483648,smax_value=256) R1_w=map_value(id=0,off=520,ks=4,vs=1056,imm=0) R6=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R10=fp0
; payload += len;
26: (18) r6 = 0xffffb14a4010c220
28: (0f) r6 += r0
last_idx 28 first_idx 21
regs=1 stack=0 before 26: (18) r6 = 0xffffb14a4010c220
regs=1 stack=0 before 25: (7b) *(u64 *)(r1 +0) = r0
regs=1 stack=0 before 23: (18) r1 = 0xffffb14a4010c208
regs=1 stack=0 before 22: (c7) r0 s>>= 32
regs=1 stack=0 before 21: (25) if r1 > 0x100 goto pc+7
  R0_rw=invP(id=0,smax_value=1099511627776,umax_value=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min_value=0,s32_max_value=0,u32_max_value=0) R1_rw=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R6_w=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R10=fp0
parent didn't have regs=1 stack=0 marks
last_idx 20 first_idx 9
regs=1 stack=0 before 20: (77) r1 >>= 32
regs=1 stack=0 before 19: (bf) r1 = r0
regs=1 stack=0 before 18: (67) r0 <<= 32
regs=1 stack=0 before 17: (85) call bpf_probe_read_kernel_str#115
value -2147483648 makes map_value pointer be out of bounds
processed 22 insns (limit 1000000) max_states_per_insn 0 total_states 2 peak_states 2 mark_read 1

libbpf: -- END LOG --
libbpf: failed to load program 'raw_tp/sys_enter'
libbpf: failed to load object 'test_varlen'
libbpf: failed to load BPF skeleton 'test_varlen': -4007
test_varlen:FAIL:skel_open failed to open skeleton
#86 varlen:FAIL
Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED

^ permalink raw reply

* [PATCH net v3 0/2] two fixes for 'act_gate' control plane
From: Davide Caratti @ 2020-06-16 20:25 UTC (permalink / raw)
  To: Po Liu, Cong Wang, Vladimir Oltean; +Cc: David S. Miller, netdev

- patch 1/2 attempts to fix the error path of tcf_gate_init() when users
  try to configure 'act_gate' rules with wrong parameters
- patch 2/2 is a follow-up of a recent fix for NULL dereference in
  the error path of tcf_gate_init()

further work will introduce a tdc test for 'act_gate'.

changes since v2:
  - fix undefined behavior in patch 1/2
  - improve comment in patch 2/2
changes since v1:
  coding style fixes in patch 1/2 and 2/2

Davide Caratti (2):
  net/sched: act_gate: fix NULL dereference in tcf_gate_init()
  net/sched: act_gate: fix configuration of the periodic timer

 net/sched/act_gate.c | 126 +++++++++++++++++++++++--------------------
 1 file changed, 68 insertions(+), 58 deletions(-)

-- 
2.26.2


^ permalink raw reply

* [PATCH net v3 1/2] net/sched: act_gate: fix NULL dereference in tcf_gate_init()
From: Davide Caratti @ 2020-06-16 20:25 UTC (permalink / raw)
  To: Po Liu, Cong Wang, Vladimir Oltean; +Cc: David S. Miller, netdev
In-Reply-To: <cover.1592338450.git.dcaratti@redhat.com>

it is possible to see a KASAN use-after-free, immediately followed by a
NULL dereference crash, with the following command:

 # tc action add action gate index 3 cycle-time 100000000ns \
 > cycle-time-ext 100000000ns clockid CLOCK_TAI

 BUG: KASAN: use-after-free in tcf_action_init_1+0x8eb/0x960
 Write of size 1 at addr ffff88810a5908bc by task tc/883

 CPU: 0 PID: 883 Comm: tc Not tainted 5.7.0+ #188
 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
 Call Trace:
  dump_stack+0x75/0xa0
  print_address_description.constprop.6+0x1a/0x220
  kasan_report.cold.9+0x37/0x7c
  tcf_action_init_1+0x8eb/0x960
  tcf_action_init+0x157/0x2a0
  tcf_action_add+0xd9/0x2f0
  tc_ctl_action+0x2a3/0x39d
  rtnetlink_rcv_msg+0x5f3/0x920
  netlink_rcv_skb+0x120/0x380
  netlink_unicast+0x439/0x630
  netlink_sendmsg+0x714/0xbf0
  sock_sendmsg+0xe2/0x110
  ____sys_sendmsg+0x5b4/0x890
  ___sys_sendmsg+0xe9/0x160
  __sys_sendmsg+0xd3/0x170
  do_syscall_64+0x9a/0x370
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

[...]

 KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077]
 CPU: 0 PID: 883 Comm: tc Tainted: G    B             5.7.0+ #188
 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
 RIP: 0010:tcf_action_fill_size+0xa3/0xf0
 [....]
 RSP: 0018:ffff88813a48f250 EFLAGS: 00010212
 RAX: dffffc0000000000 RBX: 0000000000000094 RCX: ffffffffa47c3eb6
 RDX: 000000000000000e RSI: 0000000000000008 RDI: 0000000000000070
 RBP: ffff88810a590800 R08: 0000000000000004 R09: ffffed1027491e03
 R10: 0000000000000003 R11: ffffed1027491e03 R12: 0000000000000000
 R13: 0000000000000000 R14: dffffc0000000000 R15: ffff88810a590800
 FS:  00007f62cae8ce40(0000) GS:ffff888147c00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007f62c9d20a10 CR3: 000000013a52a000 CR4: 0000000000340ef0
 Call Trace:
  tcf_action_init+0x172/0x2a0
  tcf_action_add+0xd9/0x2f0
  tc_ctl_action+0x2a3/0x39d
  rtnetlink_rcv_msg+0x5f3/0x920
  netlink_rcv_skb+0x120/0x380
  netlink_unicast+0x439/0x630
  netlink_sendmsg+0x714/0xbf0
  sock_sendmsg+0xe2/0x110
  ____sys_sendmsg+0x5b4/0x890
  ___sys_sendmsg+0xe9/0x160
  __sys_sendmsg+0xd3/0x170
  do_syscall_64+0x9a/0x370
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

this is caused by the test on 'cycletime_ext', that is still unassigned
when the action is newly created. This makes the action .init() return 0
without calling tcf_idr_insert(), hence the UAF + crash.

rework the logic that prevents zero values of cycle-time, as follows:

1) 'tcfg_cycletime_ext' seems to be unused in the action software path,
   and it was already possible by other means to obtain non-zero
   cycletime and zero cycletime-ext. So, removing that test should not
   cause any damage.
2) while at it, we must prevent overwriting configuration data with wrong
   ones: use a temporary variable for 'tcfg_cycletime', and validate it
   preserving the original semantic (that allowed computing the cycle
   time as the sum of all intervals, when not specified by
   TCA_GATE_CYCLE_TIME).
3) remove the test on 'tcfg_cycletime', no more useful, and avoid
   returning -EFAULT, which did not seem an appropriate return value for
   a wrong netlink attribute.

v3: fix uninitialized 'cycletime' (thanks to Vladimir Oltean)
v2: remove useless 'return;' at the end of void gate_get_start_time()

Fixes: a51c328df310 ("net: qos: introduce a gate control flow action")
CC: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_gate.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index 9c628591f452..94e560c2f81d 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -32,7 +32,7 @@ static ktime_t gate_get_time(struct tcf_gate *gact)
 	return KTIME_MAX;
 }
 
-static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
+static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
 {
 	struct tcf_gate_params *param = &gact->param;
 	ktime_t now, base, cycle;
@@ -43,18 +43,13 @@ static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start)
 
 	if (ktime_after(base, now)) {
 		*start = base;
-		return 0;
+		return;
 	}
 
 	cycle = param->tcfg_cycletime;
 
-	/* cycle time should not be zero */
-	if (!cycle)
-		return -EFAULT;
-
 	n = div64_u64(ktime_sub_ns(now, base), cycle);
 	*start = ktime_add_ns(base, (n + 1) * cycle);
-	return 0;
 }
 
 static void gate_start_timer(struct tcf_gate *gact, ktime_t start)
@@ -287,12 +282,12 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	enum tk_offsets tk_offset = TK_OFFS_TAI;
 	struct nlattr *tb[TCA_GATE_MAX + 1];
 	struct tcf_chain *goto_ch = NULL;
+	u64 cycletime = 0, basetime = 0;
 	struct tcf_gate_params *p;
 	s32 clockid = CLOCK_TAI;
 	struct tcf_gate *gact;
 	struct tc_gate *parm;
 	int ret = 0, err;
-	u64 basetime = 0;
 	u32 gflags = 0;
 	s32 prio = -1;
 	ktime_t start;
@@ -375,11 +370,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	spin_lock_bh(&gact->tcf_lock);
 	p = &gact->param;
 
-	if (tb[TCA_GATE_CYCLE_TIME]) {
-		p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
-		if (!p->tcfg_cycletime_ext)
-			goto chain_put;
-	}
+	if (tb[TCA_GATE_CYCLE_TIME])
+		cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]);
 
 	if (tb[TCA_GATE_ENTRY_LIST]) {
 		err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack);
@@ -387,14 +379,19 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 			goto chain_put;
 	}
 
-	if (!p->tcfg_cycletime) {
+	if (!cycletime) {
 		struct tcfg_gate_entry *entry;
 		ktime_t cycle = 0;
 
 		list_for_each_entry(entry, &p->entries, list)
 			cycle = ktime_add_ns(cycle, entry->interval);
-		p->tcfg_cycletime = cycle;
+		cycletime = cycle;
+		if (!cycletime) {
+			err = -EINVAL;
+			goto chain_put;
+		}
 	}
+	p->tcfg_cycletime = cycletime;
 
 	if (tb[TCA_GATE_CYCLE_TIME_EXT])
 		p->tcfg_cycletime_ext =
@@ -408,14 +405,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	gact->tk_offset = tk_offset;
 	hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT);
 	gact->hitimer.function = gate_timer_func;
-
-	err = gate_get_start_time(gact, &start);
-	if (err < 0) {
-		NL_SET_ERR_MSG(extack,
-			       "Internal error: failed get start time");
-		release_entry_list(&p->entries);
-		goto chain_put;
-	}
+	gate_get_start_time(gact, &start);
 
 	gact->current_close_time = start;
 	gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING;
-- 
2.26.2


^ permalink raw reply related

* [PATCH net v3 2/2] net/sched: act_gate: fix configuration of the periodic timer
From: Davide Caratti @ 2020-06-16 20:25 UTC (permalink / raw)
  To: Po Liu, Cong Wang, Vladimir Oltean; +Cc: David S. Miller, netdev
In-Reply-To: <cover.1592338450.git.dcaratti@redhat.com>

assigning a dummy value of 'clock_id' to avoid cancellation of the cycle
timer before its initialization was a temporary solution, and we still
need to handle the case where act_gate timer parameters are changed by
commands like the following one:

 # tc action replace action gate <parameters>

the fix consists in the following items:

1) remove the workaround assignment of 'clock_id', and init the list of
   entries before the first error path after IDR atomic check/allocation
2) validate 'clock_id' earlier: there is no need to do IDR atomic
   check/allocation if we know that 'clock_id' is a bad value
3) use a dedicated function, 'gate_setup_timer()', to ensure that the
   timer is cancelled and re-initialized on action overwrite, and also
   ensure we initialize the timer in the error path of tcf_gate_init()

v3: improve comment in the error path of tcf_gate_init() (thanks to
    Vladimir Oltean)
v2: avoid 'goto' in gate_setup_timer (thanks to Cong Wang)

CC: Ivan Vecera <ivecera@redhat.com>
Fixes: a01c245438c5 ("net/sched: fix a couple of splats in the error path of tfc_gate_init()")
Fixes: a51c328df310 ("net: qos: introduce a gate control flow action")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_gate.c | 90 +++++++++++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 35 deletions(-)

diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c
index 94e560c2f81d..323ae7f6315d 100644
--- a/net/sched/act_gate.c
+++ b/net/sched/act_gate.c
@@ -272,6 +272,27 @@ static int parse_gate_list(struct nlattr *list_attr,
 	return err;
 }
 
+static void gate_setup_timer(struct tcf_gate *gact, u64 basetime,
+			     enum tk_offsets tko, s32 clockid,
+			     bool do_init)
+{
+	if (!do_init) {
+		if (basetime == gact->param.tcfg_basetime &&
+		    tko == gact->tk_offset &&
+		    clockid == gact->param.tcfg_clockid)
+			return;
+
+		spin_unlock_bh(&gact->tcf_lock);
+		hrtimer_cancel(&gact->hitimer);
+		spin_lock_bh(&gact->tcf_lock);
+	}
+	gact->param.tcfg_basetime = basetime;
+	gact->param.tcfg_clockid = clockid;
+	gact->tk_offset = tko;
+	hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT);
+	gact->hitimer.function = gate_timer_func;
+}
+
 static int tcf_gate_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind, bool rtnl_held,
@@ -303,6 +324,27 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	if (!tb[TCA_GATE_PARMS])
 		return -EINVAL;
 
+	if (tb[TCA_GATE_CLOCKID]) {
+		clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]);
+		switch (clockid) {
+		case CLOCK_REALTIME:
+			tk_offset = TK_OFFS_REAL;
+			break;
+		case CLOCK_MONOTONIC:
+			tk_offset = TK_OFFS_MAX;
+			break;
+		case CLOCK_BOOTTIME:
+			tk_offset = TK_OFFS_BOOT;
+			break;
+		case CLOCK_TAI:
+			tk_offset = TK_OFFS_TAI;
+			break;
+		default:
+			NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+			return -EINVAL;
+		}
+	}
+
 	parm = nla_data(tb[TCA_GATE_PARMS]);
 	index = parm->index;
 
@@ -326,10 +368,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 		tcf_idr_release(*a, bind);
 		return -EEXIST;
 	}
-	if (ret == ACT_P_CREATED) {
-		to_gate(*a)->param.tcfg_clockid = -1;
-		INIT_LIST_HEAD(&(to_gate(*a)->param.entries));
-	}
 
 	if (tb[TCA_GATE_PRIORITY])
 		prio = nla_get_s32(tb[TCA_GATE_PRIORITY]);
@@ -340,33 +378,14 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_GATE_FLAGS])
 		gflags = nla_get_u32(tb[TCA_GATE_FLAGS]);
 
-	if (tb[TCA_GATE_CLOCKID]) {
-		clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]);
-		switch (clockid) {
-		case CLOCK_REALTIME:
-			tk_offset = TK_OFFS_REAL;
-			break;
-		case CLOCK_MONOTONIC:
-			tk_offset = TK_OFFS_MAX;
-			break;
-		case CLOCK_BOOTTIME:
-			tk_offset = TK_OFFS_BOOT;
-			break;
-		case CLOCK_TAI:
-			tk_offset = TK_OFFS_TAI;
-			break;
-		default:
-			NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
-			goto release_idr;
-		}
-	}
+	gact = to_gate(*a);
+	if (ret == ACT_P_CREATED)
+		INIT_LIST_HEAD(&gact->param.entries);
 
 	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
 	if (err < 0)
 		goto release_idr;
 
-	gact = to_gate(*a);
-
 	spin_lock_bh(&gact->tcf_lock);
 	p = &gact->param;
 
@@ -397,14 +416,10 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 		p->tcfg_cycletime_ext =
 			nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]);
 
+	gate_setup_timer(gact, basetime, tk_offset, clockid,
+			 ret == ACT_P_CREATED);
 	p->tcfg_priority = prio;
-	p->tcfg_basetime = basetime;
-	p->tcfg_clockid = clockid;
 	p->tcfg_flags = gflags;
-
-	gact->tk_offset = tk_offset;
-	hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT);
-	gact->hitimer.function = gate_timer_func;
 	gate_get_start_time(gact, &start);
 
 	gact->current_close_time = start;
@@ -433,6 +448,13 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla,
 	if (goto_ch)
 		tcf_chain_put_by_act(goto_ch);
 release_idr:
+	/* action is not inserted in any list: it's safe to init hitimer
+	 * without taking tcf_lock.
+	 */
+	if (ret == ACT_P_CREATED)
+		gate_setup_timer(gact, gact->param.tcfg_basetime,
+				 gact->tk_offset, gact->param.tcfg_clockid,
+				 true);
 	tcf_idr_release(*a, bind);
 	return err;
 }
@@ -443,9 +465,7 @@ static void tcf_gate_cleanup(struct tc_action *a)
 	struct tcf_gate_params *p;
 
 	p = &gact->param;
-	if (p->tcfg_clockid != -1)
-		hrtimer_cancel(&gact->hitimer);
-
+	hrtimer_cancel(&gact->hitimer);
 	release_entry_list(&p->entries);
 }
 
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH net v3 3/4] net/sched: cls_api: fix nooffloaddevcnt warning dmesg log
From: Pablo Neira Ayuso @ 2020-06-16 20:30 UTC (permalink / raw)
  To: wenxu; +Cc: netdev, davem, vladbu
In-Reply-To: <20200616201750.GA27024@salvia>

On Tue, Jun 16, 2020 at 10:17:50PM +0200, Pablo Neira Ayuso wrote:
> On Tue, Jun 16, 2020 at 11:19:39AM +0800, wenxu@ucloud.cn wrote:
> > From: wenxu <wenxu@ucloud.cn>
> > 
> > When a indr device add in offload success. The block->nooffloaddevcnt
> > should be 0. After the representor go away. When the dir device go away
> > the flow_block UNBIND operation with -EOPNOTSUPP which lead the warning
> > dmesg log. 
> > 
> > The block->nooffloaddevcnt should always count for indr block.
> > even the indr block offload successful. The representor maybe
> > gone away and the ingress qdisc can work in software mode.
> > 
> > block->nooffloaddevcnt warning with following dmesg log:
> > 
> > [  760.667058] #####################################################
> > [  760.668186] ## TEST test-ecmp-add-vxlan-encap-disable-sriov.sh ##
> > [  760.669179] #####################################################
> > [  761.780655] :test: Fedora 30 (Thirty)
> > [  761.783794] :test: Linux reg-r-vrt-018-180 5.7.0+
> > [  761.822890] :test: NIC ens1f0 FW 16.26.6000 PCI 0000:81:00.0 DEVICE 0x1019 ConnectX-5 Ex
> > [  761.860244] mlx5_core 0000:81:00.0 ens1f0: Link up
> > [  761.880693] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
> > [  762.059732] mlx5_core 0000:81:00.1 ens1f1: Link up
> > [  762.234341] :test: unbind vfs of ens1f0
> > [  762.257825] :test: Change ens1f0 eswitch (0000:81:00.0) mode to switchdev
> > [  762.291363] :test: unbind vfs of ens1f1
> > [  762.306914] :test: Change ens1f1 eswitch (0000:81:00.1) mode to switchdev
> > [  762.309237] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3)
> > [  763.282598] mlx5_core 0000:81:00.1: E-Switch: Supported tc offload range - chains: 4294967294, prios: 4294967295
> > [  763.362825] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
> > [  763.444465] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
> > [  763.460088] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
> > [  763.502586] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
> > [  763.552429] ens1f1_0: renamed from eth0
> > [  763.569569] mlx5_core 0000:81:00.1: E-Switch: Enable: mode(OFFLOADS), nvfs(2), active vports(3)
> > [  763.629694] ens1f1_1: renamed from eth1
> > [  764.631552] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_0: link becomes ready
> > [  764.670841] :test: unbind vfs of ens1f0
> > [  764.681966] :test: unbind vfs of ens1f1
> > [  764.726762] mlx5_core 0000:81:00.0 ens1f0: Link up
> > [  764.766511] mlx5_core 0000:81:00.1 ens1f1: Link up
> > [  764.797325] :test: Add multipath vxlan encap rule and disable sriov
> > [  764.798544] :test: config multipath route
> > [  764.812732] mlx5_core 0000:81:00.0: lag map port 1:2 port 2:2
> > [  764.874556] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:2
> > [  765.603681] :test: OK
> > [  765.659048] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_1: link becomes ready
> > [  765.675085] :test: verify rule in hw
> > [  765.694237] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
> > [  765.711892] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1: link becomes ready
> > [  766.979230] :test: OK
> > [  768.125419] :test: OK
> > [  768.127519] :test: - disable sriov ens1f1
> > [  768.131160] pci 0000:81:02.2: Removing from iommu group 75
> > [  768.132646] pci 0000:81:02.3: Removing from iommu group 76
> > [  769.179749] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
> > [  769.455627] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:1
> > [  769.703990] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
> > [  769.988637] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
> > [  769.990022] :test: - disable sriov ens1f0
> > [  769.994922] pci 0000:81:00.2: Removing from iommu group 73
> > [  769.997048] pci 0000:81:00.3: Removing from iommu group 74
> > [  771.035813] mlx5_core 0000:81:00.0: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
> > [  771.339091] ------------[ cut here ]------------
> > [  771.340812] WARNING: CPU: 6 PID: 3448 at net/sched/cls_api.c:749 tcf_block_offload_unbind.isra.0+0x5c/0x60
> > [  771.341728] Modules linked in: act_mirred act_tunnel_key cls_flower dummy vxlan ip6_udp_tunnel udp_tunnel sch_ingress nfsv3 nfs_acl nfs lockd grace fscache tun bridge stp llc sunrpc rdma_ucm rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mlxfw act_ct nf_flow_table kvm_intel nf_nat kvm nf_conntrack irqbypass crct10dif_pclmul igb crc32_pclmul nf_defrag_ipv6 libcrc32c nf_defrag_ipv4 crc32c_intel ghash_clmulni_intel ptp ipmi_ssif intel_cstate pps_c
> > ore ses intel_uncore mei_me iTCO_wdt joydev ipmi_si iTCO_vendor_support i2c_i801 enclosure mei ioatdma dca lpc_ich wmi ipmi_devintf pcspkr acpi_power_meter ipmi_msghandler acpi_pad ast i2c_algo_bit drm_vram_helper drm_kms_helper drm_ttm_helper ttm drm mpt3sas raid_class scsi_transport_sas
> > [  771.347818] CPU: 6 PID: 3448 Comm: test-ecmp-add-v Not tainted 5.7.0+ #1146
> > [  771.348727] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
> > [  771.349646] RIP: 0010:tcf_block_offload_unbind.isra.0+0x5c/0x60
> > [  771.350553] Code: 4a fd ff ff 83 f8 a1 74 0e 5b 4c 89 e7 5d 41 5c 41 5d e9 07 93 89 ff 8b 83 a0 00 00 00 8d 50 ff 89 93 a0 00 00 00 85 c0 75 df <0f> 0b eb db 0f 1f 44 00 00 41 57 41 56 41 55 41 89 cd 41 54 49 89
> > [  771.352420] RSP: 0018:ffffb33144cd3b00 EFLAGS: 00010246
> > [  771.353353] RAX: 0000000000000000 RBX: ffff8b37cf4b2800 RCX: 0000000000000000
> > [  771.354294] RDX: 00000000ffffffff RSI: ffff8b3b9aad0000 RDI: ffffffff8d5c6e20
> > [  771.355245] RBP: ffff8b37eb546948 R08: ffffffffc0b7a348 R09: ffff8b3b9aad0000
> > [  771.356189] R10: 0000000000000001 R11: ffff8b3ba7a0a1c0 R12: ffff8b37cf4b2850
> > [  771.357123] R13: ffff8b3b9aad0000 R14: ffff8b37cf4b2820 R15: ffff8b37cf4b2820
> > [  771.358039] FS:  00007f8a19b6e740(0000) GS:ffff8b3befa00000(0000) knlGS:0000000000000000
> > [  771.358965] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [  771.359885] CR2: 00007f3afb91c1a0 CR3: 000000045133c004 CR4: 00000000001606e0
> > [  771.360825] Call Trace:
> > [  771.361764]  __tcf_block_put+0x84/0x150
> > [  771.362712]  ingress_destroy+0x1b/0x20 [sch_ingress]
> > [  771.363658]  qdisc_destroy+0x3e/0xc0
> > [  771.364594]  dev_shutdown+0x7a/0xa5
> > [  771.365522]  rollback_registered_many+0x20d/0x530
> > [  771.366458]  ? netdev_upper_dev_unlink+0x15d/0x1c0
> > [  771.367387]  unregister_netdevice_many.part.0+0xf/0x70
> > [  771.368310]  vxlan_netdevice_event+0xa4/0x110 [vxlan]
> > [  771.369454]  notifier_call_chain+0x4c/0x70
> > [  771.370579]  rollback_registered_many+0x2f5/0x530
> > [  771.371719]  rollback_registered+0x56/0x90
> > [  771.372843]  unregister_netdevice_queue+0x73/0xb0
> > [  771.373982]  unregister_netdev+0x18/0x20
> > [  771.375168]  mlx5e_vport_rep_unload+0x56/0xc0 [mlx5_core]
> > [  771.376327]  esw_offloads_disable+0x81/0x90 [mlx5_core]
> > [  771.377512]  mlx5_eswitch_disable_locked.cold+0xcb/0x1af [mlx5_core]
> > [  771.378679]  mlx5_eswitch_disable+0x44/0x60 [mlx5_core]
> > [  771.379822]  mlx5_device_disable_sriov+0xad/0xb0 [mlx5_core]
> > [  771.380968]  mlx5_core_sriov_configure+0xc1/0xe0 [mlx5_core]
> > [  771.382087]  sriov_numvfs_store+0xfc/0x130
> > [  771.383195]  kernfs_fop_write+0xce/0x1b0
> > [  771.384302]  vfs_write+0xb6/0x1a0
> > [  771.385410]  ksys_write+0x5f/0xe0
> > [  771.386500]  do_syscall_64+0x5b/0x1d0
> > [  771.387569]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > 
> > Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()")
> > Signed-off-by: wenxu <wenxu@ucloud.cn>
> > ---
> >  net/sched/cls_api.c | 24 ++++++++++++++----------
> >  1 file changed, 14 insertions(+), 10 deletions(-)
> > 
> > diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> > index a00a203..86c3937 100644
> > --- a/net/sched/cls_api.c
> > +++ b/net/sched/cls_api.c
> > @@ -671,25 +671,29 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
> >  				 struct netlink_ext_ack *extack)
> >  {
> >  	struct flow_block_offload bo = {};
> > -	int err;
> >  
> >  	tcf_block_offload_init(&bo, dev, command, ei->binder_type,
> >  			       &block->flow_block, tcf_block_shared(block),
> >  			       extack);
> >  
> > -	if (dev->netdev_ops->ndo_setup_tc)
> > +	if (dev->netdev_ops->ndo_setup_tc) {
> > +		int err;
> > +
> >  		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
> > -	else
> > -		err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block,
> > -						  &bo, tc_block_indr_cleanup);
> > +		if (err < 0) {
> > +			if (err != -EOPNOTSUPP)
> > +				NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed");
> > +			return err;
> > +		}
> >  
> > -	if (err < 0) {
> > -		if (err != -EOPNOTSUPP)
> > -			NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed");
> > -		return err;
> > +		return tcf_block_setup(block, &bo);
> >  	}
> >  
> > -	return tcf_block_setup(block, &bo);
> > +	flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, &bo,
> > +				    tc_block_indr_cleanup);
> > +	tcf_block_setup(block, &bo);
> > +
> > +	return -EOPNOTSUPP;
> 
> So tcf_block_offload_cmd() always return -EOPNOTSUPP for _BIND and
> _UNBIND operations after this patch ?

tcf_block_offload_unbind() is not called from the
tc_block_indr_cleanup() path.

How is this patch related to 1/4, 2/4 and 4/4 ?

^ permalink raw reply

* Re: [PATCH bpf 1/2] flow_dissector: reject invalid attach_flags
From: Alexei Starovoitov @ 2020-06-16 20:37 UTC (permalink / raw)
  To: Lorenz Bauer
  Cc: Alexei Starovoitov, Daniel Borkmann, Jakub Sitnicki, kernel-team,
	Network Development, bpf, LKML, John Fastabend
In-Reply-To: <CACAyw99Szs3nUTx=DSmh0x8iTBLNF9TTLGC0GQLZ=FifVnbzBA@mail.gmail.com>

On Tue, Jun 16, 2020 at 1:30 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
>
> On Tue, 16 Jun 2020 at 04:55, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Jun 15, 2020 at 7:43 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
> > >
> > > On Fri, 12 Jun 2020 at 23:36, Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Fri, Jun 12, 2020 at 9:02 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
> > > > >
> > > > > Using BPF_PROG_ATTACH on a flow dissector program supports neither flags
> > > > > nor target_fd but accepts any value. Return EINVAL if either are non-zero.
> > > > >
> > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > > > > Fixes: b27f7bb590ba ("flow_dissector: Move out netns_bpf prog callbacks")
> > > > > ---
> > > > >  kernel/bpf/net_namespace.c | 3 +++
> > > > >  1 file changed, 3 insertions(+)
> > > > >
> > > > > diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
> > > > > index 78cf061f8179..56133e78ae4f 100644
> > > > > --- a/kernel/bpf/net_namespace.c
> > > > > +++ b/kernel/bpf/net_namespace.c
> > > > > @@ -192,6 +192,9 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
> > > > >         struct net *net;
> > > > >         int ret;
> > > > >
> > > > > +       if (attr->attach_flags || attr->target_fd)
> > > > > +               return -EINVAL;
> > > > > +
> > > >
> > > > In theory it makes sense, but how did you test it?
> > >
> > > Not properly it seems, sorry!
> > >
> > > > test_progs -t flow
> > > > fails 5 tests.
> > >
> > > I spent today digging through this, and the issue is actually more annoying than
> > > I thought. BPF_PROG_DETACH for sockmap and flow_dissector ignores
> > > attach_bpf_fd. The cgroup and lirc2 attach point use this to make sure that the
> > > program being detached is actually what user space expects. We actually have
> > > tests that set attach_bpf_fd for these to attach points, which tells
> > > me that this is
> > > an easy mistake to make.
> > >
> > > Unfortunately I can't come up with a good fix that seems backportable:
> > > - Making sockmap and flow_dissector have the same semantics as cgroup
> > >   and lirc2 requires a bunch of changes (probably a new function for sockmap)
> >
> > making flow dissector pass prog_fd as cg and lirc is certainly my preference.
> > Especially since tests are passing fd user code is likely doing the same,
> > so breakage is unlikely. Also it wasn't done that long ago, so
> > we can backport far enough.
> > It will remove cap_net_admin ugly check in bpf_prog_detach()
> > which is the only exception now in cap model.
>
> SGTM. What about sockmap though? The code for that has been around for ages.

you mean the second patch that enforces sock_map_get_from_fd doesn't
use attach_flags?
I think it didn't break anything, so enforcing is fine.

or the detach part that doesn't use prog_fd ?
I'm not sure what's the best here.
At least from cap perspective it's fine because map_fd is there.

John, wdyt?

^ permalink raw reply

* Re: [PATCH net v3 2/4] flow_offload: fix incorrect cb_priv check for flow_block_cb
From: Pablo Neira Ayuso @ 2020-06-16 20:38 UTC (permalink / raw)
  To: Simon Horman; +Cc: wenxu, netdev, davem, vladbu
In-Reply-To: <20200616154716.GA16382@netronome.com>

On Tue, Jun 16, 2020 at 05:47:17PM +0200, Simon Horman wrote:
> On Tue, Jun 16, 2020 at 11:18:16PM +0800, wenxu wrote:
> > 
> > 在 2020/6/16 22:34, Simon Horman 写道:
> > > On Tue, Jun 16, 2020 at 10:20:46PM +0800, wenxu wrote:
> > >> 在 2020/6/16 18:51, Simon Horman 写道:
> > >>> On Tue, Jun 16, 2020 at 11:19:38AM +0800, wenxu@ucloud.cn wrote:
> > >>>> From: wenxu <wenxu@ucloud.cn>
> > >>>>
> > >>>> In the function __flow_block_indr_cleanup, The match stataments
> > >>>> this->cb_priv == cb_priv is always false, the flow_block_cb->cb_priv
> > >>>> is totally different data with the flow_indr_dev->cb_priv.
> > >>>>
> > >>>> Store the representor cb_priv to the flow_block_cb->indr.cb_priv in
> > >>>> the driver.
> > >>>>
> > >>>> Fixes: 1fac52da5942 ("net: flow_offload: consolidate indirect flow_block infrastructure")
> > >>>> Signed-off-by: wenxu <wenxu@ucloud.cn>
> > >>> Hi Wenxu,
> > >>>
> > >>> I wonder if this can be resolved by using the cb_ident field of struct
> > >>> flow_block_cb.
> > >>>
> > >>> I observe that mlx5e_rep_indr_setup_block() seems to be the only call-site
> > >>> where the value of the cb_ident parameter of flow_block_cb_alloc() is
> > >>> per-block rather than per-device. So part of my proposal is to change
> > >>> that.
> > >> I check all the xxdriver_indr_setup_block. It seems all the cb_ident parameter of
> > >>
> > >> flow_block_cb_alloc is per-block. Both in the nfp_flower_setup_indr_tc_block
> > >>
> > >> and bnxt_tc_setup_indr_block.
> > >>
> > >>
> > >> nfp_flower_setup_indr_tc_block:
> > >>
> > >> struct nfp_flower_indr_block_cb_priv *cb_priv;
> > >>
> > >> block_cb = flow_block_cb_alloc(nfp_flower_setup_indr_block_cb,
> > >>                                                cb_priv, cb_priv,
> > >>                                                nfp_flower_setup_indr_tc_release);
> > >>
> > >>
> > >> bnxt_tc_setup_indr_block:
> > >>
> > >> struct bnxt_flower_indr_block_cb_priv *cb_priv;
> > >>
> > >> block_cb = flow_block_cb_alloc(bnxt_tc_setup_indr_block_cb,
> > >>                                                cb_priv, cb_priv,
> > >>                                                bnxt_tc_setup_indr_rel);
> > >>
> > >>
> > >> And the function flow_block_cb_is_busy called in most place. Pass the
> > >>
> > >> parameter as cb_priv but not cb_indent .
> > > Thanks, I see that now. But I still think it would be useful to understand
> > > the purpose of cb_ident. It feels like it would lead to a clean solution
> > > to the problem you have highlighted.
> > 
> > I think The cb_ident means identify.  It is used to identify the each flow block cb.
> > 
> > In the both flow_block_cb_is_busy and flow_block_cb_lookup function check
> > 
> > the block_cb->cb_ident == cb_ident.
> 
> Thanks, I think that I now see what you mean about the different scope of
> cb_ident and your proposal to allow cleanup by flow_indr_dev_unregister().
> 
> I do, however, still wonder if there is a nicer way than reaching into
> the structure and manually setting block_cb->indr.cb_priv
> at each call-site.
> 
> Perhaps a variant of flow_block_cb_alloc() for indirect blocks
> would be nicer?

A follow up patch to add this new variant would be good. Probably
__flow_block_indr_binding() can go away with this new variant to set
up the indirect flow block.

^ permalink raw reply

* Re: [PATCH net] tcp: grow window for OOO packets only for SACK flows
From: David Miller @ 2020-06-16 20:39 UTC (permalink / raw)
  To: edumazet; +Cc: netdev, eric.dumazet, ncardwell, ycheng, venkat.x.venkatsubra
In-Reply-To: <20200616033707.145423-1-edumazet@google.com>

From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Jun 2020 20:37:07 -0700

> Back in 2013, we made a change that broke fast retransmit
> for non SACK flows.
> 
> Indeed, for these flows, a sender needs to receive three duplicate
> ACK before starting fast retransmit. Sending ACK with different
> receive window do not count.
> 
> Even if enabling SACK is strongly recommended these days,
> there still are some cases where it has to be disabled.
> 
> Not increasing the window seems better than having to
> rely on RTO.
> 
> After the fix, following packetdrill test gives :
 ...
> Fixes: 4e4f1fc22681 ("tcp: properly increase rcv_ssthresh for ofo packets")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>

Applied and queued up for -stable, thanks Eric.

^ permalink raw reply

* Re: [PATCH 1/1 v2] mvpp2: remove module bugfix
From: David Miller @ 2020-06-16 20:42 UTC (permalink / raw)
  To: sven.auhagen
  Cc: netdev, antoine.tenart, gregory.clement, maxime.chevallier,
	thomas.petazzoni, miquel.raynal, mw, lorenzo, technoboy85
In-Reply-To: <20200616043529.gs2vcdryka7t4hjo@SvensMacBookAir.sven.lan>

From: Sven Auhagen <sven.auhagen@voleatech.de>
Date: Tue, 16 Jun 2020 06:35:29 +0200

> The remove function does not destroy all
> BM Pools when per cpu pool is active.
> 
> When reloading the mvpp2 as a module the BM Pools
> are still active in hardware and due to the bug
> have twice the size now old + new.
> 
> This eventually leads to a kernel crash.
> 
> v2:
> * add Fixes tag
> 
> Fixes: 7d04b0b13b11 ("mvpp2: percpu buffers")
> Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH net] bareudp: Fixed configuration to avoid having garbage values
From: David Miller @ 2020-06-16 20:43 UTC (permalink / raw)
  To: martinvarghesenokia; +Cc: netdev, martin.varghese
In-Reply-To: <1592286538-6895-1-git-send-email-martinvarghesenokia@gmail.com>

From: Martin Varghese <martinvarghesenokia@gmail.com>
Date: Tue, 16 Jun 2020 11:18:58 +0530

> From: Martin <martin.varghese@nokia.com>
> 
> Code to initialize the conf structure while gathering the configuration
> of the device was missing.
> 
> Fixes: 571912c69f0e ("net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc.")
> Signed-off-by: Martin <martin.varghese@nokia.com>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH net] mlxsw: spectrum: Adjust headroom buffers for 8x ports
From: David Miller @ 2020-06-16 20:47 UTC (permalink / raw)
  To: idosch; +Cc: netdev, kuba, jiri, mlxsw, idosch
In-Reply-To: <20200616071458.192798-1-idosch@idosch.org>

From: Ido Schimmel <idosch@idosch.org>
Date: Tue, 16 Jun 2020 10:14:58 +0300

> From: Ido Schimmel <idosch@mellanox.com>
> 
> The port's headroom buffers are used to store packets while they
> traverse the device's pipeline and also to store packets that are egress
> mirrored.
> 
> On Spectrum-3, ports with eight lanes use two headroom buffers between
> which the configured headroom size is split.
> 
> In order to prevent packet loss, multiply the calculated headroom size
> by two for 8x ports.
> 
> Fixes: da382875c616 ("mlxsw: spectrum: Extend to support Spectrum-3 ASIC")
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>

Applied and queued up for -stable, thank you.

^ permalink raw reply

* Re: [PATCH] net: usb: ax88179_178a: fix packet alignment padding
From: David Miller @ 2020-06-16 20:55 UTC (permalink / raw)
  To: jk; +Cc: netdev, allan, freddy, pfink, linux-usb, louis
In-Reply-To: <e780f13fdde89d03ef863618d8de3dd67ba53c72.camel@ozlabs.org>

From: Jeremy Kerr <jk@ozlabs.org>
Date: Tue, 16 Jun 2020 17:08:23 +0800

>> Because that code in this loop makes the same calculations:
>> 
>>                 ax_skb = skb_clone(skb, GFP_ATOMIC);
>>                 if (ax_skb) {
>>                         ax_skb->len = pkt_len;
>>                         ax_skb->data = skb->data + 2;
>>                         skb_set_tail_pointer(ax_skb, pkt_len);
>>                         ax_skb->truesize = pkt_len + sizeof(struct sk_buff);
>>                         ax88179_rx_checksum(ax_skb, pkt_hdr);
>>                         usbnet_skb_return(dev, ax_skb);
>> 
>> So if your change is right, it should be applied to this code block
>> as well.
> 
> Yep, my patch changes that block too (or did I miss something?)

Nope, you didn't miss anything.  I missed that completely.

>> And do we know that it's two extra tail bytes always?  Or some kind
>> of alignment padding the chip performs for every sub-packet?
> 
> I've assumed it's a constant two bytes of prefix padding, as that's all
> I've seen. But it would be great to have more detail from ASIX if
> possible.

I'll wait a bit for the ASIX folks to comment.

It seems logical to me that what the chip does is align up the total
sub-packet length to a multiple of 4 or larger, and then add those two
prefix padding bytes.  Otherwise the prefix padding won't necessarily
and reliably align the IP header after the link level header.

Thanks.

^ permalink raw reply

* Re: [PATCH] lan743x: add MODULE_DEVICE_TABLE for module loading alias
From: David Miller @ 2020-06-16 21:01 UTC (permalink / raw)
  To: tharvey; +Cc: bryan.whitehead, UNGLinuxDriver, kuba, netdev, linux-kernel
In-Reply-To: <1592321518-28464-1-git-send-email-tharvey@gateworks.com>

From: Tim Harvey <tharvey@gateworks.com>
Date: Tue, 16 Jun 2020 08:31:58 -0700

> Without a MODULE_DEVICE_TABLE the attributes are missing that create
> an alias for auto-loading the module in userspace via hotplug.
> 
> Signed-off-by: Tim Harvey <tharvey@gateworks.com>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH v4 0/3] mm, treewide: Rename kzfree() to kfree_sensitive()
From: Matthew Wilcox @ 2020-06-16 21:14 UTC (permalink / raw)
  To: Joe Perches
  Cc: Waiman Long, Andrew Morton, David Howells, Jarkko Sakkinen,
	James Morris, Serge E. Hallyn, Linus Torvalds, David Rientjes,
	Michal Hocko, Johannes Weiner, Dan Carpenter, David Sterba,
	Jason A . Donenfeld, linux-mm, keyrings, linux-kernel,
	linux-crypto, linux-pm, linux-stm32, linux-amlogic,
	linux-mediatek, linuxppc-dev, virtualization, netdev, linux-ppp,
	wireguard, linux-wireless, devel, linux-scsi, target-devel,
	linux-btrfs, linux-cifs, linux-fscrypt, ecryptfs, kasan-dev,
	linux-bluetooth, linux-wpan, linux-sctp, linux-nfs,
	tipc-discussion, linux-security-module, linux-integrity
In-Reply-To: <fe3b9a437be4aeab3bac68f04193cb6daaa5bee4.camel@perches.com>

On Tue, Jun 16, 2020 at 11:53:50AM -0700, Joe Perches wrote:
> To this larger audience and last week without reply:
> https://lore.kernel.org/lkml/573b3fbd5927c643920e1364230c296b23e7584d.camel@perches.com/
> 
> Are there _any_ fastpath uses of kfree or vfree?

I worked on adding a 'free' a couple of years ago.  That was capable
of freeing percpu, vmalloc, kmalloc and alloc_pages memory.  I ran into
trouble when I tried to free kmem_cache_alloc memory -- it works for slab
and slub, but not slob (because slob needs the size from the kmem_cache).

My motivation for this was to change kfree_rcu() to just free_rcu().

> To eliminate these mispairings at a runtime cost of four
> comparisons, should the kfree/vfree/kvfree/kfree_const
> functions be consolidated into a single kfree?

I would say to leave kfree() alone and just introduce free() as a new
default.  There's some weird places in the kernel that have a 'free'
symbol of their own, but those should be renamed anyway.

^ permalink raw reply

* RE: [PATCH] e1000e: add ifdef to avoid dead code
From: Kirsher, Jeffrey T @ 2020-06-16 21:20 UTC (permalink / raw)
  To: Eric Dumazet, Greg Thelen, David S. Miller, Jakub Kicinski,
	Lifshits, Vitaly
  Cc: intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <b88dc544-9f1b-75af-244e-9967ffeacf0e@gmail.com>

> -----Original Message-----
> From: Eric Dumazet <eric.dumazet@gmail.com>
> On 6/13/20 11:11 PM, Greg Thelen wrote:
> > Commit e086ba2fccda ("e1000e: disable s0ix entry and exit flows for ME
> > systems") added e1000e_check_me() but it's only called from
> > CONFIG_PM_SLEEP protected code.  Thus builds without CONFIG_PM_SLEEP
> > see:
> >   drivers/net/ethernet/intel/e1000e/netdev.c:137:13: warning:
> > 'e1000e_check_me' defined but not used [-Wunused-function]
> >
> > Add CONFIG_PM_SLEEP ifdef guard to avoid dead code.
> >
> > Fixes: e086ba2fccda ("e1000e: disable s0ix entry and exit flows for ME
> > systems")
> > Signed-off-by: Greg Thelen <gthelen@google.com>
> > ---
> >  drivers/net/ethernet/intel/e1000e/netdev.c | 2 ++
> >  1 file changed, 2 insertions(+)
> 
> Reviewed-by: Eric Dumazet <edumazet@google.com>
[Kirsher, Jeffrey T] 

This patch is not necessary, after Arnd's patch.  Here is his patch:
http://patchwork.ozlabs.org/project/intel-wired-lan/patch/20200527134716.948148-1-arnd@arndb.de/
 and I will be pushing it to Dave's net tree later tonight.

^ permalink raw reply

* Re: [PATCH bpf 2/2] selftests/bpf: add variable-length data concatenation pattern test
From: Andrii Nakryiko @ 2020-06-16 21:27 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov, Kernel Team,
	Christoph Hellwig
In-Reply-To: <5fed920d-aeb6-c8de-18c0-7c046bbfb242@iogearbox.net>

On Tue, Jun 16, 2020 at 1:21 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 6/16/20 7:04 AM, Andrii Nakryiko wrote:
> > Add selftest that validates variable-length data reading and concatentation
> > with one big shared data array. This is a common pattern in production use for
> > monitoring and tracing applications, that potentially can read a lot of data,
> > but usually reads much less. Such pattern allows to determine precisely what
> > amount of data needs to be sent over perfbuf/ringbuf and maximize efficiency.
> >
> > This is the first BPF selftest that at all looks at and tests
> > bpf_probe_read_str()-like helper's return value, closing a major gap in BPF
> > testing. It surfaced the problem with bpf_probe_read_kernel_str() returning
> > 0 on success, instead of amount of bytes successfully read.
> >
> > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
>
> Fix looks good, but I'm seeing an issue in the selftest on my side. With latest
> Clang/LLVM I'm getting:
>
> # ./test_progs -t varlen
> #86 varlen:OK
> Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
>
> All good, however, the test_progs-no_alu32 fails for me with:

Yeah, same here. It's due to Clang emitting unnecessary bit shifts
because bpf_probe_read_kernel_str() is defined as returning 32-bit
int. I have a patch ready locally, just waiting for bpf-next to open,
which switches those helpers to return long, which auto-matically
fixes this test.

If it's not a problem, I'd just wait for that patch to go into
bpf-next. If not, I can sprinkle bits of assembly magic around to
force the kernel to do those bitshifts earlier. But I figured having
test_progs-no_alu32 failing one selftest temporarily wasn't too bad.

>
> # ./test_progs-no_alu32 -t varlen
> Switching to flavor 'no_alu32' subdirectory...
> libbpf: load bpf program failed: Invalid argument
> libbpf: -- BEGIN DUMP LOG ---
> libbpf:
> arg#0 type is not a struct
> Unrecognized arg#0 type PTR
> ; int pid = bpf_get_current_pid_tgid() >> 32;
> 0: (85) call bpf_get_current_pid_tgid#14
> ; int pid = bpf_get_current_pid_tgid() >> 32;
> 1: (77) r0 >>= 32
> ; if (test_pid != pid || !capture)
> 2: (18) r1 = 0xffffb14a4010c200
> 4: (61) r1 = *(u32 *)(r1 +0)
>   R0_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=map_value(id=0,off=512,ks=4,vs=1056,imm=0) R10=fp0
> ; if (test_pid != pid || !capture)
> 5: (5d) if r1 != r0 goto pc+43
>   R0_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0
> 6: (18) r1 = 0xffffb14a4010c204
> 8: (71) r1 = *(u8 *)(r1 +0)
>   R0_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=map_value(id=0,off=516,ks=4,vs=1056,imm=0) R10=fp0
> 9: (15) if r1 == 0x0 goto pc+39
>   R0=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R10=fp0
> ; len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in1[0]);
> 10: (18) r6 = 0xffffb14a4010c220
> 12: (18) r1 = 0xffffb14a4010c220
> 14: (b7) r2 = 256
> 15: (18) r3 = 0xffffb14a4010c000
> 17: (85) call bpf_probe_read_kernel_str#115
>   R0=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R2_w=inv256 R3_w=map_value(id=0,off=0,ks=4,vs=1056,imm=0) R6_w=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R10=fp0
> last_idx 17 first_idx 9
> regs=4 stack=0 before 15: (18) r3 = 0xffffb14a4010c000
> regs=4 stack=0 before 14: (b7) r2 = 256
> 18: (67) r0 <<= 32
> 19: (bf) r1 = r0
> 20: (77) r1 >>= 32
> ; if (len <= MAX_LEN) {
> 21: (25) if r1 > 0x100 goto pc+7
>   R0=inv(id=0,smax_value=1099511627776,umax_value=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min_value=0,s32_max_value=0,u32_max_value=0) R1=inv(id=0,umax_value=256,var_off=(0x0; 0x1ff)) R6=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R10=fp0
> ;
> 22: (c7) r0 s>>= 32
> ; payload1_len1 = len;
> 23: (18) r1 = 0xffffb14a4010c208
> 25: (7b) *(u64 *)(r1 +0) = r0
>   R0_w=inv(id=0,smin_value=-2147483648,smax_value=256) R1_w=map_value(id=0,off=520,ks=4,vs=1056,imm=0) R6=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R10=fp0
> ; payload += len;
> 26: (18) r6 = 0xffffb14a4010c220
> 28: (0f) r6 += r0
> last_idx 28 first_idx 21
> regs=1 stack=0 before 26: (18) r6 = 0xffffb14a4010c220
> regs=1 stack=0 before 25: (7b) *(u64 *)(r1 +0) = r0
> regs=1 stack=0 before 23: (18) r1 = 0xffffb14a4010c208
> regs=1 stack=0 before 22: (c7) r0 s>>= 32
> regs=1 stack=0 before 21: (25) if r1 > 0x100 goto pc+7
>   R0_rw=invP(id=0,smax_value=1099511627776,umax_value=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min_value=0,s32_max_value=0,u32_max_value=0) R1_rw=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R6_w=map_value(id=0,off=544,ks=4,vs=1056,imm=0) R10=fp0
> parent didn't have regs=1 stack=0 marks
> last_idx 20 first_idx 9
> regs=1 stack=0 before 20: (77) r1 >>= 32
> regs=1 stack=0 before 19: (bf) r1 = r0
> regs=1 stack=0 before 18: (67) r0 <<= 32
> regs=1 stack=0 before 17: (85) call bpf_probe_read_kernel_str#115
> value -2147483648 makes map_value pointer be out of bounds
> processed 22 insns (limit 1000000) max_states_per_insn 0 total_states 2 peak_states 2 mark_read 1
>
> libbpf: -- END LOG --
> libbpf: failed to load program 'raw_tp/sys_enter'
> libbpf: failed to load object 'test_varlen'
> libbpf: failed to load BPF skeleton 'test_varlen': -4007
> test_varlen:FAIL:skel_open failed to open skeleton
> #86 varlen:FAIL
> Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED

^ permalink raw reply

* Re: [PATCH 06/12] xen-blkfront: add callbacks for PM suspend and hibernation]
From: Anchal Agarwal @ 2020-06-16 21:49 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Boris Ostrovsky, tglx@linutronix.de, mingo@redhat.com,
	bp@alien8.de, hpa@zytor.com, x86@kernel.org, jgross@suse.com,
	linux-pm@vger.kernel.org, linux-mm@kvack.org, Kamata, Munehisa,
	sstabellini@kernel.org, konrad.wilk@oracle.com, axboe@kernel.dk,
	davem@davemloft.net, rjw@rjwysocki.net, len.brown@intel.com,
	pavel@ucw.cz, peterz@infradead.org, Valentin, Eduardo,
	Singh, Balbir, xen-devel@lists.xenproject.org,
	vkuznets@redhat.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Woodhouse, David,
	benh@kernel.crashing.org
In-Reply-To: <20200604070548.GH1195@Air-de-Roger>

On Thu, Jun 04, 2020 at 09:05:48AM +0200, Roger Pau Monné wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> Hello,
> 
> On Wed, Jun 03, 2020 at 11:33:52PM +0000, Agarwal, Anchal wrote:
> >  CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> >
> >
> >
> >     On Tue, May 19, 2020 at 11:27:50PM +0000, Anchal Agarwal wrote:
> >     > From: Munehisa Kamata <kamatam@amazon.com>
> >     >
> >     > S4 power transition states are much different than xen
> >     > suspend/resume. Former is visible to the guest and frontend drivers should
> >     > be aware of the state transitions and should be able to take appropriate
> >     > actions when needed. In transition to S4 we need to make sure that at least
> >     > all the in-flight blkif requests get completed, since they probably contain
> >     > bits of the guest's memory image and that's not going to get saved any
> >     > other way. Hence, re-issuing of in-flight requests as in case of xen resume
> >     > will not work here. This is in contrast to xen-suspend where we need to
> >     > freeze with as little processing as possible to avoid dirtying RAM late in
> >     > the migration cycle and we know that in-flight data can wait.
> >     >
> >     > Add freeze, thaw and restore callbacks for PM suspend and hibernation
> >     > support. All frontend drivers that needs to use PM_HIBERNATION/PM_SUSPEND
> >     > events, need to implement these xenbus_driver callbacks. The freeze handler
> >     > stops block-layer queue and disconnect the frontend from the backend while
> >     > freeing ring_info and associated resources. Before disconnecting from the
> >     > backend, we need to prevent any new IO from being queued and wait for existing
> >     > IO to complete. Freeze/unfreeze of the queues will guarantee that there are no
> >     > requests in use on the shared ring. However, for sanity we should check
> >     > state of the ring before disconnecting to make sure that there are no
> >     > outstanding requests to be processed on the ring. The restore handler
> >     > re-allocates ring_info, unquiesces and unfreezes the queue and re-connect to
> >     > the backend, so that rest of the kernel can continue to use the block device
> >     > transparently.
> >     >
> >     > Note:For older backends,if a backend doesn't have commit'12ea729645ace'
> >     > xen/blkback: unmap all persistent grants when frontend gets disconnected,
> >     > the frontend may see massive amount of grant table warning when freeing
> >     > resources.
> >     > [   36.852659] deferring g.e. 0xf9 (pfn 0xffffffffffffffff)
> >     > [   36.855089] xen:grant_table: WARNING:e.g. 0x112 still in use!
> >     >
> >     > In this case, persistent grants would need to be disabled.
> >     >
> >     > [Anchal Changelog: Removed timeout/request during blkfront freeze.
> >     > Reworked the whole patch to work with blk-mq and incorporate upstream's
> >     > comments]
> >
> >     Please tag versions using vX and it would be helpful if you could list
> >     the specific changes that you performed between versions. There where
> >     3 RFC versions IIRC, and there's no log of the changes between them.
> >
> > I will elaborate on "upstream's comments" in my changelog in my next round of patches.
> 
> Sorry for being picky, but can you please make sure your email client
> properly quotes previous emails on reply. Note the lack of '>' added
> to the quoted parts of your reply.
That was just my outlook probably. Note taken.
> 
> >     > +                     }
> >     > +
> >     >                       break;
> >     > +             }
> >     > +
> >     > +             /*
> >     > +              * We may somehow receive backend's Closed again while thawing
> >     > +              * or restoring and it causes thawing or restoring to fail.
> >     > +              * Ignore such unexpected state regardless of the backend state.
> >     > +              */
> >     > +             if (info->connected == BLKIF_STATE_FROZEN) {
> >
> >     I think you can join this with the previous dev->state == XenbusStateClosed?
> >
> >     Also, won't the device be in the Closed state already if it's in state
> >     frozen?
> > Yes but I think this mostly due to a hypothetical case if during thawing backend switches to Closed state.
> > I am not entirely sure if that could happen. Could use some expertise here.
> 
> I think the frontend seeing the backend in the closed state during
> restore would be a bug that should prevent the frontend from
> resuming.
> 
> >     > +     /* Kick the backend to disconnect */
> >     > +     xenbus_switch_state(dev, XenbusStateClosing);
> >     > +
> >     > +     /*
> >     > +      * We don't want to move forward before the frontend is diconnected
> >     > +      * from the backend cleanly.
> >     > +      */
> >     > +     timeout = wait_for_completion_timeout(&info->wait_backend_disconnected,
> >     > +                                           timeout);
> >     > +     if (!timeout) {
> >     > +             err = -EBUSY;
> >
> >     Note err is only used here, and I think could just be dropped.
> >
> > This err is what's being returned from the function. Am I missing anything?
> 
> Just 'return -EBUSY;' directly, and remove the top level variable. You
> can also use -EBUSY directly in the xenbus_dev_error call. Anyway, not
> that important.
> 
> >     > +             xenbus_dev_error(dev, err, "Freezing timed out;"
> >     > +                              "the device may become inconsistent state");
> >
> >     Leaving the device in this state is quite bad, as it's in a closed
> >     state and with the queues frozen. You should make an attempt to
> >     restore things to a working state.
> >
> > You mean if backend closed after timeout? Is there a way to know that? I understand it's not good to
> > leave it in this state however, I am still trying to find if there is a good way to know if backend is still connected after timeout.
> > Hence the message " the device may become inconsistent state".  I didn't see a timeout not even once on my end so that's why
> > I may be looking for an alternate perspective here. may be need to thaw everything back intentionally is one thing I could think of.
> 
> You can manually force this state, and then check that it will behave
> correctly. I would expect that on a failure to disconnect from the
> backend you should switch the frontend to the 'Init' state in order to
> try to reconnect to the backend when possible.
> 
From what I understand forcing manually is, failing the freeze without
disconnect and try to revive the connection by unfreezing the
queues->reconnecting to backend [which never got diconnected]. May be even
tearing down things manually because I am not sure what state will frontend
see if backend fails to to disconnect at any point in time. I assumed connected.
Then again if its "CONNECTED" I may not need to tear down everything and start
from Initialising state because that may not work.

So I am not so sure about backend's state so much, lets say if  xen_blkif_disconnect fail,
I don't see it getting handled in the backend then what will be backend's state?
Will it still switch xenbus state to 'Closed'? If not what will frontend see, 
if it tries to read backend's state through xenbus_read_driver_state ?

So the flow be like:
Front end marks XenbusStateClosing
Backend marks its state as XenbusStateClosing
    Frontend marks XenbusStateClosed
    Backend disconnects calls xen_blkif_disconnect
       Backend fails to disconnect, the above function returns EBUSY
       What will be state of backend here? 
       Frontend did not tear down the rings if backend does not switches the
       state to 'Closed' in case of failure.

If backend stays in CONNECTED state, then even if we mark it Initialised in frontend, backend
won't be calling connect(). {From reading code in frontend_changed}
IMU, Initialising will fail since backend dev->state != XenbusStateClosed plus
we did not tear down anything so calling talk_to_blkback may not be needed

Does that sound correct?
> >     > +     }
> >     > +
> >     > +     return err;
> >     > +}
> >     > +
> >     > +static int blkfront_restore(struct xenbus_device *dev)
> >     > +{
> >     > +     struct blkfront_info *info = dev_get_drvdata(&dev->dev);
> >     > +     int err = 0;
> >     > +
> >     > +     err = talk_to_blkback(dev, info);
> >     > +     blk_mq_unquiesce_queue(info->rq);
> >     > +     blk_mq_unfreeze_queue(info->rq);
> >     > +     if (!err)
> >     > +         blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
> >
> >     Bad indentation. Also shouldn't you first update the queues and then
> >     unfreeze them?
> > Please correct me if I am wrong, blk_mq_update_nr_hw_queues freezes the queue
> > So I don't think the order could be reversed.
> 
> Regardless of what blk_mq_update_nr_hw_queues does, I don't think it's
> correct to unfreeze the queues without having updated them. Also the
> freezing/unfreezing uses a refcount, so I think it's perfectly fine to
> call blk_mq_update_nr_hw_queues first and then unfreeze the queues.
> 
> Also note that talk_to_blkback returning an error should likely
> prevent any unfreezing, as the queues won't be updated to match the
> parameters of the backend.
>
I think you are right here. Will send out fixes in V2
> Thanks, Roger.
> 
Thanks,
Anchal

^ permalink raw reply

* Re: [PATCH RFC v7 03/14] vhost: use batched get_vq_desc version
From: Michael S. Tsirkin @ 2020-06-16 22:08 UTC (permalink / raw)
  To: Eugenio Perez Martin
  Cc: linux-kernel, kvm list, virtualization, netdev, Jason Wang
In-Reply-To: <CAJaqyWeX7knekVPcsZ2+AAf8zvZhPvt46fZncAsLhwYJ3eUa1g@mail.gmail.com>

On Tue, Jun 16, 2020 at 05:23:43PM +0200, Eugenio Perez Martin wrote:
> On Mon, Jun 15, 2020 at 6:05 PM Eugenio Pérez <eperezma@redhat.com> wrote:
> >
> > On Thu, 2020-06-11 at 07:30 -0400, Michael S. Tsirkin wrote:
> > > On Wed, Jun 10, 2020 at 06:18:32PM +0200, Eugenio Perez Martin wrote:
> > > > On Wed, Jun 10, 2020 at 5:13 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > On Wed, Jun 10, 2020 at 02:37:50PM +0200, Eugenio Perez Martin wrote:
> > > > > > > +/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> > > > > > > + * A negative code is returned on error. */
> > > > > > > +static int fetch_descs(struct vhost_virtqueue *vq)
> > > > > > > +{
> > > > > > > +       int ret;
> > > > > > > +
> > > > > > > +       if (unlikely(vq->first_desc >= vq->ndescs)) {
> > > > > > > +               vq->first_desc = 0;
> > > > > > > +               vq->ndescs = 0;
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       if (vq->ndescs)
> > > > > > > +               return 1;
> > > > > > > +
> > > > > > > +       for (ret = 1;
> > > > > > > +            ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
> > > > > > > +            ret = fetch_buf(vq))
> > > > > > > +               ;
> > > > > >
> > > > > > (Expanding comment in V6):
> > > > > >
> > > > > > We get an infinite loop this way:
> > > > > > * vq->ndescs == 0, so we call fetch_buf() here
> > > > > > * fetch_buf gets less than vhost_vq_num_batch_descs(vq); descriptors. ret = 1
> > > > > > * This loop calls again fetch_buf, but vq->ndescs > 0 (and avail_vq ==
> > > > > > last_avail_vq), so it just return 1
> > > > >
> > > > > That's what
> > > > >          [PATCH RFC v7 08/14] fixup! vhost: use batched get_vq_desc version
> > > > > is supposed to fix.
> > > > >
> > > >
> > > > Sorry, I forgot to include that fixup.
> > > >
> > > > With it I don't see CPU stalls, but with that version latency has
> > > > increased a lot and I see packet lost:
> > > > + ping -c 5 10.200.0.1
> > > > PING 10.200.0.1 (10.200.0.1) 56(84) bytes of data.
> > > > > From 10.200.0.2 icmp_seq=1 Destination Host Unreachable
> > > > > From 10.200.0.2 icmp_seq=2 Destination Host Unreachable
> > > > > From 10.200.0.2 icmp_seq=3 Destination Host Unreachable
> > > > 64 bytes from 10.200.0.1: icmp_seq=5 ttl=64 time=6848 ms
> > > >
> > > > --- 10.200.0.1 ping statistics ---
> > > > 5 packets transmitted, 1 received, +3 errors, 80% packet loss, time 76ms
> > > > rtt min/avg/max/mdev = 6848.316/6848.316/6848.316/0.000 ms, pipe 4
> > > > --
> > > >
> > > > I cannot even use netperf.
> > >
> > > OK so that's the bug to try to find and fix I think.
> > >
> > >
> > > > If I modify with my proposed version:
> > > > + ping -c 5 10.200.0.1
> > > > PING 10.200.0.1 (10.200.0.1) 56(84) bytes of data.
> > > > 64 bytes from 10.200.0.1: icmp_seq=1 ttl=64 time=7.07 ms
> > > > 64 bytes from 10.200.0.1: icmp_seq=2 ttl=64 time=0.358 ms
> > > > 64 bytes from 10.200.0.1: icmp_seq=3 ttl=64 time=5.35 ms
> > > > 64 bytes from 10.200.0.1: icmp_seq=4 ttl=64 time=2.27 ms
> > > > 64 bytes from 10.200.0.1: icmp_seq=5 ttl=64 time=0.426 ms
> > >
> > > Not sure which version this is.
> > >
> > > > [root@localhost ~]# netperf -H 10.200.0.1 -p 12865 -l 10 -t TCP_STREAM
> > > > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > > > 10.200.0.1 () port 0 AF_INET
> > > > Recv   Send    Send
> > > > Socket Socket  Message  Elapsed
> > > > Size   Size    Size     Time     Throughput
> > > > bytes  bytes   bytes    secs.    10^6bits/sec
> > > >
> > > > 131072  16384  16384    10.01    4742.36
> > > > [root@localhost ~]# netperf -H 10.200.0.1 -p 12865 -l 10 -t UDP_STREAM
> > > > MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > > > 10.200.0.1 () port 0 AF_INET
> > > > Socket  Message  Elapsed      Messages
> > > > Size    Size     Time         Okay Errors   Throughput
> > > > bytes   bytes    secs            #      #   10^6bits/sec
> > > >
> > > > 212992   65507   10.00        9214      0     482.83
> > > > 212992           10.00        9214            482.83
> > > >
> > > > I will compare with the non-batch version for reference, but the
> > > > difference between the two is noticeable. Maybe it's worth finding a
> > > > good value for the if() inside fetch_buf?
> > > >
> > > > Thanks!
> > > >
> > >
> > > I don't think it's performance, I think it's a bug somewhere,
> > > e.g. maybe we corrupt a packet, or stall the queue, or
> > > something like this.
> > >
> > > Let's do this, I will squash the fixups and post v8 so you can bisect
> > > and then debug cleanly.
> >
> > Ok, so if we apply the patch proposed in v7 08/14 (Or the version 8 of the patchset sent), this is what happens:
> >
> > 1. Userland (virtio_test in my case) introduces just one buffer in vq, and it kicks
> > 2. vhost module reaches fetch_descs, called from vhost_get_vq_desc. From there we call fetch_buf in a for loop.
> > 3. The first time we call fetch_buf, it returns properly one buffer. However, the second time we call it, it returns 0
> > because vq->avail_idx == vq->last_avail_idx and vq->avail_idx == last_avail_idx code path.
> > 4. fetch_descs assign ret = 0, so it returns 0. vhost_get_vq_desc will goto err, and it will signal no new buffer
> > (returning vq->num).
> >
> > So to fix it and maintain the batching maybe we could return vq->ndescs in case ret == 0:
> >
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index c0dfb5e3d2af..5993d4f34ca9 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -2315,7 +2327,8 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> >
> >         /* On success we expect some descs */
> >         BUG_ON(ret > 0 && !vq->ndescs);
> > -       return ret;
> > +       return ret ?: vq->ndescs;


I'd rather we used standard C. Also ret < 0 needs
to be handled. Also - what if fetch of some descs fails
but some succeeds?
What do we want to do?
Maybe:

return vq->ndescs ? vq->ndescs : ret;


> >  }
> >
> >  /* Reverse the effects of fetch_descs */
> > --
> >
> > Another possibility could be to return different codes from fetch_buf, but I find the suggested modification easier.
> >
> > What do you think?
> >
> > Thanks!
> >
> 
> Hi!
> 
> I can send a proposed RFC v9 in case it is more convenient for you.
> 
> Thanks!

Excellent, pls go ahead!
And can you include the performance numbers?
It's enough to test the final version.

-- 
MST


^ permalink raw reply

* Re: [PATCH bpf 2/2] selftests/bpf: add variable-length data concatenation pattern test
From: Daniel Borkmann @ 2020-06-16 22:23 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Andrii Nakryiko, bpf, Networking, Alexei Starovoitov, Kernel Team,
	Christoph Hellwig
In-Reply-To: <CAEf4BzZQXKBFNqAtadcK6UArfgMDQ--5P0XA1m2f_d8KG6YRtg@mail.gmail.com>

On 6/16/20 11:27 PM, Andrii Nakryiko wrote:
> On Tue, Jun 16, 2020 at 1:21 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 6/16/20 7:04 AM, Andrii Nakryiko wrote:
>>> Add selftest that validates variable-length data reading and concatentation
>>> with one big shared data array. This is a common pattern in production use for
>>> monitoring and tracing applications, that potentially can read a lot of data,
>>> but usually reads much less. Such pattern allows to determine precisely what
>>> amount of data needs to be sent over perfbuf/ringbuf and maximize efficiency.
>>>
>>> This is the first BPF selftest that at all looks at and tests
>>> bpf_probe_read_str()-like helper's return value, closing a major gap in BPF
>>> testing. It surfaced the problem with bpf_probe_read_kernel_str() returning
>>> 0 on success, instead of amount of bytes successfully read.
>>>
>>> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
>>
>> Fix looks good, but I'm seeing an issue in the selftest on my side. With latest
>> Clang/LLVM I'm getting:
>>
>> # ./test_progs -t varlen
>> #86 varlen:OK
>> Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED
>>
>> All good, however, the test_progs-no_alu32 fails for me with:
> 
> Yeah, same here. It's due to Clang emitting unnecessary bit shifts
> because bpf_probe_read_kernel_str() is defined as returning 32-bit
> int. I have a patch ready locally, just waiting for bpf-next to open,
> which switches those helpers to return long, which auto-matically
> fixes this test.
> 
> If it's not a problem, I'd just wait for that patch to go into
> bpf-next. If not, I can sprinkle bits of assembly magic around to
> force the kernel to do those bitshifts earlier. But I figured having
> test_progs-no_alu32 failing one selftest temporarily wasn't too bad.

Given {net,bpf}-next will open up soon, another option could be to take in the fix
itself to bpf and selftest would be submitted together with your other improvement;
any objections?

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH 06/12] xen-blkfront: add callbacks for PM suspend and hibernation]
From: Anchal Agarwal @ 2020-06-16 22:30 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: Boris Ostrovsky, tglx@linutronix.de, mingo@redhat.com,
	bp@alien8.de, hpa@zytor.com, x86@kernel.org, jgross@suse.com,
	linux-pm@vger.kernel.org, linux-mm@kvack.org, Kamata, Munehisa,
	sstabellini@kernel.org, konrad.wilk@oracle.com, axboe@kernel.dk,
	davem@davemloft.net, rjw@rjwysocki.net, len.brown@intel.com,
	pavel@ucw.cz, peterz@infradead.org, Valentin, Eduardo,
	Singh, Balbir, xen-devel@lists.xenproject.org,
	vkuznets@redhat.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Woodhouse, David,
	benh@kernel.crashing.org
In-Reply-To: <20200616214925.GA21684@dev-dsk-anchalag-2a-9c2d1d96.us-west-2.amazon.com>

On Tue, Jun 16, 2020 at 09:49:25PM +0000, Anchal Agarwal wrote:
> On Thu, Jun 04, 2020 at 09:05:48AM +0200, Roger Pau Monné wrote:
> > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > 
> > 
> > 
> > Hello,
> > 
> > On Wed, Jun 03, 2020 at 11:33:52PM +0000, Agarwal, Anchal wrote:
> > >  CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > >
> > >
> > >
> > >     On Tue, May 19, 2020 at 11:27:50PM +0000, Anchal Agarwal wrote:
> > >     > From: Munehisa Kamata <kamatam@amazon.com>
> > >     >
> > >     > S4 power transition states are much different than xen
> > >     > suspend/resume. Former is visible to the guest and frontend drivers should
> > >     > be aware of the state transitions and should be able to take appropriate
> > >     > actions when needed. In transition to S4 we need to make sure that at least
> > >     > all the in-flight blkif requests get completed, since they probably contain
> > >     > bits of the guest's memory image and that's not going to get saved any
> > >     > other way. Hence, re-issuing of in-flight requests as in case of xen resume
> > >     > will not work here. This is in contrast to xen-suspend where we need to
> > >     > freeze with as little processing as possible to avoid dirtying RAM late in
> > >     > the migration cycle and we know that in-flight data can wait.
> > >     >
> > >     > Add freeze, thaw and restore callbacks for PM suspend and hibernation
> > >     > support. All frontend drivers that needs to use PM_HIBERNATION/PM_SUSPEND
> > >     > events, need to implement these xenbus_driver callbacks. The freeze handler
> > >     > stops block-layer queue and disconnect the frontend from the backend while
> > >     > freeing ring_info and associated resources. Before disconnecting from the
> > >     > backend, we need to prevent any new IO from being queued and wait for existing
> > >     > IO to complete. Freeze/unfreeze of the queues will guarantee that there are no
> > >     > requests in use on the shared ring. However, for sanity we should check
> > >     > state of the ring before disconnecting to make sure that there are no
> > >     > outstanding requests to be processed on the ring. The restore handler
> > >     > re-allocates ring_info, unquiesces and unfreezes the queue and re-connect to
> > >     > the backend, so that rest of the kernel can continue to use the block device
> > >     > transparently.
> > >     >
> > >     > Note:For older backends,if a backend doesn't have commit'12ea729645ace'
> > >     > xen/blkback: unmap all persistent grants when frontend gets disconnected,
> > >     > the frontend may see massive amount of grant table warning when freeing
> > >     > resources.
> > >     > [   36.852659] deferring g.e. 0xf9 (pfn 0xffffffffffffffff)
> > >     > [   36.855089] xen:grant_table: WARNING:e.g. 0x112 still in use!
> > >     >
> > >     > In this case, persistent grants would need to be disabled.
> > >     >
> > >     > [Anchal Changelog: Removed timeout/request during blkfront freeze.
> > >     > Reworked the whole patch to work with blk-mq and incorporate upstream's
> > >     > comments]
> > >
> > >     Please tag versions using vX and it would be helpful if you could list
> > >     the specific changes that you performed between versions. There where
> > >     3 RFC versions IIRC, and there's no log of the changes between them.
> > >
> > > I will elaborate on "upstream's comments" in my changelog in my next round of patches.
> > 
> > Sorry for being picky, but can you please make sure your email client
> > properly quotes previous emails on reply. Note the lack of '>' added
> > to the quoted parts of your reply.
> That was just my outlook probably. Note taken.
> > 
> > >     > +                     }
> > >     > +
> > >     >                       break;
> > >     > +             }
> > >     > +
> > >     > +             /*
> > >     > +              * We may somehow receive backend's Closed again while thawing
> > >     > +              * or restoring and it causes thawing or restoring to fail.
> > >     > +              * Ignore such unexpected state regardless of the backend state.
> > >     > +              */
> > >     > +             if (info->connected == BLKIF_STATE_FROZEN) {
> > >
> > >     I think you can join this with the previous dev->state == XenbusStateClosed?
> > >
> > >     Also, won't the device be in the Closed state already if it's in state
> > >     frozen?
> > > Yes but I think this mostly due to a hypothetical case if during thawing backend switches to Closed state.
> > > I am not entirely sure if that could happen. Could use some expertise here.
> > 
> > I think the frontend seeing the backend in the closed state during
> > restore would be a bug that should prevent the frontend from
> > resuming.
> > 
> > >     > +     /* Kick the backend to disconnect */
> > >     > +     xenbus_switch_state(dev, XenbusStateClosing);
> > >     > +
> > >     > +     /*
> > >     > +      * We don't want to move forward before the frontend is diconnected
> > >     > +      * from the backend cleanly.
> > >     > +      */
> > >     > +     timeout = wait_for_completion_timeout(&info->wait_backend_disconnected,
> > >     > +                                           timeout);
> > >     > +     if (!timeout) {
> > >     > +             err = -EBUSY;
> > >
> > >     Note err is only used here, and I think could just be dropped.
> > >
> > > This err is what's being returned from the function. Am I missing anything?
> > 
> > Just 'return -EBUSY;' directly, and remove the top level variable. You
> > can also use -EBUSY directly in the xenbus_dev_error call. Anyway, not
> > that important.
> > 
> > >     > +             xenbus_dev_error(dev, err, "Freezing timed out;"
> > >     > +                              "the device may become inconsistent state");
> > >
> > >     Leaving the device in this state is quite bad, as it's in a closed
> > >     state and with the queues frozen. You should make an attempt to
> > >     restore things to a working state.
> > >
> > > You mean if backend closed after timeout? Is there a way to know that? I understand it's not good to
> > > leave it in this state however, I am still trying to find if there is a good way to know if backend is still connected after timeout.
> > > Hence the message " the device may become inconsistent state".  I didn't see a timeout not even once on my end so that's why
> > > I may be looking for an alternate perspective here. may be need to thaw everything back intentionally is one thing I could think of.
> > 
> > You can manually force this state, and then check that it will behave
> > correctly. I would expect that on a failure to disconnect from the
> > backend you should switch the frontend to the 'Init' state in order to
> > try to reconnect to the backend when possible.
> > 
> From what I understand forcing manually is, failing the freeze without
> disconnect and try to revive the connection by unfreezing the
> queues->reconnecting to backend [which never got diconnected]. May be even
> tearing down things manually because I am not sure what state will frontend
> see if backend fails to to disconnect at any point in time. I assumed connected.
> Then again if its "CONNECTED" I may not need to tear down everything and start
> from Initialising state because that may not work.
> 
> So I am not so sure about backend's state so much, lets say if  xen_blkif_disconnect fail,
> I don't see it getting handled in the backend then what will be backend's state?
> Will it still switch xenbus state to 'Closed'? If not what will frontend see, 
> if it tries to read backend's state through xenbus_read_driver_state ?
> 
> So the flow be like:
> Front end marks XenbusStateClosing
> Backend marks its state as XenbusStateClosing
>     Frontend marks XenbusStateClosed
>     Backend disconnects calls xen_blkif_disconnect
>        Backend fails to disconnect, the above function returns EBUSY
>        What will be state of backend here? 
>        Frontend did not tear down the rings if backend does not switches the
>        state to 'Closed' in case of failure.
> 
> If backend stays in CONNECTED state, then even if we mark it Initialised in frontend, backend
> won't be calling connect(). {From reading code in frontend_changed}
> IMU, Initialising will fail since backend dev->state != XenbusStateClosed plus
> we did not tear down anything so calling talk_to_blkback may not be needed
> 
> Does that sound correct?
Send that too quickly, I also meant to add XenBusIntialised state should be ok
only if we expect backend will stay in "Connected" state. Also, I experimented
with that notion. I am little worried about the correctness here. 
Can the backend  come to an Unknown state somehow?
> > >     > +     }
> > >     > +
> > >     > +     return err;
> > >     > +}
> > >     > +
> > >     > +static int blkfront_restore(struct xenbus_device *dev)
> > >     > +{
> > >     > +     struct blkfront_info *info = dev_get_drvdata(&dev->dev);
> > >     > +     int err = 0;
> > >     > +
> > >     > +     err = talk_to_blkback(dev, info);
> > >     > +     blk_mq_unquiesce_queue(info->rq);
> > >     > +     blk_mq_unfreeze_queue(info->rq);
> > >     > +     if (!err)
> > >     > +         blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
> > >
> > >     Bad indentation. Also shouldn't you first update the queues and then
> > >     unfreeze them?
> > > Please correct me if I am wrong, blk_mq_update_nr_hw_queues freezes the queue
> > > So I don't think the order could be reversed.
> > 
> > Regardless of what blk_mq_update_nr_hw_queues does, I don't think it's
> > correct to unfreeze the queues without having updated them. Also the
> > freezing/unfreezing uses a refcount, so I think it's perfectly fine to
> > call blk_mq_update_nr_hw_queues first and then unfreeze the queues.
> > 
> > Also note that talk_to_blkback returning an error should likely
> > prevent any unfreezing, as the queues won't be updated to match the
> > parameters of the backend.
> >
> I think you are right here. Will send out fixes in V2
> > Thanks, Roger.
> > 
> Thanks,
> Anchal
> 

^ permalink raw reply

* [PATCH][next] ath10k: wmi: Use struct_size() helper in ath10k_wmi_alloc_skb()
From: Gustavo A. R. Silva @ 2020-06-16 22:51 UTC (permalink / raw)
  To: Kalle Valo, David S. Miller, Jakub Kicinski
  Cc: ath10k, linux-wireless, netdev, linux-kernel, Gustavo A. R. Silva

Make use of the struct_size() helper instead of an open-coded version
in order to avoid any potential type mistakes. Also, remove unnecessary
variable _len_.

This code was detected with the help of Coccinelle and, audited and
fixed manually.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 32 +++++++--------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index a81a1ab2de19..b89681394a15 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -6551,7 +6551,7 @@ static struct sk_buff *ath10k_wmi_op_gen_init(struct ath10k *ar)
 	struct wmi_init_cmd *cmd;
 	struct sk_buff *buf;
 	struct wmi_resource_config config = {};
-	u32 len, val;
+	u32 val;
 
 	config.num_vdevs = __cpu_to_le32(TARGET_NUM_VDEVS);
 	config.num_peers = __cpu_to_le32(TARGET_NUM_PEERS);
@@ -6603,10 +6603,7 @@ static struct sk_buff *ath10k_wmi_op_gen_init(struct ath10k *ar)
 	config.num_msdu_desc = __cpu_to_le32(TARGET_NUM_MSDU_DESC);
 	config.max_frag_entries = __cpu_to_le32(TARGET_MAX_FRAG_ENTRIES);
 
-	len = sizeof(*cmd) +
-	      (sizeof(struct host_memory_chunk) * ar->wmi.num_mem_chunks);
-
-	buf = ath10k_wmi_alloc_skb(ar, len);
+	buf = ath10k_wmi_alloc_skb(ar, struct_size(cmd, mem_chunks.items, ar->wmi.num_mem_chunks));
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
@@ -6624,7 +6621,7 @@ static struct sk_buff *ath10k_wmi_10_1_op_gen_init(struct ath10k *ar)
 	struct wmi_init_cmd_10x *cmd;
 	struct sk_buff *buf;
 	struct wmi_resource_config_10x config = {};
-	u32 len, val;
+	u32 val;
 
 	config.num_vdevs = __cpu_to_le32(TARGET_10X_NUM_VDEVS);
 	config.num_peers = __cpu_to_le32(TARGET_10X_NUM_PEERS);
@@ -6668,10 +6665,7 @@ static struct sk_buff *ath10k_wmi_10_1_op_gen_init(struct ath10k *ar)
 	config.num_msdu_desc = __cpu_to_le32(TARGET_10X_NUM_MSDU_DESC);
 	config.max_frag_entries = __cpu_to_le32(TARGET_10X_MAX_FRAG_ENTRIES);
 
-	len = sizeof(*cmd) +
-	      (sizeof(struct host_memory_chunk) * ar->wmi.num_mem_chunks);
-
-	buf = ath10k_wmi_alloc_skb(ar, len);
+	buf = ath10k_wmi_alloc_skb(ar, struct_size(cmd, mem_chunks.items, ar->wmi.num_mem_chunks));
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
@@ -6689,7 +6683,7 @@ static struct sk_buff *ath10k_wmi_10_2_op_gen_init(struct ath10k *ar)
 	struct wmi_init_cmd_10_2 *cmd;
 	struct sk_buff *buf;
 	struct wmi_resource_config_10x config = {};
-	u32 len, val, features;
+	u32 val, features;
 
 	config.num_vdevs = __cpu_to_le32(TARGET_10X_NUM_VDEVS);
 	config.num_peer_keys = __cpu_to_le32(TARGET_10X_NUM_PEER_KEYS);
@@ -6741,10 +6735,7 @@ static struct sk_buff *ath10k_wmi_10_2_op_gen_init(struct ath10k *ar)
 	config.num_msdu_desc = __cpu_to_le32(TARGET_10X_NUM_MSDU_DESC);
 	config.max_frag_entries = __cpu_to_le32(TARGET_10X_MAX_FRAG_ENTRIES);
 
-	len = sizeof(*cmd) +
-	      (sizeof(struct host_memory_chunk) * ar->wmi.num_mem_chunks);
-
-	buf = ath10k_wmi_alloc_skb(ar, len);
+	buf = ath10k_wmi_alloc_skb(ar, struct_size(cmd, mem_chunks.items, ar->wmi.num_mem_chunks));
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
@@ -6776,7 +6767,6 @@ static struct sk_buff *ath10k_wmi_10_4_op_gen_init(struct ath10k *ar)
 	struct wmi_init_cmd_10_4 *cmd;
 	struct sk_buff *buf;
 	struct wmi_resource_config_10_4 config = {};
-	u32 len;
 
 	config.num_vdevs = __cpu_to_le32(ar->max_num_vdevs);
 	config.num_peers = __cpu_to_le32(ar->max_num_peers);
@@ -6838,10 +6828,7 @@ static struct sk_buff *ath10k_wmi_10_4_op_gen_init(struct ath10k *ar)
 	config.iphdr_pad_config = __cpu_to_le32(TARGET_10_4_IPHDR_PAD_CONFIG);
 	config.qwrap_config = __cpu_to_le32(TARGET_10_4_QWRAP_CONFIG);
 
-	len = sizeof(*cmd) +
-	      (sizeof(struct host_memory_chunk) * ar->wmi.num_mem_chunks);
-
-	buf = ath10k_wmi_alloc_skb(ar, len);
+	buf = ath10k_wmi_alloc_skb(ar, struct_size(cmd, mem_chunks.items, ar->wmi.num_mem_chunks));
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
@@ -7549,12 +7536,9 @@ ath10k_wmi_op_gen_scan_chan_list(struct ath10k *ar,
 	struct sk_buff *skb;
 	struct wmi_channel_arg *ch;
 	struct wmi_channel *ci;
-	int len;
 	int i;
 
-	len = sizeof(*cmd) + arg->n_channels * sizeof(struct wmi_channel);
-
-	skb = ath10k_wmi_alloc_skb(ar, len);
+	skb = ath10k_wmi_alloc_skb(ar, struct_size(cmd, chan_info, arg->n_channels));
 	if (!skb)
 		return ERR_PTR(-EINVAL);
 
-- 
2.27.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox