Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH RFC 1/4] net: phy: swphy: emulate register MII_ESTATUS
From: Andrew Lunn @ 2019-08-14 15:34 UTC (permalink / raw)
  To: Heiner Kallweit
  Cc: Florian Fainelli, Marek Behun, David Miller,
	netdev@vger.kernel.org
In-Reply-To: <5e3d85cd-43b6-c581-be99-b6b0cf025771@gmail.com>

On Tue, Aug 13, 2019 at 11:24:56PM +0200, Heiner Kallweit wrote:
> When the genphy driver binds to a swphy it will call
> genphy_read_abilites that will try to read MII_ESTATUS if BMSR_ESTATEN
> is set in MII_BMSR. So far this would read the default value 0xffff
> and 1000FD and 1000HD are reported as supported just by chance.
> Better add explicit support for emulating MII_ESTATUS.
> 
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* [patch net-next v3 0/2] netdevsim: implement support for devlink region and snapshots
From: Jiri Pirko @ 2019-08-14 15:37 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

Implement devlink region support for netdevsim and test it.

---
Note the selftest patch depends on "[patch net-next] selftests:
netdevsim: add devlink params tests" patch sent earlier today.

Jiri Pirko (2):
  netdevsim: implement support for devlink region and snapshots
  selftests: netdevsim: add devlink regions tests

 drivers/net/netdevsim/dev.c                   | 63 ++++++++++++++++++-
 drivers/net/netdevsim/netdevsim.h             |  1 +
 .../drivers/net/netdevsim/devlink.sh          | 54 +++++++++++++++-
 3 files changed, 116 insertions(+), 2 deletions(-)

-- 
2.21.0


^ permalink raw reply

* [patch net-next v3 1/2] netdevsim: implement support for devlink region and snapshots
From: Jiri Pirko @ 2019-08-14 15:37 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw
In-Reply-To: <20190814153735.6923-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Implement dummy region of size 32K and allow user to create snapshots
or random data using debugfs file trigger.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v2->v3:
- return -ENOMEM in case dummy_data cannot be allocated
  and don't print out error message.
- return err in case snapshot creation fails and kfree dummy_data.
- use PTR_ERR_OR_ZERO in nsim_dev_dummy_region_init().
---
 drivers/net/netdevsim/dev.c       | 63 ++++++++++++++++++++++++++++++-
 drivers/net/netdevsim/netdevsim.h |  1 +
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 08ca59fc189b..125a0358bc04 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -27,6 +27,41 @@
 
 static struct dentry *nsim_dev_ddir;
 
+#define NSIM_DEV_DUMMY_REGION_SIZE (1024 * 32)
+
+static ssize_t nsim_dev_take_snapshot_write(struct file *file,
+					    const char __user *data,
+					    size_t count, loff_t *ppos)
+{
+	struct nsim_dev *nsim_dev = file->private_data;
+	void *dummy_data;
+	u32 id;
+	int err;
+
+	dummy_data = kmalloc(NSIM_DEV_DUMMY_REGION_SIZE, GFP_KERNEL);
+	if (!dummy_data)
+		return -ENOMEM;
+
+	get_random_bytes(dummy_data, NSIM_DEV_DUMMY_REGION_SIZE);
+
+	id = devlink_region_shapshot_id_get(priv_to_devlink(nsim_dev));
+	err = devlink_region_snapshot_create(nsim_dev->dummy_region,
+					     dummy_data, id, kfree);
+	if (err) {
+		pr_err("Failed to create region snapshot\n");
+		kfree(dummy_data);
+		return err;
+	}
+
+	return count;
+}
+
+static const struct file_operations nsim_dev_take_snapshot_fops = {
+	.open = simple_open,
+	.write = nsim_dev_take_snapshot_write,
+	.llseek = generic_file_llseek,
+};
+
 static int nsim_dev_debugfs_init(struct nsim_dev *nsim_dev)
 {
 	char dev_ddir_name[16];
@@ -44,6 +79,8 @@ static int nsim_dev_debugfs_init(struct nsim_dev *nsim_dev)
 			   &nsim_dev->max_macs);
 	debugfs_create_bool("test1", 0600, nsim_dev->ddir,
 			    &nsim_dev->test1);
+	debugfs_create_file("take_snapshot", 0200, nsim_dev->ddir, nsim_dev,
+			    &nsim_dev_take_snapshot_fops);
 	return 0;
 }
 
@@ -248,6 +285,23 @@ static void nsim_devlink_param_load_driverinit_values(struct devlink *devlink)
 		nsim_dev->test1 = saved_value.vbool;
 }
 
+#define NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX 16
+
+static int nsim_dev_dummy_region_init(struct nsim_dev *nsim_dev,
+				      struct devlink *devlink)
+{
+	nsim_dev->dummy_region =
+		devlink_region_create(devlink, "dummy",
+				      NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX,
+				      NSIM_DEV_DUMMY_REGION_SIZE);
+	return PTR_ERR_OR_ZERO(nsim_dev->dummy_region);
+}
+
+static void nsim_dev_dummy_region_exit(struct nsim_dev *nsim_dev)
+{
+	devlink_region_destroy(nsim_dev->dummy_region);
+}
+
 static int nsim_dev_reload(struct devlink *devlink,
 			   struct netlink_ext_ack *extack)
 {
@@ -363,10 +417,14 @@ nsim_dev_create(struct nsim_bus_dev *nsim_bus_dev, unsigned int port_count)
 		goto err_dl_unregister;
 	nsim_devlink_set_params_init_values(nsim_dev, devlink);
 
-	err = nsim_dev_debugfs_init(nsim_dev);
+	err = nsim_dev_dummy_region_init(nsim_dev, devlink);
 	if (err)
 		goto err_params_unregister;
 
+	err = nsim_dev_debugfs_init(nsim_dev);
+	if (err)
+		goto err_dummy_region_exit;
+
 	err = nsim_bpf_dev_init(nsim_dev);
 	if (err)
 		goto err_debugfs_exit;
@@ -376,6 +434,8 @@ nsim_dev_create(struct nsim_bus_dev *nsim_bus_dev, unsigned int port_count)
 
 err_debugfs_exit:
 	nsim_dev_debugfs_exit(nsim_dev);
+err_dummy_region_exit:
+	nsim_dev_dummy_region_exit(nsim_dev);
 err_params_unregister:
 	devlink_params_unregister(devlink, nsim_devlink_params,
 				  ARRAY_SIZE(nsim_devlink_params));
@@ -396,6 +456,7 @@ static void nsim_dev_destroy(struct nsim_dev *nsim_dev)
 
 	nsim_bpf_dev_exit(nsim_dev);
 	nsim_dev_debugfs_exit(nsim_dev);
+	nsim_dev_dummy_region_exit(nsim_dev);
 	devlink_params_unregister(devlink, nsim_devlink_params,
 				  ARRAY_SIZE(nsim_devlink_params));
 	devlink_unregister(devlink);
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 95751a817508..4c758c6919f5 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -160,6 +160,7 @@ struct nsim_dev {
 	bool fw_update_status;
 	u32 max_macs;
 	bool test1;
+	struct devlink_region *dummy_region;
 };
 
 int nsim_dev_init(void);
-- 
2.21.0


^ permalink raw reply related

* [patch net-next v3 2/2] selftests: netdevsim: add devlink regions tests
From: Jiri Pirko @ 2019-08-14 15:37 UTC (permalink / raw)
  To: netdev; +Cc: davem, jakub.kicinski, mlxsw
In-Reply-To: <20190814153735.6923-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Test netdevsim devlink region implementation.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
- new patch
---
 .../drivers/net/netdevsim/devlink.sh          | 54 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
index 6828e9404460..115837355eaf 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -3,7 +3,7 @@
 
 lib_dir=$(dirname $0)/../../../net/forwarding
 
-ALL_TESTS="fw_flash_test params_test"
+ALL_TESTS="fw_flash_test params_test regions_test"
 NUM_NETIFS=0
 source $lib_dir/lib.sh
 
@@ -90,6 +90,58 @@ params_test()
 	log_test "params test"
 }
 
+check_region_size()
+{
+	local name=$1
+	local size
+
+	size=$(devlink region show $DL_HANDLE/$name -j | jq -e -r '.[][].size')
+	check_err $? "Failed to get $name region size"
+	[ $size -eq 32768 ]
+	check_err $? "Invalid $name region size"
+}
+
+check_region_snapshot_count()
+{
+	local name=$1
+	local phase_name=$2
+	local expected_count=$3
+	local count
+
+	count=$(devlink region show $DL_HANDLE/$name -j | jq -e -r '.[][].snapshot | length')
+	[ $count -eq $expected_count ]
+	check_err $? "Unexpected $phase_name snapshot count"
+}
+
+regions_test()
+{
+	RET=0
+
+	local count
+
+	check_region_size dummy
+	check_region_snapshot_count dummy initial 0
+
+	echo ""> $DEBUGFS_DIR/take_snapshot
+	check_err $? "Failed to take first dummy region snapshot"
+	check_region_snapshot_count dummy post-first-snapshot 1
+
+	echo ""> $DEBUGFS_DIR/take_snapshot
+	check_err $? "Failed to take second dummy region snapshot"
+	check_region_snapshot_count dummy post-second-snapshot 2
+
+	echo ""> $DEBUGFS_DIR/take_snapshot
+	check_err $? "Failed to take third dummy region snapshot"
+	check_region_snapshot_count dummy post-third-snapshot 3
+
+	devlink region del $DL_HANDLE/dummy snapshot 1
+	check_err $? "Failed to delete first dummy region snapshot"
+
+	check_region_snapshot_count dummy post-first-delete 2
+
+	log_test "regions test"
+}
+
 setup_prepare()
 {
 	modprobe netdevsim
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH bpf-next v4 8/8] net/mlx5e: Add AF_XDP need_wakeup support
From: Jonathan Lemon @ 2019-08-14 15:40 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: bjorn.topel, ast, daniel, netdev, brouer, maximmi, bpf,
	bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	kiran.patil, axboe, maciej.fijalkowski, maciejromanfijalkowski,
	intel-wired-lan
In-Reply-To: <1565767643-4908-9-git-send-email-magnus.karlsson@intel.com>

On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:

> From: Maxim Mikityanskiy <maximmi@mellanox.com>
>
> This commit adds support for the new need_wakeup feature of AF_XDP. The
> applications can opt-in by using the XDP_USE_NEED_WAKEUP bind() flag.
> When this feature is enabled, some behavior changes:
>
> RX side: If the Fill Ring is empty, instead of busy-polling, set the
> flag to tell the application to kick the driver when it refills the Fill
> Ring.
>
> TX side: If there are pending completions or packets queued for
> transmission, set the flag to tell the application that it can skip the
> sendto() syscall and save time.
>
> The performance testing was performed on a machine with the following
> configuration:
>
> - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
> - Mellanox ConnectX-5 Ex with 100 Gbit/s link
>
> The results with retpoline disabled:
>
>        | without need_wakeup  | with need_wakeup     |
>        |----------------------|----------------------|
>        | one core | two cores | one core | two cores |
> -------|----------|-----------|----------|-----------|
> txonly | 20.1     | 33.5      | 29.0     | 34.2      |
> rxdrop | 0.065    | 14.1      | 12.0     | 14.1      |
> l2fwd  | 0.032    | 7.3       | 6.6      | 7.2       |
>
> "One core" means the application and NAPI run on the same core. "Two
> cores" means they are pinned to different cores.
>
> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
> Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
> Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>

^ permalink raw reply

* Re: [PATCH net v2] ibmveth: Convert multicast list size for little-endian system
From: Thomas Falcon @ 2019-08-14 15:41 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, liuhangbin, davem, joe
In-Reply-To: <20190813194037.464bea2c@cakuba.netronome.com>


On 8/13/19 9:43 PM, Jakub Kicinski wrote:
> On Mon, 12 Aug 2019 16:13:06 -0500, Thomas Falcon wrote:
>> The ibm,mac-address-filters property defines the maximum number of
>> addresses the hypervisor's multicast filter list can support. It is
>> encoded as a big-endian integer in the OF device tree, but the virtual
>> ethernet driver does not convert it for use by little-endian systems.
>> As a result, the driver is not behaving as it should on affected systems
>> when a large number of multicast addresses are assigned to the device.
>>
>> Reported-by: Hangbin Liu <liuhangbin@gmail.com>
>> Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>
> Okay, applied, but:

Thanks!

...

> ibmveth_init_link_settings() is part of your net-next patch which
> you're respining, so I had to apply manually. Please double check your
> patches apply cleanly to the designated tree.

Sorry about that, I thought I fixed that but maybe I got patches mixed 
up somehow.  Thanks again.

Tom


^ permalink raw reply

* Re: [PATCH bpf-next v4 3/8] i40e: add support for AF_XDP need_wakeup feature
From: Jonathan Lemon @ 2019-08-14 15:41 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: bjorn.topel, ast, daniel, netdev, brouer, maximmi, bpf,
	bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	kiran.patil, axboe, maciej.fijalkowski, maciejromanfijalkowski,
	intel-wired-lan
In-Reply-To: <1565767643-4908-4-git-send-email-magnus.karlsson@intel.com>

On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:

> This patch adds support for the need_wakeup feature of AF_XDP. If the
> application has told the kernel that it might sleep using the new bind
> flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it has
> no more buffers on the NIC Rx ring and yield to the application. For
> Tx, it will set the flag if it has no outstanding Tx completion
> interrupts and return to the application.
>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>

^ permalink raw reply

* Re: [PATCH bpf-next v4 4/8] ixgbe: add support for AF_XDP need_wakeup feature
From: Jonathan Lemon @ 2019-08-14 15:41 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: bjorn.topel, ast, daniel, netdev, brouer, maximmi, bpf,
	bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	kiran.patil, axboe, maciej.fijalkowski, maciejromanfijalkowski,
	intel-wired-lan
In-Reply-To: <1565767643-4908-5-git-send-email-magnus.karlsson@intel.com>



On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:

> This patch adds support for the need_wakeup feature of AF_XDP. If the
> application has told the kernel that it might sleep using the new bind
> flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it has
> no more buffers on the NIC Rx ring and yield to the application. For
> Tx, it will set the flag if it has no outstanding Tx completion
> interrupts and return to the application.
>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>

^ permalink raw reply

* Re: [PATCH bpf-next v4 3/8] i40e: add support for AF_XDP need_wakeup feature
From: Jonathan Lemon @ 2019-08-14 15:42 UTC (permalink / raw)
  To: Magnus Karlsson
  Cc: Magnus Karlsson, Björn Töpel, Alexei Starovoitov,
	Daniel Borkmann, Network Development, Jesper Dangaard Brouer,
	Maxim Mikityanskiy, bpf, bruce.richardson, ciara.loftus,
	Jakub Kicinski, Ye Xiaolong, Zhang, Qi Z, Samudrala, Sridhar,
	Kevin Laatz, ilias.apalodimas, Kiran, axboe, Fijalkowski, Maciej,
	Maciej Fijalkowski, intel-wired-lan
In-Reply-To: <CAJ8uoz0Tnb=i-LkGqLU87be9BuYqxmu2pN1Mte0UEWA2+f8bTQ@mail.gmail.com>



On 14 Aug 2019, at 7:59, Magnus Karlsson wrote:

> On Wed, Aug 14, 2019 at 4:48 PM Jonathan Lemon 
> <jonathan.lemon@gmail.com> wrote:
>>
>>
>>
>> On 14 Aug 2019, at 0:27, Magnus Karlsson wrote:
>>
>>> This patch adds support for the need_wakeup feature of AF_XDP. If 
>>> the
>>> application has told the kernel that it might sleep using the new 
>>> bind
>>> flag XDP_USE_NEED_WAKEUP, the driver will then set this flag if it 
>>> has
>>> no more buffers on the NIC Rx ring and yield to the application. For
>>> Tx, it will set the flag if it has no outstanding Tx completion
>>> interrupts and return to the application.
>>>
>>> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
>>> ---
>>>  drivers/net/ethernet/intel/i40e/i40e_xsk.c | 18 ++++++++++++++++++
>>>  1 file changed, 18 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>>> b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>>> index d0ff5d8..42c9012 100644
>>> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
>>> @@ -626,6 +626,15 @@ int i40e_clean_rx_irq_zc(struct i40e_ring
>>> *rx_ring, int budget)
>>>
>>>       i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
>>>       i40e_update_rx_stats(rx_ring, total_rx_bytes, 
>>> total_rx_packets);
>>> +
>>> +     if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
>>> +             if (failure || rx_ring->next_to_clean == 
>>> rx_ring->next_to_use)
>>> +                     xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
>>> +             else
>>> +                     xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
>>> +
>>> +             return (int)total_rx_packets;
>>> +     }
>>>       return failure ? budget : (int)total_rx_packets;
>>
>> Can you elaborate why we're not returning the total budget on failure
>> for the wakeup case?
>
> In the non need_wakeup case (the old behavior), when allocation fails
> from the fill queue we want to retry right away basically busy
> spinning on the fill queue until we find at least one entry and then
> go on processing packets. Works well when the app and the driver are
> on different cores, but a lousy strategy when they execute on the same
> core. That is why in the need_wakeup feature case, we do not return
> the total budget if there is a failure. We will just come back at a
> later point in time from a syscall since the need_wakeup flag will
> have been set and check the fill queue again. We do not want a
> busy-spinning behavior in this case.

That makes sense.  Thanks for all the work on this, Magnus!
-- 
Jonathan

^ permalink raw reply

* Re: [PATCH net-next] net: openvswitch: Set OvS recirc_id from tc chain index
From: Pravin Shelar @ 2019-08-14 15:53 UTC (permalink / raw)
  To: Paul Blakey
  Cc: Linux Kernel Network Developers, David S. Miller, Justin Pettit,
	Simon Horman, Marcelo Ricardo Leitner, Vlad Buslov, Jiri Pirko,
	Roi Dayan, Yossi Kuperman, Rony Efraim, Oz Shlomo
In-Reply-To: <68e7a65c-162a-8bc5-4d80-f4f245944b9c@mellanox.com>

On Tue, Aug 13, 2019 at 1:29 AM Paul Blakey <paulb@mellanox.com> wrote:
>
>
> On 8/12/2019 7:18 PM, Pravin Shelar wrote:
> > On Sun, Aug 11, 2019 at 3:46 AM Paul Blakey <paulb@mellanox.com> wrote:
> >>
> >> On 8/8/2019 11:53 PM, Pravin Shelar wrote:
> >>> On Wed, Aug 7, 2019 at 5:08 AM Paul Blakey <paulb@mellanox.com> wrote:
> >>>> Offloaded OvS datapath rules are translated one to one to tc rules,
> >>>> for example the following simplified OvS rule:
> >>>>
> >>>> recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
> >>>>
> >>>> Will be translated to the following tc rule:
> >>>>
> >>>> $ tc filter add dev dev1 ingress \
> >>>>               prio 1 chain 0 proto ip \
> >>>>                   flower tcp ct_state -trk \
> >>>>                   action ct pipe \
> >>>>                   action goto chain 2
> >>>>
> >>>> Received packets will first travel though tc, and if they aren't stolen
> >>>> by it, like in the above rule, they will continue to OvS datapath.
> >>>> Since we already did some actions (action ct in this case) which might
> >>>> modify the packets, and updated action stats, we would like to continue
> >>>> the proccessing with the correct recirc_id in OvS (here recirc_id(2))
> >>>> where we left off.
> >>>>
> >>>> To support this, introduce a new skb extension for tc, which
> >>>> will be used for translating tc chain to ovs recirc_id to
> >>>> handle these miss cases. Last tc chain index will be set
> >>>> by tc goto chain action and read by OvS datapath.
> >>>>
> >>>> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> >>>> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> >>>> Acked-by: Jiri Pirko <jiri@mellanox.com>
> >>>> ---
> >>>>    include/linux/skbuff.h    | 13 +++++++++++++
> >>>>    include/net/sch_generic.h |  5 ++++-
> >>>>    net/core/skbuff.c         |  6 ++++++
> >>>>    net/openvswitch/flow.c    |  9 +++++++++
> >>>>    net/sched/Kconfig         | 13 +++++++++++++
> >>>>    net/sched/act_api.c       |  1 +
> >>>>    net/sched/cls_api.c       | 12 ++++++++++++
> >>>>    7 files changed, 58 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> >>>> index 3aef8d8..fb2a792 100644
> >>>> --- a/include/linux/skbuff.h
> >>>> +++ b/include/linux/skbuff.h
> >>>> @@ -279,6 +279,16 @@ struct nf_bridge_info {
> >>>>    };
> >>>>    #endif
> >>>>
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +/* Chain in tc_skb_ext will be used to share the tc chain with
> >>>> + * ovs recirc_id. It will be set to the current chain by tc
> >>>> + * and read by ovs to recirc_id.
> >>>> + */
> >>>> +struct tc_skb_ext {
> >>>> +       __u32 chain;
> >>>> +};
> >>>> +#endif
> >>>> +
> >>>>    struct sk_buff_head {
> >>>>           /* These two members must be first. */
> >>>>           struct sk_buff  *next;
> >>>> @@ -4050,6 +4060,9 @@ enum skb_ext_id {
> >>>>    #ifdef CONFIG_XFRM
> >>>>           SKB_EXT_SEC_PATH,
> >>>>    #endif
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +       TC_SKB_EXT,
> >>>> +#endif
> >>>>           SKB_EXT_NUM, /* must be last */
> >>>>    };
> >>>>
> >>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> >>>> index 6b6b012..871feea 100644
> >>>> --- a/include/net/sch_generic.h
> >>>> +++ b/include/net/sch_generic.h
> >>>> @@ -275,7 +275,10 @@ struct tcf_result {
> >>>>                           unsigned long   class;
> >>>>                           u32             classid;
> >>>>                   };
> >>>> -               const struct tcf_proto *goto_tp;
> >>>> +               struct {
> >>>> +                       const struct tcf_proto *goto_tp;
> >>>> +                       u32 goto_index;
> >>>> +               };
> >>>>
> >>>>                   /* used in the skb_tc_reinsert function */
> >>>>                   struct {
> >>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>> index ea8e8d3..2b40b5a 100644
> >>>> --- a/net/core/skbuff.c
> >>>> +++ b/net/core/skbuff.c
> >>>> @@ -4087,6 +4087,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> >>>>    #ifdef CONFIG_XFRM
> >>>>           [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
> >>>>    #endif
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +       [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
> >>>> +#endif
> >>>>    };
> >>>>
> >>>>    static __always_inline unsigned int skb_ext_total_length(void)
> >>>> @@ -4098,6 +4101,9 @@ static __always_inline unsigned int skb_ext_total_length(void)
> >>>>    #ifdef CONFIG_XFRM
> >>>>                   skb_ext_type_len[SKB_EXT_SEC_PATH] +
> >>>>    #endif
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +               skb_ext_type_len[TC_SKB_EXT] +
> >>>> +#endif
> >>>>                   0;
> >>>>    }
> >>>>
> >>>> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> >>>> index bc89e16..0287ead 100644
> >>>> --- a/net/openvswitch/flow.c
> >>>> +++ b/net/openvswitch/flow.c
> >>>> @@ -816,6 +816,9 @@ static int key_extract_mac_proto(struct sk_buff *skb)
> >>>>    int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
> >>>>                            struct sk_buff *skb, struct sw_flow_key *key)
> >>>>    {
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +       struct tc_skb_ext *tc_ext;
> >>>> +#endif
> >>>>           int res, err;
> >>>>
> >>>>           /* Extract metadata from packet. */
> >>>> @@ -848,7 +851,13 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
> >>>>           if (res < 0)
> >>>>                   return res;
> >>>>           key->mac_proto = res;
> >>>> +
> >>>> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> >>>> +       tc_ext = skb_ext_find(skb, TC_SKB_EXT);
> >>>> +       key->recirc_id = tc_ext ? tc_ext->chain : 0;
> >>>> +#else
> >>>>           key->recirc_id = 0;
> >>>> +#endif
> >>>>
> >>> Most of cases the config would be turned on, so the ifdef is not that
> >>> useful. Can you add static key to avoid searching the skb-ext in non
> >>> offload cases.
> >> Hi,
> >>
> >> What do you mean by a static key?
> >>
> > https://www.kernel.org/doc/Documentation/static-keys.txt
> >
> > Static key can be enabled when a flow is added to the tc filter.
>
> Hi and thanks for the feedback,
>
> The skb_ext_find() just checks a single bit on the
> skb->active_extensions, and if so returns an offset. Do you think it
> will impact performance much?
>
I do not see much down side of adding static key here.

>
> But to your suggestion, do you mean that the first tc goto action
> instance with the relevant ifdef (CONFIG_NET_TC_SKB_EXT) it will enable
> the OvS static key that guards this skb_ext_find()?
>
> I guess calling it in tcf_action_set_ctrlact() if goto_chain != 0.
>
> This will expose some OvS helper function (or static key) to
> net/sched/act_api.c right?

Right, The patch adds this dependency anyways, so this symbol
definition in OVS would make it explicit to user.

^ permalink raw reply

* Re: [PATCH bpf-next 1/3] libbpf: add asm/unistd.h to xsk to get __NR_mmap2
From: Yonghong Song @ 2019-08-14 15:51 UTC (permalink / raw)
  To: Andrii Nakryiko, Magnus Karlsson, Björn Töpel,
	David S. Miller, Jesper Dangaard Brouer, john fastabend,
	Jakub Kicinski, Daniel Borkmann, Networking, bpf,
	xdp-newbies@vger.kernel.org, open list
In-Reply-To: <20190814092403.GA4142@khorivan>



On 8/14/19 2:24 AM, Ivan Khoronzhuk wrote:
> On Tue, Aug 13, 2019 at 04:38:13PM -0700, Andrii Nakryiko wrote:
> 
> Hi, Andrii
> 
>> On Tue, Aug 13, 2019 at 3:24 AM Ivan Khoronzhuk
>> <ivan.khoronzhuk@linaro.org> wrote:
>>>
>>> That's needed to get __NR_mmap2 when mmap2 syscall is used.
>>>
>>> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>>> ---
>>>  tools/lib/bpf/xsk.c | 1 +
>>>  1 file changed, 1 insertion(+)
>>>
>>> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
>>> index 5007b5d4fd2c..f2fc40f9804c 100644
>>> --- a/tools/lib/bpf/xsk.c
>>> +++ b/tools/lib/bpf/xsk.c
>>> @@ -12,6 +12,7 @@
>>>  #include <stdlib.h>
>>>  #include <string.h>
>>>  #include <unistd.h>
>>> +#include <asm/unistd.h>
>>
>> asm/unistd.h is not present in Github libbpf projection. Is there any
> 
> Look on includes from
> tools/lib/bpf/libpf.c
> tools/lib/bpf/bpf.c
> 
> That's how it's done... Copping headers to arch/arm will not
> solve this, it includes both of them anyway, and anyway it needs
> asm/unistd.h inclusion here, only because xsk.c needs __NR_*
> 
> 
>> way to avoid including this header? Generally, libbpf can't easily use
>> all of kernel headers, we need to re-implemented all the extra used
>> stuff for Github version of libbpf, so we try to minimize usage of new
>> headers that are not just plain uapi headers from include/uapi.
> 
> Yes I know, it's far away from real number of changes needed.
> I faced enough about this already and kernel headers, especially
> for arm32 it's a bit decency problem. But this patch it's part of
> normal one. I have couple issues despite this normally fixed mmap2
> that is the same even if uapi includes are coppied to tools/arch/arm.
> 
> In continuation of kernel headers inclusion and arm build:
> 
> For instance, what about this rough "kernel headers" hack:
> https://github.com/ikhorn/af_xdp_stuff/commit/aa645ccca4d844f404ec3c2b27402d4d7848d1b5 

The ".syntax unified" is mentioned a couple of times
in bcc mailing list as well. llvm bpf backend might
be able to solve it. I have not looked at the details though.

> 
> or this one related for arm32 only:
> https://github.com/ikhorn/af_xdp_stuff/commit/2c6c6d538605aac39600dcb3c9b66de11c70b963 

This may not work if bpf program tries to handle kernel headers.
bpf program may get wrong layout.

Anyway, the above two comments are irrelevant to this patch set
and if needed should be discussed separately.

> 
> 
> I have more...
> 
>>
>>>  #include <arpa/inet.h>
>>>  #include <asm/barrier.h>
>>>  #include <linux/compiler.h>
>>> -- 
>>> 2.17.1
>>>
> 

^ permalink raw reply

* Re: [PATCH net-next 1/5] RDS: Re-add pf/sol access via sysctl
From: Doug Ledford @ 2019-08-14 15:56 UTC (permalink / raw)
  To: Gerd Rausch, Santosh Shilimkar, netdev, linux-rdma, rds-devel
  Cc: David Miller
In-Reply-To: <e0397d30-7405-a7af-286c-fe76887caf0a@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 2686 bytes --]

On Tue, 2019-08-13 at 11:20 -0700, Gerd Rausch wrote:
> From: Andy Grover <andy.grover@oracle.com>
> Date: Tue, 24 Nov 2009 15:35:51 -0800
> 
> Although RDS has an official PF_RDS value now, existing software
> expects to look for rds sysctls to determine it. We need to maintain
> these for now, for backwards compatibility.
> 
> Signed-off-by: Andy Grover <andy.grover@oracle.com>
> Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
> ---
>  net/rds/sysctl.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
> 
> diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
> index e381bbcd9cc1..9760292a0af4 100644
> --- a/net/rds/sysctl.c
> +++ b/net/rds/sysctl.c
> @@ -49,6 +49,13 @@ unsigned int  rds_sysctl_max_unacked_bytes = (16 <<
> 20);
>  
>  unsigned int rds_sysctl_ping_enable = 1;
>  
> +/*
> + * We have official values, but must maintain the sysctl interface
> for existing
> + * software that expects to find these values here.
> + */
> +static int rds_sysctl_pf_rds = PF_RDS;
> +static int rds_sysctl_sol_rds = SOL_RDS;
> +
>  static struct ctl_table rds_sysctl_rds_table[] = {
>  	{
>  		.procname       = "reconnect_min_delay_ms",
> @@ -68,6 +75,20 @@ static struct ctl_table rds_sysctl_rds_table[] = {
>  		.extra1		= &rds_sysctl_reconnect_min_jiffies,
>  		.extra2		= &rds_sysctl_reconnect_max,
>  	},
> +	{
> +		.procname       = "pf_rds",
> +		.data		= &rds_sysctl_pf_rds,
> +		.maxlen         = sizeof(int),
> +		.mode           = 0444,
> +		.proc_handler   = &proc_dointvec,
> +	},
> +	{
> +		.procname       = "sol_rds",
> +		.data		= &rds_sysctl_sol_rds,
> +		.maxlen         = sizeof(int),
> +		.mode           = 0444,
> +		.proc_handler   = &proc_dointvec,
> +	},
>  	{
>  		.procname	= "max_unacked_packets",
>  		.data		= &rds_sysctl_max_unacked_packets,

Good Lord...RDS was taken into the kernel in Feb of 2009, so over 10
years ago.  The patch to put PF_RDS/AF_RDS/SOL_RDS was taken into
include/linux/socket.h Feb 26, 2009.  The RDS ports were allocated by
IANA on Feb 27 and May 20, 2009.  And you *still* have software that
needs this?  The only software that has ever used RDS was Oracle
software.  I would have expected you guys to update your source code to
do the right thing long before now.  In fact, I would expect you were
ready to retire all of the legacy software that needs this by now.  As
of today, does your current build of Oracle software still require this,
or have you at least fixed it up in your modern builds?

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 0/5] net/rds: Fixes from internal Oracle repo
From: Doug Ledford @ 2019-08-14 15:59 UTC (permalink / raw)
  To: Gerd Rausch, Santosh Shilimkar, netdev, linux-rdma, rds-devel
  Cc: David Miller
In-Reply-To: <0b05ed75-6772-e339-11e6-1fb5714981c0@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 1110 bytes --]

On Tue, 2019-08-13 at 11:20 -0700, Gerd Rausch wrote:
> This is the first set of (mostly old) patches from our internal
> repository
> in an effort to synchronize what Oracle had been using internally
> with what is shipped with the Linux kernel.
> 
> Andy Grover (2):
>   RDS: Re-add pf/sol access via sysctl
>   rds: check for excessive looping in rds_send_xmit
> 
> Chris Mason (2):
>   RDS: limit the number of times we loop in rds_send_xmit
>   RDS: don't use GFP_ATOMIC for sk_alloc in rds_create
> 
> Gerd Rausch (1):
>   net/rds: Add a few missing rds_stat_names entries
> 
>  net/rds/af_rds.c  |  2 +-
>  net/rds/ib_recv.c | 12 +++++++++++-
>  net/rds/rds.h     |  2 +-
>  net/rds/send.c    | 12 ++++++++++++
>  net/rds/stats.c   |  3 +++
>  net/rds/sysctl.c  | 21 +++++++++++++++++++++
>  6 files changed, 49 insertions(+), 3 deletions(-)
> 

That first patch looks like a total bogon to me, but the remaining four
look fine.

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 0/4] net: bridge: mdb: allow dump/add/del of host-joined entries
From: Nikolay Aleksandrov @ 2019-08-14 16:01 UTC (permalink / raw)
  To: netdev; +Cc: davem, roopa, bridge
In-Reply-To: <20190814144024.9710-1-nikolay@cumulusnetworks.com>

On 8/14/19 5:40 PM, Nikolay Aleksandrov wrote:
> Hi,
> This set makes the bridge dump host-joined mdb entries, they should be
> treated as normal entries since they take a slot and are aging out.
> We already have notifications for them but we couldn't dump them until
> now so they remained hidden. We dump them similar to how they're
> notified, in order to keep user-space compatibility with the dumped
> objects (e.g. iproute2 dumps mdbs in a format which can be fed into
> add/del commands) we allow host-joined groups also to be added/deleted via
> mdb commands. That can later be used for L2 mcast MAC manipulation as
> was recently discussed. Note that iproute2 changes are not necessary,
> this set will work with the current user-space mdb code.
> 
> Patch 01 - a trivial comment move
> Patch 02 - factors out the mdb filling code so it can be
>            re-used for the host-joined entries
> Patch 03 - dumps host-joined entries
> Patch 04 - allows manipulation of host-joined entries via standard mdb
>            calls
> 
> Thanks,
>  Nik
> 
> Nikolay Aleksandrov (4):
>   net: bridge: mdb: move vlan comments
>   net: bridge: mdb: factor out mdb filling
>   net: bridge: mdb: dump host-joined entries as well
>   net: bridge: mdb: allow add/delete for host-joined groups
> 
>  net/bridge/br_mdb.c       | 171 +++++++++++++++++++++++++-------------
>  net/bridge/br_multicast.c |  24 ++++--
>  net/bridge/br_private.h   |   2 +
>  3 files changed, 133 insertions(+), 64 deletions(-)
> 

Self-NAK
There's a double notification sent for manual add/del of host groups.
It's a trivial fix, I'll spin v2 later after running more tests.


^ permalink raw reply

* Re: tc - mirred ingress not supported at the moment
From: Stephen Hemminger @ 2019-08-14 16:14 UTC (permalink / raw)
  To: Martin Olsson; +Cc: Cong Wang, netdev
In-Reply-To: <CAAT+qEbDAuQWGZa5BQYMZfBRQM+mDS=CMb9GTPz6Nxz_WD0M8Q@mail.gmail.com>

On Wed, 14 Aug 2019 11:25:25 +0200
Martin Olsson <martin.olsson+netdev@sentorsecurity.com> wrote:

> Hi Cong!
> 
> Ah sorry.
> Already implemented. Great!
> 
> Hmmm. Then why don't the manual at
> https://www.linux.org/docs/man8/tc-mirred.html to reflect the changes?
> That was the place I checked to see if ingress was still not implemented.
> In the commit you point at, the sentence "Currently only egress is
> implemented" has been removed.

The man pages on linux.org are not controlled by kernel/iproute developers.
Not sure who builds/owns these and don't care.


> Question:
> Is there any form of performance penalty if I send the mirrored
> traffic to the ingress queue of the destination interface rather than
> to the egress queue?
> I mean, in the kernel there is the possibility to perform far more
> actions on the ingress queue than on the egress, but if I leave both
> queues at their defaults, will mirrored packets to ingress use more
> CPU cycles than to the egress destination, or are they more or less
> identical?
> 
> 
> Question 2:
> Given the commit
> https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/commit/?id=5eca0a3701223619a513c7209f7d9335ca1b4cfa,
> how can I see in what kernel version it was added?

Look at the tags

$ git tag --contains 5eca0a3701223619a513c7209f7d9335ca1b4cfa 
v4.10.0
v4.11.0
v4.12.0
v4.13.0
v4.14.0
v4.14.1
v4.15.0
v4.16.0
v4.17.0
v4.18.0
v4.19.0
v4.20.0
v5.0.0
v5.1.0
v5.2.0

https://stackoverflow.com/questions/27886537/how-to-find-out-which-releases-contain-a-given-git-commit

^ permalink raw reply

* Re: [PATCH net-next,v4 08/12] drivers: net: use flow block API
From: Edward Cree @ 2019-08-14 16:17 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netdev, netfilter-devel
In-Reply-To: <20190813195126.ilwtoljk2csco73m@salvia>

On 13/08/2019 20:51, Pablo Neira Ayuso wrote:
> On Mon, Aug 12, 2019 at 06:50:09PM +0100, Edward Cree wrote:
>> Pablo, can you explain (because this commit message doesn't) why these per-
>>  driver lists are needed, and what the information/state is that has module
>>  (rather than, say, netdevice) scope?
> The idea is to update drivers to support one flow_block per subsystem,
> one for ethtool, one for tc, and so on. So far, existing drivers only
> allow for binding one single flow_block to one of the existing
> subsystems. So this limitation applies at driver level.
That argues for per-driver _code_, not for per-driver _state_.  For instance,
 each driver could (more logically) store this information in the netdev
 private data, rather than a static global.  Or even, since each driver
 instance has a unique cb_ident = netdev_priv(net_dev), this doesn't need to
 be local to the driver at all and could just belong to the device owning the
 flow_block (which isn't necessarily the device doing the offload, per
 indirect blocks).

TBH I'm still not clear why you need a flow_block per subsystem, rather than
 just having multiple subsystems feed their offload requests through the same
 flow_block but with different enum tc_setup_type or enum tc_fl_command or
 some other indication that this is "netfilter" rather than "tc" asking for a
 tc_cls_flower_offload.

I'd also like to concur with what Jakub said on v2: "this series is really
 hard to follow... the number of things called some combination of block cb
 and list makes my head hurt :/".

This really needs a design document explaining what all the bits are, how
 they fit together, and why they need to be like that.

^ permalink raw reply

* Re: [PATCH bpf-next 1/3] libbpf: add asm/unistd.h to xsk to get __NR_mmap2
From: Yonghong Song @ 2019-08-14 16:17 UTC (permalink / raw)
  To: Björn Töpel, Andrii Nakryiko, Magnus Karlsson,
	Björn Töpel, David S. Miller, Jesper Dangaard Brouer,
	john fastabend, Jakub Kicinski, Daniel Borkmann, Networking, bpf,
	Xdp, open list
In-Reply-To: <CAJ+HfNiqu7WEoBFnfK3znU4tVyAmpPVabTjTSKH1ZVo2W1rrXg@mail.gmail.com>



On 8/14/19 6:32 AM, Björn Töpel wrote:
> On Wed, 14 Aug 2019 at 13:57, Ivan Khoronzhuk
> <ivan.khoronzhuk@linaro.org> wrote:
>>
>> On Wed, Aug 14, 2019 at 12:24:05PM +0300, Ivan Khoronzhuk wrote:
>>> On Tue, Aug 13, 2019 at 04:38:13PM -0700, Andrii Nakryiko wrote:
>>>
>>> Hi, Andrii
>>>
>>>> On Tue, Aug 13, 2019 at 3:24 AM Ivan Khoronzhuk
>>>> <ivan.khoronzhuk@linaro.org> wrote:
>>>>>
>>>>> That's needed to get __NR_mmap2 when mmap2 syscall is used.
>>>>>
>>>>> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>>>>> ---
>>>>> tools/lib/bpf/xsk.c | 1 +
>>>>> 1 file changed, 1 insertion(+)
>>>>>
>>>>> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
>>>>> index 5007b5d4fd2c..f2fc40f9804c 100644
>>>>> --- a/tools/lib/bpf/xsk.c
>>>>> +++ b/tools/lib/bpf/xsk.c
>>>>> @@ -12,6 +12,7 @@
>>>>> #include <stdlib.h>
>>>>> #include <string.h>
>>>>> #include <unistd.h>
>>>>> +#include <asm/unistd.h>
>>>>
>>>> asm/unistd.h is not present in Github libbpf projection. Is there any
>>>
>>> Look on includes from
>>> tools/lib/bpf/libpf.c
>>> tools/lib/bpf/bpf.c
>>>
>>> That's how it's done... Copping headers to arch/arm will not
>>> solve this, it includes both of them anyway, and anyway it needs
>>> asm/unistd.h inclusion here, only because xsk.c needs __NR_*
>>>
>>>
>>
>> There is one more radical solution for this I can send, but I'm not sure how it
>> can impact on other syscals/arches...
>>
>> Looks like:
>>
>>
>> diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
>> index 9312066a1ae3..8b2f8ff7ce44 100644
>> --- a/tools/lib/bpf/Makefile
>> +++ b/tools/lib/bpf/Makefile
>> @@ -113,6 +113,7 @@ override CFLAGS += -Werror -Wall
>>   override CFLAGS += -fPIC
>>   override CFLAGS += $(INCLUDES)
>>   override CFLAGS += -fvisibility=hidden
>> +override CFLAGS += -D_FILE_OFFSET_BITS=64
>>
> 
> Hmm, isn't this glibc-ism? Does is it work for, say, musl or bionic?
> 
> If this is portable, and works on 32-, and 64-bit archs, I'm happy
> with the patch. :-)

Second here. Looks defining -D_FILE_OFFSET_BITS=64 is a well known
fix for 32bit system to deal with files > 2GB.
I remembered I used it in distant past. The below link
also explains the case.
https://digital-domain.net/largefiles.html

Testing on musl is necessary as Arnaldo's perf test suite
indeed tested it. Probably bionic too, not really familiar with that.

> 
> 
> Björn
> 
>>   ifeq ($(VERBOSE),1)
>>     Q =
>> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
>> index f2fc40f9804c..ff2d03b8380d 100644
>> --- a/tools/lib/bpf/xsk.c
>> +++ b/tools/lib/bpf/xsk.c
>> @@ -75,23 +75,6 @@ struct xsk_nl_info {
>>          int fd;
>>   };
>>
>> -/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
>> - * Unfortunately, it is not part of glibc.
>> - */
>> -static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags,
>> -                            int fd, __u64 offset)
>> -{
>> -#ifdef __NR_mmap2
>> -       unsigned int page_shift = __builtin_ffs(getpagesize()) - 1;
>> -       long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd,
>> -                          (off_t)(offset >> page_shift));
>> -
>> -       return (void *)ret;
>> -#else
>> -       return mmap(addr, length, prot, flags, fd, offset);
>> -#endif
>> -}
>> -
>>   int xsk_umem__fd(const struct xsk_umem *umem)
>>   {
>>          return umem ? umem->fd : -EINVAL;
>> @@ -211,10 +194,9 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
>>                  goto out_socket;
>>          }
>>
>> -       map = xsk_mmap(NULL, off.fr.desc +
>> -                      umem->config.fill_size * sizeof(__u64),
>> -                      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
>> -                      umem->fd, XDP_UMEM_PGOFF_FILL_RING);
>> +       map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
>> +                  PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd,
>> +                  XDP_UMEM_PGOFF_FILL_RING);
>>          if (map == MAP_FAILED) {
>>                  err = -errno;
>>                  goto out_socket;
>> @@ -228,10 +210,9 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
>>          fill->ring = map + off.fr.desc;
>>          fill->cached_cons = umem->config.fill_size;
>>
>> -       map = xsk_mmap(NULL,
>> -                      off.cr.desc + umem->config.comp_size * sizeof(__u64),
>> -                      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
>> -                      umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
>> +       map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
>> +                  PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd,
>> +                  XDP_UMEM_PGOFF_COMPLETION_RING);
>>          if (map == MAP_FAILED) {
>>                  err = -errno;
>>                  goto out_mmap;
>> @@ -552,11 +533,10 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
>>          }
>>
>>          if (rx) {
>> -               rx_map = xsk_mmap(NULL, off.rx.desc +
>> -                                 xsk->config.rx_size * sizeof(struct xdp_desc),
>> -                                 PROT_READ | PROT_WRITE,
>> -                                 MAP_SHARED | MAP_POPULATE,
>> -                                 xsk->fd, XDP_PGOFF_RX_RING);
>> +               rx_map = mmap(NULL, off.rx.desc +
>> +                             xsk->config.rx_size * sizeof(struct xdp_desc),
>> +                             PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
>> +                             xsk->fd, XDP_PGOFF_RX_RING);
>>                  if (rx_map == MAP_FAILED) {
>>                          err = -errno;
>>                          goto out_socket;
>> @@ -571,11 +551,10 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
>>          xsk->rx = rx;
>>
>>          if (tx) {
>> -               tx_map = xsk_mmap(NULL, off.tx.desc +
>> -                                 xsk->config.tx_size * sizeof(struct xdp_desc),
>> -                                 PROT_READ | PROT_WRITE,
>> -                                 MAP_SHARED | MAP_POPULATE,
>> -                                 xsk->fd, XDP_PGOFF_TX_RING);
>> +               tx_map = mmap(NULL, off.tx.desc +
>> +                             xsk->config.tx_size * sizeof(struct xdp_desc),
>> +                             PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
>> +                             xsk->fd, XDP_PGOFF_TX_RING);
>>                  if (tx_map == MAP_FAILED) {
>>                          err = -errno;
>>                          goto out_mmap_rx;
>>
>>
>> If maintainers are ready to accept this I can send.
>> What do you say?
>>
>> --
>> Regards,
>> Ivan Khoronzhuk

^ permalink raw reply

* RE: [PATCH v2 0/2] Simplify mtty driver and mdev core
From: Parav Pandit @ 2019-08-14 16:21 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Cornelia Huck, Kirti Wankhede, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, cjia@nvidia.com, Jiri Pirko,
	netdev@vger.kernel.org
In-Reply-To: <20190814085746.26b5f2a3@x1.home>



> -----Original Message-----
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Wednesday, August 14, 2019 8:28 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: Cornelia Huck <cohuck@redhat.com>; Kirti Wankhede
> <kwankhede@nvidia.com>; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; cjia@nvidia.com; Jiri Pirko <jiri@mellanox.com>;
> netdev@vger.kernel.org
> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> 
> On Wed, 14 Aug 2019 13:45:49 +0000
> Parav Pandit <parav@mellanox.com> wrote:
> 
> > > -----Original Message-----
> > > From: Cornelia Huck <cohuck@redhat.com>
> > > Sent: Wednesday, August 14, 2019 6:39 PM
> > > To: Parav Pandit <parav@mellanox.com>
> > > Cc: Alex Williamson <alex.williamson@redhat.com>; Kirti Wankhede
> > > <kwankhede@nvidia.com>; kvm@vger.kernel.org; linux-
> > > kernel@vger.kernel.org; cjia@nvidia.com; Jiri Pirko
> > > <jiri@mellanox.com>; netdev@vger.kernel.org
> > > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > >
> > > On Wed, 14 Aug 2019 12:27:01 +0000
> > > Parav Pandit <parav@mellanox.com> wrote:
> > >
> > > > + Jiri, + netdev
> > > > To get perspective on the ndo->phys_port_name for the representor
> > > > netdev
> > > of mdev.
> > > >
> > > > Hi Cornelia,
> > > >
> > > > > -----Original Message-----
> > > > > From: Cornelia Huck <cohuck@redhat.com>
> > > > > Sent: Wednesday, August 14, 2019 1:32 PM
> > > > > To: Parav Pandit <parav@mellanox.com>
> > > > > Cc: Alex Williamson <alex.williamson@redhat.com>; Kirti Wankhede
> > > > > <kwankhede@nvidia.com>; kvm@vger.kernel.org; linux-
> > > > > kernel@vger.kernel.org; cjia@nvidia.com
> > > > > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > > > >
> > > > > On Wed, 14 Aug 2019 05:54:36 +0000 Parav Pandit
> > > > > <parav@mellanox.com> wrote:
> > > > >
> > > > > > > > I get that part. I prefer to remove the UUID itself from
> > > > > > > > the structure and therefore removing this API makes lot more
> sense?
> > > > > > >
> > > > > > > Mdev and support tools around mdev are based on UUIDs
> > > > > > > because it's
> > > > > defined
> > > > > > > in the documentation.
> > > > > > When we introduce newer device naming scheme, it will update
> > > > > > the
> > > > > documentation also.
> > > > > > May be that is the time to move to .rst format too.
> > > > >
> > > > > You are aware that there are existing tools that expect a uuid
> > > > > naming scheme, right?
> > > > >
> > > > Yes, Alex mentioned too.
> > > > The good tool that I am aware of is [1], which is 4 months old.
> > > > Not sure if it is
> > > part of any distros yet.
> > > >
> > > > README also says, that it is in 'early in development. So we have
> > > > scope to
> > > improve it for non UUID names, but lets discuss that more below.
> > >
> > > The up-to-date reference for mdevctl is
> > > https://github.com/mdevctl/mdevctl. There is currently an effort to
> > > get this packaged in Fedora.
> > >
> > Awesome.
> >
> > > >
> > > > > >
> > > > > > > I don't think it's as simple as saying "voila, UUID
> > > > > > > dependencies are removed, users are free to use arbitrary
> > > > > > > strings".  We'd need to create some kind of naming policy,
> > > > > > > what characters are allows so that we can potentially expand
> > > > > > > the creation parameters as has been proposed a couple times,
> > > > > > > how do we deal with collisions and races, and why should we
> > > > > > > make such a change when a UUID is a perfectly reasonable
> > > > > > > devices name.  Thanks,
> > > > > > >
> > > > > > Sure, we should define a policy on device naming to be more relaxed.
> > > > > > We have enough examples in-kernel.
> > > > > > Few that I am aware of are netdev (vxlan, macvlan, ipvlan, lot
> > > > > > more), rdma
> > > > > etc which has arbitrary device names and ID based device names.
> > > > > >
> > > > > > Collisions and race is already taken care today in the mdev core.
> > > > > > Same
> > > > > unique device names continue.
> > > > >
> > > > > I'm still completely missing a rationale _why_ uuids are
> > > > > supposedly bad/restricting/etc.
> > > > There is nothing bad about uuid based naming.
> > > > Its just too long name to derive phys_port_name of a netdev.
> > > > In details below.
> > > >
> > > > For a given mdev of networking type, we would like to have
> > > > (a) representor netdevice [2]
> > > > (b) associated devlink port [3]
> > > >
> > > > Currently these representor netdevice exist only for the PCIe SR-IOV VFs.
> > > > It is further getting extended for mdev without SR-IOV.
> > > >
> > > > Each of the devlink port is attached to representor netdevice [4].
> > > >
> > > > This netdevice phys_port_name should be a unique derived from some
> > > property of mdev.
> > > > Udev/systemd uses phys_port_name to derive unique representor
> > > > netdev
> > > name.
> > > > This netdev name is further use by orchestration and switching
> > > > software in
> > > user space.
> > > > One such distro supported switching software is ovs [4], which
> > > > relies on the
> > > persistent device name of the representor netdevice.
> > >
> > > Ok, let me rephrase this to check that I understand this correctly.
> > > I'm not sure about some of the terms you use here (even after
> > > looking at the linked doc/code), but that's probably still ok.
> > >
> > > We want to derive an unique (and probably persistent?) netdev name
> > > so that userspace can refer to a representor netdevice. Makes sense.
> > > For generating that name, udev uses the phys_port_name (which
> > > represents the devlink port, IIUC). Also makes sense.
> > >
> > You understood it correctly.
> >
> > > >
> > > > phys_port_name has limitation to be only 15 characters long.
> > > > UUID doesn't fit in phys_port_name.
> > >
> > > Understood. But why do we need to derive the phys_port_name from the
> > > mdev device name? This netdevice use case seems to be just one use
> > > case for using mdev devices? If this is a specialized mdev type for
> > > this setup, why not just expose a shorter identifier via an extra attribute?
> > >
> > Representor netdev, represents mdev's switch port (like PCI SRIOV VF's switch
> port).
> > So user must be able to relate this two objects in similar manner as SRIOV
> VFs.
> > Phys_port_name is derived from the PCI PF and VF numbering scheme.
> > Similarly mdev's such port should be derived from mdev's id/name/attribute.
> >
> > > > Longer UUID names are creating snow ball effect, not just in
> > > > networking stack
> > > but many user space tools too.
> > >
> > > This snowball effect mainly comes from the device name ->
> > > phys_port_name setup, IIUC.
> > >
> > Right.
> >
> > > > (as opposed to recently introduced mdevctl, are they more mdev
> > > > tools which has dependency on UUID name?)
> > >
> > > I am aware that people have written scripts etc. to manage their mdevs.
> > > Given that the mdev infrastructure has been around for quite some
> > > time, I'd say the chance of some of those scripts relying on uuid names is
> non-zero.
> > >
> > Ok. but those scripts have never managed networking devices.
> > So those scripts won't break because they will always create mdev devices
> using UUID.
> > When they use these new networking devices, they need more things than
> their scripts.
> > So user space upgrade for such mixed mode case is reasonable.
> 
> Tools like mdevctl are agnostic of the type of mdev device they're managing, it
> shouldn't matter than they've never managed a networking mdev previously, it
> follows the standards of mdev management.
> 
> > > >
> > > > Instead of mdev subsystem creating such effect, one option we are
> > > considering is to have shorter mdev names.
> > > > (Similar to netdev, rdma, nvme devices).
> > > > Such as mdev1, mdev2000 etc.
> 
> Note that these are kernel generated names, as are the other examples.
No. I probably gave the wrong examples.
Mdev user provided names can be 'foo', 'bar', 'foo1'.

> In the case of mdev, the user is providing the UUID, which becomes the device
> name.  When a user writes to the create attribute, there needs to be
> determinism that the user can identify the device they created vs another that
> may have been created concurrently.  I don't see that we can put users in the
> path of managing device instance numbers.
No. Its just user provided names.

> 
> > > > Second option I was considering is to have an optional alias for
> > > > UUID based
> > > mdev.
> > > > This name alias is given at time of mdev creation.
> > > > Devlink port's phys_port_name is derived out of this shorter mdev
> > > > name
> > > alias.
> > > > This way, mdev remains to be UUID based with optional extension.
> > > > However, I prefer first option to relax mdev naming scheme.
> > >
> > > Actually, I think that second option makes much more sense, as you
> > > avoid potentially breaking existing tooling.
> > Let's first understand of what exactly will break with existing tool
> > if they see non_uuid based device.
> 
> Do we really want a mixed namespace of device names, some UUID, some...
> something else?  That seems like a mess.
> 
So you prefer alias as an attribute? If so, it should be an optional additional parameter during create time, 
because it is desired to not invent new callbacks for such attributes setting and (and rewrite them).

> > Existing tooling continue to work with UUID devices.
> > Do you have example of what can break if they see non_uuid based
> > device name? I think you are clear, but to be sure, UUID based
> > creation will continue to be there. Optionally mdev will be created
> > with alpha-numeric string, if we don't it as additional attribute.
> 
> I'm not onboard with a UUID being just one of the possible naming strings via
> which we can create mdev devices.  I think that becomes untenable for
> userspace.  I don't think a sufficient argument has been made against the alias
> approach, which seems to keep the UUID as a canonical name, providing a
> consistent namespace, augmented with user or kernel provided short alias.
> Thanks,
> 
If I understand you correctly, you prefer alias name approach to keep UUID naming scheme intact in mdev?

> Alex
> 
> > > >
> > > > > We want to uniquely identify a device, across different types of
> > > > > vendor drivers. An uuid is a unique identifier and even a
> > > > > well-defined one. Tools (e.g. mdevctl) are relying on it for
> > > > > mdev devices
> > > today.
> > > > >
> > > > > What is the problem you're trying to solve?
> > > > Unique device naming is still achieved without UUID scheme by
> > > > various
> > > subsystems in kernel using alpha-numeric string.
> > > > Having such string based continue to provide unique names.
> > > >
> > > > I hope I described the problem and two solutions above.
> > > >
> > > > [1] https://github.com/awilliam/mdevctl
> > > > [2]
> > > > https://elixir.bootlin.com/linux/v5.3-rc4/source/drivers/net/ether
> > > > net/
> > > > mellanox/mlx5/core/en_rep.c [3]
> > > > http://man7.org/linux/man-pages/man8/devlink-port.8.html
> > > > [4]
> > > > https://elixir.bootlin.com/linux/v5.3-rc4/source/net/core/devlink.
> > > > c#L6
> > > > 921
> > > > [5] https://www.openvswitch.org/
> > > >
> >


^ permalink raw reply

* [PATCH] lan78xx: Fix memory leaks
From: Wenwen Wang @ 2019-08-14 16:23 UTC (permalink / raw)
  To: Wenwen Wang
  Cc: Woojung Huh, Microchip Linux Driver Support, David S. Miller,
	open list:USB LAN78XX ETHERNET DRIVER,
	open list:USB NETWORKING DRIVERS, open list

In lan78xx_probe(), a new urb is allocated through usb_alloc_urb() and
saved to 'dev->urb_intr'. However, in the following execution, if an error
occurs, 'dev->urb_intr' is not deallocated, leading to memory leaks. To fix
this issue, invoke usb_free_urb() to free the allocated urb before
returning from the function.

Signed-off-by: Wenwen Wang <wenwen@cs.uga.edu>
---
 drivers/net/usb/lan78xx.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 3d92ea6..f033fee 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -3792,7 +3792,7 @@ static int lan78xx_probe(struct usb_interface *intf,
 	ret = register_netdev(netdev);
 	if (ret != 0) {
 		netif_err(dev, probe, netdev, "couldn't register the device\n");
-		goto out3;
+		goto out4;
 	}
 
 	usb_set_intfdata(intf, dev);
@@ -3807,12 +3807,14 @@ static int lan78xx_probe(struct usb_interface *intf,
 
 	ret = lan78xx_phy_init(dev);
 	if (ret < 0)
-		goto out4;
+		goto out5;
 
 	return 0;
 
-out4:
+out5:
 	unregister_netdev(netdev);
+out4:
+	usb_free_urb(dev->urb_intr);
 out3:
 	lan78xx_unbind(dev, intf);
 out2:
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH RFC 2/4] net: phy: allow to bind genphy driver at probe time
From: Florian Fainelli @ 2019-08-14 16:30 UTC (permalink / raw)
  To: Heiner Kallweit, Andrew Lunn, Marek Behun, David Miller
  Cc: netdev@vger.kernel.org
In-Reply-To: <7225e653-6f93-63fc-8d61-a712318d1949@gmail.com>

On 8/13/19 4:02 PM, Heiner Kallweit wrote:
> On 14.08.2019 00:53, Florian Fainelli wrote:
>> On 8/13/19 2:25 PM, Heiner Kallweit wrote:
>>> In cases like a fixed phy that is never attached to a net_device we
>>> may want to bind the genphy driver at probe time. Setting a PHY ID of
>>> 0xffffffff to bind the genphy driver would fail due to a check in
>>> get_phy_device(). Therefore let's change the PHY ID the genphy driver
>>> binds to to 0xfffffffe. This still shouldn't match any real PHY,
>>> and it will pass the check in get_phy_devcie().
>>>
>>> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
>>> ---
>>>  drivers/net/phy/phy_device.c | 3 +--
>>>  include/linux/phy.h          | 4 ++++
>>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>> index 163295dbc..54f80af31 100644
>>> --- a/drivers/net/phy/phy_device.c
>>> +++ b/drivers/net/phy/phy_device.c
>>> @@ -2388,8 +2388,7 @@ void phy_drivers_unregister(struct phy_driver *drv, int n)
>>>  EXPORT_SYMBOL(phy_drivers_unregister);
>>>  
>>>  static struct phy_driver genphy_driver = {
>>> -	.phy_id		= 0xffffffff,
>>> -	.phy_id_mask	= 0xffffffff,
>>> +	PHY_ID_MATCH_EXACT(GENPHY_ID),
>>>  	.name		= "Generic PHY",
>>>  	.soft_reset	= genphy_no_soft_reset,
>>>  	.get_features	= genphy_read_abilities,
>>> diff --git a/include/linux/phy.h b/include/linux/phy.h
>>> index 5ac7d2137..3b07bce78 100644
>>> --- a/include/linux/phy.h
>>> +++ b/include/linux/phy.h
>>> @@ -37,6 +37,10 @@
>>>  #define PHY_1000BT_FEATURES	(SUPPORTED_1000baseT_Half | \
>>>  				 SUPPORTED_1000baseT_Full)
>>>  
>>> +#define GENPHY_ID_HIGH		0xffffU
>>> +#define GENPHY_ID_LOW		0xfffeU
>>> +#define GENPHY_ID		((GENPHY_ID_HIGH << 16) | GENPHY_ID_LOW)
>>
>> This is a possible user ABI change here, if there is anything that
>> relies on reading 0xffff_ffff as a valid PHY OUI, you would be breaking
>> it. We might as well try to assign ourselves a specific PHY OUI, very
>> much like the Linux USB hubs show up with a Linux Foundation vendor ID.
>>
> 
> I see the point. However in get_phy_device() we have the following check
> that should cause a PHY with ID 0xffff_ffff to be ignored. Therefore
> I doubt there's any such PHY ID in use.
> 
> 	/* If the phy_id is mostly Fs, there is no device there */
> 	if ((phy_id & 0x1fffffff) == 0x1fffffff)
> 		return ERR_PTR(-ENODEV);

Indeed, it looks like the phy_id reported through sysfs for fixed PHY is
actually 0, so your change should be fine then, thanks!
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next,v4 07/12] net: sched: use flow block API
From: Edward Cree @ 2019-08-14 16:32 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netdev, netfilter-devel
In-Reply-To: <20190709205550.3160-8-pablo@netfilter.org>

On 09/07/2019 21:55, Pablo Neira Ayuso wrote:
> This patch adds tcf_block_setup() which uses the flow block API.
>
> This infrastructure takes the flow block callbacks coming from the
> driver and register/unregister to/from the cls_api core.
>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> <snip>
> @@ -796,13 +804,20 @@ static int tcf_block_offload_cmd(struct tcf_block *block,
>  				 struct netlink_ext_ack *extack)
>  {
>  	struct tc_block_offload bo = {};
> +	int err;
>  
>  	bo.net = dev_net(dev);
>  	bo.command = command;
>  	bo.binder_type = ei->binder_type;
>  	bo.block = block;
>  	bo.extack = extack;
> -	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
> +	INIT_LIST_HEAD(&bo.cb_list);
> +
> +	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
> +	if (err < 0)
> +		return err;
> +
> +	return tcf_block_setup(block, &bo);
>  }
>  
>  static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
> @@ -1636,6 +1651,77 @@ void tcf_block_cb_unregister(struct tcf_block *block,
>  }
>  EXPORT_SYMBOL(tcf_block_cb_unregister);
>  
> +static int tcf_block_bind(struct tcf_block *block,
> +			  struct flow_block_offload *bo)
> +{
> +	struct flow_block_cb *block_cb, *next;
> +	int err, i = 0;
> +
> +	list_for_each_entry(block_cb, &bo->cb_list, list) {
> +		err = tcf_block_playback_offloads(block, block_cb->cb,
> +						  block_cb->cb_priv, true,
> +						  tcf_block_offload_in_use(block),
> +						  bo->extack);
> +		if (err)
> +			goto err_unroll;
> +
> +		i++;
> +	}
> +	list_splice(&bo->cb_list, &block->cb_list);
> +
> +	return 0;
> +
> +err_unroll:
> +	list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
> +		if (i-- > 0) {
> +			list_del(&block_cb->list);
> +			tcf_block_playback_offloads(block, block_cb->cb,
> +						    block_cb->cb_priv, false,
> +						    tcf_block_offload_in_use(block),
> +						    NULL);
> +		}
> +		flow_block_cb_free(block_cb);
> +	}
> +
> +	return err;
> +}
Why has the replay been moved from the function called by the driver
 (__tcf_block_cb_register()) to work done by the driver's caller based on
 what the driver has left on this flow_block_offload.cb_list?  This makes
 it impossible for the driver to (say) unregister a block outside of an
 explicit request from ndo_setup_tc().
In my under-development driver, I have a teardown path called on PCI
 remove, which calls tcf_block_cb_unregister() on all my block bindings
 (of which the driver keeps track), to ensure that no flow rules are still
 in place when unregister_netdev() is called; this is needed because some
 of the driver's state for certain rules involves taking a reference on
 the netdevice (dev_hold()).  Your structural changes here make that
 impossible; is there any reason why they're necessary?

-Ed

^ permalink raw reply

* Re: [PATCH net] batman-adv: fix uninit-value in batadv_netlink_get_ifindex()
From: David Miller @ 2019-08-14 16:36 UTC (permalink / raw)
  To: edumazet; +Cc: netdev, eric.dumazet, syzkaller, mareklindner, sw, a
In-Reply-To: <20190812115727.72149-1-edumazet@google.com>

From: Eric Dumazet <edumazet@google.com>
Date: Mon, 12 Aug 2019 04:57:27 -0700

> batadv_netlink_get_ifindex() needs to make sure user passed
> a correct u32 attribute.
 ...
> Fixes: b60620cf567b ("batman-adv: netlink: hardif query")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>

Simon, I assume I will get this ultimately from you.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next] mcast: ensure L-L IPv6 packets are accepted by bridge
From: Patrick Ruddy @ 2019-08-14 16:40 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Ido Schimmel; +Cc: netdev, roopa, linus.luessing
In-Reply-To: <43ed59db-9228-9132-b9a5-31c8d1e8e9e9@cumulusnetworks.com>

Thanks both for the quick replies, answers inline...

On Wed, 2019-08-14 at 02:55 +0300, Nikolay Aleksandrov wrote:
> On 8/13/19 10:53 PM, Ido Schimmel wrote:
> > + Bridge maintainers, Linus
> > 
> 
> Good catch Ido, thanks!
> First I'd say the subject needs to reflect that this is a bridge change
> better, please rearrange it like so - bridge: mcast: ...
> More below,
> 
> > On Tue, Aug 13, 2019 at 03:18:04PM +0100, Patrick Ruddy wrote:
> > > At present only all-nodes IPv6 multicast packets are accepted by
> > > a bridge interface that is not in multicast router mode. Since
> > > other protocols can be running in the absense of multicast
> > > forwarding e.g. OSPFv3 IPv6 ND. Change the test to allow
> > > all of the FFx2::/16 range to be accepted when not in multicast
> > > router mode. This aligns the code with IPv4 link-local reception
> > > and RFC4291
> > 
> > Can you please quote the relevant part from RFC 4291?
> > 
> > > Signed-off-by: Patrick Ruddy <pruddy@vyatta.att-mail.com>
> > > ---
> > >  include/net/addrconf.h    | 15 +++++++++++++++
> > >  net/bridge/br_multicast.c |  2 +-
> > >  2 files changed, 16 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/include/net/addrconf.h b/include/net/addrconf.h
> > > index becdad576859..05b42867e969 100644
> > > --- a/include/net/addrconf.h
> > > +++ b/include/net/addrconf.h
> > > @@ -434,6 +434,21 @@ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr,
> > >  		      htonl(0xFF000000) | addr->s6_addr32[3]);
> > >  }
> > >  
> > > +/*
> > > + *      link local multicast address range ffx2::/16 rfc4291
> > > + */
> > > +static inline bool ipv6_addr_is_ll_mcast(const struct in6_addr *addr)
> > > +{
> > > +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
> > > +	__be64 *p = (__be64 *)addr;
> > > +	return ((p[0] & cpu_to_be64(0xff0f000000000000UL))
> > > +		^ cpu_to_be64(0xff02000000000000UL)) == 0UL;
> > > +#else
> > > +	return ((addr->s6_addr32[0] & htonl(0xff0f0000)) ^
> > > +		htonl(0xff020000)) == 0;
> > > +#endif
> > > +}
> > > +
> > >  static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr)
> > >  {
> > >  #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
> > > diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
> > > index 9b379e110129..ed3957381fa2 100644
> > > --- a/net/bridge/br_multicast.c
> > > +++ b/net/bridge/br_multicast.c
> > > @@ -1664,7 +1664,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
> > >  	err = ipv6_mc_check_mld(skb);
> > >  
> > >  	if (err == -ENOMSG) {
> > > -		if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
> > > +		if (!ipv6_addr_is_ll_mcast(&ipv6_hdr(skb)->daddr))
> > >  			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
> > 
> > IIUC, you want IPv6 link-local packets to be locally received, but this
> > also changes how these packets are flooded. RFC 4541 says that packets
> 
> Indeed, we'll start flooding them all, not just the all hosts address.
> If that is at all required it'll definitely have to be optional.
> 
> > addressed to the all hosts address are a special case and should be
> > forwarded to all ports:
> > 
> > "In IPv6, the data forwarding rules are more straight forward because MLD is
> > mandated for addresses with scope 2 (link-scope) or greater. The only exception
> > is the address FF02::1 which is the all hosts link-scope address for which MLD
> > messages are never sent. Packets with the all hosts link-scope address should
> > be forwarded on all ports."
> > 
> 
> I wonder what is the problem for the host to join such group on behalf of the bridge ?
> Then you'll receive the traffic at least locally and the RFC says it itself - MLD is mandated
> for the other link-local addresses.
> It's very late here and maybe I'm missing something.. :)
> 
The group is being joined by MLD at the L3 level but the packets are
not being passed up to the l3 interface becasue there is a MLD querier
on the network

snippet from /proc/net/igmp6
...
40   sw1             ff0200000000000000000001ff008700     1 00000004 0
40   sw1             ff020000000000000000000000000002     1 00000004 0
40   sw1             ff020000000000000000000000000001     1 0000000C 0
40   sw1             ff010000000000000000000000000001     1 00000008 0
41   lo1             ff020000000000000000000000000001     1 0000000C 0
41   lo1             ff010000000000000000000000000001     1 00000008 0
42   sw1.1           ff020000000000000000000000000006     1 00000004 0
42   sw1.1           ff020000000000000000000000000005     1 00000004 0
42   sw1.1           ff0200000000000000000001ff000000     2 00000004 0
42   sw1.1           ff0200000000000000000001ff008700     1 00000004 0
42   sw1.1           ff0200000000000000000001ff000099     1 00000004 0
42   sw1.1           ff020000000000000000000000000002     1 00000004 0
42   sw1.1           ff020000000000000000000000000001     1 0000000C 0
42   sw1.1           ff010000000000000000000000000001     1 00000008 0
...

the bridge is sw1 and the l3 intervace is sw1.1

Ido is correct about the flooding - I will update the patch with the
comments and reissue.

Thanks again

-pr
>  
> > Maybe you want something like:
> > 
> 
> I think we can do without the new field, either pass local_rcv into br_multicast_rcv() or
> set it based on return value. The extra test will have to remain unfortunately, but we
> can reduce the tests by one if carefully done.
> 
> > diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
> > index 09b1dd8cd853..9f312a73f61c 100644
> > --- a/net/bridge/br_input.c
> > +++ b/net/bridge/br_input.c
> > @@ -132,7 +132,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
> >  		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
> >  		    br_multicast_querier_exists(br, eth_hdr(skb))) {
> >  			if ((mdst && mdst->host_joined) ||
> > -			    br_multicast_is_router(br)) {
> > +			    br_multicast_is_router(br) ||
> > +			    BR_INPUT_SKB_CB_LOCAL_RECEIVE(skb)) {
> >  				local_rcv = true;
> >  				br->dev->stats.multicast++;
> >  			}
> > diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
> > index 9b379e110129..f03cecf6174e 100644
> > --- a/net/bridge/br_multicast.c
> > +++ b/net/bridge/br_multicast.c
> > @@ -1667,6 +1667,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
> >  		if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
> >  			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
> >  
> > +		if (ipv6_addr_is_ll_mcast(&ipv6_hdr(skb)->daddr))
> > +			BR_INPUT_SKB_CB(skb)->local_receive = 1;
> > +
> >  		if (ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) {
> >  			err = br_ip6_multicast_mrd_rcv(br, port, skb);
> >  
> > diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
> > index b7a4942ff1b3..d76394ca4059 100644
> > --- a/net/bridge/br_private.h
> > +++ b/net/bridge/br_private.h
> > @@ -426,6 +426,7 @@ struct br_input_skb_cb {
> >  #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
> >  	u8 igmp;
> >  	u8 mrouters_only:1;
> > +	u8 local_receive:1;
> >  #endif
> >  	u8 proxyarp_replied:1;
> >  	u8 src_port_isolated:1;
> > @@ -445,8 +446,10 @@ struct br_input_skb_cb {
> >  
> >  #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
> >  # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb)	(BR_INPUT_SKB_CB(__skb)->mrouters_only)
> > +# define BR_INPUT_SKB_CB_LOCAL_RECEIVE(__skb)	(BR_INPUT_SKB_CB(__skb)->local_receive)
> >  #else
> >  # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb)	(0)
> > +# define BR_INPUT_SKB_CB_LOCAL_RECEIVE(__skb)	(0)
> >  #endif
> >  
> >  #define br_printk(level, br, format, args...)	\
> > 


^ permalink raw reply

* Re: [RFC bpf-next 0/3] tools: bpftool: add subcommand to count map entries
From: Edward Cree @ 2019-08-14 16:45 UTC (permalink / raw)
  To: Quentin Monnet, Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, bpf, netdev, oss-drivers
In-Reply-To: <ab11a9f2-0fbd-d35f-fee1-784554a2705a@netronome.com>

On 14/08/2019 10:42, Quentin Monnet wrote:
> 2019-08-13 18:51 UTC-0700 ~ Alexei Starovoitov
> <alexei.starovoitov@gmail.com>
>> The same can be achieved by 'bpftool map dump|grep key|wc -l', no?
> To some extent (with subtleties for some other map types); and we use a
> similar command line as a workaround for now. But because of the rate of
> inserts/deletes in the map, the process often reports a number higher
> than the max number of entries (we observed up to ~750k when max_entries
> is 500k), even is the map is only half-full on average during the count.
> On the worst case (though not frequent), an entry is deleted just before
> we get the next key from it, and iteration starts all over again. This
> is not reliable to determine how much space is left in the map.
>
> I cannot see a solution that would provide a more accurate count from
> user space, when the map is under pressure?
This might be a really dumb suggestion, but: you're wanting to collect a
 summary statistic over an in-kernel data structure in a single syscall,
 because making a series of syscalls to examine every entry is slow and
 racy.  Isn't that exactly a job for an in-kernel virtual machine, and
 could you not supply an eBPF program which the kernel runs on each entry
 in the map, thus supporting people who want to calculate something else
 (mean, min and max, whatever) instead of count?

^ permalink raw reply

* [PATCH bpf-next 0/4] selftests/bpf: test_progs: misc fixes
From: Stanislav Fomichev @ 2019-08-14 16:47 UTC (permalink / raw)
  To: netdev, bpf; +Cc: davem, ast, daniel, Stanislav Fomichev, Andrii Nakryiko

* make output a bit easier to follow
* add test__skip to indicate skipped tests
* remove global success/error counts (use environment)
* remove asserts from the tests

Cc: Andrii Nakryiko <andriin@fb.com>

Stanislav Fomichev (4):
  selftests/bpf: test_progs: change formatting of the condenced output
  selftests/bpf: test_progs: test__skip
  selftests/bpf: test_progs: remove global fail/success counts
  selftests/bpf: test_progs: remove asserts from subtests

 .../selftests/bpf/prog_tests/bpf_obj_id.c     | 32 +++++--
 .../bpf/prog_tests/bpf_verif_scale.c          | 10 +-
 .../selftests/bpf/prog_tests/flow_dissector.c |  2 +-
 .../bpf/prog_tests/get_stack_raw_tp.c         |  2 +-
 .../selftests/bpf/prog_tests/global_data.c    | 10 +-
 .../selftests/bpf/prog_tests/l4lb_all.c       |  4 +-
 .../selftests/bpf/prog_tests/map_lock.c       | 28 +++---
 .../selftests/bpf/prog_tests/pkt_access.c     |  2 +-
 .../selftests/bpf/prog_tests/pkt_md_access.c  |  2 +-
 .../bpf/prog_tests/queue_stack_map.c          |  4 +-
 .../bpf/prog_tests/reference_tracking.c       |  2 +-
 .../selftests/bpf/prog_tests/send_signal.c    |  1 +
 .../selftests/bpf/prog_tests/spinlock.c       | 12 ++-
 .../bpf/prog_tests/stacktrace_build_id.c      | 11 ++-
 .../bpf/prog_tests/stacktrace_build_id_nmi.c  | 11 ++-
 .../selftests/bpf/prog_tests/stacktrace_map.c |  2 +-
 .../bpf/prog_tests/stacktrace_map_raw_tp.c    |  2 +-
 .../bpf/prog_tests/task_fd_query_rawtp.c      |  2 +-
 .../bpf/prog_tests/task_fd_query_tp.c         |  2 +-
 .../selftests/bpf/prog_tests/tcp_estats.c     |  2 +-
 tools/testing/selftests/bpf/prog_tests/xdp.c  |  2 +-
 .../bpf/prog_tests/xdp_adjust_tail.c          |  2 +-
 .../selftests/bpf/prog_tests/xdp_noinline.c   |  4 +-
 tools/testing/selftests/bpf/test_progs.c      | 93 +++++++++++--------
 tools/testing/selftests/bpf/test_progs.h      | 28 +++++-
 25 files changed, 165 insertions(+), 107 deletions(-)

-- 
2.23.0.rc1.153.gdeed80330f-goog

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox