Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next v2 4/4] xsk: lock the control mutex in sock_diag interface
From: Björn Töpel @ 2019-08-26  6:10 UTC (permalink / raw)
  To: ast, daniel, netdev
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, bpf,
	jonathan.lemon, syzbot+c82697e3043781e08802, hdanton, i.maximets
In-Reply-To: <20190826061053.15996-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

When accessing the members of an XDP socket, the control mutex should
be held. This commit fixes that.

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Fixes: a36b38aa2af6 ("xsk: add sock_diag interface for AF_XDP")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xsk_diag.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index d5e06c8e0cbf..c8f4f11edbbc 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
 	msg->xdiag_ino = sk_ino;
 	sock_diag_save_cookie(sk, msg->xdiag_cookie);
 
+	mutex_lock(&xs->mutex);
 	if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
 		goto out_nlmsg_trim;
 
@@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
 	    sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
 		goto out_nlmsg_trim;
 
+	mutex_unlock(&xs->mutex);
 	nlmsg_end(nlskb, nlh);
 	return 0;
 
 out_nlmsg_trim:
+	mutex_unlock(&xs->mutex);
 	nlmsg_cancel(nlskb, nlh);
 	return -EMSGSIZE;
 }
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2] net: fix skb use after free in netpoll
From: Feng Sun @ 2019-08-26  6:13 UTC (permalink / raw)
  To: davem
  Cc: edumazet, dsterba, dbanerje, fw, davej, tglx, matwey,
	sakari.ailus, netdev, linux-kernel, Feng Sun, Xiaojun Zhao
In-Reply-To: <1566577920-20956-1-git-send-email-loyou85@gmail.com>

After commit baeababb5b85d5c4e6c917efe2a1504179438d3b
("tun: return NET_XMIT_DROP for dropped packets"),
when tun_net_xmit drop packets, it will free skb and return NET_XMIT_DROP,
netpoll_send_skb_on_dev will run into following use after free cases:
1. retry netpoll_start_xmit with freed skb;
2. queue freed skb in npinfo->txq.
queue_process will also run into use after free case.

hit netpoll_send_skb_on_dev first case with following kernel log:

[  117.864773] kernel BUG at mm/slub.c:306!
[  117.864773] invalid opcode: 0000 [#1] SMP PTI
[  117.864774] CPU: 3 PID: 2627 Comm: loop_printmsg Kdump: loaded Tainted: P           OE     5.3.0-050300rc5-generic #201908182231
[  117.864775] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  117.864775] RIP: 0010:kmem_cache_free+0x28d/0x2b0
[  117.864781] Call Trace:
[  117.864781]  ? tun_net_xmit+0x21c/0x460
[  117.864781]  kfree_skbmem+0x4e/0x60
[  117.864782]  kfree_skb+0x3a/0xa0
[  117.864782]  tun_net_xmit+0x21c/0x460
[  117.864782]  netpoll_start_xmit+0x11d/0x1b0
[  117.864788]  netpoll_send_skb_on_dev+0x1b8/0x200
[  117.864789]  __br_forward+0x1b9/0x1e0 [bridge]
[  117.864789]  ? skb_clone+0x53/0xd0
[  117.864790]  ? __skb_clone+0x2e/0x120
[  117.864790]  deliver_clone+0x37/0x50 [bridge]
[  117.864790]  maybe_deliver+0x89/0xc0 [bridge]
[  117.864791]  br_flood+0x6c/0x130 [bridge]
[  117.864791]  br_dev_xmit+0x315/0x3c0 [bridge]
[  117.864792]  netpoll_start_xmit+0x11d/0x1b0
[  117.864792]  netpoll_send_skb_on_dev+0x1b8/0x200
[  117.864792]  netpoll_send_udp+0x2c6/0x3e8
[  117.864793]  write_msg+0xd9/0xf0 [netconsole]
[  117.864793]  console_unlock+0x386/0x4e0
[  117.864793]  vprintk_emit+0x17e/0x280
[  117.864794]  vprintk_default+0x29/0x50
[  117.864794]  vprintk_func+0x4c/0xbc
[  117.864794]  printk+0x58/0x6f
[  117.864795]  loop_fun+0x24/0x41 [printmsg_loop]
[  117.864795]  kthread+0x104/0x140
[  117.864795]  ? 0xffffffffc05b1000
[  117.864796]  ? kthread_park+0x80/0x80
[  117.864796]  ret_from_fork+0x35/0x40

Signed-off-by: Feng Sun <loyou85@gmail.com>
Signed-off-by: Xiaojun Zhao <xiaojunzhao141@gmail.com>
---
Changes in v2:
- change commit and title
- add netpoll_xmit_complete helper
- add one more return value check of netpoll_start_xmit
---
 net/core/netpoll.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 2cf27da..a3f20e9 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -95,6 +95,11 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 	return status;
 }
 
+static inline bool netpoll_xmit_complete(int rc)
+{
+	return dev_xmit_complete(rc);
+}
+
 static void queue_process(struct work_struct *work)
 {
 	struct netpoll_info *npinfo =
@@ -122,7 +127,7 @@ static void queue_process(struct work_struct *work)
 		txq = netdev_get_tx_queue(dev, q_index);
 		HARD_TX_LOCK(dev, txq, smp_processor_id());
 		if (netif_xmit_frozen_or_stopped(txq) ||
-		    netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
+		    !netpoll_xmit_complete(netpoll_start_xmit(skb, dev, txq))) {
 			skb_queue_head(&npinfo->txq, skb);
 			HARD_TX_UNLOCK(dev, txq);
 			local_irq_restore(flags);
@@ -335,7 +340,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 
 				HARD_TX_UNLOCK(dev, txq);
 
-				if (status == NETDEV_TX_OK)
+				if (netpoll_xmit_complete(status))
 					break;
 
 			}
@@ -352,7 +357,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 
 	}
 
-	if (status != NETDEV_TX_OK) {
+	if (!netpoll_xmit_complete(status)) {
 		skb_queue_tail(&npinfo->txq, skb);
 		schedule_delayed_work(&npinfo->tx_work,0);
 	}
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH net-next 01/14] bnxt_en: Suppress all error messages in hwrm_do_send_msg() in silent mode.
From: Michael Chan @ 2019-08-26  6:17 UTC (permalink / raw)
  To: David Miller; +Cc: Netdev, Vasundhara Volam, Jiri Pirko, Ray Jui
In-Reply-To: <20190825.221507.1465677703637201643.davem@davemloft.net>

On Sun, Aug 25, 2019 at 10:15 PM David Miller <davem@davemloft.net> wrote:
>
> From: Michael Chan <michael.chan@broadcom.com>
> Date: Sun, 25 Aug 2019 23:54:52 -0400
>
> > If the silent parameter is set, suppress all messages when there is
> > no response from firmware.  When polling for firmware to come out of
> > reset, no response may be normal and we want to suppress the error
> > messages.  Also, don't poll for the firmware DMA response if Bus Master
> > is disabled.  This is in preparation for error recovery when firmware
> > may be in error or reset state or Bus Master is disabled.
> >
> > Signed-off-by: Michael Chan <michael.chan@broadcom.com>
>
> The function bnxt_hwrm_do_send_msg() seems to be an interesting mix of return
> values, what are the semantics?
>
> It seems to use 0 for success, some error codes, and -1.  Does -1 have special
> meaning?
>
> Just curious, and really this unorthodox return value semantic should
> be documented into a comment above the function.

Sadly, it was coded initially to return firmware defined error codes.
But in some cases, the return code gets propagated all the way back to
userspace.  The long term goal is to convert to standard error codes
and so we try to use standard error codes whenever we add new patches
related to this function.  I will see what I can do to make this
better in v2.  Thanks.

^ permalink raw reply

* Re: [PATCH v2] net: fix skb use after free in netpoll
From: David Miller @ 2019-08-26  6:20 UTC (permalink / raw)
  To: loyou85
  Cc: edumazet, dsterba, dbanerje, fw, davej, tglx, matwey,
	sakari.ailus, netdev, linux-kernel, xiaojunzhao141
In-Reply-To: <1566800020-10007-1-git-send-email-loyou85@gmail.com>

From: Feng Sun <loyou85@gmail.com>
Date: Mon, 26 Aug 2019 14:13:40 +0800

> +static inline bool netpoll_xmit_complete(int rc)
> +{
> +	return dev_xmit_complete(rc);
> +}

There is no need for this useless indirection, just call dev_xmit_complete()
staright.

Also, even if it was suitable, never use the inline keyword in foo.c files.

^ permalink raw reply

* Re: [PATCH net-next 05/14] bnxt_en: Discover firmware error recovery capabilities.
From: Michael Chan @ 2019-08-26  6:23 UTC (permalink / raw)
  To: David Miller; +Cc: Netdev, Vasundhara Volam, Jiri Pirko, Ray Jui
In-Reply-To: <20190825.224913.1760774642952210371.davem@davemloft.net>

On Sun, Aug 25, 2019 at 10:49 PM David Miller <davem@davemloft.net> wrote:
>
> From: Michael Chan <michael.chan@broadcom.com>
> Date: Sun, 25 Aug 2019 23:54:56 -0400
>
> > +static int bnxt_hwrm_error_recovery_qcfg(struct bnxt *bp)
> > +{
> > +     struct hwrm_error_recovery_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
> > +     struct bnxt_fw_health *fw_health = bp->fw_health;
> > +     struct hwrm_error_recovery_qcfg_input req = {0};
> > +     int rc, i;
> > +
> > +     if (!(bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY))
> > +             return 0;
> > +
> > +     bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_ERROR_RECOVERY_QCFG, -1, -1);
> > +     mutex_lock(&bp->hwrm_cmd_lock);
> > +     rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
> > +     if (rc) {
> > +             rc = -EOPNOTSUPP;
> > +             goto err_recovery_out;
> > +     }
>
> How is this logically an unsupported operation if you're guarding it's use
> with an appropriate capability check?

The BNXT_FW_CAP_ERROR_RECOVERY flag says that error recovery should be
supported, but if the firmware call to get the recovery parameters
fails, we return -EOPNOTSUPP to let the caller know that error
recovery cannot be supported.  Again, I will try to clean up the error
codes in v2.

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2019-08-26  6:29 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) Use 32-bit index for tails calls in s390 bpf JIT, from Ilya Leoshkevich.

2) Fix missed EPOLLOUT events in TCP, from Eric Dumazet.  Same fix for SMC
   from Jason Baron.

3) ipv6_mc_may_pull() should return 0 for malformed packets, not -EINVAL.
   From Stefano Brivio.

4) Don't forget to unpin umem xdp pages in error path of
   xdp_umem_reg().  From Ivan Khoronzhuk.

5) Fix sta object leak in mac80211, from Johannes Berg.

6) Fix regression by not configuring PHYLINK on CPU port of bcm_sf2
   switches.  From Florian Fainelli.

7) Revert DMA sync removal from r8169 which was causing regressions on some
   MIPS Loongson platforms.  From Heiner Kallweit.

8) Use after free in flow dissector, from Jakub Sitnicki.

9) Fix NULL derefs of net devices during ICMP processing across collect_md
   tunnels, from Hangbin Liu.

10) proto_register() memory leaks, from Zhang Lin.

11) Set NLM_F_MULTI flag in multipart netlink messages consistently, from
    John Fastabend.

Please pull, thanks a lot!

The following changes since commit 06821504fd47a5e5b641aeeb638a0ae10a216ef8:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net (2019-08-19 10:00:01 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git 

for you to fetch changes up to f53a7ad189594a112167efaf17ea8d0242b5ac00:

  r8152: Set memory to all 0xFFs on failed reg reads (2019-08-25 19:52:59 -0700)

----------------------------------------------------------------
Alexander Wetzel (1):
      cfg80211: Fix Extended Key ID key install checks

Alexei Starovoitov (1):
      bpf: fix precision tracking in presence of bpf2bpf calls

Alexey Kodanev (1):
      ipv4: mpls: fix mpls_xmit for iptunnel

Anders Roxell (2):
      selftests/bpf: add config fragment BPF_JIT
      selftests/bpf: install files test_xdp_vlan.sh

Andrew Lunn (1):
      MAINTAINERS: Add phylink keyword to SFF/SFP/SFP+ MODULE SUPPORT

Antoine Tenart (1):
      net: cpsw: fix NULL pointer exception in the probe error path

Christophe JAILLET (1):
      Kconfig: Fix the reference to the IDT77105 Phy driver in the description of ATM_NICSTAR_USE_IDT77105

Colin Ian King (1):
      net: ieee802154: remove redundant assignment to rc

Dan Carpenter (1):
      gve: Copy and paste bug in gve_get_stats()

Daniel Borkmann (1):
      bpf: fix use after free in prog symbol exposure

David Ahern (1):
      nexthop: Fix nexthop_num_path for blackhole nexthops

David S. Miller (8):
      Merge git://git.kernel.org/.../pablo/nf
      Merge tag 'mac80211-for-davem-2019-08-21' of git://git.kernel.org/.../jberg/mac80211
      Merge tag 'batadv-net-for-davem-20190821' of git://git.open-mesh.org/linux-merge
      Merge tag 'wireless-drivers-for-davem-2019-08-21' of git://git.kernel.org/.../kvalo/wireless-drivers
      Merge git://git.kernel.org/.../bpf/bpf
      Merge branch 'ieee802154-for-davem-2019-08-24' of git://git.kernel.org/.../sschmidt/wpan
      Merge branch 'collect_md-mode-dev-null'
      Merge tag 'mlx5-fixes-2019-08-22' of git://git.kernel.org/.../saeed/linux

Denis Efremov (2):
      MAINTAINERS: Remove IP MASQUERADING record
      MAINTAINERS: net_failover: Fix typo in a filepath

Emmanuel Grumbach (1):
      iwlwifi: pcie: fix the byte count table format for 22560 devices

Eran Ben Elisha (2):
      net/mlx5e: Add num bytes metadata to WQE info
      net/mlx5e: Remove ethernet segment from dump WQE

Eric Dumazet (2):
      batman-adv: fix uninit-value in batadv_netlink_get_ifindex()
      tcp: make sure EPOLLOUT wont be missed

Florian Fainelli (1):
      net: dsa: bcm_sf2: Do not configure PHYLINK on CPU port

Hangbin Liu (3):
      ipv6/addrconf: allow adding multicast addr if IFA_F_MCAUTOJOIN is set
      ipv4/icmp: fix rt dst dev null pointer dereference
      xfrm/xfrm_policy: fix dst dev null pointer dereference in collect_md mode

Heiner Kallweit (1):
      Revert "r8169: remove not needed call to dma_sync_single_for_device"

Hodaszi, Robert (1):
      Revert "cfg80211: fix processing world regdomain when non modular"

Ilan Peer (1):
      iwlwifi: mvm: Allow multicast data frames only when associated

Ilya Leoshkevich (6):
      s390/bpf: fix lcgr instruction encoding
      s390/bpf: use 32-bit index for tail calls
      selftests/bpf: fix "bind{4, 6} deny specific IP & port" on s390
      selftests/bpf: fix test_cgroup_storage on s390
      selftests/bpf: fix test_btf_dump with O=
      bpf: allow narrow loads of some sk_reuseport_md fields with offset > 0

Ivan Khoronzhuk (1):
      xdp: unpin xdp umem pages in error path

Jakub Sitnicki (1):
      flow_dissector: Fix potential use-after-free on BPF_PROG_DETACH

Jason Baron (1):
      net/smc: make sure EPOLLOUT is raised

Johannes Berg (1):
      mac80211: fix possible sta leak

John Fastabend (1):
      net: route dump netlink NLM_F_MULTI flag missing

Julian Wiedmann (1):
      s390/qeth: reject oversized SNMP requests

Juliana Rodrigueiro (1):
      netfilter: xt_nfacct: Fix alignment mismatch in xt_nfacct_match_info

Justin.Lee1@Dell.com (1):
      net/ncsi: Fix the payload copying for the request coming from Netlink

Li RongQing (2):
      net: fix __ip_mc_inc_group usage
      net: fix icmp_socket_deliver argument 2 input

Luca Coelho (2):
      iwlwifi: pcie: don't switch FW to qnj when ax201 is detected
      iwlwifi: pcie: fix recognition of QuZ devices

Masahiro Yamada (1):
      netfilter: add include guard to nf_conntrack_h323_types.h

Mike Rapoport (1):
      trivial: netns: fix typo in 'struct net.passive' description

Moshe Shemesh (2):
      net/mlx5: Fix crdump chunks print
      net/mlx5: Fix delay in fw fatal report handling due to fw report

Pablo Neira Ayuso (1):
      netfilter: nft_flow_offload: missing netlink attribute policy

Prashant Malani (1):
      r8152: Set memory to all 0xFFs on failed reg reads

Quentin Monnet (1):
      tools: bpftool: close prog FD before exit on showing a single program

Sabrina Dubroca (1):
      ipv6: propagate ipv6_add_dev's error returns out of ipv6_find_idev

Stanislaw Gruszka (2):
      mt76: mt76x0u: do not reset radio on resume
      rt2x00: clear IV's on start to fix AP mode regression

Stefano Brivio (1):
      ipv6: Fix return value of ipv6_mc_may_pull() for malformed packets

Terry S. Duncan (1):
      net/ncsi: Ensure 32-bit boundary for data cksum

Todd Seidelmann (1):
      netfilter: ebtables: Fix argument order to ADD_COUNTER

Vlad Buslov (1):
      nfp: flower: verify that block cb is not busy before binding

Wenwen Wang (1):
      qed: Add cleanup in qed_slowpath_start()

Yangbo Lu (1):
      ocelot_ace: fix action of trap

Yi-Hung Wei (2):
      openvswitch: Fix log message in ovs conntrack
      openvswitch: Fix conntrack cache with timeout

YueHaibing (2):
      ieee802154: hwsim: Fix error handle path in hwsim_init_module
      ieee802154: hwsim: unregister hw while hwsim_subscribe_all_others fails

Zhu Yanjun (1):
      net: rds: add service level support in rds-info

zhanglin (1):
      sock: fix potential memory leak in proto_register()

 MAINTAINERS                                                |  8 ++------
 arch/s390/net/bpf_jit_comp.c                               | 12 +++++++-----
 drivers/atm/Kconfig                                        |  2 +-
 drivers/net/dsa/bcm_sf2.c                                  | 10 ++++++++--
 drivers/net/ethernet/google/gve/gve_main.c                 |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 38 +++++++++++++++++---------------------
 drivers/net/ethernet/mellanox/mlx5/core/health.c           | 22 ++++++++++++----------
 drivers/net/ethernet/mscc/ocelot_ace.c                     |  2 +-
 drivers/net/ethernet/netronome/nfp/flower/offload.c        |  7 +++++++
 drivers/net/ethernet/qlogic/qed/qed_main.c                 |  4 +++-
 drivers/net/ethernet/realtek/r8169_main.c                  |  1 +
 drivers/net/ethernet/ti/cpsw.c                             |  2 +-
 drivers/net/ieee802154/mac802154_hwsim.c                   |  8 +++++---
 drivers/net/usb/r8152.c                                    |  5 ++++-
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c          | 33 ++++++++++++++++++++++++++++++---
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c          | 10 ++++++++++
 drivers/net/wireless/intel/iwlwifi/pcie/drv.c              | 17 +++++++++++++++++
 drivers/net/wireless/intel/iwlwifi/pcie/trans.c            |  1 +
 drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c          | 20 +++++++++++++-------
 drivers/net/wireless/mediatek/mt76/mt76x0/usb.c            |  8 ++++----
 drivers/net/wireless/ralink/rt2x00/rt2800lib.c             |  9 +++++++++
 drivers/net/wireless/ralink/rt2x00/rt2x00.h                |  1 +
 drivers/net/wireless/ralink/rt2x00/rt2x00dev.c             | 13 ++++++++-----
 drivers/s390/net/qeth_core_main.c                          |  4 ++++
 include/linux/netfilter/nf_conntrack_h323_types.h          |  5 +++++
 include/net/addrconf.h                                     |  2 +-
 include/net/net_namespace.h                                |  2 +-
 include/net/nexthop.h                                      |  6 ------
 include/net/route.h                                        |  2 +-
 include/uapi/linux/netfilter/xt_nfacct.h                   |  5 +++++
 include/uapi/linux/rds.h                                   |  2 ++
 kernel/bpf/syscall.c                                       | 30 ++++++++++++++++++------------
 kernel/bpf/verifier.c                                      |  9 +++++----
 net/batman-adv/netlink.c                                   |  2 +-
 net/bridge/netfilter/ebtables.c                            |  8 ++++----
 net/core/filter.c                                          |  8 ++++----
 net/core/flow_dissector.c                                  |  2 +-
 net/core/sock.c                                            | 31 +++++++++++++++++++++----------
 net/core/stream.c                                          | 16 +++++++++-------
 net/ieee802154/socket.c                                    |  2 +-
 net/ipv4/fib_trie.c                                        |  2 +-
 net/ipv4/icmp.c                                            | 10 ++++++++--
 net/ipv4/igmp.c                                            |  4 ++--
 net/ipv4/route.c                                           | 17 ++++++++++-------
 net/ipv6/addrconf.c                                        | 19 ++++++++++---------
 net/mac80211/cfg.c                                         |  9 +++++----
 net/mpls/mpls_iptunnel.c                                   |  8 ++++----
 net/ncsi/ncsi-cmd.c                                        | 13 ++++++++++---
 net/ncsi/ncsi-rsp.c                                        |  9 ++++++---
 net/netfilter/nft_flow_offload.c                           |  6 ++++++
 net/netfilter/xt_nfacct.c                                  | 36 +++++++++++++++++++++++++-----------
 net/openvswitch/conntrack.c                                | 15 ++++++++++++++-
 net/rds/ib.c                                               | 16 ++++++++++------
 net/rds/ib.h                                               |  1 +
 net/rds/ib_cm.c                                            |  3 +++
 net/rds/rdma_transport.c                                   | 10 ++++++++--
 net/smc/smc_tx.c                                           |  6 ++----
 net/wireless/reg.c                                         |  2 +-
 net/wireless/util.c                                        | 23 ++++++++++++++---------
 net/xdp/xdp_umem.c                                         |  4 +++-
 net/xfrm/xfrm_policy.c                                     |  4 ++--
 tools/bpf/bpftool/prog.c                                   |  4 +++-
 tools/testing/selftests/bpf/Makefile                       |  6 +++++-
 tools/testing/selftests/bpf/config                         |  1 +
 tools/testing/selftests/bpf/test_btf_dump.c                |  7 +++++++
 tools/testing/selftests/bpf/test_cgroup_storage.c          |  6 +++---
 tools/testing/selftests/bpf/test_sock.c                    |  7 +++++--
 67 files changed, 415 insertions(+), 204 deletions(-)

^ permalink raw reply

* Re: Unable to create htb tc classes more than 64K
From: Eric Dumazet @ 2019-08-26  6:32 UTC (permalink / raw)
  To: Cong Wang, Akshat Kakkar; +Cc: Anton Danilov, NetFilter, lartc, netdev
In-Reply-To: <CAM_iQpVtGUH6CAAegRtTgyemLtHsO+RFP8f6LH2WtiYu9-srfw@mail.gmail.com>



On 8/25/19 7:52 PM, Cong Wang wrote:
> On Wed, Aug 21, 2019 at 11:00 PM Akshat Kakkar <akshat.1984@gmail.com> wrote:
>>
>> On Thu, Aug 22, 2019 at 3:37 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>> I am using ipset +  iptables to classify and not filters. Besides, if
>>>> tc is allowing me to define qdisc -> classes -> qdsic -> classes
>>>> (1,2,3 ...) sort of structure (ie like the one shown in ascii tree)
>>>> then how can those lowest child classes be actually used or consumed?
>>>
>>> Just install tc filters on the lower level too.
>>
>> If I understand correctly, you are saying,
>> instead of :
>> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
>> 0x00000001 fw flowid 1:10
>> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
>> 0x00000002 fw flowid 1:20
>> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
>> 0x00000003 fw flowid 2:10
>> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
>> 0x00000004 fw flowid 2:20
>>
>>
>> I should do this: (i.e. changing parent to just immediate qdisc)
>> tc filter add dev eno2 parent 1: protocol ip prio 1 handle 0x00000001
>> fw flowid 1:10
>> tc filter add dev eno2 parent 1: protocol ip prio 1 handle 0x00000002
>> fw flowid 1:20
>> tc filter add dev eno2 parent 2: protocol ip prio 1 handle 0x00000003
>> fw flowid 2:10
>> tc filter add dev eno2 parent 2: protocol ip prio 1 handle 0x00000004
>> fw flowid 2:20
> 
> 
> Yes, this is what I meant.
> 
> 
>>
>> I tried this previously. But there is not change in the result.
>> Behaviour is exactly same, i.e. I am still getting 100Mbps and not
>> 100kbps or 300kbps
>>
>> Besides, as I mentioned previously I am using ipset + skbprio and not
>> filters stuff. Filters I used just to test.
>>
>> ipset  -N foo hash:ip,mark skbinfo
>>
>> ipset -A foo 10.10.10.10, 0x0x00000001 skbprio 1:10
>> ipset -A foo 10.10.10.20, 0x0x00000002 skbprio 1:20
>> ipset -A foo 10.10.10.30, 0x0x00000003 skbprio 2:10
>> ipset -A foo 10.10.10.40, 0x0x00000004 skbprio 2:20
>>
>> iptables -A POSTROUTING -j SET --map-set foo dst,dst --map-prio
> 
> Hmm..
> 
> I am not familiar with ipset, but it seems to save the skbprio into
> skb->priority, so it doesn't need TC filter to classify it again.
> 
> I guess your packets might go to the direct queue of HTB, which
> bypasses the token bucket. Can you dump the stats and check?

With more than 64K 'classes' I suggest to use a single FQ qdisc [1], and
an eBPF program using EDT model (Earliest Departure Time)

The BPF program would perform the classification, then find a data structure
based on the 'class', and then update/maintain class virtual times and skb->tstamp

TBF = bpf_map_lookup_elem(&map, &classid);

uint64_t now = bpf_ktime_get_ns();
uint64_t time_to_send = max(TBF->time_to_send, now);

time_to_send += (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC / TBF->rate;
if (time_to_send > TBF->max_horizon) {
    return TC_ACT_SHOT;
}
TBF->time_to_send = time_to_send;
skb->tstamp = max(time_to_send, skb->tstamp);
if (time_to_send - now > TBF->ecn_horizon)
    bpf_skb_ecn_set_ce(skb);
return TC_ACT_OK;

tools/testing/selftests/bpf/progs/test_tc_edt.c shows something similar.


[1]  MQ + FQ if the device is multi-queues.

   Note that this setup scales very well on SMP, since we no longer are forced
 to use a single HTB hierarchy (protected by a single spinlock)


^ permalink raw reply

* Re: libbpf distro packaging
From: Jiri Olsa @ 2019-08-26  6:42 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Julia Kartseva, Andrii Nakryiko, labbott@redhat.com,
	acme@kernel.org, debian-kernel@lists.debian.org,
	netdev@vger.kernel.org, Andrey Ignatov, Yonghong Song,
	jolsa@kernel.org, Daniel Borkmann
In-Reply-To: <a00bab9b-dae8-23d8-8de0-3751a1d1b023@fb.com>

On Fri, Aug 23, 2019 at 04:00:01PM +0000, Alexei Starovoitov wrote:
> On 8/23/19 2:22 AM, Jiri Olsa wrote:
> > btw, the libbpf GH repo tag v0.0.4 has 0.0.3 version set in Makefile:
> > 
> >    VERSION = 0
> >    PATCHLEVEL = 0
> >    EXTRAVERSION = 3
> > 
> > current code takes version from libbpf.map so it's fine,
> > but would be great to start from 0.0.5 so we don't need to
> > bother with rpm patches.. is 0.0.5 planned soon?
> 
> Technically we can bump it at any time.
> The goal was to bump it only when new kernel is released
> to capture a collection of new APIs in a given 0.0.X release.
> So that libbpf versions are synchronized with kernel versions
> in some what loose way.
> In this case we can make an exception and bump it now.

I see, I dont think it's worth of the exception now,
the patch is simple or we'll start with 0.0.3

jirka

^ permalink raw reply

* [PATCH v3] net: fix skb use after free in netpoll
From: Feng Sun @ 2019-08-26  6:46 UTC (permalink / raw)
  To: davem
  Cc: edumazet, dsterba, dbanerje, fw, davej, tglx, matwey,
	sakari.ailus, netdev, linux-kernel, Feng Sun, Xiaojun Zhao
In-Reply-To: <20190825.232003.1145950065287854577.davem@davemloft.net>

After commit baeababb5b85d5c4e6c917efe2a1504179438d3b
("tun: return NET_XMIT_DROP for dropped packets"),
when tun_net_xmit drop packets, it will free skb and return NET_XMIT_DROP,
netpoll_send_skb_on_dev will run into following use after free cases:
1. retry netpoll_start_xmit with freed skb;
2. queue freed skb in npinfo->txq.
queue_process will also run into use after free case.

hit netpoll_send_skb_on_dev first case with following kernel log:

[  117.864773] kernel BUG at mm/slub.c:306!
[  117.864773] invalid opcode: 0000 [#1] SMP PTI
[  117.864774] CPU: 3 PID: 2627 Comm: loop_printmsg Kdump: loaded Tainted: P           OE     5.3.0-050300rc5-generic #201908182231
[  117.864775] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  117.864775] RIP: 0010:kmem_cache_free+0x28d/0x2b0
[  117.864781] Call Trace:
[  117.864781]  ? tun_net_xmit+0x21c/0x460
[  117.864781]  kfree_skbmem+0x4e/0x60
[  117.864782]  kfree_skb+0x3a/0xa0
[  117.864782]  tun_net_xmit+0x21c/0x460
[  117.864782]  netpoll_start_xmit+0x11d/0x1b0
[  117.864788]  netpoll_send_skb_on_dev+0x1b8/0x200
[  117.864789]  __br_forward+0x1b9/0x1e0 [bridge]
[  117.864789]  ? skb_clone+0x53/0xd0
[  117.864790]  ? __skb_clone+0x2e/0x120
[  117.864790]  deliver_clone+0x37/0x50 [bridge]
[  117.864790]  maybe_deliver+0x89/0xc0 [bridge]
[  117.864791]  br_flood+0x6c/0x130 [bridge]
[  117.864791]  br_dev_xmit+0x315/0x3c0 [bridge]
[  117.864792]  netpoll_start_xmit+0x11d/0x1b0
[  117.864792]  netpoll_send_skb_on_dev+0x1b8/0x200
[  117.864792]  netpoll_send_udp+0x2c6/0x3e8
[  117.864793]  write_msg+0xd9/0xf0 [netconsole]
[  117.864793]  console_unlock+0x386/0x4e0
[  117.864793]  vprintk_emit+0x17e/0x280
[  117.864794]  vprintk_default+0x29/0x50
[  117.864794]  vprintk_func+0x4c/0xbc
[  117.864794]  printk+0x58/0x6f
[  117.864795]  loop_fun+0x24/0x41 [printmsg_loop]
[  117.864795]  kthread+0x104/0x140
[  117.864795]  ? 0xffffffffc05b1000
[  117.864796]  ? kthread_park+0x80/0x80
[  117.864796]  ret_from_fork+0x35/0x40

Signed-off-by: Feng Sun <loyou85@gmail.com>
Signed-off-by: Xiaojun Zhao <xiaojunzhao141@gmail.com>
---
Changes in v3:
- use dev_xmit_complete directly

Changes in v2:
- change commit and title
- add netpoll_xmit_complete helper
- add one more return value check of netpoll_start_xmit
---
 net/core/netpoll.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 2cf27da..849380a6 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -122,7 +122,7 @@ static void queue_process(struct work_struct *work)
 		txq = netdev_get_tx_queue(dev, q_index);
 		HARD_TX_LOCK(dev, txq, smp_processor_id());
 		if (netif_xmit_frozen_or_stopped(txq) ||
-		    netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
+		    !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) {
 			skb_queue_head(&npinfo->txq, skb);
 			HARD_TX_UNLOCK(dev, txq);
 			local_irq_restore(flags);
@@ -335,7 +335,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 
 				HARD_TX_UNLOCK(dev, txq);
 
-				if (status == NETDEV_TX_OK)
+				if (dev_xmit_complete(status))
 					break;
 
 			}
@@ -352,7 +352,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 
 	}
 
-	if (status != NETDEV_TX_OK) {
+	if (!dev_xmit_complete(status)) {
 		skb_queue_tail(&npinfo->txq, skb);
 		schedule_delayed_work(&npinfo->tx_work,0);
 	}
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH] bpf: handle 32-bit zext during constant blinding
From: Naveen N. Rao @ 2019-08-26  6:59 UTC (permalink / raw)
  To: Jiong Wang, Alexei Starovoitov, Daniel Borkmann
  Cc: bpf, linux-kernel, linuxppc-dev, Michael Ellerman, netdev
In-Reply-To: <87zhk2faqg.fsf@netronome.com>

Jiong Wang wrote:
> 
> Naveen N. Rao writes:
> 
>> Since BPF constant blinding is performed after the verifier pass, the
>> ALU32 instructions inserted for doubleword immediate loads don't have a
>> corresponding zext instruction. This is causing a kernel oops on powerpc
>> and can be reproduced by running 'test_cgroup_storage' with
>> bpf_jit_harden=2.
>>
>> Fix this by emitting BPF_ZEXT during constant blinding if
>> prog->aux->verifier_zext is set.
>>
>> Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result")
>> Reported-by: Michael Ellerman <mpe@ellerman.id.au>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> 
> Thanks for the fix.
> 
> Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
> 
> Just two other comments during review in case I am wrong on somewhere.
> 
>   - Use verifier_zext instead of bpf_jit_needs_zext() seems better, even
>     though the latter could avoid extending function argument.
> 
>     Because JIT back-ends look at verifier_zext, true means zext inserted
>     by verifier so JITs won't do the code-gen.
> 
>     Use verifier_zext is sort of keeping JIT blinding the same behaviour
>     has verifier even though blinding doesn't belong to verifier, but for
>     such insn patching, it could be seen as a extension of verifier,
>     therefore use verifier_zext seems better than bpf_jit_needs_zext() to
>     me.
>    
>   - JIT blinding is also escaping the HI32 randomization which happens
>     inside verifier, otherwise x86-64 regression should have caught this issue.

Jiong,
Thanks for the review.

Alexei, Daniel,
Can you please pick this up for v5.3. This is a regression and is 
causing a crash on powerpc.


- Naveen


^ permalink raw reply

* Re: [PATCH v2 -next] net: mediatek: remove set but not used variable 'status'
From: René van Dorst @ 2019-08-26  7:10 UTC (permalink / raw)
  To: Mao Wenan, sr
  Cc: nbd, john, sean.wang, nelson.chang, davem, matthias.bgg,
	kernel-janitors, netdev, linux-arm-kernel, linux-mediatek,
	linux-kernel
In-Reply-To: <20190826013118.22720-1-maowenan@huawei.com>

Let's add Stefan to the conversation.
He is the author of this commit.

Quoting Mao Wenan <maowenan@huawei.com>:

> Fixes gcc '-Wunused-but-set-variable' warning:
> drivers/net/ethernet/mediatek/mtk_eth_soc.c: In function mtk_handle_irq:
> drivers/net/ethernet/mediatek/mtk_eth_soc.c:1951:6: warning:  
> variable status set but not used [-Wunused-but-set-variable]
>
> Fixes: 296c9120752b ("net: ethernet: mediatek: Add MT7628/88 SoC support")
> Signed-off-by: Mao Wenan <maowenan@huawei.com>
> ---
>  v2: change format of 'Fixes' tag.
>  drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c  
> b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> index 8ddbb8d..bb7d623 100644
> --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
> @@ -1948,9 +1948,7 @@ static irqreturn_t mtk_handle_irq_tx(int irq,  
> void *_eth)
>  static irqreturn_t mtk_handle_irq(int irq, void *_eth)
>  {
>  	struct mtk_eth *eth = _eth;
> -	u32 status;
>
> -	status = mtk_r32(eth, MTK_PDMA_INT_STATUS);

Hi Stefan,

You added an extra MTK_PDMA_INT_STATUS read in mtk_handle_irq()
Is that read necessary to work properly?

Greats,

René


>  	if (mtk_r32(eth, MTK_PDMA_INT_MASK) & MTK_RX_DONE_INT) {
>  		if (mtk_r32(eth, MTK_PDMA_INT_STATUS) & MTK_RX_DONE_INT)
>  			mtk_handle_irq_rx(irq, _eth);
> --
> 2.7.4





^ permalink raw reply

* [PATCH v4] rtw88: pci: Move a mass of jobs in hw IRQ to soft IRQ
From: Jian-Hong Pan @ 2019-08-26  7:08 UTC (permalink / raw)
  To: Yan-Hsuan Chuang, Kalle Valo, David S . Miller
  Cc: linux-wireless, netdev, linux-kernel, linux, Jian-Hong Pan
In-Reply-To: <F7CD281DE3E379468C6D07993EA72F84D18A5786@RTITMBSVM04.realtek.com.tw>

There is a mass of jobs between spin lock and unlock in the hardware
IRQ which will occupy much time originally. To make system work more
efficiently, this patch moves the jobs to the soft IRQ (bottom half) to
reduce the time in hardware IRQ.

Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
---
v2:
 Change the spin_lock_irqsave/unlock_irqrestore to spin_lock/unlock in
 rtw_pci_interrupt_handler. Because the interrupts are already disabled
 in the hardware interrupt handler.

v3:
 Extend the spin lock protecting area for the TX path in
 rtw_pci_interrupt_threadfn by Realtek's suggestion

v4:
 Remove the WiFi running check in rtw_pci_interrupt_threadfn to avoid AP
 connection failed by Realtek's suggestion.

 drivers/net/wireless/realtek/rtw88/pci.c | 32 +++++++++++++++++++-----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c
index 00ef229552d5..955dd6c6fb57 100644
--- a/drivers/net/wireless/realtek/rtw88/pci.c
+++ b/drivers/net/wireless/realtek/rtw88/pci.c
@@ -866,12 +866,29 @@ static irqreturn_t rtw_pci_interrupt_handler(int irq, void *dev)
 {
 	struct rtw_dev *rtwdev = dev;
 	struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
-	u32 irq_status[4];
 
 	spin_lock(&rtwpci->irq_lock);
 	if (!rtwpci->irq_enabled)
 		goto out;
 
+	/* disable RTW PCI interrupt to avoid more interrupts before the end of
+	 * thread function
+	 */
+	rtw_pci_disable_interrupt(rtwdev, rtwpci);
+out:
+	spin_unlock(&rtwpci->irq_lock);
+
+	return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t rtw_pci_interrupt_threadfn(int irq, void *dev)
+{
+	struct rtw_dev *rtwdev = dev;
+	struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
+	unsigned long flags;
+	u32 irq_status[4];
+
+	spin_lock_irqsave(&rtwpci->irq_lock, flags);
 	rtw_pci_irq_recognized(rtwdev, rtwpci, irq_status);
 
 	if (irq_status[0] & IMR_MGNTDOK)
@@ -891,8 +908,9 @@ static irqreturn_t rtw_pci_interrupt_handler(int irq, void *dev)
 	if (irq_status[0] & IMR_ROK)
 		rtw_pci_rx_isr(rtwdev, rtwpci, RTW_RX_QUEUE_MPDU);
 
-out:
-	spin_unlock(&rtwpci->irq_lock);
+	/* all of the jobs for this interrupt have been done */
+	rtw_pci_enable_interrupt(rtwdev, rtwpci);
+	spin_unlock_irqrestore(&rtwpci->irq_lock, flags);
 
 	return IRQ_HANDLED;
 }
@@ -1152,8 +1170,10 @@ static int rtw_pci_probe(struct pci_dev *pdev,
 		goto err_destroy_pci;
 	}
 
-	ret = request_irq(pdev->irq, &rtw_pci_interrupt_handler,
-			  IRQF_SHARED, KBUILD_MODNAME, rtwdev);
+	ret = devm_request_threaded_irq(rtwdev->dev, pdev->irq,
+					rtw_pci_interrupt_handler,
+					rtw_pci_interrupt_threadfn,
+					IRQF_SHARED, KBUILD_MODNAME, rtwdev);
 	if (ret) {
 		ieee80211_unregister_hw(hw);
 		goto err_destroy_pci;
@@ -1192,7 +1212,7 @@ static void rtw_pci_remove(struct pci_dev *pdev)
 	rtw_pci_disable_interrupt(rtwdev, rtwpci);
 	rtw_pci_destroy(rtwdev, pdev);
 	rtw_pci_declaim(rtwdev, pdev);
-	free_irq(rtwpci->pdev->irq, rtwdev);
+	devm_free_irq(rtwdev->dev, rtwpci->pdev->irq, rtwdev);
 	rtw_core_deinit(rtwdev);
 	ieee80211_free_hw(hw);
 }
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH v3] rtw88: pci: Move a mass of jobs in hw IRQ to soft IRQ
From: Jian-Hong Pan @ 2019-08-26  7:21 UTC (permalink / raw)
  To: Tony Chuang
  Cc: Kalle Valo, David S . Miller, linux-wireless@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux@endlessm.com
In-Reply-To: <F7CD281DE3E379468C6D07993EA72F84D18A5786@RTITMBSVM04.realtek.com.tw>

Tony Chuang <yhchuang@realtek.com> 於 2019年8月21日 週三 下午4:16寫道：
>
> Hi,
>
> > From: Jian-Hong Pan [mailto:jian-hong@endlessm.com]
> >
> > There is a mass of jobs between spin lock and unlock in the hardware
> > IRQ which will occupy much time originally. To make system work more
> > efficiently, this patch moves the jobs to the soft IRQ (bottom half) to
> > reduce the time in hardware IRQ.
> >
> > Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
> > ---
> > v2:
> >  Change the spin_lock_irqsave/unlock_irqrestore to spin_lock/unlock in
> >  rtw_pci_interrupt_handler. Because the interrupts are already disabled
> >  in the hardware interrupt handler.
> >
> > v3:
> >  Extend the spin lock protecting area for the TX path in
> >  rtw_pci_interrupt_threadfn by Realtek's suggestion
> >
> >  drivers/net/wireless/realtek/rtw88/pci.c | 33 +++++++++++++++++++-----
> >  1 file changed, 27 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/net/wireless/realtek/rtw88/pci.c
> > b/drivers/net/wireless/realtek/rtw88/pci.c
> > index 00ef229552d5..a8c17a01f318 100644
> > --- a/drivers/net/wireless/realtek/rtw88/pci.c
> > +++ b/drivers/net/wireless/realtek/rtw88/pci.c
> > @@ -866,12 +866,29 @@ static irqreturn_t rtw_pci_interrupt_handler(int irq,
> > void *dev)
> >  {
> >       struct rtw_dev *rtwdev = dev;
> >       struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
> > -     u32 irq_status[4];
> >
> >       spin_lock(&rtwpci->irq_lock);
> >       if (!rtwpci->irq_enabled)
> >               goto out;
> >
> > +     /* disable RTW PCI interrupt to avoid more interrupts before the end of
> > +      * thread function
> > +      */
> > +     rtw_pci_disable_interrupt(rtwdev, rtwpci);
> > +out:
> > +     spin_unlock(&rtwpci->irq_lock);
> > +
> > +     return IRQ_WAKE_THREAD;
> > +}
> > +
> > +static irqreturn_t rtw_pci_interrupt_threadfn(int irq, void *dev)
> > +{
> > +     struct rtw_dev *rtwdev = dev;
> > +     struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
> > +     unsigned long flags;
> > +     u32 irq_status[4];
> > +
> > +     spin_lock_irqsave(&rtwpci->irq_lock, flags);
> >       rtw_pci_irq_recognized(rtwdev, rtwpci, irq_status);
> >
> >       if (irq_status[0] & IMR_MGNTDOK)
> > @@ -891,8 +908,10 @@ static irqreturn_t rtw_pci_interrupt_handler(int irq,
> > void *dev)
> >       if (irq_status[0] & IMR_ROK)
> >               rtw_pci_rx_isr(rtwdev, rtwpci, RTW_RX_QUEUE_MPDU);
> >
> > -out:
> > -     spin_unlock(&rtwpci->irq_lock);
> > +     /* all of the jobs for this interrupt have been done */
> > +     if (rtw_flag_check(rtwdev, RTW_FLAG_RUNNING))
> > +             rtw_pci_enable_interrupt(rtwdev, rtwpci);
>
> I've applied this patch and tested it.
> But I failed to connect to AP, it seems that the
> scan_result is empty. And when I failed to connect
> to the AP, I found that the IMR is not enabled.
> I guess the check bypass the interrupt enable function.
>
> And I also found that *without MSI*, the driver is
> able to connect to the AP. Could you please verify
> this patch again with MSI interrupt enabled and
> send a v4?
>
> You can find my MSI patch on
> https://patchwork.kernel.org/patch/11081539/

I have just sent v4 patch.  Also tested the modified MSI patch like below:
The WiFi works fine on ASUS X512DK (including MSI enabled).

Jian-Hong Pan

diff --git a/drivers/net/wireless/realtek/rtw88/pci.c
b/drivers/net/wireless/realtek/rtw88/pci.c
index 955dd6c6fb57..d18f5aae1585 100644
--- a/drivers/net/wireless/realtek/rtw88/pci.c
+++ b/drivers/net/wireless/realtek/rtw88/pci.c
@@ -11,6 +11,10 @@
 #include "fw.h"
 #include "debug.h"

+static bool rtw_disable_msi;
+module_param_named(disable_msi, rtw_disable_msi, bool, 0644);
+MODULE_PARM_DESC(disable_msi, "Set Y to disable MSI interrupt support");
+
 static u32 rtw_pci_tx_queue_idx_addr[] = {
     [RTW_TX_QUEUE_BK]    = RTK_PCI_TXBD_IDX_BKQ,
     [RTW_TX_QUEUE_BE]    = RTK_PCI_TXBD_IDX_BEQ,
@@ -1116,6 +1120,48 @@ static struct rtw_hci_ops rtw_pci_ops = {
     .write_data_h2c = rtw_pci_write_data_h2c,
 };

+static int rtw_pci_request_irq(struct rtw_dev *rtwdev, struct pci_dev *pdev)
+{
+    struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
+    int ret;
+
+    if (!rtw_disable_msi) {
+        ret = pci_enable_msi(pdev);
+        if (ret) {
+            rtw_warn(rtwdev, "failed to enable msi %d, using legacy irq\n",
+                 ret);
+        } else {
+            rtw_warn(rtwdev, "pci msi enabled\n");
+            rtwpci->msi_enabled = true;
+        }
+    }
+
+    ret = devm_request_threaded_irq(rtwdev->dev, pdev->irq,
+                    rtw_pci_interrupt_handler,
+                    rtw_pci_interrupt_threadfn,
+                    IRQF_SHARED, KBUILD_MODNAME, rtwdev);
+    if (ret) {
+        rtw_err(rtwdev, "failed to request irq %d\n", ret);
+        if (rtwpci->msi_enabled) {
+            pci_disable_msi(pdev);
+            rtwpci->msi_enabled = false;
+        }
+    }
+
+    return ret;
+}
+
+static void rtw_pci_free_irq(struct rtw_dev *rtwdev, struct pci_dev *pdev)
+{
+    struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
+
+    devm_free_irq(rtwdev->dev, rtwpci->pdev->irq, rtwdev);
+    if (rtwpci->msi_enabled) {
+        pci_disable_msi(pdev);
+        rtwpci->msi_enabled = false;
+    }
+}
+
 static int rtw_pci_probe(struct pci_dev *pdev,
              const struct pci_device_id *id)
 {
@@ -1170,10 +1216,7 @@ static int rtw_pci_probe(struct pci_dev *pdev,
         goto err_destroy_pci;
     }

-    ret = devm_request_threaded_irq(rtwdev->dev, pdev->irq,
-                    rtw_pci_interrupt_handler,
-                    rtw_pci_interrupt_threadfn,
-                    IRQF_SHARED, KBUILD_MODNAME, rtwdev);
+    ret = rtw_pci_request_irq(rtwdev, pdev);
     if (ret) {
         ieee80211_unregister_hw(hw);
         goto err_destroy_pci;
@@ -1212,7 +1255,7 @@ static void rtw_pci_remove(struct pci_dev *pdev)
     rtw_pci_disable_interrupt(rtwdev, rtwpci);
     rtw_pci_destroy(rtwdev, pdev);
     rtw_pci_declaim(rtwdev, pdev);
-    devm_free_irq(rtwdev->dev, rtwpci->pdev->irq, rtwdev);
+    rtw_pci_free_irq(rtwdev, pdev);
     rtw_core_deinit(rtwdev);
     ieee80211_free_hw(hw);
 }
diff --git a/drivers/net/wireless/realtek/rtw88/pci.h
b/drivers/net/wireless/realtek/rtw88/pci.h
index 87824a4caba9..a8e369c5eaca 100644
--- a/drivers/net/wireless/realtek/rtw88/pci.h
+++ b/drivers/net/wireless/realtek/rtw88/pci.h
@@ -186,6 +186,7 @@ struct rtw_pci {
     spinlock_t irq_lock;
     u32 irq_mask[4];
     bool irq_enabled;
+    bool msi_enabled;

     u16 rx_tag;
     struct rtw_pci_tx_ring tx_rings[RTK_MAX_TX_QUEUE_NUM];


> > +     spin_unlock_irqrestore(&rtwpci->irq_lock, flags);
> >
> >       return IRQ_HANDLED;
> >  }
> > @@ -1152,8 +1171,10 @@ static int rtw_pci_probe(struct pci_dev *pdev,
> >               goto err_destroy_pci;
> >       }
> >
> > -     ret = request_irq(pdev->irq, &rtw_pci_interrupt_handler,
> > -                       IRQF_SHARED, KBUILD_MODNAME, rtwdev);
> > +     ret = devm_request_threaded_irq(rtwdev->dev, pdev->irq,
> > +                                     rtw_pci_interrupt_handler,
> > +                                     rtw_pci_interrupt_threadfn,
> > +                                     IRQF_SHARED, KBUILD_MODNAME, rtwdev);
> >       if (ret) {
> >               ieee80211_unregister_hw(hw);
> >               goto err_destroy_pci;
> > @@ -1192,7 +1213,7 @@ static void rtw_pci_remove(struct pci_dev *pdev)
> >       rtw_pci_disable_interrupt(rtwdev, rtwpci);
> >       rtw_pci_destroy(rtwdev, pdev);
> >       rtw_pci_declaim(rtwdev, pdev);
> > -     free_irq(rtwpci->pdev->irq, rtwdev);
> > +     devm_free_irq(rtwdev->dev, rtwpci->pdev->irq, rtwdev);
> >       rtw_core_deinit(rtwdev);
> >       ieee80211_free_hw(hw);
> >  }
> > --
>
>
> NACK
> Need to verify it with MSI https://patchwork.kernel.org/patch/11081539/
> And hope v4 could fix it.
>
> Yan-Hsuan
>

^ permalink raw reply related

* [PATCH] net: Adding parameter detection in __ethtool_get_link_ksettings.
From: Dongxu Liu @ 2019-08-26  7:23 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-kernel

The __ethtool_get_link_ksettings symbol will be exported,
and external users may use an illegal address.
We should check the parameters before using them,
otherwise the system will crash.

[ 8980.991134] BUG: unable to handle kernel NULL pointer dereference at           (null)
[ 8980.993049] IP: [<ffffffff8155aca7>] __ethtool_get_link_ksettings+0x27/0x140
[ 8980.994285] PGD 0
[ 8980.995013] Oops: 0000 [#1] SMP
[ 8980.995896] Modules linked in: sch_ingress ...
[ 8981.013220] CPU: 3 PID: 25174 Comm: kworker/3:3 Tainted: G           O   ----V-------   3.10.0-327.36.58.4.x86_64 #1
[ 8981.017667] Workqueue: events linkwatch_event
[ 8981.018652] task: ffff8800a8348000 ti: ffff8800b045c000 task.ti: ffff8800b045c000
[ 8981.020418] RIP: 0010:[<ffffffff8155aca7>]  [<ffffffff8155aca7>] __ethtool_get_link_ksettings+0x27/0x140
[ 8981.022383] RSP: 0018:ffff8800b045fc88  EFLAGS: 00010202
[ 8981.023453] RAX: 0000000000000000 RBX: ffff8800b045fcac RCX: 0000000000000000
[ 8981.024726] RDX: ffff8800b658f600 RSI: ffff8800b045fcac RDI: ffff8802296e0000
[ 8981.026000] RBP: ffff8800b045fc98 R08: 0000000000000000 R09: 0000000000000001
[ 8981.027273] R10: 00000000000073e0 R11: 0000082b0cc8adea R12: ffff8802296e0000
[ 8981.028561] R13: ffff8800b566e8c0 R14: ffff8800b658f600 R15: ffff8800b566e000
[ 8981.029841] FS:  0000000000000000(0000) GS:ffff88023ed80000(0000) knlGS:0000000000000000
[ 8981.031715] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 8981.032845] CR2: 0000000000000000 CR3: 00000000b39a9000 CR4: 00000000003407e0
[ 8981.034137] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 8981.035427] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 8981.036702] Stack:
[ 8981.037406]  ffff8800b658f600 0000000000009c40 ffff8800b045fce8 ffffffffa047a71d
[ 8981.039238]  000000000000004d ffff8800b045fcc8 ffff8800b045fd28 ffffffff815cb198
[ 8981.041070]  ffff8800b045fcd8 ffffffff810807e6 00000000e8212951 0000000000000001
[ 8981.042910] Call Trace:
[ 8981.043660]  [<ffffffffa047a71d>] bond_update_speed_duplex+0x3d/0x90 [bonding]
[ 8981.045424]  [<ffffffff815cb198>] ? inetdev_event+0x38/0x530
[ 8981.046554]  [<ffffffff810807e6>] ? put_online_cpus+0x56/0x80
[ 8981.047688]  [<ffffffffa0480d67>] bond_netdev_event+0x137/0x360 [bonding]
...

Signed-off-by: Dongxu Liu <liudongxu3@huawei.com>
---
 net/core/ethtool.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6288e69..9a50b64 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -545,6 +545,8 @@ int __ethtool_get_link_ksettings(struct net_device *dev,
 {
 	ASSERT_RTNL();
 
+	if (!dev || !dev->ethtool_ops)
+		return -EOPNOTSUPP;
 	if (!dev->ethtool_ops->get_link_ksettings)
 		return -EOPNOTSUPP;
 
-- 
2.12.3



^ permalink raw reply related

* Re: [PATCH v2 -next] net: mediatek: remove set but not used variable 'status'
From: Stefan Roese @ 2019-08-26  7:27 UTC (permalink / raw)
  To: René van Dorst, Mao Wenan
  Cc: nbd, john, sean.wang, nelson.chang, davem, matthias.bgg,
	kernel-janitors, netdev, linux-arm-kernel, linux-mediatek,
	linux-kernel
In-Reply-To: <20190826071048.Horde.gwS9nzceYYiYGJLnJ6-x2hz@www.vdorst.com>

Hi!

On 26.08.19 09:10, René van Dorst wrote:
> Let's add Stefan to the conversation.
> He is the author of this commit.

Thanks Rene.
  
> Quoting Mao Wenan <maowenan@huawei.com>:
> 
>> Fixes gcc '-Wunused-but-set-variable' warning:
>> drivers/net/ethernet/mediatek/mtk_eth_soc.c: In function mtk_handle_irq:
>> drivers/net/ethernet/mediatek/mtk_eth_soc.c:1951:6: warning:
>> variable status set but not used [-Wunused-but-set-variable]
>>
>> Fixes: 296c9120752b ("net: ethernet: mediatek: Add MT7628/88 SoC support")
>> Signed-off-by: Mao Wenan <maowenan@huawei.com>
>> ---
>>   v2: change format of 'Fixes' tag.
>>   drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 --
>>   1 file changed, 2 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
>> b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
>> index 8ddbb8d..bb7d623 100644
>> --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
>> +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
>> @@ -1948,9 +1948,7 @@ static irqreturn_t mtk_handle_irq_tx(int irq,
>> void *_eth)
>>   static irqreturn_t mtk_handle_irq(int irq, void *_eth)
>>   {
>>   	struct mtk_eth *eth = _eth;
>> -	u32 status;
>>
>> -	status = mtk_r32(eth, MTK_PDMA_INT_STATUS);
> 
> Hi Stefan,
> 
> You added an extra MTK_PDMA_INT_STATUS read in mtk_handle_irq()
> Is that read necessary to work properly?

No, its not needed. This read must have "slipped in" from some earlier
patch versions and I forgot to remove it later. Thanks for catching it.

Reviewed-by: Stefan Roese <sr@denx.de>

Thanks,
Stefan

^ permalink raw reply

* Re: Unable to create htb tc classes more than 64K
From: Toke Høiland-Jørgensen @ 2019-08-26  7:28 UTC (permalink / raw)
  To: Eric Dumazet, Cong Wang, Akshat Kakkar
  Cc: Anton Danilov, NetFilter, lartc, netdev
In-Reply-To: <9cbefe10-b172-ae2a-0ac7-d972468eb7a2@gmail.com>

Eric Dumazet <eric.dumazet@gmail.com> writes:

> On 8/25/19 7:52 PM, Cong Wang wrote:
>> On Wed, Aug 21, 2019 at 11:00 PM Akshat Kakkar <akshat.1984@gmail.com> wrote:
>>>
>>> On Thu, Aug 22, 2019 at 3:37 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>> I am using ipset +  iptables to classify and not filters. Besides, if
>>>>> tc is allowing me to define qdisc -> classes -> qdsic -> classes
>>>>> (1,2,3 ...) sort of structure (ie like the one shown in ascii tree)
>>>>> then how can those lowest child classes be actually used or consumed?
>>>>
>>>> Just install tc filters on the lower level too.
>>>
>>> If I understand correctly, you are saying,
>>> instead of :
>>> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
>>> 0x00000001 fw flowid 1:10
>>> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
>>> 0x00000002 fw flowid 1:20
>>> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
>>> 0x00000003 fw flowid 2:10
>>> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
>>> 0x00000004 fw flowid 2:20
>>>
>>>
>>> I should do this: (i.e. changing parent to just immediate qdisc)
>>> tc filter add dev eno2 parent 1: protocol ip prio 1 handle 0x00000001
>>> fw flowid 1:10
>>> tc filter add dev eno2 parent 1: protocol ip prio 1 handle 0x00000002
>>> fw flowid 1:20
>>> tc filter add dev eno2 parent 2: protocol ip prio 1 handle 0x00000003
>>> fw flowid 2:10
>>> tc filter add dev eno2 parent 2: protocol ip prio 1 handle 0x00000004
>>> fw flowid 2:20
>> 
>> 
>> Yes, this is what I meant.
>> 
>> 
>>>
>>> I tried this previously. But there is not change in the result.
>>> Behaviour is exactly same, i.e. I am still getting 100Mbps and not
>>> 100kbps or 300kbps
>>>
>>> Besides, as I mentioned previously I am using ipset + skbprio and not
>>> filters stuff. Filters I used just to test.
>>>
>>> ipset  -N foo hash:ip,mark skbinfo
>>>
>>> ipset -A foo 10.10.10.10, 0x0x00000001 skbprio 1:10
>>> ipset -A foo 10.10.10.20, 0x0x00000002 skbprio 1:20
>>> ipset -A foo 10.10.10.30, 0x0x00000003 skbprio 2:10
>>> ipset -A foo 10.10.10.40, 0x0x00000004 skbprio 2:20
>>>
>>> iptables -A POSTROUTING -j SET --map-set foo dst,dst --map-prio
>> 
>> Hmm..
>> 
>> I am not familiar with ipset, but it seems to save the skbprio into
>> skb->priority, so it doesn't need TC filter to classify it again.
>> 
>> I guess your packets might go to the direct queue of HTB, which
>> bypasses the token bucket. Can you dump the stats and check?
>
> With more than 64K 'classes' I suggest to use a single FQ qdisc [1], and
> an eBPF program using EDT model (Earliest Departure Time)
>
> The BPF program would perform the classification, then find a data structure
> based on the 'class', and then update/maintain class virtual times and skb->tstamp
>
> TBF = bpf_map_lookup_elem(&map, &classid);
>
> uint64_t now = bpf_ktime_get_ns();
> uint64_t time_to_send = max(TBF->time_to_send, now);
>
> time_to_send += (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC / TBF->rate;
> if (time_to_send > TBF->max_horizon) {
>     return TC_ACT_SHOT;
> }
> TBF->time_to_send = time_to_send;
> skb->tstamp = max(time_to_send, skb->tstamp);
> if (time_to_send - now > TBF->ecn_horizon)
>     bpf_skb_ecn_set_ce(skb);
> return TC_ACT_OK;
>
> tools/testing/selftests/bpf/progs/test_tc_edt.c shows something similar.
>
>
> [1]  MQ + FQ if the device is multi-queues.
>
>    Note that this setup scales very well on SMP, since we no longer are forced
>  to use a single HTB hierarchy (protected by a single spinlock)

Wow, this is very cool! Thanks for that walk-through, Eric :)

-Toke

^ permalink raw reply

* Re: Re: [PATCH net-next 0/1] Add BASE-T1 PHY support
From: Christian Herber @ 2019-08-26  7:57 UTC (permalink / raw)
  To: Heiner Kallweit, Andrew Lunn
  Cc: davem@davemloft.net, Florian Fainelli, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <1f50cdcf-200d-7c25-35ae-aee011a6a520@gmail.com>

On 24.08.2019 17:03, Heiner Kallweit wrote:
> 
> On 22.08.2019 09:18, Christian Herber wrote:
>> On 21.08.2019 20:57, Andrew Lunn wrote:
>>>
>>>> The current patch set IMO is a little bit hacky. I'm not 100% happy
>>>> with the implicit assumption that there can't be devices supporting
>>>> T1 and classic BaseT modes or fiber modes.
>>>
>>>> Andrew: Do you have an opinion on that?
>>>
>>> Hi Heiner
>>>
>>> I would also like cleaner integration. I doubt here is anything in the
>>> standard which says you cannot combine these modes. It is more a
>>> marketing question if anybody would build such a device. Maybe not
>>> directly into a vehicle, but you could imaging a mobile test device
>>> which uses T1 to talk to the car and T4 to connect to the garage
>>> network?
>>>
>>> So i don't think we should limit ourselves. phylib should provide a
>>> clean, simple set of helpers to perform standard operations for
>>> various modes. Drivers can make use of those helpers. That much should
>>> be clear. If we try to make genphy support them all simultaneously, is
>>> less clear.
>>>
>>>        Andrew
>>>
>>
>> If you want to go down this path, then i think we have to ask some more
>> questions. Clause 45 is a very scalable register scheme, it is not a
>> specific class of devices and will be extended and extended.
>>
>> Currently, the phy-c45.c supports 10/100/1000/2500/5000/10000 Mbps
>> consumer/enterprise PHYs. This is also an implicit assumption. The
>> register set (e.g. on auto-neg) used for this will also only support
>> these modes and nothing more, as it is done scaling.
>>
>> Currently not supported, but already present in IEEE 802.3:
>> - MultiGBASE-T (25/40 Gbps) (see e.g. MultiGBASE-T AN control 1 register)
>> - BASE-T1
>> - 10BASE-T1
>> - NGBASE-T1
>>
>> And surely there are some on the way or already there that I am not
>> aware of.
>>
>> To me, one architectural decision point is if you want to have generic
>> support for all C45 PHYs in one file, or if you want to split it by
>> device class. I went down the first path with my patch, as this is the
>> road gone also with the existing code.
>>
>> If you want to split BASE-T1, i think you will need one basic C45
>> library (genphy_c45_pma_read_abilities() is a good example of a function
>> that is not specific to a device class). On the other hand,
>> genphy_c45_pma_setup_forced() is not a generic function at this point as
>> it supports only a subset of devices managed in C45.
>>
>> I tend to agree with you that splitting is the best way to go in the
>> long run, but that also requires a split of the existing phy-c45.c into
>> two IMHO.
>>
> BASE-T1 seems to be based on Clause 45 (at least Clause 45 MDIO),
> but it's not fully compliant with Clause 45. Taking AN link status
> as an example: 45.2.7.2.7 states that link-up is signaled in bit 7.1.2.
> If BASE-T1 uses a different register, then it's not fully Clause 45
> compatible.

Clause 45 defines e.g. bit 7.1.2 just like it defines the BASE-T1 
auto-neg registers. Any bit that i have used in my patch is 100% 
standardized in IEEE 802.3-2018, Clause 45. By definition, BASE-T1 PHYs 
have to use the Clause 45 BASE-T1 registers, otherwise they are not IEEE 
compliant.

> Therefore also my question for the datasheet of an actual BASE-T1 PHY,
> as I would be curious whether it shadows the link-up bit from 7.513.2
> to 7.1.2 to be Clause 45 compliant. Definitely reading bit 7.513.2
> is nothing that belongs into a genphy_c45_ function.

For now, there is no such public data sheet. However, IEEE 802.3-2018 is 
public. This should be the basis for a generic driver. Datasheets are 
needed for the device specific drivers. If Linux cares to support 
BASE-T1, it should implement a driver that works with a standard 
compliant PHY and that can be done on the basis of IEEE.

> The extension to genphy_c45_pma_read_abilities() looks good to me,
> for the other parts I'd like to see first how real world BASE-T1 PHYs
> handle it. If they shadow the T1-specific bits to the Clause 45
> standard ones, we should be fine. Otherwise IMO we have to add
> separate T1 functions to phylib.
> 
> Heiner
> 

There is not requirement in IEEE to shadow the BASE-T1 registers into 
the "standard" ones. Thus, such an assumption should not be done for a 
generic driver, as it will quite certainly not work with all devices. 
Fyi, both registers are standard, just that the historically first ones 
are for classic 10/100/1000/10000 PHYs and BASE-T1 registers are for the 
single twisted pair PHYs.

^ permalink raw reply

* Re: [PATCH] net: intel: Cleanup e1000 - add space between }}
From: Jeff Kirsher @ 2019-08-26  8:03 UTC (permalink / raw)
  To: Forrest Fleming; +Cc: David S. Miller, intel-wired-lan, netdev, linux-kernel
In-Reply-To: <20190823191421.3318-1-ffleming@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 695 bytes --]

On Fri, 2019-08-23 at 19:14 +0000, Forrest Fleming wrote:
> suggested by checkpatch
> 
> Signed-off-by: Forrest Fleming <ffleming@gmail.com>
> ---
>  .../net/ethernet/intel/e1000/e1000_param.c    | 28 +++++++++----------
>  1 file changed, 14 insertions(+), 14 deletions(-)

While I do not see an issue with this change, I wonder how important it is
to make such a change.  Especially since most of the hardware supported by
this driver is not available for testing.  In addition, this is one
suggested change by checkpatch.pl that I personally do not agree with.

This is not a hard NAK, but you have to explain how this change makes the
code more readable before I consider it.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH] bridge:fragmented packets dropped by bridge
From: Jan Engelhardt @ 2019-08-26  7:59 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Rundong Ge, davem, kuznet, yoshfuji, netdev, pablo, kadlec, roopa,
	netfilter-devel, coreteam, bridge, nikolay, linux-kernel
In-Reply-To: <20190730123542.zrsrfvcy7t2n3d4g@breakpoint.cc>


On Tuesday 2019-07-30 14:35, Florian Westphal wrote:
>Rundong Ge <rdong.ge@gmail.com> wrote:
>> Given following setup:
>> -modprobe br_netfilter
>> -echo '1' > /proc/sys/net/bridge/bridge-nf-call-iptables
>> -brctl addbr br0
>> -brctl addif br0 enp2s0
>> -brctl addif br0 enp3s0
>> -brctl addif br0 enp6s0
>> -ifconfig enp2s0 mtu 1300
>> -ifconfig enp3s0 mtu 1500
>> -ifconfig enp6s0 mtu 1500
>> -ifconfig br0 up
>> 
>>                  multi-port
>> mtu1500 - mtu1500|bridge|1500 - mtu1500
>>   A                  |            B
>>                    mtu1300
>
>How can a bridge forward a frame from A/B to mtu1300?

There might be a misunderstanding here judging from the shortness of this
thread.

I understood it such that the bridge ports (eth0,eth1) have MTU 1500, yet br0
(in essence the third bridge port if you so wish) itself has MTU 1300.

Therefore, frame forwarding from eth0 to eth1 should succeed, since the
1300-byte MTU is only relevant if the bridge decides the packet needs to be
locally delivered.

^ permalink raw reply

* [PATCH v2 0/3] Add NETIF_F_HW_BR_CAP feature
From: Horatiu Vultur @ 2019-08-26  8:11 UTC (permalink / raw)
  To: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, andrew, f.fainelli, netdev, linux-kernel, bridge
  Cc: Horatiu Vultur

When a network port is added to a bridge then the port is added in
promisc mode. Some HW that has bridge capabilities(can learn, forward,
flood etc the frames) they are disabling promisc mode in the network
driver when the port is added to the SW bridge.

This patch adds the feature NETIF_F_HW_BR_CAP so that the network ports
that have this feature will not be set in promisc mode when they are
added to a SW bridge.

In this way the HW that has bridge capabilities don't need to send all the
traffic to the CPU and can also implement the promisc mode and toggle it
using the command 'ip link set dev swp promisc on'

v1 -> v2
  - rename feature to NETIF_F_HW_BR_CAP
  - add better description in the commit message and in the code
  - remove the check that all network driver have same netdev_ops and
    just check for the feature NETIF_F_HW_BR_CAP when setting the network
    port in promisc mode.

Horatiu Vultur (3):
  net: Add NETIF_HW_BR_CAP feature
  net: mscc: Use NETIF_F_HW_BR_CAP
  net: mscc: Implement promisc mode.

 drivers/net/ethernet/mscc/ocelot.c | 26 ++++++++++++++++++++++++--
 include/linux/netdev_features.h    |  6 ++++++
 net/bridge/br_if.c                 | 11 ++++++++++-
 net/core/ethtool.c                 |  1 +
 4 files changed, 41 insertions(+), 3 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH v2 3/3] net: mscc: Implement promisc mode.
From: Horatiu Vultur @ 2019-08-26  8:11 UTC (permalink / raw)
  To: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, andrew, f.fainelli, netdev, linux-kernel, bridge
  Cc: Horatiu Vultur
In-Reply-To: <1566807075-775-1-git-send-email-horatiu.vultur@microchip.com>

Before when a port was added to a bridge then the port was added in
promisc mode. But because of the patches:
commit e6300374f3be6 ("net: Add NETIF_HW_BR_CAP feature")
commit 764866d46cc81 ("net: mscc: Use NETIF_F_HW_BR_CAP")'

The port is not needed to be in promisc mode to be part of the bridge.
So it is possible to togle the promisc mode of the port even if it is or
not part of the bridge.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 drivers/net/ethernet/mscc/ocelot.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 7d7c94b..8a18eef 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -691,6 +691,25 @@ static void ocelot_set_rx_mode(struct net_device *dev)
 	__dev_mc_sync(dev, ocelot_mc_sync, ocelot_mc_unsync);
 }
 
+static void ocelot_change_rx_flags(struct net_device *dev, int flags)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	u32 val;
+
+	if (!(flags & IFF_PROMISC))
+		return;
+
+	val = ocelot_read_gix(ocelot, ANA_PORT_CPU_FWD_CFG,
+			      port->chip_port);
+	if (dev->flags & IFF_PROMISC)
+		val |= ANA_PORT_CPU_FWD_CFG_CPU_SRC_COPY_ENA;
+	else
+		val &= ~(ANA_PORT_CPU_FWD_CFG_CPU_SRC_COPY_ENA);
+
+	ocelot_write_gix(ocelot, val, ANA_PORT_CPU_FWD_CFG, port->chip_port);
+}
+
 static int ocelot_port_get_phys_port_name(struct net_device *dev,
 					  char *buf, size_t len)
 {
@@ -1070,6 +1089,7 @@ static const struct net_device_ops ocelot_port_netdev_ops = {
 	.ndo_stop			= ocelot_port_stop,
 	.ndo_start_xmit			= ocelot_port_xmit,
 	.ndo_set_rx_mode		= ocelot_set_rx_mode,
+	.ndo_change_rx_flags		= ocelot_change_rx_flags,
 	.ndo_get_phys_port_name		= ocelot_port_get_phys_port_name,
 	.ndo_set_mac_address		= ocelot_port_set_mac_address,
 	.ndo_get_stats64		= ocelot_get_stats64,
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 2/3] net: mscc: Use NETIF_F_HW_BR_CAP
From: Horatiu Vultur @ 2019-08-26  8:11 UTC (permalink / raw)
  To: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, andrew, f.fainelli, netdev, linux-kernel, bridge
  Cc: Horatiu Vultur
In-Reply-To: <1566807075-775-1-git-send-email-horatiu.vultur@microchip.com>

Enable NETIF_F_HW_BR_CAP feature for ocelot.
Because the HW can learn and flood the frames, so there is no need for SW
bridge to do this.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 drivers/net/ethernet/mscc/ocelot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 4d1bce4..7d7c94b 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -2017,8 +2017,10 @@ int ocelot_probe_port(struct ocelot *ocelot, u8 port,
 	dev->ethtool_ops = &ocelot_ethtool_ops;
 
 	dev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_RXFCS |
-		NETIF_F_HW_TC;
-	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC;
+		NETIF_F_HW_TC | NETIF_F_HW_BR_CAP;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC |
+		NETIF_F_HW_BR_CAP;
+	dev->priv_flags |= IFF_UNICAST_FLT;
 
 	memcpy(dev->dev_addr, ocelot->base_mac, ETH_ALEN);
 	dev->dev_addr[ETH_ALEN - 1] += port;
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 1/3] net: Add NETIF_HW_BR_CAP feature
From: Horatiu Vultur @ 2019-08-26  8:11 UTC (permalink / raw)
  To: roopa, nikolay, davem, UNGLinuxDriver, alexandre.belloni,
	allan.nielsen, andrew, f.fainelli, netdev, linux-kernel, bridge
  Cc: Horatiu Vultur
In-Reply-To: <1566807075-775-1-git-send-email-horatiu.vultur@microchip.com>

This patch adds a netdev feature to allow the SW bridge not to set all the
slave interfaces in promisc mode if the HW is capable of learning and
flooading the frames.

The current implementation adds all the bridge ports in promisc mode. Even
if the HW has bridge capabilities(can learn and flood frames). Then all the
frames will be copy to the CPU even if there are cases where there is no
need for this.

For example in the following scenario:
+----------------------------------+
|               SW BR              |
+----------------------------------+
|      |                  |
|      |        +------------------+
|      |        |       HW BR      |
|      |        +------------------+
|      |          |            |
+      +          +            +
p1     p2         p3           p4

Case A: There is a SW bridge with the ports p1 and p2
Case B: There is a SW bridge with the ports p3 and p4.
Case C: THere is a SW bridge with the ports p2, p3 and p4.

For case A, the HW can't do learning and flooding of the frames. Therefore
all the frames need to be copied to the CPU to allow the SW bridge to
decide what do do with the frame(forward, flood, copy to the upper layers,
etc..).

For case B, there is HW support to learn and flood the frames. In this case
there is no point to send all the frames to the CPU(except for frames that
need to go to CPU and flooded frames if flooding is enabled). Because the
HW will already forward the frame to the correct network port, then the
SW bridge will not have anything to do. It would just use CPU cycles and
then drop the frame. The reason for dropping the frame is that the network
driver will set the flag fwd_offload_mark and then SW bridge will skip all
the ports that have the same parent_id as the port that received the frame.
Which is this case.

For case C, there is HW support to learn and flood frames for ports p3 and
p4 while p2 doesn't have HW support. In this case the port p2 needs to be
in promisc mode to allow SW bridge to do the learning and flooding of the
frames while ports p3 and p4 they don't need to be in promisc mode.
The ports p3 and p4 need to make sure that the CPU is in flood mask and
need to know which addresses can be access through SW bridge so it could
send those frames to CPU port. So it would allow the SW bridge to send to
the correct network port.

A workaround for all these cases is not to set the network port in
promisc mode if it is a bridge port, which is the case for mlxsw. Or not
to implement it at all, which is the case for ocelot. But the disadvantage
of this approach is that the network bridge ports can not be set in promisc
mode if there is a need to monitor all the traffic on that port using the
command 'ip link set dev swp promisc on'. This patch adds also support for
this case.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 include/linux/netdev_features.h |  6 ++++++
 net/bridge/br_if.c              | 11 ++++++++++-
 net/core/ethtool.c              |  1 +
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 4b19c54..b5a3463 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -78,6 +78,11 @@ enum {
 	NETIF_F_HW_TLS_TX_BIT,		/* Hardware TLS TX offload */
 	NETIF_F_HW_TLS_RX_BIT,		/* Hardware TLS RX offload */

+	NETIF_F_HW_BR_CAP_BIT,		/* Hardware is capable to behave as a
+					 * bridge to learn and switch frames
+					 */
+
+
 	NETIF_F_GRO_HW_BIT,		/* Hardware Generic receive offload */
 	NETIF_F_HW_TLS_RECORD_BIT,	/* Offload TLS record */

@@ -150,6 +155,7 @@ enum {
 #define NETIF_F_GSO_UDP_L4	__NETIF_F(GSO_UDP_L4)
 #define NETIF_F_HW_TLS_TX	__NETIF_F(HW_TLS_TX)
 #define NETIF_F_HW_TLS_RX	__NETIF_F(HW_TLS_RX)
+#define NETIF_F_HW_BR_CAP	__NETIF_F(HW_BR_CAP)

 /* Finds the next feature with the highest number of the range of start till 0.
  */
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 4fe30b1..93bfc55 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -161,7 +161,16 @@ void br_manage_promisc(struct net_bridge *br)
 			    (br->auto_cnt == 1 && br_auto_port(p)))
 				br_port_clear_promisc(p);
 			else
-				br_port_set_promisc(p);
+				/* If the HW has bridge capabilities to learn
+				 * and flood the frames then there is no need
+				 * to copy all the frames to the SW to do the
+				 * same. Because the HW already switched the
+				 * frame and then there is nothing to do for
+				 * the SW bridge. The SW will just use CPU
+				 * and it would drop the frame.
+				 */
+				if (!(p->dev->features & NETIF_F_HW_BR_CAP))
+					br_port_set_promisc(p);
 		}
 	}
 }
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6288e69..10430fe 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_HW_TLS_RECORD_BIT] =	"tls-hw-record",
 	[NETIF_F_HW_TLS_TX_BIT] =	 "tls-hw-tx-offload",
 	[NETIF_F_HW_TLS_RX_BIT] =	 "tls-hw-rx-offload",
+	[NETIF_F_HW_BR_CAP_BIT] =	 "bridge-capabilities-offload",
 };

 static const char
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH] net: Adding parameter detection in __ethtool_get_link_ksettings.
From: Eric Dumazet @ 2019-08-26  8:16 UTC (permalink / raw)
  To: Dongxu Liu, davem; +Cc: netdev, linux-kernel
In-Reply-To: <20190826072332.14736-1-liudongxu3@huawei.com>



On 8/26/19 9:23 AM, Dongxu Liu wrote:
> The __ethtool_get_link_ksettings symbol will be exported,
> and external users may use an illegal address.
> We should check the parameters before using them,
> otherwise the system will crash.
> 
> [ 8980.991134] BUG: unable to handle kernel NULL pointer dereference at           (null)
> [ 8980.993049] IP: [<ffffffff8155aca7>] __ethtool_get_link_ksettings+0x27/0x140
> [ 8980.994285] PGD 0
> [ 8980.995013] Oops: 0000 [#1] SMP
> [ 8980.995896] Modules linked in: sch_ingress ...
> [ 8981.013220] CPU: 3 PID: 25174 Comm: kworker/3:3 Tainted: G           O   ----V-------   3.10.0-327.36.58.4.x86_64 #1
> [ 8981.017667] Workqueue: events linkwatch_event
> [ 8981.018652] task: ffff8800a8348000 ti: ffff8800b045c000 task.ti: ffff8800b045c000
> [ 8981.020418] RIP: 0010:[<ffffffff8155aca7>]  [<ffffffff8155aca7>] __ethtool_get_link_ksettings+0x27/0x140
> [ 8981.022383] RSP: 0018:ffff8800b045fc88  EFLAGS: 00010202
> [ 8981.023453] RAX: 0000000000000000 RBX: ffff8800b045fcac RCX: 0000000000000000
> [ 8981.024726] RDX: ffff8800b658f600 RSI: ffff8800b045fcac RDI: ffff8802296e0000
> [ 8981.026000] RBP: ffff8800b045fc98 R08: 0000000000000000 R09: 0000000000000001
> [ 8981.027273] R10: 00000000000073e0 R11: 0000082b0cc8adea R12: ffff8802296e0000
> [ 8981.028561] R13: ffff8800b566e8c0 R14: ffff8800b658f600 R15: ffff8800b566e000
> [ 8981.029841] FS:  0000000000000000(0000) GS:ffff88023ed80000(0000) knlGS:0000000000000000
> [ 8981.031715] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 8981.032845] CR2: 0000000000000000 CR3: 00000000b39a9000 CR4: 00000000003407e0
> [ 8981.034137] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 8981.035427] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [ 8981.036702] Stack:
> [ 8981.037406]  ffff8800b658f600 0000000000009c40 ffff8800b045fce8 ffffffffa047a71d
> [ 8981.039238]  000000000000004d ffff8800b045fcc8 ffff8800b045fd28 ffffffff815cb198
> [ 8981.041070]  ffff8800b045fcd8 ffffffff810807e6 00000000e8212951 0000000000000001
> [ 8981.042910] Call Trace:
> [ 8981.043660]  [<ffffffffa047a71d>] bond_update_speed_duplex+0x3d/0x90 [bonding]
> [ 8981.045424]  [<ffffffff815cb198>] ? inetdev_event+0x38/0x530
> [ 8981.046554]  [<ffffffff810807e6>] ? put_online_cpus+0x56/0x80
> [ 8981.047688]  [<ffffffffa0480d67>] bond_netdev_event+0x137/0x360 [bonding]
> ...
> 
> Signed-off-by: Dongxu Liu <liudongxu3@huawei.com>
> ---
>  net/core/ethtool.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> index 6288e69..9a50b64 100644
> --- a/net/core/ethtool.c
> +++ b/net/core/ethtool.c
> @@ -545,6 +545,8 @@ int __ethtool_get_link_ksettings(struct net_device *dev,
>  {
>  	ASSERT_RTNL();
>  
> +	if (!dev || !dev->ethtool_ops)
> +		return -EOPNOTSUPP;

I do not believe dev can possibly be NULL at this point.

>  	if (!dev->ethtool_ops->get_link_ksettings)
>  		return -EOPNOTSUPP;
>  
> 

I tried to find an appropriate Fixes: tag.

It seems this particular bug was added either by

Fixes: 9856909c2abb ("net: bonding: use __ethtool_get_ksettings")

or generically in :

Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API")


^ permalink raw reply

* [PATCH net] xfrm: add NETDEV_UNREGISTER event process for xfrmi
From: Xin Long @ 2019-08-26  8:25 UTC (permalink / raw)
  To: network dev; +Cc: davem, Steffen Klassert

When creating a xfrmi dev, it always holds the phydev. The xfrmi dev should
be deleted when the phydev is being unregistered, so that the phydev can be
put on time. Otherwise the phydev's deleting will get stuck:

  # ip link add dummy10 type dummy
  # ip link add xfrmi10 type xfrm dev dummy10
  # ip link del dummy10

The last command blocks and dmesg shows:

  unregister_netdevice: waiting for dummy10 to become free. Usage count = 1

This patch fixes it by adding NETDEV_UNREGISTER event process for xfrmi.

Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces")
Reported-by: Xiumei Mu <xmu@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/xfrm/xfrm_interface.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 74868f9..f3de1f5 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -877,6 +877,35 @@ static const struct xfrm_if_cb xfrm_if_cb = {
 	.decode_session =	xfrmi_decode_session,
 };
 
+static int xfrmi_event(struct notifier_block *this, unsigned long e, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct xfrmi_net *xfrmn = net_generic(dev_net(dev), xfrmi_net_id);
+	struct xfrm_if *xi;
+	LIST_HEAD(list);
+
+	if (e != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+
+	rcu_read_lock();
+	for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+		if (xi->phydev != dev)
+			continue;
+		unregister_netdevice_queue(xi->dev, &list);
+	}
+	rcu_read_unlock();
+
+	if (list_empty(&list))
+		return NOTIFY_DONE;
+
+	unregister_netdevice_many(&list);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block xfrmi_notifier = {
+	.notifier_call  = xfrmi_event,
+};
+
 static int __init xfrmi_init(void)
 {
 	const char *msg;
@@ -906,6 +935,7 @@ static int __init xfrmi_init(void)
 		goto rtnl_link_failed;
 
 	xfrm_if_register_cb(&xfrm_if_cb);
+	register_netdevice_notifier(&xfrmi_notifier);
 
 	return err;
 
@@ -922,6 +952,7 @@ static int __init xfrmi_init(void)
 
 static void __exit xfrmi_fini(void)
 {
+	unregister_netdevice_notifier(&xfrmi_notifier);
 	xfrm_if_unregister_cb();
 	rtnl_link_unregister(&xfrmi_link_ops);
 	xfrmi4_fini();
-- 
2.1.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox