* [PATCH net-next 8/9] net: Remove SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT
From: Florian Fainelli @ 2019-02-13 22:06 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, David S. Miller, Ido Schimmel, open list,
open list:STAGING SUBSYSTEM, moderated list:ETHERNET BRIDGE, jiri,
andrew, vivien.didelot
In-Reply-To: <20190213220638.1552-1-f.fainelli@gmail.com>
Now that we have converted the bridge code and the drivers to check for
bridge port(s) flags at the time we try to set them, there is no need
for a get() -> set() sequence anymore and
SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT therefore becomes unused.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
.../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 11 +----------
drivers/net/ethernet/rocker/rocker_main.c | 14 +-------------
drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 10 +---------
include/net/switchdev.h | 2 --
net/dsa/slave.c | 10 +---------
5 files changed, 4 insertions(+), 43 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 7616eab50035..c11cf7fa4863 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -434,16 +434,7 @@ static void mlxsw_sp_bridge_vlan_put(struct mlxsw_sp_bridge_vlan *bridge_vlan)
static int mlxsw_sp_port_attr_get(struct net_device *dev,
struct switchdev_attr *attr)
{
- switch (attr->id) {
- case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
- attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD |
- BR_MCAST_FLOOD;
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- return 0;
+ return -EOPNOTSUPP;
}
static int
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 863a8b32e6e9..8e80301eae7b 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2057,19 +2057,7 @@ static const struct net_device_ops rocker_port_netdev_ops = {
static int rocker_port_attr_get(struct net_device *dev,
struct switchdev_attr *attr)
{
- const struct rocker_port *rocker_port = netdev_priv(dev);
- int err = 0;
-
- switch (attr->id) {
- case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
- err = rocker_world_port_attr_bridge_flags_support_get(rocker_port,
- &attr->u.brport_flags_support);
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- return err;
+ return -EOPNOTSUPP;
}
static int rocker_port_attr_set(struct net_device *dev,
diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
index f788a9458b89..5f58c7df67bb 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
+++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
@@ -643,15 +643,7 @@ static void ethsw_teardown_irqs(struct fsl_mc_device *sw_dev)
static int swdev_port_attr_get(struct net_device *netdev,
struct switchdev_attr *attr)
{
- switch (attr->id) {
- case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
- attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD;
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- return 0;
+ return -EOPNOTSUPP;
}
static int port_attr_stp_state_set(struct net_device *netdev,
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index de72b0a3867f..0f352019ef99 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -45,7 +45,6 @@ enum switchdev_attr_id {
SWITCHDEV_ATTR_ID_UNDEFINED,
SWITCHDEV_ATTR_ID_PORT_STP_STATE,
SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
- SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT,
SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS,
SWITCHDEV_ATTR_ID_PORT_MROUTER,
SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
@@ -63,7 +62,6 @@ struct switchdev_attr {
union {
u8 stp_state; /* PORT_STP_STATE */
unsigned long brport_flags; /* PORT_{PRE}_BRIDGE_FLAGS */
- unsigned long brport_flags_support; /* PORT_BRIDGE_FLAGS_SUPPORT */
bool mrouter; /* PORT_MROUTER */
clock_t ageing_time; /* BRIDGE_AGEING_TIME */
bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 417388c9f1fa..a176d3ba3b7a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -388,15 +388,7 @@ static int dsa_slave_get_port_parent_id(struct net_device *dev,
static int dsa_slave_port_attr_get(struct net_device *dev,
struct switchdev_attr *attr)
{
- switch (attr->id) {
- case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
- attr->u.brport_flags_support = 0;
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- return 0;
+ return -EOPNOTSUPP;
}
static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
--
2.17.1
^ permalink raw reply related
* [PATCH net-next 9/9] net: Get rid of switchdev_port_attr_get()
From: Florian Fainelli @ 2019-02-13 22:06 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, David S. Miller, Ido Schimmel, open list,
open list:STAGING SUBSYSTEM, moderated list:ETHERNET BRIDGE, jiri,
andrew, vivien.didelot
In-Reply-To: <20190213220638.1552-1-f.fainelli@gmail.com>
With the bridge no longer calling switchdev_port_attr_get() to obtain
the supported bridge port flags from a driver but instead trying to set
the bridge port flags directly and relying on driver to reject
unsupported configurations, we can effectively get rid of
switchdev_port_attr_get() entirely since this was the only place where
it was called.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
Documentation/networking/switchdev.txt | 5 ++---
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 7 -------
drivers/net/ethernet/rocker/rocker_main.c | 7 -------
drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 7 -------
include/net/switchdev.h | 8 --------
net/dsa/slave.c | 7 -------
6 files changed, 2 insertions(+), 39 deletions(-)
diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
index ea90243340a9..327afe754230 100644
--- a/Documentation/networking/switchdev.txt
+++ b/Documentation/networking/switchdev.txt
@@ -233,9 +233,8 @@ the bridge's FDB. It's possible, but not optimal, to enable learning on the
device port and on the bridge port, and disable learning_sync.
To support learning and learning_sync port attributes, the driver implements
-switchdev op switchdev_port_attr_get/set for
-SWITCHDEV_ATTR_PORT_ID_BRIDGE_FLAGS. The driver should initialize the attributes
-to the hardware defaults.
+switchdev op switchdev_port_attr_set for SWITCHDEV_ATTR_PORT_ID_BRIDGE_FLAGS.
+The driver should initialize the attributes to the hardware defaults.
FDB Ageing
^^^^^^^^^^
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index c11cf7fa4863..5263abe6224e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -431,12 +431,6 @@ static void mlxsw_sp_bridge_vlan_put(struct mlxsw_sp_bridge_vlan *bridge_vlan)
mlxsw_sp_bridge_vlan_destroy(bridge_vlan);
}
-static int mlxsw_sp_port_attr_get(struct net_device *dev,
- struct switchdev_attr *attr)
-{
- return -EOPNOTSUPP;
-}
-
static int
mlxsw_sp_port_bridge_vlan_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
struct mlxsw_sp_bridge_vlan *bridge_vlan,
@@ -1937,7 +1931,6 @@ static struct mlxsw_sp_port *mlxsw_sp_lag_rep_port(struct mlxsw_sp *mlxsw_sp,
}
static const struct switchdev_ops mlxsw_sp_port_switchdev_ops = {
- .switchdev_port_attr_get = mlxsw_sp_port_attr_get,
.switchdev_port_attr_set = mlxsw_sp_port_attr_set,
};
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 8e80301eae7b..5d06c7edf696 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2054,12 +2054,6 @@ static const struct net_device_ops rocker_port_netdev_ops = {
* swdev interface
********************/
-static int rocker_port_attr_get(struct net_device *dev,
- struct switchdev_attr *attr)
-{
- return -EOPNOTSUPP;
-}
-
static int rocker_port_attr_set(struct net_device *dev,
const struct switchdev_attr *attr,
struct switchdev_trans *trans)
@@ -2135,7 +2129,6 @@ static int rocker_port_obj_del(struct net_device *dev,
}
static const struct switchdev_ops rocker_port_switchdev_ops = {
- .switchdev_port_attr_get = rocker_port_attr_get,
.switchdev_port_attr_set = rocker_port_attr_set,
};
diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
index 5f58c7df67bb..005c12c08df4 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
+++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
@@ -640,12 +640,6 @@ static void ethsw_teardown_irqs(struct fsl_mc_device *sw_dev)
fsl_mc_free_irqs(sw_dev);
}
-static int swdev_port_attr_get(struct net_device *netdev,
- struct switchdev_attr *attr)
-{
- return -EOPNOTSUPP;
-}
-
static int port_attr_stp_state_set(struct net_device *netdev,
struct switchdev_trans *trans,
u8 state)
@@ -924,7 +918,6 @@ static int swdev_port_obj_del(struct net_device *netdev,
}
static const struct switchdev_ops ethsw_port_switchdev_ops = {
- .switchdev_port_attr_get = swdev_port_attr_get,
.switchdev_port_attr_set = swdev_port_attr_set,
};
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 0f352019ef99..45310ddf2d7e 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -179,8 +179,6 @@ switchdev_notifier_info_to_extack(const struct switchdev_notifier_info *info)
#ifdef CONFIG_NET_SWITCHDEV
void switchdev_deferred_process(void);
-int switchdev_port_attr_get(struct net_device *dev,
- struct switchdev_attr *attr);
int switchdev_port_attr_set(struct net_device *dev,
const struct switchdev_attr *attr);
int switchdev_port_obj_add(struct net_device *dev,
@@ -225,12 +223,6 @@ static inline void switchdev_deferred_process(void)
{
}
-static inline int switchdev_port_attr_get(struct net_device *dev,
- struct switchdev_attr *attr)
-{
- return -EOPNOTSUPP;
-}
-
static inline int switchdev_port_attr_set(struct net_device *dev,
const struct switchdev_attr *attr)
{
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a176d3ba3b7a..1258a0b7a158 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -385,12 +385,6 @@ static int dsa_slave_get_port_parent_id(struct net_device *dev,
return 0;
}
-static int dsa_slave_port_attr_get(struct net_device *dev,
- struct switchdev_attr *attr)
-{
- return -EOPNOTSUPP;
-}
-
static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
struct sk_buff *skb)
{
@@ -1057,7 +1051,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
};
static const struct switchdev_ops dsa_slave_switchdev_ops = {
- .switchdev_port_attr_get = dsa_slave_port_attr_get,
.switchdev_port_attr_set = dsa_slave_port_attr_set,
};
--
2.17.1
^ permalink raw reply related
* Re: [PATCH iproute2] iplink: document XDP subcommand to force the XDP mode.
From: Daniel Borkmann @ 2019-02-13 22:07 UTC (permalink / raw)
To: Matteo Croce, netdev; +Cc: David Ahern, Stephen Hemminger, Jakub Kicinski
In-Reply-To: <20190213144030.15160-1-mcroce@redhat.com>
On 02/13/2019 03:40 PM, Matteo Croce wrote:
> When attaching an eBPF program to a device, ip link can force the XDP mode
> by using the xdp{generic,drv,offload} keyword instead of just 'xdp'.
> Document this behaviour also in the help output.
>
> Signed-off-by: Matteo Croce <mcroce@redhat.com>
> Fixes: 14683814 ("bpf: add xdpdrv for requesting XDP driver mode")
> Fixes: 1b5e8094 ("bpf: allow requesting XDP HW offload")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
^ permalink raw reply
* Re: [PATCH iproute2 net-next v2 3/4] ss: Buffer raw fields first, then render them as a table
From: Stefano Brivio @ 2019-02-13 22:20 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Eric Dumazet, netdev, Sabrina Dubroca, David Ahern
In-Reply-To: <20190213135534.01dacee5@shemminger-XPS-13-9360>
On Wed, 13 Feb 2019 13:55:34 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:
> What I would favor:
> * use big enough columns that for the common case everything lines up fine
> * if column is to wide just print that element wider (which is what print %Ns does)
This is very close to what was done before, but as soon as you mix,
say, UNIX sockets with TCP sockets, "big enough" columns typically make
output for TCP sockets unreadable.
With buffering, instead, I can decide that a line split is needed, and
keep fields aligned no matter what.
> and
> * add json output for programs that want to parse
> * use print_uint etc for that
Sure, I think we all agree with this, but it's not going to be quick to
implement (even though it's perhaps a bit easier with abstracted columns
and buffering). Eric reported a problem and I'm trying to fix it
quickly.
> The buffering patch (in iproute2-next) can/will be reverted.
I think it received generally good feedback (also from users, later on)
and this is the first report of a serious issue -- it's also an issue
which looks easy to fix (I'm half way through that by now).
By the way, this patch was merged in iproute2 more than one year ago
(December 2017, by you).
--
Stefano
^ permalink raw reply
* Re: [PATCH] mm: page_alloc: fix ref bias in page_frag_alloc() for 1-byte allocs
From: Alexander Duyck @ 2019-02-13 22:42 UTC (permalink / raw)
To: Jann Horn
Cc: linux-mm, Andrew Morton, LKML, Michal Hocko, Vlastimil Babka,
Pavel Tatashin, Oscar Salvador, Mel Gorman, Aaron Lu, Netdev,
Alexander Duyck
In-Reply-To: <20190213204157.12570-1-jannh@google.com>
On Wed, Feb 13, 2019 at 12:42 PM Jann Horn <jannh@google.com> wrote:
>
> The basic idea behind ->pagecnt_bias is: If we pre-allocate the maximum
> number of references that we might need to create in the fastpath later,
> the bump-allocation fastpath only has to modify the non-atomic bias value
> that tracks the number of extra references we hold instead of the atomic
> refcount. The maximum number of allocations we can serve (under the
> assumption that no allocation is made with size 0) is nc->size, so that's
> the bias used.
>
> However, even when all memory in the allocation has been given away, a
> reference to the page is still held; and in the `offset < 0` slowpath, the
> page may be reused if everyone else has dropped their references.
> This means that the necessary number of references is actually
> `nc->size+1`.
>
> Luckily, from a quick grep, it looks like the only path that can call
> page_frag_alloc(fragsz=1) is TAP with the IFF_NAPI_FRAGS flag, which
> requires CAP_NET_ADMIN in the init namespace and is only intended to be
> used for kernel testing and fuzzing.
Actually that has me somewhat concerned. I wouldn't be surprised if
most drivers expect the netdev_alloc_frags call to at least output an
SKB_DATA_ALIGN sized value.
We probably should update __netdev_alloc_frag and __napi_alloc_frag so
that they will pass fragsz through SKB_DATA_ALIGN.
> To test for this issue, put a `WARN_ON(page_ref_count(page) == 0)` in the
> `offset < 0` path, below the virt_to_page() call, and then repeatedly call
> writev() on a TAP device with IFF_TAP|IFF_NO_PI|IFF_NAPI_FRAGS|IFF_NAPI,
> with a vector consisting of 15 elements containing 1 byte each.
>
> Cc: stable@vger.kernel.org
> Signed-off-by: Jann Horn <jannh@google.com>
> ---
> mm/page_alloc.c | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 35fdde041f5c..46285d28e43b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4675,11 +4675,11 @@ void *page_frag_alloc(struct page_frag_cache *nc,
> /* Even if we own the page, we do not use atomic_set().
> * This would break get_page_unless_zero() users.
> */
> - page_ref_add(page, size - 1);
> + page_ref_add(page, size);
>
> /* reset page count bias and offset to start of new frag */
> nc->pfmemalloc = page_is_pfmemalloc(page);
> - nc->pagecnt_bias = size;
> + nc->pagecnt_bias = size + 1;
> nc->offset = size;
> }
>
> @@ -4695,10 +4695,10 @@ void *page_frag_alloc(struct page_frag_cache *nc,
> size = nc->size;
> #endif
> /* OK, page count is 0, we can safely set it */
> - set_page_count(page, size);
> + set_page_count(page, size + 1);
>
> /* reset page count bias and offset to start of new frag */
> - nc->pagecnt_bias = size;
> + nc->pagecnt_bias = size + 1;
> offset = size - fragsz;
> }
If we already have to add a constant it might be better to just use
PAGE_FRAG_CACHE_MAX_SIZE + 1 in all these spots where you are having
to use "size + 1" instead of "size". That way we can avoid having to
add a constant to a register value and then program that value.
instead we can just assign the constant value right from the start.
^ permalink raw reply
* [PATCH V2 0/7] Add FOLL_LONGTERM to GUP fast and use it
From: ira.weiny @ 2019-02-13 23:04 UTC (permalink / raw)
To: linux-mips, linux-kernel, kvm-ppc, linuxppc-dev, linux-s390,
linux-sh, sparclinux, kvm, linux-fpga, dri-devel, linux-rdma,
linux-media, linux-scsi, devel, virtualization, netdev,
linux-fbdev, xen-devel, devel, linux-mm, ceph-devel, rds-devel
Cc: Ira Weiny, John Hubbard, David Hildenbrand, Cornelia Huck,
Yoshinori Sato, Rich Felker, David S. Miller, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Joerg Roedel, Wu Hao, Alan Tull,
Moritz Fischer, David Airlie, Daniel Vetter, Jason Gunthorpe,
Dennis Dalessandro, Christian Benvenuti, Mauro Carvalho Chehab,
Matt Porter, Alexandre Bounine, Kai Mäkisara,
James E.J. Bottomley, Martin K. Petersen, Rob Springer,
Todd Poynor, Ben Chan, Jens Wiklander, Alex Williamson,
Michael S. Tsirkin, Jason Wang, Bartlomiej Zolnierkiewicz,
Stefano Stabellini, Martin Brandenburg, Peter Zijlstra,
Alexander Viro, Andrew Morton, Michal Hocko, Kirill A. Shutemov
In-Reply-To: <20190211201643.7599-1-ira.weiny@intel.com>
From: Ira Weiny <ira.weiny@intel.com>
NOTE: This series depends on my clean up patch to remove the write parameter
from gup_fast_permitted()[1]
HFI1, qib, and mthca, use get_user_pages_fast() due to it performance
advantages. These pages can be held for a significant time. But
get_user_pages_fast() does not protect against mapping of FS DAX pages.
Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which
retains the performance while also adding the FS DAX checks. XDP has also
shown interest in using this functionality.[2]
In addition we change get_user_pages() to use the new FOLL_LONGTERM flag and
remove the specialized get_user_pages_longterm call.
[1] https://lkml.org/lkml/2019/2/11/237
[2] https://lkml.org/lkml/2019/2/11/1789
Ira Weiny (7):
mm/gup: Replace get_user_pages_longterm() with FOLL_LONGTERM
mm/gup: Change write parameter to flags in fast walk
mm/gup: Change GUP fast to use flags rather than a write 'bool'
mm/gup: Add FOLL_LONGTERM capability to GUP fast
IB/hfi1: Use the new FOLL_LONGTERM flag to get_user_pages_fast()
IB/qib: Use the new FOLL_LONGTERM flag to get_user_pages_fast()
IB/mthca: Use the new FOLL_LONGTERM flag to get_user_pages_fast()
arch/mips/mm/gup.c | 11 +-
arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 +-
arch/powerpc/kvm/e500_mmu.c | 2 +-
arch/powerpc/mm/mmu_context_iommu.c | 4 +-
arch/s390/kvm/interrupt.c | 2 +-
arch/s390/mm/gup.c | 12 +-
arch/sh/mm/gup.c | 11 +-
arch/sparc/mm/gup.c | 9 +-
arch/x86/kvm/paging_tmpl.h | 2 +-
arch/x86/kvm/svm.c | 2 +-
drivers/fpga/dfl-afu-dma-region.c | 2 +-
drivers/gpu/drm/via/via_dmablit.c | 3 +-
drivers/infiniband/core/umem.c | 5 +-
drivers/infiniband/hw/hfi1/user_pages.c | 5 +-
drivers/infiniband/hw/mthca/mthca_memfree.c | 3 +-
drivers/infiniband/hw/qib/qib_user_pages.c | 8 +-
drivers/infiniband/hw/qib/qib_user_sdma.c | 2 +-
drivers/infiniband/hw/usnic/usnic_uiom.c | 9 +-
drivers/media/v4l2-core/videobuf-dma-sg.c | 6 +-
drivers/misc/genwqe/card_utils.c | 2 +-
drivers/misc/vmw_vmci/vmci_host.c | 2 +-
drivers/misc/vmw_vmci/vmci_queue_pair.c | 6 +-
drivers/platform/goldfish/goldfish_pipe.c | 3 +-
drivers/rapidio/devices/rio_mport_cdev.c | 4 +-
drivers/sbus/char/oradax.c | 2 +-
drivers/scsi/st.c | 3 +-
drivers/staging/gasket/gasket_page_table.c | 4 +-
drivers/tee/tee_shm.c | 2 +-
drivers/vfio/vfio_iommu_spapr_tce.c | 3 +-
drivers/vfio/vfio_iommu_type1.c | 3 +-
drivers/vhost/vhost.c | 2 +-
drivers/video/fbdev/pvr2fb.c | 2 +-
drivers/virt/fsl_hypervisor.c | 2 +-
drivers/xen/gntdev.c | 2 +-
fs/orangefs/orangefs-bufmap.c | 2 +-
include/linux/mm.h | 17 +-
kernel/futex.c | 2 +-
lib/iov_iter.c | 7 +-
mm/gup.c | 220 ++++++++++++--------
mm/gup_benchmark.c | 5 +-
mm/util.c | 8 +-
net/ceph/pagevec.c | 2 +-
net/rds/info.c | 2 +-
net/rds/rdma.c | 3 +-
44 files changed, 232 insertions(+), 180 deletions(-)
--
2.20.1
^ permalink raw reply
* [PATCH V2 2/7] mm/gup: Change write parameter to flags in fast walk
From: ira.weiny @ 2019-02-13 23:04 UTC (permalink / raw)
To: linux-mips, linux-kernel, kvm-ppc, linuxppc-dev, linux-s390,
linux-sh, sparclinux, kvm, linux-fpga, dri-devel, linux-rdma,
linux-media, linux-scsi, devel, virtualization, netdev,
linux-fbdev, xen-devel, devel, linux-mm, ceph-devel, rds-devel
Cc: Ira Weiny, John Hubbard, David Hildenbrand, Cornelia Huck,
Yoshinori Sato, Rich Felker, David S. Miller, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Joerg Roedel, Wu Hao, Alan Tull,
Moritz Fischer, David Airlie, Daniel Vetter, Jason Gunthorpe,
Dennis Dalessandro, Christian Benvenuti, Mauro Carvalho Chehab,
Matt Porter, Alexandre Bounine, Kai Mäkisara,
James E.J. Bottomley, Martin K. Petersen, Rob Springer,
Todd Poynor, Ben Chan, Jens Wiklander, Alex Williamson,
Michael S. Tsirkin, Jason Wang, Bartlomiej Zolnierkiewicz,
Stefano Stabellini, Martin Brandenburg, Peter Zijlstra,
Alexander Viro, Andrew Morton, Michal Hocko, Kirill A. Shutemov
In-Reply-To: <20190213230455.5605-1-ira.weiny@intel.com>
From: Ira Weiny <ira.weiny@intel.com>
In order to support more options in the GUP fast walk, change
the write parameter to flags throughout the call stack.
This patch does not change functionality and passes FOLL_WRITE
where write was previously used.
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
mm/gup.c | 52 ++++++++++++++++++++++++++--------------------------
1 file changed, 26 insertions(+), 26 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c
index ee96eaff118c..681388236106 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1417,7 +1417,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -1435,7 +1435,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
if (pte_protnone(pte))
goto pte_unmap;
- if (!pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
if (pte_devmap(pte)) {
@@ -1487,7 +1487,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
return 0;
}
@@ -1570,12 +1570,12 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
#endif
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pmd_access_permitted(orig, write))
+ if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
if (pmd_devmap(orig))
@@ -1608,12 +1608,12 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pud_access_permitted(orig, write))
+ if (!pud_access_permitted(orig, flags & FOLL_WRITE))
return 0;
if (pud_devmap(orig))
@@ -1646,13 +1646,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
}
static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, int write,
+ unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
int refs;
struct page *head, *page;
- if (!pgd_access_permitted(orig, write))
+ if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
@@ -1683,7 +1683,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
@@ -1705,7 +1705,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_protnone(pmd))
return 0;
- if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+ if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
pages, nr))
return 0;
@@ -1715,9 +1715,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* pmd format and THP pmd format
*/
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
- PMD_SHIFT, next, write, pages, nr))
+ PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -1725,7 +1725,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
}
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
@@ -1738,14 +1738,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
if (pud_none(pud))
return 0;
if (unlikely(pud_huge(pud))) {
- if (!gup_huge_pud(pud, pudp, addr, next, write,
+ if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
- PUD_SHIFT, next, write, pages, nr))
+ PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
@@ -1753,7 +1753,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
}
static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
@@ -1768,9 +1768,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
BUILD_BUG_ON(p4d_huge(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
- P4D_SHIFT, next, write, pages, nr))
+ P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+ } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
@@ -1778,7 +1778,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
}
static void gup_pgd_range(unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pgd_t *pgdp;
@@ -1791,14 +1791,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
if (pgd_none(pgd))
return;
if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
pages, nr))
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, write, pages, nr))
+ PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+ } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -1852,7 +1852,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_save(flags);
- gup_pgd_range(start, end, write, pages, &nr);
+ gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
local_irq_restore(flags);
}
@@ -1894,7 +1894,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_disable();
- gup_pgd_range(addr, end, write, pages, &nr);
+ gup_pgd_range(addr, end, write ? FOLL_WRITE : 0, pages, &nr);
local_irq_enable();
ret = nr;
}
--
2.20.1
^ permalink raw reply related
* [PATCH V2 1/7] mm/gup: Replace get_user_pages_longterm() with FOLL_LONGTERM
From: ira.weiny @ 2019-02-13 23:04 UTC (permalink / raw)
To: linux-mips, linux-kernel, kvm-ppc, linuxppc-dev, linux-s390,
linux-sh, sparclinux, kvm, linux-fpga, dri-devel, linux-rdma,
linux-media, linux-scsi, devel, virtualization, netdev,
linux-fbdev, xen-devel, devel, linux-mm, ceph-devel, rds-devel
Cc: Ira Weiny, John Hubbard, David Hildenbrand, Cornelia Huck,
Yoshinori Sato, Rich Felker, David S. Miller, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Joerg Roedel, Wu Hao, Alan Tull,
Moritz Fischer, David Airlie, Daniel Vetter, Jason Gunthorpe,
Dennis Dalessandro, Christian Benvenuti, Mauro Carvalho Chehab,
Matt Porter, Alexandre Bounine, Kai Mäkisara,
James E.J. Bottomley, Martin K. Petersen, Rob Springer,
Todd Poynor, Ben Chan, Jens Wiklander, Alex Williamson,
Michael S. Tsirkin, Jason Wang, Bartlomiej Zolnierkiewicz,
Stefano Stabellini, Martin Brandenburg, Peter Zijlstra,
Alexander Viro, Andrew Morton, Michal Hocko, Kirill A. Shutemov
In-Reply-To: <20190213230455.5605-1-ira.weiny@intel.com>
From: Ira Weiny <ira.weiny@intel.com>
Rather than have a separate get_user_pages_longterm() call,
introduce FOLL_LONGTERM and change the longterm callers to use
it.
This patch does not change any functionality.
FOLL_LONGTERM can only be supported with get_user_pages() as it
requires vmas to determine if DAX is in use.
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
drivers/infiniband/core/umem.c | 5 +-
drivers/infiniband/hw/qib/qib_user_pages.c | 8 +-
drivers/infiniband/hw/usnic/usnic_uiom.c | 9 +-
drivers/media/v4l2-core/videobuf-dma-sg.c | 6 +-
drivers/vfio/vfio_iommu_type1.c | 3 +-
include/linux/mm.h | 13 +-
mm/gup.c | 138 ++++++++++++---------
mm/gup_benchmark.c | 5 +-
8 files changed, 101 insertions(+), 86 deletions(-)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index b69d3efa8712..120a40df91b4 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -185,10 +185,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
while (npages) {
down_read(&mm->mmap_sem);
- ret = get_user_pages_longterm(cur_base,
+ ret = get_user_pages(cur_base,
min_t(unsigned long, npages,
PAGE_SIZE / sizeof (struct page *)),
- gup_flags, page_list, vma_list);
+ gup_flags | FOLL_LONGTERM,
+ page_list, vma_list);
if (ret < 0) {
up_read(&mm->mmap_sem);
goto umem_release;
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index ef8bcf366ddc..1b9368261035 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
down_read(¤t->mm->mmap_sem);
for (got = 0; got < num_pages; got += ret) {
- ret = get_user_pages_longterm(start_page + got * PAGE_SIZE,
- num_pages - got,
- FOLL_WRITE | FOLL_FORCE,
- p + got, NULL);
+ ret = get_user_pages(start_page + got * PAGE_SIZE,
+ num_pages - got,
+ FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
+ p + got, NULL);
if (ret < 0) {
up_read(¤t->mm->mmap_sem);
goto bail_release;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 06862a6af185..1d9a182ac163 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
ret = 0;
while (npages) {
- ret = get_user_pages_longterm(cur_base,
- min_t(unsigned long, npages,
- PAGE_SIZE / sizeof(struct page *)),
- gup_flags, page_list, NULL);
+ ret = get_user_pages(cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof(struct page *)),
+ gup_flags | FOLL_LONGTERM,
+ page_list, NULL);
if (ret < 0)
goto out;
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 08929c087e27..870a2a526e0b 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
data, size, dma->nr_pages);
- err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
- flags, dma->pages, NULL);
+ err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+ flags | FOLL_LONGTERM, dma->pages, NULL);
if (err != dma->nr_pages) {
dma->nr_pages = (err >= 0) ? err : 0;
- dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+ dprintk(1, "get_user_pages: err=%d [%d]\n", err,
dma->nr_pages);
return err < 0 ? err : -EINVAL;
}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 73652e21efec..1500bd0bb6da 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -351,7 +351,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
down_read(&mm->mmap_sem);
if (mm == current->mm) {
- ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas);
+ ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
+ vmas);
} else {
ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
vmas, NULL);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..05a105d9d4c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1536,18 +1536,6 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
-#ifdef CONFIG_FS_DAX
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas);
-#else
-static inline long get_user_pages_longterm(unsigned long start,
- unsigned long nr_pages, unsigned int gup_flags,
- struct page **pages, struct vm_area_struct **vmas)
-{
- return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-}
-#endif /* CONFIG_FS_DAX */
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages);
@@ -2615,6 +2603,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
+#define FOLL_LONGTERM 0x10000 /* mapping is intended for a long term pin */
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
diff --git a/mm/gup.c b/mm/gup.c
index b63e88eca31b..ee96eaff118c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1109,87 +1109,109 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages_remote);
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
-{
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, vmas, NULL,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
-
#ifdef CONFIG_FS_DAX
/*
- * This is the same as get_user_pages() in that it assumes we are
- * operating on the current task's mm, but it goes further to validate
- * that the vmas associated with the address range are suitable for
- * longterm elevated page reference counts. For example, filesystem-dax
- * mappings are subject to the lifetime enforced by the filesystem and
- * we need guarantees that longterm users like RDMA and V4L2 only
- * establish mappings that have a kernel enforced revocation mechanism.
+ * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag if present.
+ *
+ * __gup_longterm_locked() validates that the vmas associated with the address
+ * range are suitable for longterm elevated page reference counts. For example,
+ * filesystem-dax mappings are subject to the lifetime enforced by the
+ * filesystem and we need guarantees that longterm users like RDMA and V4L2
+ * only establish mappings that have a kernel enforced revocation mechanism.
*
* "longterm" == userspace controlled elevated page count lifetime.
* Contrast this to iov_iter_get_pages() usages which are transient.
*/
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas_arg)
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags)
{
- struct vm_area_struct **vmas = vmas_arg;
+ struct vm_area_struct **vmas_tmp = vmas;
struct vm_area_struct *vma_prev = NULL;
long rc, i;
- if (!pages)
- return -EINVAL;
-
- if (!vmas) {
- vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas)
- return -ENOMEM;
+ if (flags & FOLL_LONGTERM) {
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas_tmp) {
+ vmas_tmp = kcalloc(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas_tmp)
+ return -ENOMEM;
+ }
}
- rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ vmas_tmp, NULL, flags);
- for (i = 0; i < rc; i++) {
- struct vm_area_struct *vma = vmas[i];
+ if (flags & FOLL_LONGTERM) {
+ for (i = 0; i < rc; i++) {
+ struct vm_area_struct *vma = vmas_tmp[i];
- if (vma == vma_prev)
- continue;
+ if (vma == vma_prev)
+ continue;
- vma_prev = vma;
+ vma_prev = vma;
- if (vma_is_fsdax(vma))
- break;
- }
+ if (vma_is_fsdax(vma))
+ break;
+ }
- /*
- * Either get_user_pages() failed, or the vma validation
- * succeeded, in either case we don't need to put_page() before
- * returning.
- */
- if (i >= rc)
- goto out;
+ /*
+ * Either get_user_pages() failed, or the vma validation
+ * succeeded, in either case we don't need to put_page() before
+ * returning.
+ */
+ if (i >= rc)
+ goto out;
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
out:
- if (vmas != vmas_arg)
- kfree(vmas);
+ if (vmas_tmp != vmas)
+ kfree(vmas_tmp);
+ }
+
return rc;
}
-EXPORT_SYMBOL(get_user_pages_longterm);
+#else /* !CONFIG_FS_DAX */
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags)
+{
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ NULL, flags);
+}
#endif /* CONFIG_FS_DAX */
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
+
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 5b42d3d4b60a..c898e2e0d1e4 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
pages + i);
break;
case GUP_LONGTERM_BENCHMARK:
- nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
- pages + i, NULL);
+ nr = get_user_pages(addr, nr,
+ (gup->flags & 1) | FOLL_LONGTERM,
+ pages + i, NULL);
break;
case GUP_BENCHMARK:
nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
--
2.20.1
^ permalink raw reply related
* [PATCH V2 5/7] IB/hfi1: Use the new FOLL_LONGTERM flag to get_user_pages_fast()
From: ira.weiny @ 2019-02-13 23:04 UTC (permalink / raw)
To: linux-mips, linux-kernel, kvm-ppc, linuxppc-dev, linux-s390,
linux-sh, sparclinux, kvm, linux-fpga, dri-devel, linux-rdma,
linux-media, linux-scsi, devel, virtualization, netdev,
linux-fbdev, xen-devel, devel, linux-mm, ceph-devel, rds-devel
Cc: Ira Weiny, John Hubbard, David Hildenbrand, Cornelia Huck,
Yoshinori Sato, Rich Felker, David S. Miller, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Joerg Roedel, Wu Hao, Alan Tull,
Moritz Fischer, David Airlie, Daniel Vetter, Jason Gunthorpe,
Dennis Dalessandro, Christian Benvenuti, Mauro Carvalho Chehab,
Matt Porter, Alexandre Bounine, Kai Mäkisara,
James E.J. Bottomley, Martin K. Petersen, Rob Springer,
Todd Poynor, Ben Chan, Jens Wiklander, Alex Williamson,
Michael S. Tsirkin, Jason Wang, Bartlomiej Zolnierkiewicz,
Stefano Stabellini, Martin Brandenburg, Peter Zijlstra,
Alexander Viro, Andrew Morton, Michal Hocko, Kirill A. Shutemov
In-Reply-To: <20190213230455.5605-1-ira.weiny@intel.com>
From: Ira Weiny <ira.weiny@intel.com>
Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against
FS DAX pages being mapped.
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
drivers/infiniband/hw/hfi1/user_pages.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 78ccacaf97d0..6a7f9cd5a94e 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -104,9 +104,11 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
bool writable, struct page **pages)
{
int ret;
+ unsigned int gup_flags = writable ? FOLL_WRITE : 0;
- ret = get_user_pages_fast(vaddr, npages, writable ? FOLL_WRITE : 0,
- pages);
+ gup_flags |= FOLL_LONGTERM;
+
+ ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
if (ret < 0)
return ret;
--
2.20.1
^ permalink raw reply related
* [PATCH V2 6/7] IB/qib: Use the new FOLL_LONGTERM flag to get_user_pages_fast()
From: ira.weiny @ 2019-02-13 23:04 UTC (permalink / raw)
To: linux-mips, linux-kernel, kvm-ppc, linuxppc-dev, linux-s390,
linux-sh, sparclinux, kvm, linux-fpga, dri-devel, linux-rdma,
linux-media, linux-scsi, devel, virtualization, netdev,
linux-fbdev, xen-devel, devel, linux-mm, ceph-devel, rds-devel
Cc: Ira Weiny, John Hubbard, David Hildenbrand, Cornelia Huck,
Yoshinori Sato, Rich Felker, David S. Miller, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Joerg Roedel, Wu Hao, Alan Tull,
Moritz Fischer, David Airlie, Daniel Vetter, Jason Gunthorpe,
Dennis Dalessandro, Christian Benvenuti, Mauro Carvalho Chehab,
Matt Porter, Alexandre Bounine, Kai Mäkisara,
James E.J. Bottomley, Martin K. Petersen, Rob Springer,
Todd Poynor, Ben Chan, Jens Wiklander, Alex Williamson,
Michael S. Tsirkin, Jason Wang, Bartlomiej Zolnierkiewicz,
Stefano Stabellini, Martin Brandenburg, Peter Zijlstra,
Alexander Viro, Andrew Morton, Michal Hocko, Kirill A. Shutemov
In-Reply-To: <20190213230455.5605-1-ira.weiny@intel.com>
From: Ira Weiny <ira.weiny@intel.com>
Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against
FS DAX pages being mapped.
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
drivers/infiniband/hw/qib/qib_user_sdma.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
index 31c523b2a9f5..b53cc0240e02 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -673,7 +673,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
else
j = npages;
- ret = get_user_pages_fast(addr, j, 0, pages);
+ ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
if (ret != j) {
i = 0;
j = ret;
--
2.20.1
^ permalink raw reply related
* [PATCH V2 7/7] IB/mthca: Use the new FOLL_LONGTERM flag to get_user_pages_fast()
From: ira.weiny @ 2019-02-13 23:04 UTC (permalink / raw)
To: linux-mips, linux-kernel, kvm-ppc, linuxppc-dev, linux-s390,
linux-sh, sparclinux, kvm, linux-fpga, dri-devel, linux-rdma,
linux-media, linux-scsi, devel, virtualization, netdev,
linux-fbdev, xen-devel, devel, linux-mm, ceph-devel, rds-devel
Cc: Ira Weiny, John Hubbard, David Hildenbrand, Cornelia Huck,
Yoshinori Sato, Rich Felker, David S. Miller, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Joerg Roedel, Wu Hao, Alan Tull,
Moritz Fischer, David Airlie, Daniel Vetter, Jason Gunthorpe,
Dennis Dalessandro, Christian Benvenuti, Mauro Carvalho Chehab,
Matt Porter, Alexandre Bounine, Kai Mäkisara,
James E.J. Bottomley, Martin K. Petersen, Rob Springer,
Todd Poynor, Ben Chan, Jens Wiklander, Alex Williamson,
Michael S. Tsirkin, Jason Wang, Bartlomiej Zolnierkiewicz,
Stefano Stabellini, Martin Brandenburg, Peter Zijlstra,
Alexander Viro, Andrew Morton, Michal Hocko, Kirill A. Shutemov
In-Reply-To: <20190213230455.5605-1-ira.weiny@intel.com>
From: Ira Weiny <ira.weiny@intel.com>
Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against
FS DAX pages being mapped.
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
drivers/infiniband/hw/mthca/mthca_memfree.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 112d2f38e0de..8ff0e90d7564 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,7 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
goto out;
}
- ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages);
+ ret = get_user_pages_fast(uaddr & PAGE_MASK, 1,
+ FOLL_WRITE | FOLL_LONGTERM, pages);
if (ret < 0)
goto out;
--
2.20.1
^ permalink raw reply related
* [PATCH V2 3/7] mm/gup: Change GUP fast to use flags rather than a write 'bool'
From: ira.weiny @ 2019-02-13 23:04 UTC (permalink / raw)
To: linux-mips, linux-kernel, kvm-ppc, linuxppc-dev, linux-s390,
linux-sh, sparclinux, kvm, linux-fpga, dri-devel, linux-rdma,
linux-media, linux-scsi, devel, virtualization, netdev,
linux-fbdev, xen-devel, devel, linux-mm, ceph-devel, rds-devel
Cc: Ira Weiny, John Hubbard, David Hildenbrand, Cornelia Huck,
Yoshinori Sato, Rich Felker, David S. Miller, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Joerg Roedel, Wu Hao, Alan Tull,
Moritz Fischer, David Airlie, Daniel Vetter, Jason Gunthorpe,
Dennis Dalessandro, Christian Benvenuti, Mauro Carvalho Chehab,
Matt Porter, Alexandre Bounine, Kai Mäkisara,
James E.J. Bottomley, Martin K. Petersen, Rob Springer,
Todd Poynor, Ben Chan, Jens Wiklander, Alex Williamson,
Michael S. Tsirkin, Jason Wang, Bartlomiej Zolnierkiewicz,
Stefano Stabellini, Martin Brandenburg, Peter Zijlstra,
Alexander Viro, Andrew Morton, Michal Hocko, Kirill A. Shutemov
In-Reply-To: <20190213230455.5605-1-ira.weiny@intel.com>
From: Ira Weiny <ira.weiny@intel.com>
To facilitate additional options to get_user_pages_fast() change the
singular write parameter to be gup_flags.
This patch does not change any functionality. New functionality will
follow in subsequent patches.
Some of the get_user_pages_fast() call sites were unchanged because they
already passed FOLL_WRITE or 0 for the write parameter.
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
arch/mips/mm/gup.c | 11 ++++++-----
arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++--
arch/powerpc/kvm/e500_mmu.c | 2 +-
arch/powerpc/mm/mmu_context_iommu.c | 4 ++--
arch/s390/kvm/interrupt.c | 2 +-
arch/s390/mm/gup.c | 12 ++++++------
arch/sh/mm/gup.c | 11 ++++++-----
arch/sparc/mm/gup.c | 9 +++++----
arch/x86/kvm/paging_tmpl.h | 2 +-
arch/x86/kvm/svm.c | 2 +-
drivers/fpga/dfl-afu-dma-region.c | 2 +-
drivers/gpu/drm/via/via_dmablit.c | 3 ++-
drivers/infiniband/hw/hfi1/user_pages.c | 3 ++-
drivers/misc/genwqe/card_utils.c | 2 +-
drivers/misc/vmw_vmci/vmci_host.c | 2 +-
drivers/misc/vmw_vmci/vmci_queue_pair.c | 6 ++++--
drivers/platform/goldfish/goldfish_pipe.c | 3 ++-
drivers/rapidio/devices/rio_mport_cdev.c | 4 +++-
drivers/sbus/char/oradax.c | 2 +-
drivers/scsi/st.c | 3 ++-
drivers/staging/gasket/gasket_page_table.c | 4 ++--
drivers/tee/tee_shm.c | 2 +-
drivers/vfio/vfio_iommu_spapr_tce.c | 3 ++-
drivers/vhost/vhost.c | 2 +-
drivers/video/fbdev/pvr2fb.c | 2 +-
drivers/virt/fsl_hypervisor.c | 2 +-
drivers/xen/gntdev.c | 2 +-
fs/orangefs/orangefs-bufmap.c | 2 +-
include/linux/mm.h | 4 ++--
kernel/futex.c | 2 +-
lib/iov_iter.c | 7 +++++--
mm/gup.c | 10 +++++-----
mm/util.c | 8 ++++----
net/ceph/pagevec.c | 2 +-
net/rds/info.c | 2 +-
net/rds/rdma.c | 3 ++-
36 files changed, 81 insertions(+), 65 deletions(-)
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 0d14e0d8eacf..4c2b4483683c 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
@@ -289,7 +290,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
pages += nr;
ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
+ pages, gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bd2dcfbf00cd..8fcb0a921e46 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -582,7 +582,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* If writing != 0, then the HPTE must allow writing, if we get here */
write_ok = writing;
hva = gfn_to_hva_memslot(memslot, gfn);
- npages = get_user_pages_fast(hva, 1, writing, pages);
+ npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages);
if (npages < 1) {
/* Check if it's an I/O mapping */
down_read(¤t->mm->mmap_sem);
@@ -1175,7 +1175,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
goto err;
hva = gfn_to_hva_memslot(memslot, gfn);
- npages = get_user_pages_fast(hva, 1, 1, pages);
+ npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
if (npages < 1)
goto err;
page = pages[0];
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 24296f4cadc6..e0af53fd78c5 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
if (!pages)
return -ENOMEM;
- ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
+ ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages);
if (ret < 0)
goto free_pages;
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index a712a650a8b6..acb0990c8364 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -190,7 +190,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
for (i = 0; i < entries; ++i) {
cur_ua = ua + (i << PAGE_SHIFT);
if (1 != get_user_pages_fast(cur_ua,
- 1/* pages */, 1/* iswrite */, &page)) {
+ 1/* pages */, FOLL_WRITE, &page)) {
ret = -EFAULT;
for (j = 0; j < i; ++j)
put_page(pfn_to_page(mem->hpas[j] >>
@@ -209,7 +209,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
if (mm_iommu_move_page_from_cma(page))
goto populate;
if (1 != get_user_pages_fast(cur_ua,
- 1/* pages */, 1/* iswrite */,
+ 1/* pages */, FOLL_WRITE,
&page)) {
ret = -EFAULT;
for (j = 0; j < i; ++j)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index fcb55b02990e..69d9366b966c 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2278,7 +2278,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
ret = -EFAULT;
goto out;
}
- ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
+ ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
if (ret < 0)
goto out;
BUG_ON(ret != 1);
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 2809d11c7a28..0a6faf3d9960 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -265,7 +265,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -277,22 +277,22 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
int nr, ret;
might_sleep();
start &= PAGE_MASK;
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ nr = __get_user_pages_fast(start, nr_pages, gup_flags & FOLL_WRITE,
+ pages);
if (nr == nr_pages)
return nr;
/* Try to get the remaining pages with get_user_pages */
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
- write ? FOLL_WRITE : 0);
+ ret = get_user_pages_unlocked(start, nr_pages - nr, pages, gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0)
ret = (ret < 0) ? nr : ret + nr;
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 3e27f6d1f1ec..277c882f7489 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
@@ -261,7 +262,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT, pages,
- write ? FOLL_WRITE : 0);
+ gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index aee6dba83d0e..1e770a517d4a 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
@@ -324,7 +325,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT, pages,
- write ? FOLL_WRITE : 0);
+ gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bdca39829bc..08715034e315 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t *table;
struct page *page;
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+ npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
/* Check if the user is doing something meaningless. */
if (unlikely(npages != 1))
return -EFAULT;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f13a3a24d360..173596a020cb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1803,7 +1803,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return NULL;
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index e18a786fc943..c438722bf4e1 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -102,7 +102,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
goto unlock_vm;
}
- pinned = get_user_pages_fast(region->user_addr, npages, 1,
+ pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE,
region->pages);
if (pinned < 0) {
ret = pinned;
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index 345bda4494e1..0c8b09602910 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -239,7 +239,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer)
if (NULL == vsg->pages)
return -ENOMEM;
ret = get_user_pages_fast((unsigned long)xfer->mem_addr,
- vsg->num_pages, vsg->direction == DMA_FROM_DEVICE,
+ vsg->num_pages,
+ vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
vsg->pages);
if (ret != vsg->num_pages) {
if (ret < 0)
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 24b592c6522e..78ccacaf97d0 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -105,7 +105,8 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
{
int ret;
- ret = get_user_pages_fast(vaddr, npages, writable, pages);
+ ret = get_user_pages_fast(vaddr, npages, writable ? FOLL_WRITE : 0,
+ pages);
if (ret < 0)
return ret;
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 25265fd0fd6e..89cff9d1012b 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr,
/* pin user pages in memory */
rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */
m->nr_pages,
- m->write, /* readable/writable */
+ m->write ? FOLL_WRITE : 0, /* readable/writable */
m->page_list); /* ptrs to pages */
if (rc < 0)
goto fail_get_user_pages;
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index 997f92543dd4..422d08da3244 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context,
/*
* Lock physical page backing a given user VA.
*/
- retval = get_user_pages_fast(uva, 1, 1, &context->notify_page);
+ retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page);
if (retval != 1) {
context->notify_page = NULL;
return VMCI_ERROR_GENERIC;
diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 264f4ed8eef2..c5396ee32e51 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -666,7 +666,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
int err = VMCI_SUCCESS;
retval = get_user_pages_fast((uintptr_t) produce_uva,
- produce_q->kernel_if->num_pages, 1,
+ produce_q->kernel_if->num_pages,
+ FOLL_WRITE,
produce_q->kernel_if->u.h.header_page);
if (retval < (int)produce_q->kernel_if->num_pages) {
pr_debug("get_user_pages_fast(produce) failed (retval=%d)",
@@ -678,7 +679,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
}
retval = get_user_pages_fast((uintptr_t) consume_uva,
- consume_q->kernel_if->num_pages, 1,
+ consume_q->kernel_if->num_pages,
+ FOLL_WRITE,
consume_q->kernel_if->u.h.header_page);
if (retval < (int)consume_q->kernel_if->num_pages) {
pr_debug("get_user_pages_fast(consume) failed (retval=%d)",
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 321bc673c417..cef0133aa47a 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page,
*iter_last_page_size = last_page_size;
}
- ret = get_user_pages_fast(first_page, requested_pages, !is_write,
+ ret = get_user_pages_fast(first_page, requested_pages,
+ !is_write ? FOLL_WRITE : 0,
pages);
if (ret <= 0)
return -EFAULT;
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index cbe467ff1aba..f681b3e9e970 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
pinned = get_user_pages_fast(
(unsigned long)xfer->loc_addr & PAGE_MASK,
- nr_pages, dir == DMA_FROM_DEVICE, page_list);
+ nr_pages,
+ dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
+ page_list);
if (pinned != nr_pages) {
if (pinned < 0) {
diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c
index 6516bc3cb58b..790aa148670d 100644
--- a/drivers/sbus/char/oradax.c
+++ b/drivers/sbus/char/oradax.c
@@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p)
dax_dbg("uva %p", va);
- ret = get_user_pages_fast((unsigned long)va, 1, 1, p);
+ ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p);
if (ret == 1) {
dax_dbg("locked page %p, for VA %p", *p, va);
return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 7ff22d3f03e3..871b25914c07 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4918,7 +4918,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
/* Try to fault in all of the necessary pages */
/* rw==READ means read from drive, write into memory area */
- res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages);
+ res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0,
+ pages);
/* Errors and no page mapped should return here */
if (res < nr_pages)
diff --git a/drivers/staging/gasket/gasket_page_table.c b/drivers/staging/gasket/gasket_page_table.c
index 26755d9ca41d..f67fdf1d3817 100644
--- a/drivers/staging/gasket/gasket_page_table.c
+++ b/drivers/staging/gasket/gasket_page_table.c
@@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl,
ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr +
off + i * PAGE_SIZE;
} else {
- ret = get_user_pages_fast(page_addr - offset, 1, 1,
- &page);
+ ret = get_user_pages_fast(page_addr - offset, 1,
+ FOLL_WRITE, &page);
if (ret <= 0) {
dev_err(pg_tbl->device,
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 0b9ab1d0dd45..49fd7312e2aa 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
goto err;
}
- rc = get_user_pages_fast(start, num_pages, 1, shm->pages);
+ rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages);
if (rc > 0)
shm->num_pages = rc;
if (rc != num_pages) {
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index c424913324e3..a4b10bb4086b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -532,7 +532,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
enum dma_data_direction direction = iommu_tce_direction(tce);
if (get_user_pages_fast(tce & PAGE_MASK, 1,
- direction != DMA_TO_DEVICE, &page) != 1)
+ direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
+ &page) != 1)
return -EFAULT;
*hpa = __pa((unsigned long) page_address(page));
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 24a129fcdd61..72685b1659ff 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1700,7 +1700,7 @@ static int set_bit_to_user(int nr, void __user *addr)
int bit = nr + (log % PAGE_SIZE) * 8;
int r;
- r = get_user_pages_fast(log, 1, 1, &page);
+ r = get_user_pages_fast(log, 1, FOLL_WRITE, &page);
if (r < 0)
return r;
BUG_ON(r != 1);
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index 8a53d1de611d..41390c8e0f67 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
if (!pages)
return -ENOMEM;
- ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages);
+ ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages);
if (ret < nr_pages) {
nr_pages = ret;
ret = -EINVAL;
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 8ba726e600e9..6446bcab4185 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
/* Get the physical addresses of the source buffer */
num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset,
- num_pages, param.source != -1, pages);
+ num_pages, param.source != -1 ? FOLL_WRITE : 0, pages);
if (num_pinned != num_pages) {
/* get_user_pages() failed */
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 5efc5eee9544..7b47f1e6aab4 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
unsigned long xen_pfn;
int ret;
- ret = get_user_pages_fast(addr, 1, writeable, &page);
+ ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
if (ret < 0)
return ret;
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 443bcd8c3c19..5a7c4fda682f 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
/* map the pages */
ret = get_user_pages_fast((unsigned long)user_desc->ptr,
- bufmap->page_count, 1, bufmap->page_array);
+ bufmap->page_count, FOLL_WRITE, bufmap->page_array);
if (ret < 0)
return ret;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 05a105d9d4c3..8e1f3cd7482a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1537,8 +1537,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages);
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages);
/* Container for pinned pfns / pages */
struct frame_vector {
diff --git a/kernel/futex.c b/kernel/futex.c
index fdd312da0992..e10209946f8b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -546,7 +546,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
- err = get_user_pages_fast(address, 1, 1, &page);
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index be4bd627caf0..6dbae0692719 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1280,7 +1280,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
len = maxpages * PAGE_SIZE;
addr &= ~(PAGE_SIZE - 1);
n = DIV_ROUND_UP(len, PAGE_SIZE);
- res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages);
+ res = get_user_pages_fast(addr, n,
+ iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
+ pages);
if (unlikely(res < 0))
return res;
return (res == n ? len : res * PAGE_SIZE) - *start;
@@ -1361,7 +1363,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
p = get_pages_array(n);
if (!p)
return -ENOMEM;
- res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p);
+ res = get_user_pages_fast(addr, n,
+ iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
if (unlikely(res < 0)) {
kvfree(p);
return res;
diff --git a/mm/gup.c b/mm/gup.c
index 681388236106..6f32d36b3c5b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1863,7 +1863,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -1875,8 +1875,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
@@ -1894,7 +1894,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_disable();
- gup_pgd_range(addr, end, write ? FOLL_WRITE : 0, pages, &nr);
+ gup_pgd_range(addr, end, gup_flags, pages, &nr);
local_irq_enable();
ret = nr;
}
@@ -1905,7 +1905,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
pages += nr;
ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
- write ? FOLL_WRITE : 0);
+ gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/util.c b/mm/util.c
index 1ea055138043..01ffe145c62b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -306,7 +306,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -327,10 +327,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* get_user_pages_fast simply falls back to get_user_pages.
*/
int __weak get_user_pages_fast(unsigned long start,
- int nr_pages, int write, struct page **pages)
+ int nr_pages, unsigned int gup_flags,
+ struct page **pages)
{
- return get_user_pages_unlocked(start, nr_pages, pages,
- write ? FOLL_WRITE : 0);
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d3736f5bffec..74cafc0142ea 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
while (got < num_pages) {
rc = get_user_pages_fast(
(unsigned long)data + ((unsigned long)got * PAGE_SIZE),
- num_pages - got, write_page, pages + got);
+ num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
if (rc < 0)
break;
BUG_ON(rc == 0);
diff --git a/net/rds/info.c b/net/rds/info.c
index e367a97a18c8..03f6fd56d237 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret = -ENOMEM;
goto out;
}
- ret = get_user_pages_fast(start, nr_pages, 1, pages);
+ ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
if (ret != nr_pages) {
if (ret > 0)
nr_pages = ret;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 182ab8430594..b340ed4fc43a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
{
int ret;
- ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+ ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0,
+ pages);
if (ret >= 0 && ret < nr_pages) {
while (ret--)
--
2.20.1
^ permalink raw reply related
* [PATCH V2 4/7] mm/gup: Add FOLL_LONGTERM capability to GUP fast
From: ira.weiny @ 2019-02-13 23:04 UTC (permalink / raw)
To: linux-mips, linux-kernel, kvm-ppc, linuxppc-dev, linux-s390,
linux-sh, sparclinux, kvm, linux-fpga, dri-devel, linux-rdma,
linux-media, linux-scsi, devel, virtualization, netdev,
linux-fbdev, xen-devel, devel, linux-mm, ceph-devel, rds-devel
Cc: Ira Weiny, John Hubbard, David Hildenbrand, Cornelia Huck,
Yoshinori Sato, Rich Felker, David S. Miller, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Joerg Roedel, Wu Hao, Alan Tull,
Moritz Fischer, David Airlie, Daniel Vetter, Jason Gunthorpe,
Dennis Dalessandro, Christian Benvenuti, Mauro Carvalho Chehab,
Matt Porter, Alexandre Bounine, Kai Mäkisara,
James E.J. Bottomley, Martin K. Petersen, Rob Springer,
Todd Poynor, Ben Chan, Jens Wiklander, Alex Williamson,
Michael S. Tsirkin, Jason Wang, Bartlomiej Zolnierkiewicz,
Stefano Stabellini, Martin Brandenburg, Peter Zijlstra,
Alexander Viro, Andrew Morton, Michal Hocko, Kirill A. Shutemov
In-Reply-To: <20190213230455.5605-1-ira.weiny@intel.com>
From: Ira Weiny <ira.weiny@intel.com>
DAX pages were previously unprotected from longterm pins when users
called get_user_pages_fast().
Use the new FOLL_LONGTERM flag to check for DEVMAP pages and fall
back to regular GUP processing if a DEVMAP page is encountered.
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
mm/gup.c | 24 +++++++++++++++++++++---
1 file changed, 21 insertions(+), 3 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c
index 6f32d36b3c5b..f7e759c523bb 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1439,6 +1439,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
if (pte_devmap(pte)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ goto pte_unmap;
+
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
@@ -1578,8 +1581,11 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pmd_devmap(orig))
+ if (pmd_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+ }
refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1904,8 +1910,20 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
- gup_flags);
+ if (gup_flags & FOLL_LONGTERM) {
+ down_read(¤t->mm->mmap_sem);
+ ret = __gup_longterm_locked(current, current->mm,
+ start, nr_pages - nr,
+ pages, NULL, gup_flags);
+ up_read(¤t->mm->mmap_sem);
+ } else {
+ /*
+ * retain FAULT_FOLL_ALLOW_RETRY optimization if
+ * possible
+ */
+ ret = get_user_pages_unlocked(start, nr_pages - nr,
+ pages, gup_flags);
+ }
/* Have to be a bit careful with return values */
if (nr > 0) {
--
2.20.1
^ permalink raw reply related
* Re: [ISSUE][4.20.6] mlx5 and checksum failures
From: Saeed Mahameed @ 2019-02-13 23:36 UTC (permalink / raw)
To: ian.kumlien@gmail.com, saeedm@dev.mellanox.co.il
Cc: davem@davemloft.net, netdev@vger.kernel.org,
xiyou.wangcong@gmail.com
In-Reply-To: <CAA85sZtvnyJsa+jYFQsW+AhNfiZCk09YU7VcSRjztbe5ogtpkA@mail.gmail.com>
On Wed, 2019-02-13 at 13:04 +0100, Ian Kumlien wrote:
> One last update on this, 4.20.8 compiled with the same compiler works
> - I still suspect that it was fixed by:
> net/mlx5e: Force CHECKSUM_UNNECESSARY for short ethernet frames
>
> Anyway, we can forget about it now ;)
cool, nice to know.
Thanks for the update.
^ permalink raw reply
* Re: [PATCH iproute2 net-next v2 3/4] ss: Buffer raw fields first, then render them as a table
From: Phil Sutter @ 2019-02-13 23:39 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Stefano Brivio, Eric Dumazet, netdev, Sabrina Dubroca,
David Ahern
In-Reply-To: <20190213135534.01dacee5@shemminger-XPS-13-9360>
Hi Stephen,
On Wed, Feb 13, 2019 at 01:55:34PM -0800, Stephen Hemminger wrote:
> On Wed, 13 Feb 2019 22:17:16 +0100
> Stefano Brivio <sbrivio@redhat.com> wrote:
>
> > On Wed, 13 Feb 2019 09:31:03 -0800
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > > On 02/13/2019 12:37 AM, Stefano Brivio wrote:
> > > > On Tue, 12 Feb 2019 16:42:04 -0800
> > > > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > >
> > > >> I do not get it.
> > > >>
> > > >> "ss -emoi " uses almost 1KB per socket.
> > > >>
> > > >> 10,000,000 sockets -> we need about 10GB of memory ???
> > > >>
> > > >> This is a serious regression.
> > > >
> > > > I guess this is rather subjective: the worst case I considered back then
> > > > was the output of 'ss -tei0' (less than 500 bytes) for one million
> > > > sockets, which gives 500M of memory, which should in turn be fine on a
> > > > machine handling one million sockets.
> > > >
> > > > Now, if 'ss -emoi' on 10 million sockets is an actual use case (out of
> > > > curiosity: how are you going to process that output? Would JSON help?),
> > > > I see two easy options to solve this:
> > >
> > >
> > > ss -temoi | parser (written in shell or awk or whatever...)
> > >
> > > This is a use case, I just got bitten because using ss command
> > > actually OOM my container, while trying to debug a busy GFE.
> > >
> > > The host itself can have 10,000,000 TCP sockets, but usually sysadmin shells
> > > run in a container with no more than 500 MB available.
> > >
> > > Otherwise, it would be too easy for a buggy program to OOM the whole machine
> > > and have angry customers.
> > >
> > > >
> > > > 1. flush the output every time we reach a given buffer size (1M
> > > > perhaps). This might make the resulting blocks slightly unaligned,
> > > > with occasional loss of readability on lines occurring every 1k to
> > > > 10k sockets approximately, even though after 1k sockets column sizes
> > > > won't change much (it looks anyway better than the original), and I
> > > > don't expect anybody to actually scroll that output
> > > >
> > > > 2. add a switch for unbuffered output, but then you need to remember to
> > > > pass it manually, and the whole output would be as bad as the
> > > > original in case you need the switch.
> > > >
> > > > I'd rather go with 1., it's easy to implement (we already have partial
> > > > flushing with '--events') and it looks like a good compromise on
> > > > usability. Thoughts?
> > > >
> > >
> > > 1 seems fine, but a switch for 'please do not try to format' would be fine.
> > >
> > > I wonder why we try to 'format' when stdout is a pipe or a regular file .
> >
> > On a second thought: what about | less, or | grep [ports],
> > or > readable.log? I guess those might also be rather common use cases,
> > what do you think?
> >
> > I'm tempted to skip this for the moment and just go with option 1.
> >
>
> What I would favor:
> * use big enough columns that for the common case everything lines up fine
> * if column is to wide just print that element wider (which is what print %Ns does)
This is pretty much the situation Stefano attempted to improve, minus
scaling the columns to max terminal width. ss output formatting being
quirky and unreadable with either small or large terminals was the
number one reason I heard so far why people prefer netstat.
> and
> * add json output for programs that want to parse
> * use print_uint etc for that
For Eric's use-case, skipping any buffering and tabular output if stdout
is not a TTY suffices. In fact, iproute2 does this already for colored
output (see check_enable_color() for reference).
Adding JSON output support everywhere is a nice feature when it comes to
scripting, but it won't help console users. Unless you expect CLI
frontends to come turning that JSON into human-readable output.
IMHO, JSON output wouldn't even help in this case - unless Eric indeed
prefers to write/use a JSON parser for his analysis instead of something
along 'ss | grep'.
> The buffering patch (in iproute2-next) can/will be reverted.
It's not fair to claim that despite Stefano's commitment to fix the
reported issues. His ss output rewrite is there since v4.15.0 and
according to git history it needed only two fixes so far. I've had
one-liners which required more follow-ups than that! Also, we're still
discovering issues introduced by all the jsonify patches. Allowing for
people to get things right not the first time but after a few tries is
important. If you want to revert something, start with features which
have a fundamental design issue in the exact situation they tried to
improve, like the MSG_PEEK | MSG_TRUNC thing Hangbin and me wrote.
Thanks, Phil
^ permalink raw reply
* [pull request][net 0/4] Mellanox, mlx5 fixes 2019-02-13
From: Saeed Mahameed @ 2019-02-13 23:44 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, Saeed Mahameed
Hi Dave,
This series introduces some fixes to mlx5 driver.
For more information please see tag log below.
Please pull and let me know if there is any problem.
For -stable v4.19:
('net/mlx5e: XDP, fix redirect resources availability check')
Thanks,
Saeed.
---
The following changes since commit 2fdeee2549231b1f989f011bb18191f5660d3745:
team: avoid complex list operations in team_nl_cmd_options_set() (2019-02-12 14:19:27 -0500)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5-fixes-2019-02-13
for you to fetch changes up to 407e17b1a69a51ba9a512a04342da56c1f931df4:
net/mlx5e: XDP, fix redirect resources availability check (2019-02-13 15:40:51 -0800)
----------------------------------------------------------------
mlx5-fixes-2019-02-13
----------------------------------------------------------------
Huy Nguyen (1):
net/mlx5: No command allowed when command interface is not ready
Maria Pasechnik (1):
net/mlx5e: Fix NULL pointer derefernce in set channels error flow
Saeed Mahameed (1):
net/mlx5e: XDP, fix redirect resources availability check
Tariq Toukan (1):
net/mlx5: Fix a compilation warning in events.c
drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 18 ++++++++++++++++++
drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 +
drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 6 ++----
drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h | 17 +++++++++++++++++
drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 7 ++++---
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 ++
drivers/net/ethernet/mellanox/mlx5/core/events.c | 17 +++++++++--------
drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 +
9 files changed, 55 insertions(+), 16 deletions(-)
^ permalink raw reply
* [net 1/4] net/mlx5e: Fix NULL pointer derefernce in set channels error flow
From: Saeed Mahameed @ 2019-02-13 23:44 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, Maria Pasechnik, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20190213234451.27029-1-saeedm@mellanox.com>
From: Maria Pasechnik <mariap@mellanox.com>
New channels are applied to the priv channels only after they
are successfully opened. Then, the indirection table should be built
according to the new number of channels.
Currently, such build is preformed independently of whether the
channels opening is successful, and is not reverted on failure.
The bug is caused due to removal of rss params from channels struct
and moving it to priv struct. That change cause to independency between
channels and rss params.
This causes a crash on a later point, when accessing rqn of a non
existing channel.
This patch fixes it by moving the indirection table build right before
switching the priv channels to new channels struct, after the new set of
channels was successfully opened.
Fixes: bbeb53b8b2c9 ("net/mlx5e: Move RSS params to a dedicated struct")
Signed-off-by: Maria Pasechnik <mariap@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 3bbccead2f63..47233b9a4f81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -354,9 +354,6 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
new_channels.params = priv->channels.params;
new_channels.params.num_channels = count;
- if (!netif_is_rxfh_configured(priv->netdev))
- mlx5e_build_default_indir_rqt(priv->rss_params.indirection_rqt,
- MLX5E_INDIR_RQT_SIZE, count);
if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
priv->channels.params = new_channels.params;
@@ -372,6 +369,10 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
if (arfs_enabled)
mlx5e_arfs_disable(priv);
+ if (!netif_is_rxfh_configured(priv->netdev))
+ mlx5e_build_default_indir_rqt(priv->rss_params.indirection_rqt,
+ MLX5E_INDIR_RQT_SIZE, count);
+
/* Switch to new channels, set new parameters and close old ones */
mlx5e_switch_priv_channels(priv, &new_channels, NULL);
--
2.20.1
^ permalink raw reply related
* [net 2/4] net/mlx5: No command allowed when command interface is not ready
From: Saeed Mahameed @ 2019-02-13 23:44 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, Huy Nguyen, Daniel Jurgens, Saeed Mahameed
In-Reply-To: <20190213234451.27029-1-saeedm@mellanox.com>
From: Huy Nguyen <huyn@mellanox.com>
When EEH is injected and PCI bus stalls, mlx5's pci error detect
function is called to deactivate the command interface and tear down
the device. The issue is that there can be a thread that already
passed MLX5_DEVICE_STATE_INTERNAL_ERROR check, it will send the command
and stuck in the wait_func.
Solution:
Add function mlx5_cmd_flush to disable command interface and clear all
the pending commands. When device state is set to
MLX5_DEVICE_STATE_INTERNAL_ERROR, call mlx5_cmd_flush to ensure all
pending threads waiting for firmware commands completion are terminated.
Fixes: c1d4d2e92ad6 ("net/mlx5: Avoid calling sleeping function by the health poll thread")
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 18 ++++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/health.c | 2 +-
.../ethernet/mellanox/mlx5/core/mlx5_core.h | 1 +
3 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 3e0fa8a8077b..e267ff93e8a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1583,6 +1583,24 @@ void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev)
spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
}
+void mlx5_cmd_flush(struct mlx5_core_dev *dev)
+{
+ struct mlx5_cmd *cmd = &dev->cmd;
+ int i;
+
+ for (i = 0; i < cmd->max_reg_cmds; i++)
+ while (down_trylock(&cmd->sem))
+ mlx5_cmd_trigger_completions(dev);
+
+ while (down_trylock(&cmd->pages_sem))
+ mlx5_cmd_trigger_completions(dev);
+
+ /* Unlock cmdif */
+ up(&cmd->pages_sem);
+ for (i = 0; i < cmd->max_reg_cmds; i++)
+ up(&cmd->sem);
+}
+
static int status_to_err(u8 status)
{
return status ? -1 : 0; /* TBD more meaningful codes */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 196c07383082..cb9fa3430c53 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -103,7 +103,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
mlx5_core_err(dev, "start\n");
if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) {
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
- mlx5_cmd_trigger_completions(dev);
+ mlx5_cmd_flush(dev);
}
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 5300b0b6d836..4fdac020b795 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -126,6 +126,7 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev,
struct ptp_system_timestamp *sts);
void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev);
+void mlx5_cmd_flush(struct mlx5_core_dev *dev);
int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
--
2.20.1
^ permalink raw reply related
* [net 3/4] net/mlx5: Fix a compilation warning in events.c
From: Saeed Mahameed @ 2019-02-13 23:44 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, Tariq Toukan, Mikhael Goikhman, Saeed Mahameed
In-Reply-To: <20190213234451.27029-1-saeedm@mellanox.com>
From: Tariq Toukan <tariqt@mellanox.com>
Eliminate the following compilation warning:
drivers/net/ethernet/mellanox/mlx5/core/events.c: warning: 'error_str'
may be used uninitialized in this function [-Wuninitialized]: => 238:3
Fixes: c2fb3db22d35 ("net/mlx5: Rework handling of port module events")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Mikhael Goikhman <migo@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/events.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index fbc42b7252a9..503035469d2d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -211,11 +211,10 @@ static int port_module(struct notifier_block *nb, unsigned long type, void *data
enum port_module_event_status_type module_status;
enum port_module_event_error_type error_type;
struct mlx5_eqe_port_module *module_event_eqe;
- const char *status_str, *error_str;
+ const char *status_str;
u8 module_num;
module_event_eqe = &eqe->data.port_module;
- module_num = module_event_eqe->module;
module_status = module_event_eqe->module_status &
PORT_MODULE_EVENT_MODULE_STATUS_MASK;
error_type = module_event_eqe->error_type &
@@ -223,25 +222,27 @@ static int port_module(struct notifier_block *nb, unsigned long type, void *data
if (module_status < MLX5_MODULE_STATUS_NUM)
events->pme_stats.status_counters[module_status]++;
- status_str = mlx5_pme_status_to_string(module_status);
- if (module_status == MLX5_MODULE_STATUS_ERROR) {
+ if (module_status == MLX5_MODULE_STATUS_ERROR)
if (error_type < MLX5_MODULE_EVENT_ERROR_NUM)
events->pme_stats.error_counters[error_type]++;
- error_str = mlx5_pme_error_to_string(error_type);
- }
if (!printk_ratelimit())
return NOTIFY_OK;
- if (module_status == MLX5_MODULE_STATUS_ERROR)
+ module_num = module_event_eqe->module;
+ status_str = mlx5_pme_status_to_string(module_status);
+ if (module_status == MLX5_MODULE_STATUS_ERROR) {
+ const char *error_str = mlx5_pme_error_to_string(error_type);
+
mlx5_core_err(events->dev,
"Port module event[error]: module %u, %s, %s\n",
module_num, status_str, error_str);
- else
+ } else {
mlx5_core_info(events->dev,
"Port module event: module %u, %s\n",
module_num, status_str);
+ }
return NOTIFY_OK;
}
--
2.20.1
^ permalink raw reply related
* [net 4/4] net/mlx5e: XDP, fix redirect resources availability check
From: Saeed Mahameed @ 2019-02-13 23:44 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Saeed Mahameed, Toke Høiland-Jørgensen,
Tariq Toukan
In-Reply-To: <20190213234451.27029-1-saeedm@mellanox.com>
Currently mlx5 driver creates xdp redirect hw queues unconditionally on
netdevice open, This is great until someone starts redirecting XDP traffic
via ndo_xdp_xmit on mlx5 device and changes the device configuration at
the same time, this might cause crashes, since the other device's napi
is not aware of the mlx5 state change (resources un-availability).
To fix this we must synchronize with other devices napi's on the system.
Added a new flag under mlx5e_priv to determine XDP TX resources are
available, set/clear it up when necessary and use synchronize_rcu()
when the flag is turned off, so other napi's are in-sync with it, before
we actually cleanup the hw resources.
The flag is tested prior to committing to transmit on mlx5e_xdp_xmit, and
it is sufficient to determine if it safe to transmit or not. The other
two internal flags (MLX5E_STATE_OPENED and MLX5E_SQ_STATE_ENABLED) become
unnecessary. Thus, they are removed from data path.
Fixes: 58b99ee3e3eb ("net/mlx5e: Add support for XDP_REDIRECT in device-out side")
Reported-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 +
.../net/ethernet/mellanox/mlx5/core/en/xdp.c | 6 ++----
.../net/ethernet/mellanox/mlx5/core/en/xdp.h | 17 +++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/en_main.c | 2 ++
4 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8fa8fdd30b85..448a92561567 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -657,6 +657,7 @@ struct mlx5e_channel_stats {
enum {
MLX5E_STATE_OPENED,
MLX5E_STATE_DESTROYING,
+ MLX5E_STATE_XDP_TX_ENABLED,
};
struct mlx5e_rqt {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 3740177eed09..03b2a9f9c589 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -365,7 +365,8 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
int sq_num;
int i;
- if (unlikely(!test_bit(MLX5E_STATE_OPENED, &priv->state)))
+ /* this flag is sufficient, no need to test internal sq state */
+ if (unlikely(!mlx5e_xdp_tx_is_enabled(priv)))
return -ENETDOWN;
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
@@ -378,9 +379,6 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
sq = &priv->channels.c[sq_num]->xdpsq;
- if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
- return -ENETDOWN;
-
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
struct mlx5e_xdp_info xdpi;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index 3a67cb3cd179..ee27a7c8cd87 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -50,6 +50,23 @@ void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq);
int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
u32 flags);
+static inline void mlx5e_xdp_tx_enable(struct mlx5e_priv *priv)
+{
+ set_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
+}
+
+static inline void mlx5e_xdp_tx_disable(struct mlx5e_priv *priv)
+{
+ clear_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
+ /* let other device's napi(s) see our new state */
+ synchronize_rcu();
+}
+
+static inline bool mlx5e_xdp_tx_is_enabled(struct mlx5e_priv *priv)
+{
+ return test_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
+}
+
static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq)
{
if (sq->doorbell_cseg) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 01819e5c9975..93e50ccd44c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2938,6 +2938,7 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
mlx5e_build_tx2sq_maps(priv);
mlx5e_activate_channels(&priv->channels);
+ mlx5e_xdp_tx_enable(priv);
netif_tx_start_all_queues(priv->netdev);
if (mlx5e_is_vport_rep(priv))
@@ -2959,6 +2960,7 @@ void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
*/
netif_tx_stop_all_queues(priv->netdev);
netif_tx_disable(priv->netdev);
+ mlx5e_xdp_tx_disable(priv);
mlx5e_deactivate_channels(&priv->channels);
}
--
2.20.1
^ permalink raw reply related
* Re: [PATCH iproute2 net-next v2 3/4] ss: Buffer raw fields first, then render them as a table
From: David Ahern @ 2019-02-13 23:47 UTC (permalink / raw)
To: Phil Sutter, Stephen Hemminger, Stefano Brivio, Eric Dumazet,
netdev, Sabrina Dubroca
In-Reply-To: <20190213233950.GQ26388@orbyte.nwl.cc>
On 2/13/19 4:39 PM, Phil Sutter wrote:
>> What I would favor:
>> * use big enough columns that for the common case everything lines up fine
>> * if column is to wide just print that element wider (which is what print %Ns does)
>
> This is pretty much the situation Stefano attempted to improve, minus
> scaling the columns to max terminal width. ss output formatting being
> quirky and unreadable with either small or large terminals was the
> number one reason I heard so far why people prefer netstat.
+1.
prior to Stefano's change ss was a PITA trying to read in an xterm. I
for one would run the command and then have to adjust the terminal to
get it to display an actual readable format.
>
>> and
>> * add json output for programs that want to parse
>> * use print_uint etc for that
>
> For Eric's use-case, skipping any buffering and tabular output if stdout
> is not a TTY suffices. In fact, iproute2 does this already for colored
> output (see check_enable_color() for reference).
>
> Adding JSON output support everywhere is a nice feature when it comes to
> scripting, but it won't help console users. Unless you expect CLI
> frontends to come turning that JSON into human-readable output.
>
> IMHO, JSON output wouldn't even help in this case - unless Eric indeed
> prefers to write/use a JSON parser for his analysis instead of something
> along 'ss | grep'.
I agree. json has its uses, console/xterm for humans is not one and
piping into something like jq to selectively pick columns is not a user
friendly solution.
>
>> The buffering patch (in iproute2-next) can/will be reverted.
>
> It's not fair to claim that despite Stefano's commitment to fix the
> reported issues. His ss output rewrite is there since v4.15.0 and
> according to git history it needed only two fixes so far. I've had
> one-liners which required more follow-ups than that! Also, we're still
> discovering issues introduced by all the jsonify patches. Allowing for
> people to get things right not the first time but after a few tries is
> important. If you want to revert something, start with features which
> have a fundamental design issue in the exact situation they tried to
> improve, like the MSG_PEEK | MSG_TRUNC thing Hangbin and me wrote.
I was just looking at the overhead of that. While it is deceiving to
twice as many recvmsg calls as you expect, the overhead of the peek in
reading 700k+ routes is on the order of 3% with the 32k min buffer size.
The true overhead of the dump functions for ip is the device index to
name mapping (just like the overhead of a batch is the name to index
mapping). I will send a v2 of my patches soon.
^ permalink raw reply
* Re: [pull request][net 0/4] Mellanox, mlx5 fixes 2019-02-13
From: David Miller @ 2019-02-14 0:10 UTC (permalink / raw)
To: saeedm; +Cc: netdev
In-Reply-To: <20190213234451.27029-1-saeedm@mellanox.com>
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 13 Feb 2019 15:44:47 -0800
> This series introduces some fixes to mlx5 driver.
> For more information please see tag log below.
>
> Please pull and let me know if there is any problem.
Pulled.
> For -stable v4.19:
> ('net/mlx5e: XDP, fix redirect resources availability check')
Queued up for -stable, thanks Saeed.
^ permalink raw reply
* Re: [PATCH 0/3] Netfilter/IPVS fixes for net
From: David Miller @ 2019-02-14 0:15 UTC (permalink / raw)
To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <20190213174758.17275-1-pablo@netfilter.org>
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 13 Feb 2019 18:47:55 +0100
> The following patchset contains Netfilter/IPVS fixes for net:
>
> 1) Missing structure initialization in ebtables causes splat with
> 32-bit user level on a 64-bit kernel, from Francesco Ruggeri.
>
> 2) Missing dependency on nf_defrag in IPVS IPv6 codebase, from
> Andrea Claudi.
>
> 3) Fix possible use-after-free from release path of target extensions.
>
> You can pull these changes from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
Pulled, thanks Pablo.
^ permalink raw reply
* Re: [PATCH net-next 0/4] net: phy: Add 2.5G/5GBASET PHYs support
From: David Miller @ 2019-02-14 0:19 UTC (permalink / raw)
To: maxime.chevallier
Cc: netdev, linux-kernel, andrew, f.fainelli, hkallweit1, linux,
linux-arm-kernel, antoine.tenart, thomas.petazzoni,
gregory.clement, miquel.raynal, nadavh, stefanc, mw
In-Reply-To: <20190211142529.22885-1-maxime.chevallier@bootlin.com>
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Mon, 11 Feb 2019 15:25:25 +0100
> The 802.3bz standard defines 2 modes based on the NBASET alliance work
> that allow to use 2.5Gbps and 5Gbps speeds on Cat 5e, 6 and 7 cables.
>
> This series adds the necessary infrastructure to handle these modes with
> C45 PHYs. This series was originally part of a bigger one, that has
> seen 2 iterations [1] [2] that added support for these modes on Marvell
> Alaska PHYs.
>
> Following some discussions with Heiner and Andrew [3], we decided to
> split-out the generic parts so that we can work together on the
> following steps to get these mode fully working with Aquantia and
> Marvell PHYS.
>
> The first 3 patches are reworking some of the internal network phy
> infrastructure to handle the new modes in a more generic way.
>
> The 4th patch adds all the C45 register definition and accesses that
> follows the 802.3bz standard to support 2.5GBASET and 5GBASET.
>
> [1] : https://lore.kernel.org/netdev/20190118152352.26417-1-maxime.chevallier@bootlin.com/
> [2] : https://lore.kernel.org/netdev/20190207094939.27369-1-maxime.chevallier@bootlin.com/
> [3] : https://lore.kernel.org/netdev/81c340ea-54b0-1abf-94af-b8dc4ee83e3a@gmail.com/
Series applied, thanks Maxime.
^ permalink raw reply
* [PATCH iproute2-next v2 1/3] ll_map: Add function to remove link cache entry by index
From: David Ahern @ 2019-02-14 0:22 UTC (permalink / raw)
To: stephen; +Cc: netdev, David Ahern
In-Reply-To: <20190214002249.31866-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Add ll_drop_by_index to remove an entry from the link cache.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/ll_map.h | 1 +
lib/ll_map.c | 14 ++++++++++++++
2 files changed, 15 insertions(+)
diff --git a/include/ll_map.h b/include/ll_map.h
index 511fe00b8567..4de1041e2746 100644
--- a/include/ll_map.h
+++ b/include/ll_map.h
@@ -9,6 +9,7 @@ unsigned ll_name_to_index(const char *name);
const char *ll_index_to_name(unsigned idx);
int ll_index_to_type(unsigned idx);
int ll_index_to_flags(unsigned idx);
+void ll_drop_by_index(unsigned index);
unsigned namehash(const char *str);
const char *ll_idx_n2a(unsigned int idx);
diff --git a/lib/ll_map.c b/lib/ll_map.c
index 1ab8ef0758ac..8e8a0b1e9c9d 100644
--- a/lib/ll_map.c
+++ b/lib/ll_map.c
@@ -210,6 +210,20 @@ unsigned ll_name_to_index(const char *name)
return idx;
}
+void ll_drop_by_index(unsigned index)
+{
+ struct ll_cache *im;
+
+ im = ll_get_by_index(index);
+ if (!im)
+ return;
+
+ hlist_del(&im->idx_hash);
+ hlist_del(&im->name_hash);
+
+ free(im);
+}
+
void ll_init_map(struct rtnl_handle *rth)
{
static int initialized;
--
2.11.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox