* Re: [PATCH 10/10] virtio: enable endian checks for sparse builds
From: Jason Wang @ 2016-12-07 5:27 UTC (permalink / raw)
To: Michael S. Tsirkin, linux-kernel
Cc: kvm, Neil Armstrong, David Airlie, linux-remoteproc, dri-devel,
virtualization, linux-s390, James E.J. Bottomley, Herbert Xu,
linux-scsi, v9fs-developer, Asias He, Arnd Bergmann, linux-kbuild,
Jens Axboe, Michal Marek, Stefan Hajnoczi, Matt Mackall,
Greg Kroah-Hartman, linux-crypto, netdev, David S. Miller
In-Reply-To: <1481038106-24899-11-git-send-email-mst@redhat.com>
On 2016年12月06日 23:41, Michael S. Tsirkin wrote:
> __CHECK_ENDIAN__ isn't on by default presumably because
> it triggers too many sparse warnings for correct code.
> But virtio is now clean of these warnings, and
> we want to keep it this way - enable this for
> sparse builds.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> It seems that there should be a better way to do it,
> but this works too.
Reviewed-by: Jason Wang <jasowang@redhat.com>
>
> drivers/block/Makefile | 1 +
> drivers/char/Makefile | 1 +
> drivers/char/hw_random/Makefile | 2 ++
> drivers/gpu/drm/virtio/Makefile | 1 +
> drivers/net/Makefile | 3 +++
> drivers/net/caif/Makefile | 1 +
> drivers/rpmsg/Makefile | 1 +
> drivers/s390/virtio/Makefile | 2 ++
> drivers/scsi/Makefile | 1 +
> drivers/vhost/Makefile | 1 +
> drivers/virtio/Makefile | 3 +++
> net/9p/Makefile | 1 +
> net/packet/Makefile | 1 +
> net/vmw_vsock/Makefile | 2 ++
> 14 files changed, 21 insertions(+)
>
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 1e9661e..597481c 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -27,6 +27,7 @@ obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o
> obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
> obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
> obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
> +CFLAGS_virtio_blk.o += -D__CHECK_ENDIAN__
> obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
>
> obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
> diff --git a/drivers/char/Makefile b/drivers/char/Makefile
> index 6e6c244..a99467d 100644
> --- a/drivers/char/Makefile
> +++ b/drivers/char/Makefile
> @@ -6,6 +6,7 @@ obj-y += mem.o random.o
> obj-$(CONFIG_TTY_PRINTK) += ttyprintk.o
> obj-y += misc.o
> obj-$(CONFIG_ATARI_DSP56K) += dsp56k.o
> +CFLAGS_virtio_console.o += -D__CHECK_ENDIAN__
> obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o
> obj-$(CONFIG_RAW_DRIVER) += raw.o
> obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
> diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
> index 5f52b1e..a2b0931 100644
> --- a/drivers/char/hw_random/Makefile
> +++ b/drivers/char/hw_random/Makefile
> @@ -17,6 +17,8 @@ obj-$(CONFIG_HW_RANDOM_IXP4XX) += ixp4xx-rng.o
> obj-$(CONFIG_HW_RANDOM_OMAP) += omap-rng.o
> obj-$(CONFIG_HW_RANDOM_OMAP3_ROM) += omap3-rom-rng.o
> obj-$(CONFIG_HW_RANDOM_PASEMI) += pasemi-rng.o
> +CFLAGS_virtio_transport.o += -D__CHECK_ENDIAN__
> +CFLAGS_virtio-rng.o += -D__CHECK_ENDIAN__
> obj-$(CONFIG_HW_RANDOM_VIRTIO) += virtio-rng.o
> obj-$(CONFIG_HW_RANDOM_TX4939) += tx4939-rng.o
> obj-$(CONFIG_HW_RANDOM_MXC_RNGA) += mxc-rnga.o
> diff --git a/drivers/gpu/drm/virtio/Makefile b/drivers/gpu/drm/virtio/Makefile
> index 3fb8eac..1162366 100644
> --- a/drivers/gpu/drm/virtio/Makefile
> +++ b/drivers/gpu/drm/virtio/Makefile
> @@ -3,6 +3,7 @@
> # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
>
> ccflags-y := -Iinclude/drm
> +ccflags-y += -D__CHECK_ENDIAN__
>
> virtio-gpu-y := virtgpu_drv.o virtgpu_kms.o virtgpu_drm_bus.o virtgpu_gem.o \
> virtgpu_fb.o virtgpu_display.o virtgpu_vq.o virtgpu_ttm.o \
> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
> index 7336cbd..3f587de 100644
> --- a/drivers/net/Makefile
> +++ b/drivers/net/Makefile
> @@ -12,6 +12,7 @@ obj-$(CONFIG_EQUALIZER) += eql.o
> obj-$(CONFIG_IFB) += ifb.o
> obj-$(CONFIG_MACSEC) += macsec.o
> obj-$(CONFIG_MACVLAN) += macvlan.o
> +CFLAGS_macvtap.o += -D__CHECK_ENDIAN__
> obj-$(CONFIG_MACVTAP) += macvtap.o
> obj-$(CONFIG_MII) += mii.o
> obj-$(CONFIG_MDIO) += mdio.o
> @@ -20,8 +21,10 @@ obj-$(CONFIG_NETCONSOLE) += netconsole.o
> obj-$(CONFIG_PHYLIB) += phy/
> obj-$(CONFIG_RIONET) += rionet.o
> obj-$(CONFIG_NET_TEAM) += team/
> +CFLAGS_tun.o += -D__CHECK_ENDIAN__
> obj-$(CONFIG_TUN) += tun.o
> obj-$(CONFIG_VETH) += veth.o
> +CFLAGS_virtio_net.o += -D__CHECK_ENDIAN__
> obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
> obj-$(CONFIG_VXLAN) += vxlan.o
> obj-$(CONFIG_GENEVE) += geneve.o
> diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
> index 9bbd453..d1a922c 100644
> --- a/drivers/net/caif/Makefile
> +++ b/drivers/net/caif/Makefile
> @@ -12,3 +12,4 @@ obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
>
> # Virtio interface
> obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
> +CFLAGS_caif_virtio.o += -D__CHECK_ENDIAN__
> diff --git a/drivers/rpmsg/Makefile b/drivers/rpmsg/Makefile
> index ae9c913..23c8b66 100644
> --- a/drivers/rpmsg/Makefile
> +++ b/drivers/rpmsg/Makefile
> @@ -1,3 +1,4 @@
> obj-$(CONFIG_RPMSG) += rpmsg_core.o
> obj-$(CONFIG_RPMSG_QCOM_SMD) += qcom_smd.o
> obj-$(CONFIG_RPMSG_VIRTIO) += virtio_rpmsg_bus.o
> +CFLAGS_virtio_rpmsg_bus.o += -D__CHECK_ENDIAN__
> diff --git a/drivers/s390/virtio/Makefile b/drivers/s390/virtio/Makefile
> index df40692..270ada5 100644
> --- a/drivers/s390/virtio/Makefile
> +++ b/drivers/s390/virtio/Makefile
> @@ -6,6 +6,8 @@
> # it under the terms of the GNU General Public License (version 2 only)
> # as published by the Free Software Foundation.
>
> +CFLAGS_virtio_ccw.o += -D__CHECK_ENDIAN__
> +CFLAGS_kvm_virtio.o += -D__CHECK_ENDIAN__
> s390-virtio-objs := virtio_ccw.o
> ifdef CONFIG_S390_GUEST_OLD_TRANSPORT
> s390-virtio-objs += kvm_virtio.o
> diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
> index 38d938d..9f70d46 100644
> --- a/drivers/scsi/Makefile
> +++ b/drivers/scsi/Makefile
> @@ -135,6 +135,7 @@ obj-$(CONFIG_SCSI_BNX2_ISCSI) += libiscsi.o bnx2i/
> obj-$(CONFIG_BE2ISCSI) += libiscsi.o be2iscsi/
> obj-$(CONFIG_SCSI_ESAS2R) += esas2r/
> obj-$(CONFIG_SCSI_PMCRAID) += pmcraid.o
> +CFLAGS_virtio_scsi.o += -D__CHECK_ENDIAN__
> obj-$(CONFIG_SCSI_VIRTIO) += virtio_scsi.o
> obj-$(CONFIG_VMWARE_PVSCSI) += vmw_pvscsi.o
> obj-$(CONFIG_XEN_SCSI_FRONTEND) += xen-scsifront.o
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 6b012b9..619e2cd 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -1,3 +1,4 @@
> +ccflags-y := -D__CHECK_ENDIAN__
> obj-$(CONFIG_VHOST_NET) += vhost_net.o
> vhost_net-y := net.o
>
> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> index 41e30e3..d331f19 100644
> --- a/drivers/virtio/Makefile
> +++ b/drivers/virtio/Makefile
> @@ -1,3 +1,6 @@
> +#virtio must be kept clean wrt endian tags,
> +#otherwise we'll get to maintain broken host/guest ABIs
> +ccflags-y := -D__CHECK_ENDIAN__
> obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
> obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
> obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
> diff --git a/net/9p/Makefile b/net/9p/Makefile
> index a0874cc..acf1225 100644
> --- a/net/9p/Makefile
> +++ b/net/9p/Makefile
> @@ -11,6 +11,7 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o
> trans_fd.o \
> trans_common.o \
>
> +CFLAGS_trans_virtio.o += -D__CHECK_ENDIAN__
> 9pnet_virtio-objs := \
> trans_virtio.o \
>
> diff --git a/net/packet/Makefile b/net/packet/Makefile
> index 9df6134..a13bcb3 100644
> --- a/net/packet/Makefile
> +++ b/net/packet/Makefile
> @@ -2,6 +2,7 @@
> # Makefile for the packet AF.
> #
>
> +ccflags-y := -D__CHECK_ENDIAN__
> obj-$(CONFIG_PACKET) += af_packet.o
> obj-$(CONFIG_PACKET_DIAG) += af_packet_diag.o
> af_packet_diag-y += diag.o
> diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
> index bc27c70..a61eccb 100644
> --- a/net/vmw_vsock/Makefile
> +++ b/net/vmw_vsock/Makefile
> @@ -8,6 +8,8 @@ vsock-y += af_vsock.o vsock_addr.o
> vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
> vmci_transport_notify_qstate.o
>
> +CFLAGS_virtio_transport.o += -D__CHECK_ENDIAN__
> +CFLAGS_virtio_transport_common.o += -D__CHECK_ENDIAN__
> vmw_vsock_virtio_transport-y += virtio_transport.o
>
> vmw_vsock_virtio_transport_common-y += virtio_transport_common.o
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net-next v2 7/7] bnxt_en: Add interface to support RDMA driver.
From: Michael Chan @ 2016-12-07 5:26 UTC (permalink / raw)
To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>
Since the network driver and RDMA driver operate on the same PCI function,
we need to create an interface to allow the RDMA driver to share resources
with the network driver.
1. Create a new bnxt_en_dev struct which will be returned by
bnxt_ulp_probe() upon success. After that, all calls from the RDMA driver
to bnxt_en will pass a pointer to this struct.
2. This struct contains additional function pointers to register, request
msix, send fw messages, register for async events.
3. If the RDMA driver wants to enable RDMA on the function, it needs to
call the function pointer bnxt_register_device(). A ulp_ops structure
is passed for RCU protected upcalls from bnxt_en to the RDMA driver.
4. The RDMA driver can call firmware APIs using the bnxt_send_fw_msg()
function pointer.
5. 1 stats context is reserved when the RDMA driver registers. MSIX
and completion rings are reserved when the RDMA driver calls
bnxt_request_msix() function pointer.
6. When the RDMA driver calls bnxt_unregister_device(), all RDMA resources
will be cleaned up.
v2: Fixed 2 uninitialized variable warnings.
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/Makefile | 2 +-
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 41 ++-
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 6 +
drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 346 ++++++++++++++++++++++++++
drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 93 +++++++
5 files changed, 483 insertions(+), 5 deletions(-)
create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile
index b233a86..6082ed1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -1,3 +1,3 @@
obj-$(CONFIG_BNXT) += bnxt_en.o
-bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o
+bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c782942..9608cb4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -52,6 +52,7 @@
#include "bnxt_hsi.h"
#include "bnxt.h"
+#include "bnxt_ulp.h"
#include "bnxt_sriov.h"
#include "bnxt_ethtool.h"
#include "bnxt_dcb.h"
@@ -1528,12 +1529,11 @@ static int bnxt_async_event_process(struct bnxt *bp,
set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
break;
default:
- netdev_err(bp->dev, "unhandled ASYNC event (id 0x%x)\n",
- event_id);
goto async_event_process_exit;
}
schedule_work(&bp->sp_task);
async_event_process_exit:
+ bnxt_ulp_async_events(bp, cmpl);
return 0;
}
@@ -3547,7 +3547,7 @@ static int bnxt_hwrm_vnic_ctx_alloc(struct bnxt *bp, u16 vnic_id, u16 ctx_idx)
return rc;
}
-static int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
+int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
{
unsigned int ring = 0, grp_idx;
struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
@@ -3595,6 +3595,9 @@ static int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
#endif
if ((bp->flags & BNXT_FLAG_STRIP_VLAN) || def_vlan)
req.flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_VLAN_STRIP_MODE);
+ if (!vnic_id && bnxt_ulp_registered(bp->edev, BNXT_ROCE_ULP))
+ req.flags |=
+ cpu_to_le32(VNIC_CFG_REQ_FLAGS_ROCE_DUAL_VNIC_MODE);
return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
}
@@ -4842,6 +4845,16 @@ unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
return bp->pf.max_stat_ctxs;
}
+void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+ if (BNXT_VF(bp))
+ bp->vf.max_stat_ctxs = max;
+ else
+#endif
+ bp->pf.max_stat_ctxs = max;
+}
+
unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
{
#if defined(CONFIG_BNXT_SRIOV)
@@ -4851,6 +4864,16 @@ unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
return bp->pf.max_cp_rings;
}
+void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+ if (BNXT_VF(bp))
+ bp->vf.max_cp_rings = max;
+ else
+#endif
+ bp->pf.max_cp_rings = max;
+}
+
static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
{
#if defined(CONFIG_BNXT_SRIOV)
@@ -6767,6 +6790,8 @@ static void bnxt_remove_one(struct pci_dev *pdev)
pci_iounmap(pdev, bp->bar2);
pci_iounmap(pdev, bp->bar1);
pci_iounmap(pdev, bp->bar0);
+ kfree(bp->edev);
+ bp->edev = NULL;
free_netdev(dev);
pci_release_regions(pdev);
@@ -6936,6 +6961,7 @@ void bnxt_restore_pf_fw_resources(struct bnxt *bp)
{
ASSERT_RTNL();
bnxt_hwrm_func_qcaps(bp);
+ bnxt_subtract_ulp_resources(bp, BNXT_ROCE_ULP);
}
static void bnxt_parse_log_pcie_link(struct bnxt *bp)
@@ -7047,6 +7073,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
if (rc)
goto init_err;
+ bp->ulp_probe = bnxt_ulp_probe;
+
/* Get the MAX capabilities for this function */
rc = bnxt_hwrm_func_qcaps(bp);
if (rc) {
@@ -7144,12 +7172,15 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct net_device *netdev = pci_get_drvdata(pdev);
+ struct bnxt *bp = netdev_priv(netdev);
netdev_info(netdev, "PCI I/O error detected\n");
rtnl_lock();
netif_device_detach(netdev);
+ bnxt_ulp_stop(bp);
+
if (state == pci_channel_io_perm_failure) {
rtnl_unlock();
return PCI_ERS_RESULT_DISCONNECT;
@@ -7195,8 +7226,10 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev)
if (!err && netif_running(netdev))
err = bnxt_open(netdev);
- if (!err)
+ if (!err) {
result = PCI_ERS_RESULT_RECOVERED;
+ bnxt_ulp_start(bp);
+ }
}
if (result != PCI_ERS_RESULT_RECOVERED && netif_running(netdev))
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index eec2415..16defe9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -972,6 +972,9 @@ struct bnxt {
#define BNXT_SINGLE_PF(bp) (BNXT_PF(bp) && !BNXT_NPAR(bp))
#define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0)
+ struct bnxt_en_dev *edev;
+ struct bnxt_en_dev * (*ulp_probe)(struct net_device *);
+
struct bnxt_napi **bnapi;
struct bnxt_rx_ring_info *rx_ring;
@@ -1242,9 +1245,12 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
int bmap_size);
+int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id);
int bnxt_hwrm_set_coal(struct bnxt *);
unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
+void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max);
unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
+void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max);
void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
void bnxt_tx_disable(struct bnxt *bp);
void bnxt_tx_enable(struct bnxt *bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
new file mode 100644
index 0000000..8b7464b
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -0,0 +1,346 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <asm/byteorder.h>
+#include <linux/bitmap.h>
+
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_ulp.h"
+
+static int bnxt_register_dev(struct bnxt_en_dev *edev, int ulp_id,
+ struct bnxt_ulp_ops *ulp_ops, void *handle)
+{
+ struct net_device *dev = edev->net;
+ struct bnxt *bp = netdev_priv(dev);
+ struct bnxt_ulp *ulp;
+
+ ASSERT_RTNL();
+ if (ulp_id >= BNXT_MAX_ULP)
+ return -EINVAL;
+
+ ulp = &edev->ulp_tbl[ulp_id];
+ if (rcu_access_pointer(ulp->ulp_ops)) {
+ netdev_err(bp->dev, "ulp id %d already registered\n", ulp_id);
+ return -EBUSY;
+ }
+ if (ulp_id == BNXT_ROCE_ULP) {
+ unsigned int max_stat_ctxs;
+
+ max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp);
+ if (max_stat_ctxs <= BNXT_MIN_ROCE_STAT_CTXS ||
+ bp->num_stat_ctxs == max_stat_ctxs)
+ return -ENOMEM;
+ bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs -
+ BNXT_MIN_ROCE_STAT_CTXS);
+ }
+
+ atomic_set(&ulp->ref_count, 0);
+ ulp->handle = handle;
+ rcu_assign_pointer(ulp->ulp_ops, ulp_ops);
+
+ if (ulp_id == BNXT_ROCE_ULP) {
+ if (test_bit(BNXT_STATE_OPEN, &bp->state))
+ bnxt_hwrm_vnic_cfg(bp, 0);
+ }
+
+ return 0;
+}
+
+static int bnxt_unregister_dev(struct bnxt_en_dev *edev, int ulp_id)
+{
+ struct net_device *dev = edev->net;
+ struct bnxt *bp = netdev_priv(dev);
+ struct bnxt_ulp *ulp;
+ int i = 0;
+
+ ASSERT_RTNL();
+ if (ulp_id >= BNXT_MAX_ULP)
+ return -EINVAL;
+
+ ulp = &edev->ulp_tbl[ulp_id];
+ if (!rcu_access_pointer(ulp->ulp_ops)) {
+ netdev_err(bp->dev, "ulp id %d not registered\n", ulp_id);
+ return -EINVAL;
+ }
+ if (ulp_id == BNXT_ROCE_ULP) {
+ unsigned int max_stat_ctxs;
+
+ max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp);
+ bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs + 1);
+ }
+ if (ulp->max_async_event_id)
+ bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0);
+
+ RCU_INIT_POINTER(ulp->ulp_ops, NULL);
+ synchronize_rcu();
+ ulp->max_async_event_id = 0;
+ ulp->async_events_bmap = NULL;
+ while (atomic_read(&ulp->ref_count) != 0 && i < 10) {
+ msleep(100);
+ i++;
+ }
+ return 0;
+}
+
+static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
+ struct bnxt_msix_entry *ent, int num_msix)
+{
+ struct net_device *dev = edev->net;
+ struct bnxt *bp = netdev_priv(dev);
+ int max_idx, max_cp_rings;
+ int avail_msix, i, idx;
+
+ ASSERT_RTNL();
+ if (ulp_id != BNXT_ROCE_ULP)
+ return -EINVAL;
+
+ if (!(bp->flags & BNXT_FLAG_USING_MSIX))
+ return -ENODEV;
+
+ max_cp_rings = bnxt_get_max_func_cp_rings(bp);
+ max_idx = min_t(int, bp->total_irqs, max_cp_rings);
+ avail_msix = max_idx - bp->cp_nr_rings;
+ if (!avail_msix)
+ return -ENOMEM;
+ if (avail_msix > num_msix)
+ avail_msix = num_msix;
+
+ idx = max_idx - avail_msix;
+ for (i = 0; i < avail_msix; i++) {
+ ent[i].vector = bp->irq_tbl[idx + i].vector;
+ ent[i].ring_idx = idx + i;
+ ent[i].db_offset = (idx + i) * 0x80;
+ }
+ bnxt_set_max_func_irqs(bp, max_idx - avail_msix);
+ bnxt_set_max_func_cp_rings(bp, max_cp_rings - avail_msix);
+ edev->ulp_tbl[ulp_id].msix_requested = avail_msix;
+ return avail_msix;
+}
+
+static int bnxt_free_msix_vecs(struct bnxt_en_dev *edev, int ulp_id)
+{
+ struct net_device *dev = edev->net;
+ struct bnxt *bp = netdev_priv(dev);
+ int max_cp_rings, msix_requested;
+
+ ASSERT_RTNL();
+ if (ulp_id != BNXT_ROCE_ULP)
+ return -EINVAL;
+
+ max_cp_rings = bnxt_get_max_func_cp_rings(bp);
+ msix_requested = edev->ulp_tbl[ulp_id].msix_requested;
+ bnxt_set_max_func_cp_rings(bp, max_cp_rings + msix_requested);
+ edev->ulp_tbl[ulp_id].msix_requested = 0;
+ bnxt_set_max_func_irqs(bp, bp->total_irqs);
+ return 0;
+}
+
+void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id)
+{
+ ASSERT_RTNL();
+ if (bnxt_ulp_registered(bp->edev, ulp_id)) {
+ struct bnxt_en_dev *edev = bp->edev;
+ unsigned int msix_req, max;
+
+ msix_req = edev->ulp_tbl[ulp_id].msix_requested;
+ max = bnxt_get_max_func_cp_rings(bp);
+ bnxt_set_max_func_cp_rings(bp, max - msix_req);
+ max = bnxt_get_max_func_stat_ctxs(bp);
+ bnxt_set_max_func_stat_ctxs(bp, max - 1);
+ }
+}
+
+static int bnxt_send_msg(struct bnxt_en_dev *edev, int ulp_id,
+ struct bnxt_fw_msg *fw_msg)
+{
+ struct net_device *dev = edev->net;
+ struct bnxt *bp = netdev_priv(dev);
+ struct input *req;
+ int rc;
+
+ mutex_lock(&bp->hwrm_cmd_lock);
+ req = fw_msg->msg;
+ req->resp_addr = cpu_to_le64(bp->hwrm_cmd_resp_dma_addr);
+ rc = _hwrm_send_message(bp, fw_msg->msg, fw_msg->msg_len,
+ fw_msg->timeout);
+ if (!rc) {
+ struct output *resp = bp->hwrm_cmd_resp_addr;
+ u32 len = le16_to_cpu(resp->resp_len);
+
+ if (fw_msg->resp_max_len < len)
+ len = fw_msg->resp_max_len;
+
+ memcpy(fw_msg->resp, resp, len);
+ }
+ mutex_unlock(&bp->hwrm_cmd_lock);
+ return rc;
+}
+
+static void bnxt_ulp_get(struct bnxt_ulp *ulp)
+{
+ atomic_inc(&ulp->ref_count);
+}
+
+static void bnxt_ulp_put(struct bnxt_ulp *ulp)
+{
+ atomic_dec(&ulp->ref_count);
+}
+
+void bnxt_ulp_stop(struct bnxt *bp)
+{
+ struct bnxt_en_dev *edev = bp->edev;
+ struct bnxt_ulp_ops *ops;
+ int i;
+
+ if (!edev)
+ return;
+
+ for (i = 0; i < BNXT_MAX_ULP; i++) {
+ struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+ ops = rtnl_dereference(ulp->ulp_ops);
+ if (!ops || !ops->ulp_stop)
+ continue;
+ ops->ulp_stop(ulp->handle);
+ }
+}
+
+void bnxt_ulp_start(struct bnxt *bp)
+{
+ struct bnxt_en_dev *edev = bp->edev;
+ struct bnxt_ulp_ops *ops;
+ int i;
+
+ if (!edev)
+ return;
+
+ for (i = 0; i < BNXT_MAX_ULP; i++) {
+ struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+ ops = rtnl_dereference(ulp->ulp_ops);
+ if (!ops || !ops->ulp_start)
+ continue;
+ ops->ulp_start(ulp->handle);
+ }
+}
+
+void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs)
+{
+ struct bnxt_en_dev *edev = bp->edev;
+ struct bnxt_ulp_ops *ops;
+ int i;
+
+ if (!edev)
+ return;
+
+ for (i = 0; i < BNXT_MAX_ULP; i++) {
+ struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+ rcu_read_lock();
+ ops = rcu_dereference(ulp->ulp_ops);
+ if (!ops || !ops->ulp_sriov_config) {
+ rcu_read_unlock();
+ continue;
+ }
+ bnxt_ulp_get(ulp);
+ rcu_read_unlock();
+ ops->ulp_sriov_config(ulp->handle, num_vfs);
+ bnxt_ulp_put(ulp);
+ }
+}
+
+void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl)
+{
+ u16 event_id = le16_to_cpu(cmpl->event_id);
+ struct bnxt_en_dev *edev = bp->edev;
+ struct bnxt_ulp_ops *ops;
+ int i;
+
+ if (!edev)
+ return;
+
+ rcu_read_lock();
+ for (i = 0; i < BNXT_MAX_ULP; i++) {
+ struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+ ops = rcu_dereference(ulp->ulp_ops);
+ if (!ops || !ops->ulp_async_notifier)
+ continue;
+ if (!ulp->async_events_bmap ||
+ event_id > ulp->max_async_event_id)
+ continue;
+
+ /* Read max_async_event_id first before testing the bitmap. */
+ smp_rmb();
+ if (test_bit(event_id, ulp->async_events_bmap))
+ ops->ulp_async_notifier(ulp->handle, cmpl);
+ }
+ rcu_read_unlock();
+}
+
+static int bnxt_register_async_events(struct bnxt_en_dev *edev, int ulp_id,
+ unsigned long *events_bmap, u16 max_id)
+{
+ struct net_device *dev = edev->net;
+ struct bnxt *bp = netdev_priv(dev);
+ struct bnxt_ulp *ulp;
+
+ if (ulp_id >= BNXT_MAX_ULP)
+ return -EINVAL;
+
+ ulp = &edev->ulp_tbl[ulp_id];
+ ulp->async_events_bmap = events_bmap;
+ /* Make sure bnxt_ulp_async_events() sees this order */
+ smp_wmb();
+ ulp->max_async_event_id = max_id;
+ bnxt_hwrm_func_rgtr_async_events(bp, events_bmap, max_id + 1);
+ return 0;
+}
+
+static const struct bnxt_en_ops bnxt_en_ops_tbl = {
+ .bnxt_register_device = bnxt_register_dev,
+ .bnxt_unregister_device = bnxt_unregister_dev,
+ .bnxt_request_msix = bnxt_req_msix_vecs,
+ .bnxt_free_msix = bnxt_free_msix_vecs,
+ .bnxt_send_fw_msg = bnxt_send_msg,
+ .bnxt_register_fw_async_events = bnxt_register_async_events,
+};
+
+struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev)
+{
+ struct bnxt *bp = netdev_priv(dev);
+ struct bnxt_en_dev *edev;
+
+ edev = bp->edev;
+ if (!edev) {
+ edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+ if (!edev)
+ return ERR_PTR(-ENOMEM);
+ edev->en_ops = &bnxt_en_ops_tbl;
+ if (bp->flags & BNXT_FLAG_ROCEV1_CAP)
+ edev->flags |= BNXT_EN_FLAG_ROCEV1_CAP;
+ if (bp->flags & BNXT_FLAG_ROCEV2_CAP)
+ edev->flags |= BNXT_EN_FLAG_ROCEV2_CAP;
+ edev->net = dev;
+ edev->pdev = bp->pdev;
+ bp->edev = edev;
+ }
+ return bp->edev;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
new file mode 100644
index 0000000..74f816e
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
@@ -0,0 +1,93 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_ULP_H
+#define BNXT_ULP_H
+
+#define BNXT_ROCE_ULP 0
+#define BNXT_OTHER_ULP 1
+#define BNXT_MAX_ULP 2
+
+#define BNXT_MIN_ROCE_CP_RINGS 2
+#define BNXT_MIN_ROCE_STAT_CTXS 1
+
+struct hwrm_async_event_cmpl;
+struct bnxt;
+
+struct bnxt_ulp_ops {
+ /* async_notifier() cannot sleep (in BH context) */
+ void (*ulp_async_notifier)(void *, struct hwrm_async_event_cmpl *);
+ void (*ulp_stop)(void *);
+ void (*ulp_start)(void *);
+ void (*ulp_sriov_config)(void *, int);
+};
+
+struct bnxt_msix_entry {
+ u32 vector;
+ u32 ring_idx;
+ u32 db_offset;
+};
+
+struct bnxt_fw_msg {
+ void *msg;
+ int msg_len;
+ void *resp;
+ int resp_max_len;
+ int timeout;
+};
+
+struct bnxt_ulp {
+ void *handle;
+ struct bnxt_ulp_ops __rcu *ulp_ops;
+ unsigned long *async_events_bmap;
+ u16 max_async_event_id;
+ u16 msix_requested;
+ atomic_t ref_count;
+};
+
+struct bnxt_en_dev {
+ struct net_device *net;
+ struct pci_dev *pdev;
+ u32 flags;
+ #define BNXT_EN_FLAG_ROCEV1_CAP 0x1
+ #define BNXT_EN_FLAG_ROCEV2_CAP 0x2
+ #define BNXT_EN_FLAG_ROCE_CAP (BNXT_EN_FLAG_ROCEV1_CAP | \
+ BNXT_EN_FLAG_ROCEV2_CAP)
+ const struct bnxt_en_ops *en_ops;
+ struct bnxt_ulp ulp_tbl[BNXT_MAX_ULP];
+};
+
+struct bnxt_en_ops {
+ int (*bnxt_register_device)(struct bnxt_en_dev *, int,
+ struct bnxt_ulp_ops *, void *);
+ int (*bnxt_unregister_device)(struct bnxt_en_dev *, int);
+ int (*bnxt_request_msix)(struct bnxt_en_dev *, int,
+ struct bnxt_msix_entry *, int);
+ int (*bnxt_free_msix)(struct bnxt_en_dev *, int);
+ int (*bnxt_send_fw_msg)(struct bnxt_en_dev *, int,
+ struct bnxt_fw_msg *);
+ int (*bnxt_register_fw_async_events)(struct bnxt_en_dev *, int,
+ unsigned long *, u16);
+};
+
+static inline bool bnxt_ulp_registered(struct bnxt_en_dev *edev, int ulp_id)
+{
+ if (edev && rcu_access_pointer(edev->ulp_tbl[ulp_id].ulp_ops))
+ return true;
+ return false;
+}
+
+void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id);
+void bnxt_ulp_stop(struct bnxt *bp);
+void bnxt_ulp_start(struct bnxt *bp);
+void bnxt_ulp_sriov_cfg(struct bnxt *bp, int num_vfs);
+void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl);
+struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev);
+
+#endif
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v2 6/7] bnxt_en: Refactor the driver registration function with firmware.
From: Michael Chan @ 2016-12-07 5:26 UTC (permalink / raw)
To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>
The driver register function with firmware consists of passing version
information and registering for async events. To support the RDMA driver,
the async events that we need to register may change. Separate the
driver register function into 2 parts so that we can just update the
async events for the RDMA driver.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 34 ++++++++++++++++++++++++++-----
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++
2 files changed, 31 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 57285bd..c782942 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3117,27 +3117,46 @@ int hwrm_send_message_silent(struct bnxt *bp, void *msg, u32 msg_len,
return rc;
}
-static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
+int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
+ int bmap_size)
{
struct hwrm_func_drv_rgtr_input req = {0};
- int i;
DECLARE_BITMAP(async_events_bmap, 256);
u32 *events = (u32 *)async_events_bmap;
+ int i;
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
req.enables =
- cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE |
- FUNC_DRV_RGTR_REQ_ENABLES_VER |
- FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD);
+ cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD);
memset(async_events_bmap, 0, sizeof(async_events_bmap));
for (i = 0; i < ARRAY_SIZE(bnxt_async_events_arr); i++)
__set_bit(bnxt_async_events_arr[i], async_events_bmap);
+ if (bmap && bmap_size) {
+ for (i = 0; i < bmap_size; i++) {
+ if (test_bit(i, bmap))
+ __set_bit(i, async_events_bmap);
+ }
+ }
+
for (i = 0; i < 8; i++)
req.async_event_fwd[i] |= cpu_to_le32(events[i]);
+ return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
+{
+ struct hwrm_func_drv_rgtr_input req = {0};
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
+
+ req.enables =
+ cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE |
+ FUNC_DRV_RGTR_REQ_ENABLES_VER);
+
req.os_type = cpu_to_le16(FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX);
req.ver_maj = DRV_VER_MAJ;
req.ver_min = DRV_VER_MIN;
@@ -3146,6 +3165,7 @@ static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
if (BNXT_PF(bp)) {
DECLARE_BITMAP(vf_req_snif_bmap, 256);
u32 *data = (u32 *)vf_req_snif_bmap;
+ int i;
memset(vf_req_snif_bmap, 0, sizeof(vf_req_snif_bmap));
for (i = 0; i < ARRAY_SIZE(bnxt_vf_req_snif); i++)
@@ -7023,6 +7043,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
if (rc)
goto init_err;
+ rc = bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0);
+ if (rc)
+ goto init_err;
+
/* Get the MAX capabilities for this function */
rc = bnxt_hwrm_func_qcaps(bp);
if (rc) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index d796836..eec2415 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1240,6 +1240,8 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
int _hwrm_send_message(struct bnxt *, void *, u32, int);
int hwrm_send_message(struct bnxt *, void *, u32, int);
int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
+int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
+ int bmap_size);
int bnxt_hwrm_set_coal(struct bnxt *);
unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v2 5/7] bnxt_en: Reserve RDMA resources by default.
From: Michael Chan @ 2016-12-07 5:26 UTC (permalink / raw)
To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>
If the device supports RDMA, we'll setup network default rings so that
there are enough minimum resources for RDMA, if possible. However, the
user can still increase network rings to the max if he wants. The actual
RDMA resources won't be reserved until the RDMA driver registers.
v2: Fix compile warning when BNXT_CONFIG_SRIOV is not set.
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 58 ++++++++++++++++++++++++++++++-
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 9 +++++
2 files changed, 66 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 52b8ad4..57285bd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4166,6 +4166,11 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
if (rc)
goto hwrm_func_qcaps_exit;
+ if (resp->flags & cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_ROCE_V1_SUPPORTED))
+ bp->flags |= BNXT_FLAG_ROCEV1_CAP;
+ if (resp->flags & cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_ROCE_V2_SUPPORTED))
+ bp->flags |= BNXT_FLAG_ROCEV2_CAP;
+
bp->tx_push_thresh = 0;
if (resp->flags &
cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED))
@@ -4808,6 +4813,24 @@ static int bnxt_setup_int_mode(struct bnxt *bp)
return rc;
}
+unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+ if (BNXT_VF(bp))
+ return bp->vf.max_stat_ctxs;
+#endif
+ return bp->pf.max_stat_ctxs;
+}
+
+unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+ if (BNXT_VF(bp))
+ return bp->vf.max_cp_rings;
+#endif
+ return bp->pf.max_cp_rings;
+}
+
static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
{
#if defined(CONFIG_BNXT_SRIOV)
@@ -6832,6 +6855,39 @@ int bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, bool shared)
return bnxt_trim_rings(bp, max_rx, max_tx, cp, shared);
}
+static int bnxt_get_dflt_rings(struct bnxt *bp, int *max_rx, int *max_tx,
+ bool shared)
+{
+ int rc;
+
+ rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+ if (rc)
+ return rc;
+
+ if (bp->flags & BNXT_FLAG_ROCE_CAP) {
+ int max_cp, max_stat, max_irq;
+
+ /* Reserve minimum resources for RoCE */
+ max_cp = bnxt_get_max_func_cp_rings(bp);
+ max_stat = bnxt_get_max_func_stat_ctxs(bp);
+ max_irq = bnxt_get_max_func_irqs(bp);
+ if (max_cp <= BNXT_MIN_ROCE_CP_RINGS ||
+ max_irq <= BNXT_MIN_ROCE_CP_RINGS ||
+ max_stat <= BNXT_MIN_ROCE_STAT_CTXS)
+ return 0;
+
+ max_cp -= BNXT_MIN_ROCE_CP_RINGS;
+ max_irq -= BNXT_MIN_ROCE_CP_RINGS;
+ max_stat -= BNXT_MIN_ROCE_STAT_CTXS;
+ max_cp = min_t(int, max_cp, max_irq);
+ max_cp = min_t(int, max_cp, max_stat);
+ rc = bnxt_trim_rings(bp, max_rx, max_tx, max_cp, shared);
+ if (rc)
+ rc = 0;
+ }
+ return rc;
+}
+
static int bnxt_set_dflt_rings(struct bnxt *bp)
{
int dflt_rings, max_rx_rings, max_tx_rings, rc;
@@ -6840,7 +6896,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
if (sh)
bp->flags |= BNXT_FLAG_SHARED_RINGS;
dflt_rings = netif_get_num_default_rss_queues();
- rc = bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, sh);
+ rc = bnxt_get_dflt_rings(bp, &max_rx_rings, &max_tx_rings, sh);
if (rc)
return rc;
bp->rx_nr_rings = min_t(int, dflt_rings, max_rx_rings);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 43a4b17..d796836 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -387,6 +387,9 @@ struct rx_tpa_end_cmp_ext {
#define DB_KEY_TX_PUSH (0x4 << 28)
#define DB_LONG_TX_PUSH (0x2 << 24)
+#define BNXT_MIN_ROCE_CP_RINGS 2
+#define BNXT_MIN_ROCE_STAT_CTXS 1
+
#define INVALID_HW_RING_ID ((u16)-1)
/* The hardware supports certain page sizes. Use the supported page sizes
@@ -953,6 +956,10 @@ struct bnxt {
#define BNXT_FLAG_PORT_STATS 0x400
#define BNXT_FLAG_UDP_RSS_CAP 0x800
#define BNXT_FLAG_EEE_CAP 0x1000
+ #define BNXT_FLAG_ROCEV1_CAP 0x8000
+ #define BNXT_FLAG_ROCEV2_CAP 0x10000
+ #define BNXT_FLAG_ROCE_CAP (BNXT_FLAG_ROCEV1_CAP | \
+ BNXT_FLAG_ROCEV2_CAP)
#define BNXT_FLAG_CHIP_NITRO_A0 0x1000000
#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA | \
@@ -1234,6 +1241,8 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
int hwrm_send_message(struct bnxt *, void *, u32, int);
int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
int bnxt_hwrm_set_coal(struct bnxt *);
+unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
+unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
void bnxt_tx_disable(struct bnxt *bp);
void bnxt_tx_enable(struct bnxt *bp);
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v2 4/7] bnxt_en: Improve completion ring allocation for VFs.
From: Michael Chan @ 2016-12-07 5:26 UTC (permalink / raw)
To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>
All available remaining completion rings not used by the PF should be
made available for the VFs so that there are enough rings in the VF to
support RDMA. The earlier workaround code of capping the rings by the
statistics context is removed.
When SRIOV is disabled, call a new function bnxt_restore_pf_fw_resources()
to restore FW resources. Later on we need to add some logic to account
for RDMA resources.
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 +++++++-
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +-
drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 14 ++++----------
3 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 30b482b..52b8ad4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4152,7 +4152,7 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp)
return rc;
}
-int bnxt_hwrm_func_qcaps(struct bnxt *bp)
+static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
{
int rc = 0;
struct hwrm_func_qcaps_input req = {0};
@@ -6856,6 +6856,12 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
return rc;
}
+void bnxt_restore_pf_fw_resources(struct bnxt *bp)
+{
+ ASSERT_RTNL();
+ bnxt_hwrm_func_qcaps(bp);
+}
+
static void bnxt_parse_log_pcie_link(struct bnxt *bp)
{
enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 0ee2cc4..43a4b17 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1234,7 +1234,6 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
int hwrm_send_message(struct bnxt *, void *, u32, int);
int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
int bnxt_hwrm_set_coal(struct bnxt *);
-int bnxt_hwrm_func_qcaps(struct bnxt *);
void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
void bnxt_tx_disable(struct bnxt *bp);
void bnxt_tx_enable(struct bnxt *bp);
@@ -1245,4 +1244,5 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
int bnxt_close_nic(struct bnxt *, bool, bool);
int bnxt_setup_mq_tc(struct net_device *dev, u8 tc);
int bnxt_get_max_rings(struct bnxt *, int *, int *, bool);
+void bnxt_restore_pf_fw_resources(struct bnxt *bp);
#endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index bff626a..c696025 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -420,15 +420,7 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
/* Remaining rings are distributed equally amongs VF's for now */
- /* TODO: the following workaroud is needed to restrict total number
- * of vf_cp_rings not exceed number of HW ring groups. This WA should
- * be removed once new HWRM provides HW ring groups capability in
- * hwrm_func_qcap.
- */
- vf_cp_rings = min_t(u16, pf->max_cp_rings, pf->max_stat_ctxs);
- vf_cp_rings = (vf_cp_rings - bp->cp_nr_rings) / num_vfs;
- /* TODO: restore this logic below once the WA above is removed */
- /* vf_cp_rings = (pf->max_cp_rings - bp->cp_nr_rings) / num_vfs; */
+ vf_cp_rings = (pf->max_cp_rings - bp->cp_nr_rings) / num_vfs;
vf_stat_ctx = (pf->max_stat_ctxs - bp->num_stat_ctxs) / num_vfs;
if (bp->flags & BNXT_FLAG_AGG_RINGS)
vf_rx_rings = (pf->max_rx_rings - bp->rx_nr_rings * 2) /
@@ -590,7 +582,9 @@ void bnxt_sriov_disable(struct bnxt *bp)
bp->pf.active_vfs = 0;
/* Reclaim all resources for the PF. */
- bnxt_hwrm_func_qcaps(bp);
+ rtnl_lock();
+ bnxt_restore_pf_fw_resources(bp);
+ rtnl_unlock();
}
int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs)
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v2 3/7] bnxt_en: Move function reset to bnxt_init_one().
From: Michael Chan @ 2016-12-07 5:26 UTC (permalink / raw)
To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>
Now that MSIX is enabled in bnxt_init_one(), resources may be allocated by
the RDMA driver before the network device is opened. So we cannot do
function reset in bnxt_open() which will clear all the resources.
The proper place to do function reset now is in bnxt_init_one().
If we get AER, we'll do function reset as well.
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 ++++++-------------------
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 -
2 files changed, 6 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index da302eb..30b482b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5613,22 +5613,7 @@ int bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
static int bnxt_open(struct net_device *dev)
{
struct bnxt *bp = netdev_priv(dev);
- int rc = 0;
- if (!test_bit(BNXT_STATE_FN_RST_DONE, &bp->state)) {
- rc = bnxt_hwrm_func_reset(bp);
- if (rc) {
- netdev_err(bp->dev, "hwrm chip reset failure rc: %x\n",
- rc);
- rc = -EBUSY;
- return rc;
- }
- /* Do func_reset during the 1st PF open only to prevent killing
- * the VFs when the PF is brought down and up.
- */
- if (BNXT_PF(bp))
- set_bit(BNXT_STATE_FN_RST_DONE, &bp->state);
- }
return __bnxt_open_nic(bp, true, true);
}
@@ -7028,6 +7013,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
if (rc)
goto init_err;
+ rc = bnxt_hwrm_func_reset(bp);
+ if (rc)
+ goto init_err;
+
rc = bnxt_init_int_mode(bp);
if (rc)
goto init_err;
@@ -7069,7 +7058,6 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct net_device *netdev = pci_get_drvdata(pdev);
- struct bnxt *bp = netdev_priv(netdev);
netdev_info(netdev, "PCI I/O error detected\n");
@@ -7084,8 +7072,6 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
if (netif_running(netdev))
bnxt_close(netdev);
- /* So that func_reset will be done during slot_reset */
- clear_bit(BNXT_STATE_FN_RST_DONE, &bp->state);
pci_disable_device(pdev);
rtnl_unlock();
@@ -7119,7 +7105,8 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev)
} else {
pci_set_master(pdev);
- if (netif_running(netdev))
+ err = bnxt_hwrm_func_reset(bp);
+ if (!err && netif_running(netdev))
err = bnxt_open(netdev);
if (!err)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 1461355..0ee2cc4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1021,7 +1021,6 @@ struct bnxt {
unsigned long state;
#define BNXT_STATE_OPEN 0
#define BNXT_STATE_IN_SP_TASK 1
-#define BNXT_STATE_FN_RST_DONE 2
struct bnxt_irq *irq_tbl;
int total_irqs;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v2 2/7] bnxt_en: Enable MSIX early in bnxt_init_one().
From: Michael Chan @ 2016-12-07 5:26 UTC (permalink / raw)
To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>
To better support the new RDMA driver, we need to move pci_enable_msix()
from bnxt_open() to bnxt_init_one(). This way, MSIX vectors are available
to the RDMA driver whether the network device is up or down.
Part of the existing bnxt_setup_int_mode() function is now refactored into
a new bnxt_init_int_mode(). bnxt_init_int_mode() is called during
bnxt_init_one() to enable MSIX. The remaining logic in
bnxt_setup_int_mode() to map the IRQs to the completion rings is called
during bnxt_open().
v2: Fixed compile warning when CONFIG_BNXT_SRIOV is not set.
Signed-off-by: Somnath Kotur <somnath.kotur@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 183 +++++++++++++++++++-----------
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 +
2 files changed, 115 insertions(+), 69 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 269e757..da302eb 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4743,6 +4743,80 @@ static int bnxt_trim_rings(struct bnxt *bp, int *rx, int *tx, int max,
return 0;
}
+static void bnxt_setup_msix(struct bnxt *bp)
+{
+ const int len = sizeof(bp->irq_tbl[0].name);
+ struct net_device *dev = bp->dev;
+ int tcs, i;
+
+ tcs = netdev_get_num_tc(dev);
+ if (tcs > 1) {
+ bp->tx_nr_rings_per_tc = bp->tx_nr_rings / tcs;
+ if (bp->tx_nr_rings_per_tc == 0) {
+ netdev_reset_tc(dev);
+ bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+ } else {
+ int i, off, count;
+
+ bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tcs;
+ for (i = 0; i < tcs; i++) {
+ count = bp->tx_nr_rings_per_tc;
+ off = i * count;
+ netdev_set_tc_queue(dev, i, count, off);
+ }
+ }
+ }
+
+ for (i = 0; i < bp->cp_nr_rings; i++) {
+ char *attr;
+
+ if (bp->flags & BNXT_FLAG_SHARED_RINGS)
+ attr = "TxRx";
+ else if (i < bp->rx_nr_rings)
+ attr = "rx";
+ else
+ attr = "tx";
+
+ snprintf(bp->irq_tbl[i].name, len, "%s-%s-%d", dev->name, attr,
+ i);
+ bp->irq_tbl[i].handler = bnxt_msix;
+ }
+}
+
+static void bnxt_setup_inta(struct bnxt *bp)
+{
+ const int len = sizeof(bp->irq_tbl[0].name);
+
+ if (netdev_get_num_tc(bp->dev))
+ netdev_reset_tc(bp->dev);
+
+ snprintf(bp->irq_tbl[0].name, len, "%s-%s-%d", bp->dev->name, "TxRx",
+ 0);
+ bp->irq_tbl[0].handler = bnxt_inta;
+}
+
+static int bnxt_setup_int_mode(struct bnxt *bp)
+{
+ int rc;
+
+ if (bp->flags & BNXT_FLAG_USING_MSIX)
+ bnxt_setup_msix(bp);
+ else
+ bnxt_setup_inta(bp);
+
+ rc = bnxt_set_real_num_queues(bp);
+ return rc;
+}
+
+static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+ if (BNXT_VF(bp))
+ return bp->vf.max_irqs;
+#endif
+ return bp->pf.max_irqs;
+}
+
void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
{
#if defined(CONFIG_BNXT_SRIOV)
@@ -4753,16 +4827,12 @@ void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
bp->pf.max_irqs = max_irqs;
}
-static int bnxt_setup_msix(struct bnxt *bp)
+static int bnxt_init_msix(struct bnxt *bp)
{
- struct msix_entry *msix_ent;
- struct net_device *dev = bp->dev;
int i, total_vecs, rc = 0, min = 1;
- const int len = sizeof(bp->irq_tbl[0].name);
-
- bp->flags &= ~BNXT_FLAG_USING_MSIX;
- total_vecs = bp->cp_nr_rings;
+ struct msix_entry *msix_ent;
+ total_vecs = bnxt_get_max_func_irqs(bp);
msix_ent = kcalloc(total_vecs, sizeof(struct msix_entry), GFP_KERNEL);
if (!msix_ent)
return -ENOMEM;
@@ -4783,8 +4853,10 @@ static int bnxt_setup_msix(struct bnxt *bp)
bp->irq_tbl = kcalloc(total_vecs, sizeof(struct bnxt_irq), GFP_KERNEL);
if (bp->irq_tbl) {
- int tcs;
+ for (i = 0; i < total_vecs; i++)
+ bp->irq_tbl[i].vector = msix_ent[i].vector;
+ bp->total_irqs = total_vecs;
/* Trim rings based upon num of vectors allocated */
rc = bnxt_trim_rings(bp, &bp->rx_nr_rings, &bp->tx_nr_rings,
total_vecs, min == 1);
@@ -4792,43 +4864,10 @@ static int bnxt_setup_msix(struct bnxt *bp)
goto msix_setup_exit;
bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
- tcs = netdev_get_num_tc(dev);
- if (tcs > 1) {
- bp->tx_nr_rings_per_tc = bp->tx_nr_rings / tcs;
- if (bp->tx_nr_rings_per_tc == 0) {
- netdev_reset_tc(dev);
- bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
- } else {
- int i, off, count;
+ bp->cp_nr_rings = (min == 1) ?
+ max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
+ bp->tx_nr_rings + bp->rx_nr_rings;
- bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tcs;
- for (i = 0; i < tcs; i++) {
- count = bp->tx_nr_rings_per_tc;
- off = i * count;
- netdev_set_tc_queue(dev, i, count, off);
- }
- }
- }
- bp->cp_nr_rings = total_vecs;
-
- for (i = 0; i < bp->cp_nr_rings; i++) {
- char *attr;
-
- bp->irq_tbl[i].vector = msix_ent[i].vector;
- if (bp->flags & BNXT_FLAG_SHARED_RINGS)
- attr = "TxRx";
- else if (i < bp->rx_nr_rings)
- attr = "rx";
- else
- attr = "tx";
-
- snprintf(bp->irq_tbl[i].name, len,
- "%s-%s-%d", dev->name, attr, i);
- bp->irq_tbl[i].handler = bnxt_msix;
- }
- rc = bnxt_set_real_num_queues(bp);
- if (rc)
- goto msix_setup_exit;
} else {
rc = -ENOMEM;
goto msix_setup_exit;
@@ -4838,52 +4877,54 @@ static int bnxt_setup_msix(struct bnxt *bp)
return 0;
msix_setup_exit:
- netdev_err(bp->dev, "bnxt_setup_msix err: %x\n", rc);
+ netdev_err(bp->dev, "bnxt_init_msix err: %x\n", rc);
+ kfree(bp->irq_tbl);
+ bp->irq_tbl = NULL;
pci_disable_msix(bp->pdev);
kfree(msix_ent);
return rc;
}
-static int bnxt_setup_inta(struct bnxt *bp)
+static int bnxt_init_inta(struct bnxt *bp)
{
- int rc;
- const int len = sizeof(bp->irq_tbl[0].name);
-
- if (netdev_get_num_tc(bp->dev))
- netdev_reset_tc(bp->dev);
-
bp->irq_tbl = kcalloc(1, sizeof(struct bnxt_irq), GFP_KERNEL);
- if (!bp->irq_tbl) {
- rc = -ENOMEM;
- return rc;
- }
+ if (!bp->irq_tbl)
+ return -ENOMEM;
+
+ bp->total_irqs = 1;
bp->rx_nr_rings = 1;
bp->tx_nr_rings = 1;
bp->cp_nr_rings = 1;
bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
bp->flags |= BNXT_FLAG_SHARED_RINGS;
bp->irq_tbl[0].vector = bp->pdev->irq;
- snprintf(bp->irq_tbl[0].name, len,
- "%s-%s-%d", bp->dev->name, "TxRx", 0);
- bp->irq_tbl[0].handler = bnxt_inta;
- rc = bnxt_set_real_num_queues(bp);
- return rc;
+ return 0;
}
-static int bnxt_setup_int_mode(struct bnxt *bp)
+static int bnxt_init_int_mode(struct bnxt *bp)
{
int rc = 0;
if (bp->flags & BNXT_FLAG_MSIX_CAP)
- rc = bnxt_setup_msix(bp);
+ rc = bnxt_init_msix(bp);
if (!(bp->flags & BNXT_FLAG_USING_MSIX) && BNXT_PF(bp)) {
/* fallback to INTA */
- rc = bnxt_setup_inta(bp);
+ rc = bnxt_init_inta(bp);
}
return rc;
}
+static void bnxt_clear_int_mode(struct bnxt *bp)
+{
+ if (bp->flags & BNXT_FLAG_USING_MSIX)
+ pci_disable_msix(bp->pdev);
+
+ kfree(bp->irq_tbl);
+ bp->irq_tbl = NULL;
+ bp->flags &= ~BNXT_FLAG_USING_MSIX;
+}
+
static void bnxt_free_irq(struct bnxt *bp)
{
struct bnxt_irq *irq;
@@ -4902,10 +4943,6 @@ static void bnxt_free_irq(struct bnxt *bp)
free_irq(irq->vector, bp->bnapi[i]);
irq->requested = 0;
}
- if (bp->flags & BNXT_FLAG_USING_MSIX)
- pci_disable_msix(bp->pdev);
- kfree(bp->irq_tbl);
- bp->irq_tbl = NULL;
}
static int bnxt_request_irq(struct bnxt *bp)
@@ -6695,6 +6732,7 @@ static void bnxt_remove_one(struct pci_dev *pdev)
cancel_work_sync(&bp->sp_task);
bp->sp_event = 0;
+ bnxt_clear_int_mode(bp);
bnxt_hwrm_func_drv_unrgtr(bp);
bnxt_free_hwrm_resources(bp);
bnxt_dcb_free(bp);
@@ -6990,10 +7028,14 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
if (rc)
goto init_err;
- rc = register_netdev(dev);
+ rc = bnxt_init_int_mode(bp);
if (rc)
goto init_err;
+ rc = register_netdev(dev);
+ if (rc)
+ goto init_err_clr_int;
+
netdev_info(dev, "%s found at mem %lx, node addr %pM\n",
board_info[ent->driver_data].name,
(long)pci_resource_start(pdev, 0), dev->dev_addr);
@@ -7002,6 +7044,9 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
return 0;
+init_err_clr_int:
+ bnxt_clear_int_mode(bp);
+
init_err:
pci_iounmap(pdev, bp->bar0);
pci_release_regions(pdev);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 8327d0d..1461355 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1024,6 +1024,7 @@ struct bnxt {
#define BNXT_STATE_FN_RST_DONE 2
struct bnxt_irq *irq_tbl;
+ int total_irqs;
u8 mac_addr[ETH_ALEN];
#ifdef CONFIG_BNXT_DCB
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v2 1/7] bnxt_en: Add bnxt_set_max_func_irqs().
From: Michael Chan @ 2016-12-07 5:26 UTC (permalink / raw)
To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
In-Reply-To: <1481088381-30411-1-git-send-email-michael.chan@broadcom.com>
By refactoring existing code into this new function. The new function
will be used in subsequent patches.
v2: Fixed compile warning when CONFIG_BNXT_SRIOV is not set.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 17 +++++++++++------
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 +
2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index e84613a..269e757 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4743,6 +4743,16 @@ static int bnxt_trim_rings(struct bnxt *bp, int *rx, int *tx, int max,
return 0;
}
+void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+ if (BNXT_VF(bp))
+ bp->vf.max_irqs = max_irqs;
+ else
+#endif
+ bp->pf.max_irqs = max_irqs;
+}
+
static int bnxt_setup_msix(struct bnxt *bp)
{
struct msix_entry *msix_ent;
@@ -6949,12 +6959,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
bnxt_set_tpa_flags(bp);
bnxt_set_ring_params(bp);
- if (BNXT_PF(bp))
- bp->pf.max_irqs = max_irqs;
-#if defined(CONFIG_BNXT_SRIOV)
- else
- bp->vf.max_irqs = max_irqs;
-#endif
+ bnxt_set_max_func_irqs(bp, max_irqs);
bnxt_set_dflt_rings(bp);
/* Default RSS hash cfg. */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index b4abc1b..8327d0d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1235,6 +1235,7 @@ static inline void bnxt_disable_poll(struct bnxt_napi *bnapi)
int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
int bnxt_hwrm_set_coal(struct bnxt *);
int bnxt_hwrm_func_qcaps(struct bnxt *);
+void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
void bnxt_tx_disable(struct bnxt *bp);
void bnxt_tx_enable(struct bnxt *bp);
int bnxt_hwrm_set_pause(struct bnxt *);
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v2 0/7] bnxt_en: Add interface to support RDMA driver.
From: Michael Chan @ 2016-12-07 5:26 UTC (permalink / raw)
To: davem; +Cc: netdev, selvin.xavier, somnath.kotur, dledford, linux-rdma
This series adds an interface to support a brand new RDMA driver bnxt_re.
The first step is to re-arrange some code so that pci_enable_msix() can
be called during pci probe. The purpose is to allow the RDMA driver to
initialize and stay initialized whether the netdev is up or down.
Then we make some changes to VF resource allocation so that there is
enough resources to support RDMA.
Finally the last patch adds a simple interface to allow the RDMA driver to
probe and register itself with any bnxt_en devices that support RDMA.
Once registered, the RDMA driver can request MSIX, send fw messages, and
receive some notifications.
v2: Fixed kbuild test robot warnings.
David, please consider this series for net-next. Thanks.
Michael Chan (7):
bnxt_en: Add bnxt_set_max_func_irqs().
bnxt_en: Enable MSIX early in bnxt_init_one().
bnxt_en: Move function reset to bnxt_init_one().
bnxt_en: Improve completion ring allocation for VFs.
bnxt_en: Reserve RDMA resources by default.
bnxt_en: Refactor the driver registration function with firmware.
bnxt_en: Add interface to support RDMA driver.
drivers/net/ethernet/broadcom/bnxt/Makefile | 2 +-
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 360 +++++++++++++++++-------
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 22 +-
drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 14 +-
drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 346 +++++++++++++++++++++++
drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 93 ++++++
6 files changed, 722 insertions(+), 115 deletions(-)
create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
--
1.8.3.1
^ permalink raw reply
* Re: [PATCH] net: wireless: realtek: constify rate_control_ops structures
From: Jes Sorensen @ 2016-12-07 5:07 UTC (permalink / raw)
To: Larry Finger
Cc: Bhumika Goyal, julia.lawall, chaoming_li, kvalo, linux-wireless,
netdev, linux-kernel
In-Reply-To: <0f223291-734f-7658-57b7-18e962d15823@lwfinger.net>
Larry Finger <Larry.Finger@lwfinger.net> writes:
> On 12/02/2016 03:50 AM, Bhumika Goyal wrote:
>> The structures rate_control_ops are only passed as an argument to the
>> functions ieee80211_rate_control_{register/unregister}. This argument is
>> of type const, so rate_control_ops having this property can also be
>> declared as const.
>> Done using Coccinelle:
>>
>> @r1 disable optional_qualifier @
>> identifier i;
>> position p;
>> @@
>> static struct rate_control_ops i@p = {...};
>>
>> @ok1@
>> identifier r1.i;
>> position p;
>> @@
>> ieee80211_rate_control_register(&i@p)
>>
>> @ok2@
>> identifier r1.i;
>> position p;
>> @@
>> ieee80211_rate_control_unregister(&i@p)
>>
>> @bad@
>> position p!={r1.p,ok1.p,ok2.p};
>> identifier r1.i;
>> @@
>> i@p
>>
>> @depends on !bad disable optional_qualifier@
>> identifier r1.i;
>> @@
>> static
>> +const
>> struct rate_control_ops i={...};
>>
>> @depends on !bad disable optional_qualifier@
>> identifier r1.i;
>> @@
>> +const
>> struct rate_control_ops i;
>>
>> File size before:
>> text data bss dec hex filename
>> 1991 104 0 2095 82f wireless/realtek/rtlwifi/rc.o
>>
>> File size after:
>> text data bss dec hex filename
>> 2095 0 0 2095 wireless/realtek/rtlwifi/rc.o
>>
[snip]
> The content of your patch is OK; however, your subject is not. By
> convention, "net: wireless: realtek:" is assumed. We do, however,
> include "rtlwifi:" to indicate which part of
> drivers/net/wireless/realtek/ is referenced.
In addition, the first part of the description is useful and the file
size information is reasonable too, but ~20 lines of coccinelle scripts
in the commit message is rather pointless.
Jes
^ permalink raw reply
* Re: [PATCH] net: return value of skb_linearize should be handled in Linux kernel
From: Cong Wang @ 2016-12-07 5:02 UTC (permalink / raw)
To: Zhouyi Zhou
Cc: faisal.latif-ral2JQCrhuEAvxtiuMwx3w,
dledford-H+wXaHxf7aLQT0dZR+AlfA,
sean.hefty-ral2JQCrhuEAvxtiuMwx3w,
hal.rosenstock-Re5JQEeQqe8AvxtiuMwx3w, Jeff Kirsher,
QLogic-Storage-Upstream-h88ZbnxC6KDQT0dZR+AlfA,
jejb-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8, Martin K. Petersen,
jth-DgEjT+Ai2ygdnm+yROfE0A, jon.maloy-IzeFyvvaP7pWk0Htik3J/w,
ying.xue-CWA4WttNNZF54TAoqtyWWQ, David Miller,
linux-rdma-u79uwXL29TY76Z2rM5mHXA, LKML, intel-wired-lan,
Linux Kernel Network Developers,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
fcoe-devel-s9riP+hp16TNLxjTenLetw,
tipc-discussion-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <1481008233-16777-1-git-send-email-zhouzhouyi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
On Mon, Dec 5, 2016 at 11:10 PM, Zhouyi Zhou <zhouzhouyi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
> index 2a653ec..ab787cb 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
> @@ -490,7 +490,11 @@ int ixgbe_fcoe_ddp(struct ixgbe_adapter *adapter,
> */
> if ((fh->fh_r_ctl == FC_RCTL_DD_SOL_DATA) &&
> (fctl & FC_FC_END_SEQ)) {
> - skb_linearize(skb);
> + int err = 0;
> +
> + err = skb_linearize(skb);
> + if (err)
> + return err;
You can reuse 'rc' instead of adding 'err'.
> crc = (struct fcoe_crc_eof *)skb_put(skb, sizeof(*crc));
> crc->fcoe_eof = FC_EOF_T;
> }
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index fee1f29..4926d48 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -2173,8 +2173,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
> total_rx_bytes += ddp_bytes;
> total_rx_packets += DIV_ROUND_UP(ddp_bytes,
> mss);
> - }
> - if (!ddp_bytes) {
> + } else {
> dev_kfree_skb_any(skb);
> continue;
> }
This piece doesn't seem to be related.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH net] phy: Don't increment MDIO bus refcount unless it's a different owner
From: Florian Fainelli @ 2016-12-07 4:54 UTC (permalink / raw)
To: netdev; +Cc: johan, rmk+kernel, andrew, Florian Fainelli
Commit 3e3aaf649416 ("phy: fix mdiobus module safety") fixed the way we
dealt with MDIO bus module reference count, but sort of introduced a
regression in that, if an Ethernet driver registers its own MDIO bus
driver, as is common, we will end up with the Ethernet driver's
module->refnct set to 1, thus preventing this driver from any removal.
Fix this by comparing the network device's device driver owner against
the MDIO bus driver owner, and only if they are different, increment the
MDIO bus module refcount.
Fixes: 3e3aaf649416 ("phy: fix mdiobus module safety")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
Russell,
I verified this against the ethoc driver primarily (on a TS7300 board)
and bcmgenet.
Thanks!
drivers/net/phy/phy_device.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1a4bf8acad78..c4ceb082e970 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -857,11 +857,17 @@ EXPORT_SYMBOL(phy_attached_print);
int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
u32 flags, phy_interface_t interface)
{
+ struct module *ndev_owner = dev->dev.parent->driver->owner;
struct mii_bus *bus = phydev->mdio.bus;
struct device *d = &phydev->mdio.dev;
int err;
- if (!try_module_get(bus->owner)) {
+ /* For Ethernet device drivers that register their own MDIO bus, we
+ * will have bus->owner match ndev_mod, so we do not want to increment
+ * our own module->refcnt here, otherwise we would not be able to
+ * unload later on.
+ */
+ if (ndev_owner != bus->owner && !try_module_get(bus->owner)) {
dev_err(&dev->dev, "failed to get the bus module\n");
return -EIO;
}
@@ -921,7 +927,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
error:
put_device(d);
- module_put(bus->owner);
+ if (ndev_owner != bus->owner)
+ module_put(bus->owner);
return err;
}
EXPORT_SYMBOL(phy_attach_direct);
@@ -971,6 +978,8 @@ EXPORT_SYMBOL(phy_attach);
*/
void phy_detach(struct phy_device *phydev)
{
+ struct net_device *dev = phydev->attached_dev;
+ struct module *ndev_owner = dev->dev.parent->driver->owner;
struct mii_bus *bus;
int i;
@@ -998,7 +1007,8 @@ void phy_detach(struct phy_device *phydev)
bus = phydev->mdio.bus;
put_device(&phydev->mdio.dev);
- module_put(bus->owner);
+ if (ndev_owner != bus->owner)
+ module_put(bus->owner);
}
EXPORT_SYMBOL(phy_detach);
--
2.9.3
^ permalink raw reply related
* Re: [PATCH 09/10] vsock/virtio: fix src/dst cid format
From: Jason Wang @ 2016-12-07 4:31 UTC (permalink / raw)
To: Michael S. Tsirkin, linux-kernel
Cc: stable, Stefan Hajnoczi, David S. Miller, kvm, virtualization,
netdev
In-Reply-To: <1481038106-24899-10-git-send-email-mst@redhat.com>
On 2016年12月06日 23:41, Michael S. Tsirkin wrote:
> These fields are 64 bit, using le32_to_cpu and friends
> on these will not do the right thing.
> Fix this up.
>
> Cc: stable@vger.kernel.org
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> net/vmw_vsock/virtio_transport_common.c | 14 +++++++-------
> 1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index 6120384..22e99c4 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -606,9 +606,9 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
> return 0;
>
> pkt = virtio_transport_alloc_pkt(&info, 0,
> - le32_to_cpu(pkt->hdr.dst_cid),
> + le64_to_cpu(pkt->hdr.dst_cid),
> le32_to_cpu(pkt->hdr.dst_port),
> - le32_to_cpu(pkt->hdr.src_cid),
> + le64_to_cpu(pkt->hdr.src_cid),
> le32_to_cpu(pkt->hdr.src_port));
Looking at sockaddr_vm, svm_cid is "unsigned int", do we really want 64
bit here?
> if (!pkt)
> return -ENOMEM;
> @@ -823,7 +823,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
> struct virtio_vsock_pkt_info info = {
> .op = VIRTIO_VSOCK_OP_RESPONSE,
> .type = VIRTIO_VSOCK_TYPE_STREAM,
> - .remote_cid = le32_to_cpu(pkt->hdr.src_cid),
> + .remote_cid = le64_to_cpu(pkt->hdr.src_cid),
> .remote_port = le32_to_cpu(pkt->hdr.src_port),
> .reply = true,
> };
> @@ -863,9 +863,9 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
> child->sk_state = SS_CONNECTED;
>
> vchild = vsock_sk(child);
> - vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
> + vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
> le32_to_cpu(pkt->hdr.dst_port));
> - vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
> + vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
> le32_to_cpu(pkt->hdr.src_port));
>
> vsock_insert_connected(vchild);
> @@ -904,9 +904,9 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
> struct sock *sk;
> bool space_available;
>
> - vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
> + vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid),
> le32_to_cpu(pkt->hdr.src_port));
> - vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
> + vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid),
> le32_to_cpu(pkt->hdr.dst_port));
>
> trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
^ permalink raw reply
* [PATCH net-next 1/1] driver: macvlan: Remove the rcu member of macvlan_port
From: fgao @ 2016-12-07 4:23 UTC (permalink / raw)
To: davem, kaber, netdev, gfree.wind; +Cc: Gao Feng
From: Gao Feng <fgao@ikuai8.com>
When free macvlan_port in macvlan_port_destroy, it is safe to free
directly because netdev_rx_handler_unregister could enforce one
grace period.
So it is unnecessary to use kfree_rcu for macvlan_port.
Signed-off-by: Gao Feng <fgao@ikuai8.com>
---
drivers/net/macvlan.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 3c0a171..20b3fdf2 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -43,7 +43,6 @@ struct macvlan_port {
struct net_device *dev;
struct hlist_head vlan_hash[MACVLAN_HASH_SIZE];
struct list_head vlans;
- struct rcu_head rcu;
struct sk_buff_head bc_queue;
struct work_struct bc_work;
bool passthru;
@@ -1151,7 +1150,7 @@ static void macvlan_port_destroy(struct net_device *dev)
cancel_work_sync(&port->bc_work);
__skb_queue_purge(&port->bc_queue);
- kfree_rcu(port, rcu);
+ kfree(port);
}
static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[])
--
1.9.1
^ permalink raw reply related
* Re: [PATCH 08/10] vsock/virtio: mark an internal function static
From: Jason Wang @ 2016-12-07 4:21 UTC (permalink / raw)
To: Michael S. Tsirkin, linux-kernel
Cc: netdev, virtualization, David S. Miller, Stefan Hajnoczi, kvm
In-Reply-To: <1481038106-24899-9-git-send-email-mst@redhat.com>
On 2016年12月06日 23:41, Michael S. Tsirkin wrote:
> virtio_transport_alloc_pkt is only used locally, make it static.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> net/vmw_vsock/virtio_transport_common.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> index a53b3a1..6120384 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -32,7 +32,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
> return container_of(t, struct virtio_transport, transport);
> }
>
> -struct virtio_vsock_pkt *
> +static struct virtio_vsock_pkt *
> virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
> size_t len,
> u32 src_cid,
Git grep shows it was used by tracing.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH 07/10] vsock/virtio: add a missing __le annotation
From: Jason Wang @ 2016-12-07 4:19 UTC (permalink / raw)
To: Michael S. Tsirkin, linux-kernel
Cc: netdev, virtualization, David S. Miller, Stefan Hajnoczi, kvm
In-Reply-To: <1481038106-24899-8-git-send-email-mst@redhat.com>
On 2016年12月06日 23:40, Michael S. Tsirkin wrote:
> guest cid is read from config space, therefore it's in little endian
> format and is treated as such, annotate it accordingly.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> net/vmw_vsock/virtio_transport.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
> index 936d7ee..90096b9 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -336,7 +336,7 @@ static void virtio_vsock_reset_sock(struct sock *sk)
> static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
> {
> struct virtio_device *vdev = vsock->vdev;
> - u64 guest_cid;
> + __le64 guest_cid;
>
> vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
> &guest_cid, sizeof(guest_cid));
Reviewed-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH 06/10] vhost: add missing __user annotations
From: Jason Wang @ 2016-12-07 4:17 UTC (permalink / raw)
To: Michael S. Tsirkin, linux-kernel; +Cc: netdev, kvm, virtualization
In-Reply-To: <1481038106-24899-7-git-send-email-mst@redhat.com>
On 2016年12月06日 23:40, Michael S. Tsirkin wrote:
> Several vhost functions were missing __user annotations
> on pointers, causing sparse warnings. Fix this up.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/vhost.c | 10 +++++-----
> 1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 7331ef3..ba7db68 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -719,7 +719,7 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
> static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
> struct iovec iov[], int iov_size, int access);
>
> -static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
> +static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
> const void *from, unsigned size)
> {
> int ret;
> @@ -749,7 +749,7 @@ static int vhost_copy_to_user(struct vhost_virtqueue *vq, void *to,
> }
>
> static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
> - void *from, unsigned size)
> + void __user *from, unsigned size)
> {
> int ret;
>
> @@ -783,7 +783,7 @@ static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
> }
>
> static void __user *__vhost_get_user(struct vhost_virtqueue *vq,
> - void *addr, unsigned size)
> + void __user *addr, unsigned size)
> {
> int ret;
>
> @@ -934,8 +934,8 @@ static int umem_access_ok(u64 uaddr, u64 size, int access)
> return 0;
> }
>
> -int vhost_process_iotlb_msg(struct vhost_dev *dev,
> - struct vhost_iotlb_msg *msg)
> +static int vhost_process_iotlb_msg(struct vhost_dev *dev,
> + struct vhost_iotlb_msg *msg)
> {
> int ret = 0;
>
Patch looks good but this looks like another static conversion not
__user annotations.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH 05/10] vhost: make interval tree static inline
From: Jason Wang @ 2016-12-07 4:16 UTC (permalink / raw)
To: Michael S. Tsirkin, linux-kernel; +Cc: netdev, kvm, virtualization
In-Reply-To: <1481038106-24899-6-git-send-email-mst@redhat.com>
On 2016年12月06日 23:40, Michael S. Tsirkin wrote:
> vhost_umem_interval_tree is only used locally within vhost.c, mark it
> static. As some functions generated go unused, this triggers warnings
> unless we also mark it inline.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/vhost/vhost.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index c6f2d89..7331ef3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -49,7 +49,7 @@ enum {
>
> INTERVAL_TREE_DEFINE(struct vhost_umem_node,
> rb, __u64, __subtree_last,
> - START, LAST, , vhost_umem_interval_tree);
> + START, LAST, static inline, vhost_umem_interval_tree);
>
> #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
> static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
Reviewed-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* linux-next: manual merge of the staging tree with the net-next tree
From: Stephen Rothwell @ 2016-12-07 4:04 UTC (permalink / raw)
To: Greg KH, David Miller, Networking; +Cc: linux-next, linux-kernel, Jarod Wilson
Hi Greg,
Today's linux-next merge of the staging tree got a conflict in:
drivers/staging/slicoss/slicoss.c
between commit:
a52ad514fdf3 ("net: deprecate eth_change_mtu, remove usage")
from the net-next tree and commit:
0af72df267f2 ("staging: slicoss: remove the staging driver")
from the staging tree.
I fixed it up (I just removed the file) and can carry the fix as
necessary. This is now fixed as far as linux-next is concerned, but any
non trivial conflicts should be mentioned to your upstream maintainer
when your tree is submitted for merging. You may also want to consider
cooperating with the maintainer of the conflicting tree to minimise any
particularly complex conflicts.
--
Cheers,
Stephen Rothwell
^ permalink raw reply
* [net-next][PATCH v2 10/18] RDS: IB: track and log active side endpoint in connection
From: Santosh Shilimkar @ 2016-12-07 4:01 UTC (permalink / raw)
To: netdev, davem; +Cc: linux-kernel, santosh.shilimkar
In-Reply-To: <1481083316-11648-1-git-send-email-santosh.shilimkar@oracle.com>
Useful to know the active and passive end points in a
RDS IB connection.
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
net/rds/ib.h | 3 +++
net/rds/ib_cm.c | 11 +++++++----
2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/net/rds/ib.h b/net/rds/ib.h
index f14c26d..97e7696 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -181,6 +181,9 @@ struct rds_ib_connection {
/* Batched completions */
unsigned int i_unsignaled_wrs;
+
+ /* Endpoint role in connection */
+ int i_active_side;
};
/* This assumes that atomic_t is at least 32 bits */
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 3002acf..4d1bf04 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -120,16 +120,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_conn_destroy(conn);
return;
} else {
- pr_notice("RDS/IB: connected <%pI4,%pI4> version %u.%u%s\n",
+ pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+ ic->i_active_side ? "Active" : "Passive",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");
}
- /*
- * Init rings and fill recv. this needs to wait until protocol negotiation
- * is complete, since ring layout is different from 3.0 to 3.1.
+ /* Init rings and fill recv. this needs to wait until protocol
+ * negotiation is complete, since ring layout is different
+ * from 3.1 to 4.1.
*/
rds_ib_send_init_ring(ic);
rds_ib_recv_init_ring(ic);
@@ -685,6 +686,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
if (ic->i_cm_id == cm_id)
ret = 0;
}
+ ic->i_active_side = true;
return ret;
}
@@ -859,6 +861,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
ic->i_sends = NULL;
vfree(ic->i_recvs);
ic->i_recvs = NULL;
+ ic->i_active_side = false;
}
int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
--
1.9.1
^ permalink raw reply related
* [net-next][PATCH v2 08/18] RDS: IB: split the mr registration and invalidation path
From: Santosh Shilimkar @ 2016-12-07 4:01 UTC (permalink / raw)
To: netdev, davem; +Cc: linux-kernel, santosh.shilimkar
In-Reply-To: <1481083316-11648-1-git-send-email-santosh.shilimkar@oracle.com>
MR invalidation in RDS is done in background thread and not in
data path like registration. So break the dependency between them
which helps to remove the performance bottleneck.
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
net/rds/ib.h | 4 +++-
net/rds/ib_cm.c | 9 +++++++--
net/rds/ib_frmr.c | 11 ++++++-----
3 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/net/rds/ib.h b/net/rds/ib.h
index f4e8121..f14c26d 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,7 +14,8 @@
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
-#define RDS_IB_DEFAULT_FR_WR 512
+#define RDS_IB_DEFAULT_FR_WR 256
+#define RDS_IB_DEFAULT_FR_INV_WR 256
#define RDS_IB_DEFAULT_RETRY_COUNT 1
@@ -125,6 +126,7 @@ struct rds_ib_connection {
/* To control the number of wrs from fastreg */
atomic_t i_fastreg_wrs;
+ atomic_t i_fastunreg_wrs;
/* interrupt handling */
struct tasklet_struct i_send_tasklet;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index b9da1e5..3002acf 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -382,7 +382,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
* completion queue and send queue. This extra space is used for FRMR
* registration and invalidation work requests
*/
- fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+ fr_queue_space = rds_ibdev->use_fastreg ?
+ (RDS_IB_DEFAULT_FR_WR + 1) +
+ (RDS_IB_DEFAULT_FR_INV_WR + 1)
+ : 0;
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
@@ -444,6 +447,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq;
atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
+ atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
/*
* XXX this can fail if max_*_wr is too large? Are we supposed
@@ -766,7 +770,8 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
(atomic_read(&ic->i_signaled_sends) == 0) &&
- (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
+ (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
+ (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet);
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 66b3d62..48332a6 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -241,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
if (frmr->fr_state != FRMR_IS_INUSE)
goto out;
- while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
- atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastunreg_wrs);
cpu_relax();
}
@@ -261,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
if (unlikely(ret)) {
frmr->fr_state = FRMR_IS_STALE;
frmr->fr_inv = false;
- atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ atomic_inc(&ibmr->ic->i_fastunreg_wrs);
pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
goto out;
}
@@ -289,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
if (frmr->fr_inv) {
frmr->fr_state = FRMR_IS_FREE;
frmr->fr_inv = false;
+ atomic_inc(&ic->i_fastreg_wrs);
+ } else {
+ atomic_inc(&ic->i_fastunreg_wrs);
}
-
- atomic_inc(&ic->i_fastreg_wrs);
}
void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
--
1.9.1
^ permalink raw reply related
* [net-next][PATCH v2 18/18] RDS: IB: add missing connection cache usage info
From: Santosh Shilimkar @ 2016-12-07 4:01 UTC (permalink / raw)
To: netdev, davem; +Cc: linux-kernel, santosh.shilimkar
In-Reply-To: <1481083316-11648-1-git-send-email-santosh.shilimkar@oracle.com>
rds-tools already support it.
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
include/uapi/linux/rds.h | 1 +
net/rds/ib.c | 1 +
2 files changed, 2 insertions(+)
diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 3833113..410ae3c 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -183,6 +183,7 @@ struct rds_info_rdma_connection {
uint32_t max_send_sge;
uint32_t rdma_mr_max;
uint32_t rdma_mr_size;
+ uint32_t cache_allocs;
};
/* RDS message Receive Path Latency points */
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 8d70884..b5e2699 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -313,6 +313,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
iinfo->max_send_wr = ic->i_send_ring.w_nr;
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_ibdev->max_sge;
+ iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs);
rds_ib_get_mr_info(rds_ibdev, iinfo);
}
return 1;
--
1.9.1
^ permalink raw reply related
* [net-next][PATCH v2 17/18] RDS: add receive message trace used by application
From: Santosh Shilimkar @ 2016-12-07 4:01 UTC (permalink / raw)
To: netdev, davem; +Cc: linux-kernel, santosh.shilimkar
In-Reply-To: <1481083316-11648-1-git-send-email-santosh.shilimkar@oracle.com>
Socket option to tap receive path latency in various stages
in nano seconds. It can be enabled on selective sockets using
using SO_RDS_MSG_RXPATH_LATENCY socket option. RDS will return
the data to application with RDS_CMSG_RXPATH_LATENCY in defined
format. Scope is left to add more trace points for future
without need of change in the interface.
Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
include/uapi/linux/rds.h | 33 +++++++++++++++++++++++++++++++++
net/rds/af_rds.c | 28 ++++++++++++++++++++++++++++
net/rds/ib_recv.c | 4 ++++
net/rds/rds.h | 10 ++++++++++
net/rds/recv.c | 32 +++++++++++++++++++++++++++++---
net/rds/tcp_recv.c | 5 +++++
6 files changed, 109 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 0f9265c..3833113 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -52,6 +52,13 @@
#define RDS_GET_MR_FOR_DEST 7
#define SO_RDS_TRANSPORT 8
+/* Socket option to tap receive path latency
+ * SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
+ * Format used struct rds_rx_trace_so
+ */
+#define SO_RDS_MSG_RXPATH_LATENCY 10
+
+
/* supported values for SO_RDS_TRANSPORT */
#define RDS_TRANS_IB 0
#define RDS_TRANS_IWARP 1
@@ -77,6 +84,12 @@
* the same as for the GET_MR setsockopt.
* RDS_CMSG_RDMA_STATUS (recvmsg)
* Returns the status of a completed RDMA operation.
+ * RDS_CMSG_RXPATH_LATENCY(recvmsg)
+ * Returns rds message latencies in various stages of receive
+ * path in nS. Its set per socket using SO_RDS_MSG_RXPATH_LATENCY
+ * socket option. Legitimate points are defined in
+ * enum rds_message_rxpath_latency. More points can be added in
+ * future. CSMG format is struct rds_cmsg_rx_trace.
*/
#define RDS_CMSG_RDMA_ARGS 1
#define RDS_CMSG_RDMA_DEST 2
@@ -87,6 +100,7 @@
#define RDS_CMSG_ATOMIC_CSWP 7
#define RDS_CMSG_MASKED_ATOMIC_FADD 8
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
+#define RDS_CMSG_RXPATH_LATENCY 11
#define RDS_INFO_FIRST 10000
#define RDS_INFO_COUNTERS 10000
@@ -171,6 +185,25 @@ struct rds_info_rdma_connection {
uint32_t rdma_mr_size;
};
+/* RDS message Receive Path Latency points */
+enum rds_message_rxpath_latency {
+ RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
+ RDS_MSG_RX_DGRAM_REASSEMBLE,
+ RDS_MSG_RX_DGRAM_DELIVERED,
+ RDS_MSG_RX_DGRAM_TRACE_MAX
+};
+
+struct rds_rx_trace_so {
+ u8 rx_traces;
+ u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
+struct rds_cmsg_rx_trace {
+ u8 rx_traces;
+ u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX];
+ u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+};
+
/*
* Congestion monitoring.
* Congestion control in RDS happens at the host connection
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 2ac1e61..fd821740 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -298,6 +298,30 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
return 0;
}
+static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+ int optlen)
+{
+ struct rds_rx_trace_so trace;
+ int i;
+
+ if (optlen != sizeof(struct rds_rx_trace_so))
+ return -EFAULT;
+
+ if (copy_from_user(&trace, optval, sizeof(trace)))
+ return -EFAULT;
+
+ rs->rs_rx_traces = trace.rx_traces;
+ for (i = 0; i < rs->rs_rx_traces; i++) {
+ if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
+ rs->rs_rx_traces = 0;
+ return -EFAULT;
+ }
+ rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
+ }
+
+ return 0;
+}
+
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -338,6 +362,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
release_sock(sock->sk);
break;
+ case SO_RDS_MSG_RXPATH_LATENCY:
+ ret = rds_recv_track_latency(rs, optval, optlen);
+ break;
default:
ret = -ENOPROTOOPT;
}
@@ -484,6 +511,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
INIT_LIST_HEAD(&rs->rs_cong_list);
spin_lock_init(&rs->rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT;
+ rs->rs_rx_traces = 0;
spin_lock_bh(&rds_sock_lock);
list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 4b0f126..e10624a 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -911,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
ic->i_ibinc = ibinc;
hdr = &ibinc->ii_inc.i_hdr;
+ ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+ local_clock();
memcpy(hdr, ihdr, sizeof(*hdr));
ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+ ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+ local_clock();
rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
ic->i_recv_data_rem, hdr->h_flags);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f713194..07fff73 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -253,6 +253,11 @@ struct rds_ext_header_rdma_dest {
#define RDS_EXTHDR_GEN_NUM 6
#define __RDS_EXTHDR_MAX 16 /* for now */
+#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
+#define RDS_MSG_RX_HDR 0
+#define RDS_MSG_RX_START 1
+#define RDS_MSG_RX_END 2
+#define RDS_MSG_RX_CMSG 3
struct rds_incoming {
atomic_t i_refcount;
@@ -265,6 +270,7 @@ struct rds_incoming {
rds_rdma_cookie_t i_rdma_cookie;
struct timeval i_rx_tstamp;
+ u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
};
struct rds_mr {
@@ -575,6 +581,10 @@ struct rds_sock {
unsigned char rs_recverr,
rs_cong_monitor;
u32 rs_hash_initval;
+
+ /* Socket receive path trace points*/
+ u8 rs_rx_traces;
+ u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
};
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index ba19eee..8b7e7b7 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -43,6 +43,8 @@
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr)
{
+ int i;
+
atomic_set(&inc->i_refcount, 1);
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = conn;
@@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
inc->i_rdma_cookie = 0;
inc->i_rx_tstamp.tv_sec = 0;
inc->i_rx_tstamp.tv_usec = 0;
+
+ for (i = 0; i < RDS_RX_MAX_TRACES; i++)
+ inc->i_rx_lat_trace[i] = 0;
}
EXPORT_SYMBOL_GPL(rds_inc_init);
@@ -373,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
if (sock_flag(sk, SOCK_RCVTSTAMP))
do_gettimeofday(&inc->i_rx_tstamp);
rds_inc_addref(inc);
+ inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
__rds_wake_sk_sleep(sk);
} else {
@@ -534,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
if (ret)
- return ret;
+ goto out;
}
if ((inc->i_rx_tstamp.tv_sec != 0) &&
@@ -543,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
sizeof(struct timeval),
&inc->i_rx_tstamp);
if (ret)
- return ret;
+ goto out;
}
- return 0;
+ if (rs->rs_rx_traces) {
+ struct rds_cmsg_rx_trace t;
+ int i, j;
+
+ inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
+ t.rx_traces = rs->rs_rx_traces;
+ for (i = 0; i < rs->rs_rx_traces; i++) {
+ j = rs->rs_rx_trace[i];
+ t.rx_trace_pos[i] = j;
+ t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] -
+ inc->i_rx_lat_trace[j];
+ }
+
+ ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY,
+ sizeof(t), &t);
+ if (ret)
+ goto out;
+ }
+
+out:
+ return ret;
}
int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index ad4892e..e006ef8 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
rdsdebug("alloced tinc %p\n", tinc);
rds_inc_path_init(&tinc->ti_inc, cp,
cp->cp_conn->c_faddr);
+ tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
+ local_clock();
+
/*
* XXX * we might be able to use the __ variants when
* we've already serialized at a higher level.
@@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
/* could be 0 for a 0 len message */
tc->t_tinc_data_rem =
be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+ tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
+ local_clock();
}
}
--
1.9.1
^ permalink raw reply related
* [net-next][PATCH v2 16/18] RDS: make message size limit compliant with spec
From: Santosh Shilimkar @ 2016-12-07 4:01 UTC (permalink / raw)
To: netdev, davem; +Cc: linux-kernel, santosh.shilimkar
In-Reply-To: <1481083316-11648-1-git-send-email-santosh.shilimkar@oracle.com>
From: Avinash Repaka <avinash.repaka@oracle.com>
RDS support max message size as 1M but the code doesn't check this
in all cases. Patch fixes it for RDMA & non-RDMA and RDS MR size
and its enforced irrespective of underlying transport.
Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
net/rds/rdma.c | 9 ++++++++-
net/rds/rds.h | 3 +++
net/rds/send.c | 31 +++++++++++++++++++++++++++++++
3 files changed, 42 insertions(+), 1 deletion(-)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index dd508e0..6e8db33 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -40,7 +40,6 @@
/*
* XXX
* - build with sparse
- * - should we limit the size of a mr region? let transport return failure?
* - should we detect duplicate keys on a socket? hmm.
* - an rdma is an mlock, apply rlimit?
*/
@@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
}
+ /* Restrict the size of mr irrespective of underlying transport
+ * To account for unaligned mr regions, subtract one from nr_pages
+ */
+ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
args->vec.addr, args->vec.bytes, nr_pages);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 8ccd5a9..f713194 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...)
#define RDS_FRAG_SHIFT 12
#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
+/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
+#define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20))
+
#define RDS_CONG_MAP_BYTES (65536 / 8)
#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
diff --git a/net/rds/send.c b/net/rds/send.c
index 45e025b..5cc6403 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -994,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
return hash;
}
+static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
+{
+ struct rds_rdma_args *args;
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+ args = CMSG_DATA(cmsg);
+ *rdma_bytes += args->remote_vec.bytes;
+ }
+ }
+ return 0;
+}
+
int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
{
struct sock *sk = sock->sk;
@@ -1008,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
int nonblock = msg->msg_flags & MSG_DONTWAIT;
long timeo = sock_sndtimeo(sk, nonblock);
struct rds_conn_path *cpath;
+ size_t total_payload_len = payload_len, rdma_payload_len = 0;
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
@@ -1040,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
}
release_sock(sk);
+ ret = rds_rdma_bytes(msg, &rdma_payload_len);
+ if (ret)
+ goto out;
+
+ total_payload_len += rdma_payload_len;
+ if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
if (payload_len > rds_sk_sndbuf(rs)) {
ret = -EMSGSIZE;
goto out;
--
1.9.1
^ permalink raw reply related
* [net-next][PATCH v2 15/18] RDS: add stat for socket recv memory usage
From: Santosh Shilimkar @ 2016-12-07 4:01 UTC (permalink / raw)
To: netdev, davem; +Cc: linux-kernel, santosh.shilimkar
In-Reply-To: <1481083316-11648-1-git-send-email-santosh.shilimkar@oracle.com>
From: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Tracks the receive side memory added to scokets and removed from sockets.
Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
net/rds/rds.h | 3 +++
net/rds/recv.c | 4 ++++
2 files changed, 7 insertions(+)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0bb8213..8ccd5a9 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -631,6 +631,9 @@ struct rds_statistics {
uint64_t s_cong_update_received;
uint64_t s_cong_send_error;
uint64_t s_cong_send_blocked;
+ uint64_t s_recv_bytes_added_to_socket;
+ uint64_t s_recv_bytes_removed_from_socket;
+
};
/* af_rds.c */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 9d0666e..ba19eee 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -94,6 +94,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
return;
rs->rs_rcv_bytes += delta;
+ if (delta > 0)
+ rds_stats_add(s_recv_bytes_added_to_socket, delta);
+ else
+ rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
--
1.9.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox