* [PATCH net-next 03/14] bnxt_en: Improve the IRQ disable sequence during shutdown.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
The IRQ is disabled by writing to the completion ring doorbell. This
should be done before the hardware completion ring is freed for correctness.
The current code disables IRQs after all the completion rings are freed.
Fix it by calling bnxt_disable_int_sync() before freeing the completion
rings. Rearrange the code to avoid forward declaration.
Signed-off-by: Michael Chan <michael.chan@broadocm.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 89 ++++++++++++++++---------------
1 file changed, 46 insertions(+), 43 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 3fbc842..277573b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2953,6 +2953,45 @@ static int bnxt_alloc_mem(struct bnxt *bp, bool irq_re_init)
return rc;
}
+static void bnxt_disable_int(struct bnxt *bp)
+{
+ int i;
+
+ if (!bp->bnapi)
+ return;
+
+ for (i = 0; i < bp->cp_nr_rings; i++) {
+ struct bnxt_napi *bnapi = bp->bnapi[i];
+ struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+ BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
+ }
+}
+
+static void bnxt_disable_int_sync(struct bnxt *bp)
+{
+ int i;
+
+ atomic_inc(&bp->intr_sem);
+
+ bnxt_disable_int(bp);
+ for (i = 0; i < bp->cp_nr_rings; i++)
+ synchronize_irq(bp->irq_tbl[i].vector);
+}
+
+static void bnxt_enable_int(struct bnxt *bp)
+{
+ int i;
+
+ atomic_set(&bp->intr_sem, 0);
+ for (i = 0; i < bp->cp_nr_rings; i++) {
+ struct bnxt_napi *bnapi = bp->bnapi[i];
+ struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+
+ BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
+ }
+}
+
void bnxt_hwrm_cmd_hdr_init(struct bnxt *bp, void *request, u16 req_type,
u16 cmpl_ring, u16 target_id)
{
@@ -3937,6 +3976,12 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path)
}
}
+ /* The completion rings are about to be freed. After that the
+ * IRQ doorbell will not work anymore. So we need to disable
+ * IRQ here.
+ */
+ bnxt_disable_int_sync(bp);
+
for (i = 0; i < bp->cp_nr_rings; i++) {
struct bnxt_napi *bnapi = bp->bnapi[i];
struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
@@ -4658,34 +4703,6 @@ static int bnxt_init_nic(struct bnxt *bp, bool irq_re_init)
return bnxt_init_chip(bp, irq_re_init);
}
-static void bnxt_disable_int(struct bnxt *bp)
-{
- int i;
-
- if (!bp->bnapi)
- return;
-
- for (i = 0; i < bp->cp_nr_rings; i++) {
- struct bnxt_napi *bnapi = bp->bnapi[i];
- struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
-
- BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
- }
-}
-
-static void bnxt_enable_int(struct bnxt *bp)
-{
- int i;
-
- atomic_set(&bp->intr_sem, 0);
- for (i = 0; i < bp->cp_nr_rings; i++) {
- struct bnxt_napi *bnapi = bp->bnapi[i];
- struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
-
- BNXT_CP_DB_REARM(cpr->cp_doorbell, cpr->cp_raw_cons);
- }
-}
-
static int bnxt_set_real_num_queues(struct bnxt *bp)
{
int rc;
@@ -5640,19 +5657,6 @@ static int bnxt_open(struct net_device *dev)
return __bnxt_open_nic(bp, true, true);
}
-static void bnxt_disable_int_sync(struct bnxt *bp)
-{
- int i;
-
- atomic_inc(&bp->intr_sem);
- if (!netif_running(bp->dev))
- return;
-
- bnxt_disable_int(bp);
- for (i = 0; i < bp->cp_nr_rings; i++)
- synchronize_irq(bp->irq_tbl[i].vector);
-}
-
int bnxt_close_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
{
int rc = 0;
@@ -5674,13 +5678,12 @@ int bnxt_close_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
while (test_bit(BNXT_STATE_IN_SP_TASK, &bp->state))
msleep(20);
- /* Flush rings before disabling interrupts */
+ /* Flush rings and and disable interrupts */
bnxt_shutdown_nic(bp, irq_re_init);
/* TODO CHIMP_FW: Link/PHY related cleanup if (link_re_init) */
bnxt_disable_napi(bp);
- bnxt_disable_int_sync(bp);
del_timer_sync(&bp->timer);
bnxt_free_skbs(bp);
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 04/14] bnxt_en: Fix and clarify link_info->advertising.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
The advertising field is closely related to the auto_link_speeds field.
The former is the user setting while the latter is the firmware setting.
Both should be u16. We should use the advertising field in
bnxt_get_link_ksettings because the auto_link_speeds field may not
be updated with the latest from the firmware yet.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +-
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 ++--
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 5 +++--
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 277573b..4a8059f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5363,7 +5363,7 @@ static void bnxt_hwrm_set_link_common(struct bnxt *bp,
{
u8 autoneg = bp->link_info.autoneg;
u16 fw_link_speed = bp->link_info.req_link_speed;
- u32 advertising = bp->link_info.advertising;
+ u16 advertising = bp->link_info.advertising;
if (autoneg & BNXT_AUTONEG_SPEED) {
req->auto_mode |=
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index fddc316..0eb6401 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -828,7 +828,7 @@ struct bnxt_link_info {
#define BNXT_LINK_SPEED_40GB PORT_PHY_QCFG_RESP_LINK_SPEED_40GB
#define BNXT_LINK_SPEED_50GB PORT_PHY_QCFG_RESP_LINK_SPEED_50GB
u16 support_speeds;
- u16 auto_link_speeds;
+ u16 auto_link_speeds; /* fw adv setting */
#define BNXT_LINK_SPEED_MSK_100MB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MB
#define BNXT_LINK_SPEED_MSK_1GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GB
#define BNXT_LINK_SPEED_MSK_2GB PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2GB
@@ -851,7 +851,7 @@ struct bnxt_link_info {
u8 req_duplex;
u8 req_flow_ctrl;
u16 req_link_speed;
- u32 advertising;
+ u16 advertising; /* user adv setting */
bool force_link_chng;
/* a copy of phy_qcfg output used to report link
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 784aa77..1cfa7a6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -893,7 +893,7 @@ u32 _bnxt_fw_to_ethtool_adv_spds(u16 fw_speeds, u8 fw_pause)
static void bnxt_fw_to_ethtool_advertised_spds(struct bnxt_link_info *link_info,
struct ethtool_link_ksettings *lk_ksettings)
{
- u16 fw_speeds = link_info->auto_link_speeds;
+ u16 fw_speeds = link_info->advertising;
u8 fw_pause = 0;
if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
@@ -1090,8 +1090,9 @@ static int bnxt_set_link_ksettings(struct net_device *dev,
struct bnxt *bp = netdev_priv(dev);
struct bnxt_link_info *link_info = &bp->link_info;
const struct ethtool_link_settings *base = &lk_ksettings->base;
- u32 speed, fw_advertising = 0;
bool set_pause = false;
+ u16 fw_advertising = 0;
+ u32 speed;
int rc = 0;
if (!BNXT_SINGLE_PF(bp))
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 05/14] bnxt_en: Refactor TPA code path.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
Call tcp_gro_complete() in the common code path instead of the chip-
specific method. The newer 5731x method is missing the call.
Signed-off-by: Michael Chan <michael.chan@broadcmo.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4a8059f..0654c3f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1127,7 +1127,6 @@ static struct sk_buff *bnxt_gro_func_5730x(struct bnxt_tpa_info *tpa_info,
dev_kfree_skb_any(skb);
return NULL;
}
- tcp_gro_complete(skb);
if (nw_off) { /* tunnel */
struct udphdr *uh = NULL;
@@ -1177,6 +1176,8 @@ static inline struct sk_buff *bnxt_gro_skb(struct bnxt *bp,
RX_TPA_END_CMP_PAYLOAD_OFFSET) >>
RX_TPA_END_CMP_PAYLOAD_OFFSET_SHIFT;
skb = bp->gro_func(tpa_info, payload_off, TPA_END_GRO_TS(tpa_end), skb);
+ if (likely(skb))
+ tcp_gro_complete(skb);
#endif
return skb;
}
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 06/14] bnxt_en: Add function to get vnic capability.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
The new vnic RSS capability will enhance NTUPLE support, to be added
in subsequent patches.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 22 +++++++++++++++++
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 +
drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 34 +++++++++++++++++++++++++++
3 files changed, 57 insertions(+)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0654c3f..9168326 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3665,6 +3665,27 @@ static int bnxt_hwrm_vnic_alloc(struct bnxt *bp, u16 vnic_id,
return rc;
}
+static int bnxt_hwrm_vnic_qcaps(struct bnxt *bp)
+{
+ struct hwrm_vnic_qcaps_output *resp = bp->hwrm_cmd_resp_addr;
+ struct hwrm_vnic_qcaps_input req = {0};
+ int rc;
+
+ if (bp->hwrm_spec_code < 0x10600)
+ return 0;
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_VNIC_QCAPS, -1, -1);
+ mutex_lock(&bp->hwrm_cmd_lock);
+ rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ if (!rc) {
+ if (resp->flags &
+ cpu_to_le32(VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP))
+ bp->flags |= BNXT_FLAG_NEW_RSS_CAP;
+ }
+ mutex_unlock(&bp->hwrm_cmd_lock);
+ return rc;
+}
+
static int bnxt_hwrm_ring_grp_alloc(struct bnxt *bp)
{
u16 i;
@@ -7070,6 +7091,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6;
}
+ bnxt_hwrm_vnic_qcaps(bp);
if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp)) {
dev->hw_features |= NETIF_F_NTUPLE;
if (bnxt_rfs_capable(bp)) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 0eb6401..80bf1ab 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -944,6 +944,7 @@ struct bnxt {
#define BNXT_FLAG_PORT_STATS 0x400
#define BNXT_FLAG_UDP_RSS_CAP 0x800
#define BNXT_FLAG_EEE_CAP 0x1000
+ #define BNXT_FLAG_NEW_RSS_CAP 0x2000
#define BNXT_FLAG_ROCEV1_CAP 0x8000
#define BNXT_FLAG_ROCEV2_CAP 0x10000
#define BNXT_FLAG_ROCE_CAP (BNXT_FLAG_ROCEV1_CAP | \
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
index 2ddfa51..d0d49ed 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
@@ -2797,6 +2797,40 @@ struct hwrm_vnic_cfg_output {
u8 valid;
};
+/* hwrm_vnic_qcaps */
+/* Input (24 bytes) */
+struct hwrm_vnic_qcaps_input {
+ __le16 req_type;
+ __le16 cmpl_ring;
+ __le16 seq_id;
+ __le16 target_id;
+ __le64 resp_addr;
+ __le32 enables;
+ __le32 unused_0;
+};
+
+/* Output (24 bytes) */
+struct hwrm_vnic_qcaps_output {
+ __le16 error_code;
+ __le16 req_type;
+ __le16 seq_id;
+ __le16 resp_len;
+ __le16 mru;
+ u8 unused_0;
+ u8 unused_1;
+ __le32 flags;
+ #define VNIC_QCAPS_RESP_FLAGS_VLAN_STRIP_CAP 0x2UL
+ #define VNIC_QCAPS_RESP_FLAGS_BD_STALL_CAP 0x4UL
+ #define VNIC_QCAPS_RESP_FLAGS_ROCE_DUAL_VNIC_CAP 0x8UL
+ #define VNIC_QCAPS_RESP_FLAGS_ROCE_ONLY_VNIC_CAP 0x10UL
+ #define VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP 0x20UL
+ __le32 unused_2;
+ u8 unused_3;
+ u8 unused_4;
+ u8 unused_5;
+ u8 valid;
+};
+
/* hwrm_vnic_tpa_cfg */
/* Input (40 bytes) */
struct hwrm_vnic_tpa_cfg_input {
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 07/14] bnxt_en: Refactor code that determines RFS capability.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
Add function bnxt_rfs_supported() that determines if the chip supports
RFS. Refactor the existing function bnxt_rfs_capable() that determines
if run-time conditions support RFS.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 38 +++++++++++++++++++++++++++----
1 file changed, 33 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9168326..f7ea99f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4835,6 +4835,24 @@ static int bnxt_setup_int_mode(struct bnxt *bp)
return rc;
}
+static unsigned int bnxt_get_max_func_rss_ctxs(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+ if (BNXT_VF(bp))
+ return bp->vf.max_rsscos_ctxs;
+#endif
+ return bp->pf.max_rsscos_ctxs;
+}
+
+static unsigned int bnxt_get_max_func_vnics(struct bnxt *bp)
+{
+#if defined(CONFIG_BNXT_SRIOV)
+ if (BNXT_VF(bp))
+ return bp->vf.max_vnics;
+#endif
+ return bp->pf.max_vnics;
+}
+
unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
{
#if defined(CONFIG_BNXT_SRIOV)
@@ -5962,20 +5980,30 @@ static int bnxt_cfg_rx_mode(struct bnxt *bp)
return rc;
}
+/* If the chip and firmware supports RFS */
+static bool bnxt_rfs_supported(struct bnxt *bp)
+{
+ if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp))
+ return true;
+ return false;
+}
+
+/* If runtime conditions support RFS */
static bool bnxt_rfs_capable(struct bnxt *bp)
{
#ifdef CONFIG_RFS_ACCEL
- struct bnxt_pf_info *pf = &bp->pf;
- int vnics;
+ int vnics, max_vnics, max_rss_ctxs;
if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_MSIX_CAP))
return false;
vnics = 1 + bp->rx_nr_rings;
- if (vnics > pf->max_rsscos_ctxs || vnics > pf->max_vnics) {
+ max_vnics = bnxt_get_max_func_vnics(bp);
+ max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp);
+ if (vnics > max_vnics || vnics > max_rss_ctxs) {
netdev_warn(bp->dev,
"Not enough resources to support NTUPLE filters, enough resources for up to %d rx rings\n",
- min(pf->max_rsscos_ctxs - 1, pf->max_vnics - 1));
+ min(max_rss_ctxs - 1, max_vnics - 1));
return false;
}
@@ -7092,7 +7120,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
}
bnxt_hwrm_vnic_qcaps(bp);
- if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp)) {
+ if (bnxt_rfs_supported(bp)) {
dev->hw_features |= NETIF_F_NTUPLE;
if (bnxt_rfs_capable(bp)) {
bp->flags |= BNXT_FLAG_RFS;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 08/14] bnxt_en: Add new hardware RFS mode.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
The existing hardware RFS mode uses one hardware RSS context block
per ring just to calculate the RSS hash. This is very wasteful and
prevents VF functions from using it. The new hardware mode shares
the same hardware RSS context for RSS placement and RFS steering.
This allows VFs to enable RFS.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 27 ++++++++++++++++++++++++++-
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 +
2 files changed, 27 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index f7ea99f..0ca530e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2630,6 +2630,10 @@ static int bnxt_alloc_vnic_attributes(struct bnxt *bp)
goto out;
}
+ if ((bp->flags & BNXT_FLAG_NEW_RSS_CAP) &&
+ !(vnic->flags & BNXT_VNIC_RSS_FLAG))
+ continue;
+
/* Allocate rss table and hash key */
vnic->rss_table = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
&vnic->rss_table_dma_addr,
@@ -3562,6 +3566,12 @@ int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
req.rss_rule = cpu_to_le16(vnic->fw_rss_cos_lb_ctx[0]);
req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_RSS_RULE |
VNIC_CFG_REQ_ENABLES_MRU);
+ } else if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG) {
+ req.rss_rule =
+ cpu_to_le16(bp->vnic_info[0].fw_rss_cos_lb_ctx[0]);
+ req.enables |= cpu_to_le32(VNIC_CFG_REQ_ENABLES_RSS_RULE |
+ VNIC_CFG_REQ_ENABLES_MRU);
+ req.flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_RSS_DFLT_CR_MODE);
} else {
req.rss_rule = cpu_to_le16(0xffff);
}
@@ -4490,8 +4500,12 @@ static void bnxt_hwrm_resource_free(struct bnxt *bp, bool close_path,
static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
{
+ struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
int rc;
+ if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG)
+ goto skip_rss_ctx;
+
/* allocate context for vnic */
rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic_id, 0);
if (rc) {
@@ -4511,6 +4525,7 @@ static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id)
bp->rsscos_nr_ctxs++;
}
+skip_rss_ctx:
/* configure default vnic, ring grp */
rc = bnxt_hwrm_vnic_cfg(bp, vnic_id);
if (rc) {
@@ -4545,13 +4560,17 @@ static int bnxt_alloc_rfs_vnics(struct bnxt *bp)
int i, rc = 0;
for (i = 0; i < bp->rx_nr_rings; i++) {
+ struct bnxt_vnic_info *vnic;
u16 vnic_id = i + 1;
u16 ring_id = i;
if (vnic_id >= bp->nr_vnics)
break;
- bp->vnic_info[vnic_id].flags |= BNXT_VNIC_RFS_FLAG;
+ vnic = &bp->vnic_info[vnic_id];
+ vnic->flags |= BNXT_VNIC_RFS_FLAG;
+ if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+ vnic->flags |= BNXT_VNIC_RFS_NEW_RSS_FLAG;
rc = bnxt_hwrm_vnic_alloc(bp, vnic_id, ring_id, 1);
if (rc) {
netdev_err(bp->dev, "hwrm vnic %d alloc failure rc: %x\n",
@@ -5985,6 +6004,8 @@ static bool bnxt_rfs_supported(struct bnxt *bp)
{
if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp))
return true;
+ if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+ return true;
return false;
}
@@ -6000,6 +6021,10 @@ static bool bnxt_rfs_capable(struct bnxt *bp)
vnics = 1 + bp->rx_nr_rings;
max_vnics = bnxt_get_max_func_vnics(bp);
max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp);
+
+ /* RSS contexts not a limiting factor */
+ if (bp->flags & BNXT_FLAG_NEW_RSS_CAP)
+ max_rss_ctxs = max_vnics;
if (vnics > max_vnics || vnics > max_rss_ctxs) {
netdev_warn(bp->dev,
"Not enough resources to support NTUPLE filters, enough resources for up to %d rx rings\n",
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 80bf1ab..75803e5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -708,6 +708,7 @@ struct bnxt_vnic_info {
#define BNXT_VNIC_RFS_FLAG 2
#define BNXT_VNIC_MCAST_FLAG 4
#define BNXT_VNIC_UCAST_FLAG 8
+#define BNXT_VNIC_RFS_NEW_RSS_FLAG 0x10
};
#if defined(CONFIG_BNXT_SRIOV)
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 10/14] bnxt_en: Add IPV6 hardware RFS support.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
Accept ipv6 flows in .ndo_rx_flow_steer() and support ETHTOOL_GRXCLSRULE
ipv6 flows.
Signed-off-by: Michael Chan <michael.chan@broadocm.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 32 +++++++++++---
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 53 +++++++++++++++++------
2 files changed, 66 insertions(+), 19 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0ca530e..4d478e7 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3316,10 +3316,26 @@ static int bnxt_hwrm_cfa_ntuple_filter_alloc(struct bnxt *bp,
req.ip_addr_type = CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4;
req.ip_protocol = keys->basic.ip_proto;
- req.src_ipaddr[0] = keys->addrs.v4addrs.src;
- req.src_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
- req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
- req.dst_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+ if (keys->basic.n_proto == htons(ETH_P_IPV6)) {
+ int i;
+
+ req.ethertype = htons(ETH_P_IPV6);
+ req.ip_addr_type =
+ CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6;
+ *(struct in6_addr *)&req.src_ipaddr[0] =
+ keys->addrs.v6addrs.src;
+ *(struct in6_addr *)&req.dst_ipaddr[0] =
+ keys->addrs.v6addrs.dst;
+ for (i = 0; i < 4; i++) {
+ req.src_ipaddr_mask[i] = cpu_to_be32(0xffffffff);
+ req.dst_ipaddr_mask[i] = cpu_to_be32(0xffffffff);
+ }
+ } else {
+ req.src_ipaddr[0] = keys->addrs.v4addrs.src;
+ req.src_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+ req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
+ req.dst_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+ }
req.src_port = keys->ports.src;
req.src_port_mask = cpu_to_be16(0xffff);
@@ -6588,12 +6604,18 @@ static int bnxt_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
goto err_free;
}
- if ((fkeys->basic.n_proto != htons(ETH_P_IP)) ||
+ if ((fkeys->basic.n_proto != htons(ETH_P_IP) &&
+ fkeys->basic.n_proto != htons(ETH_P_IPV6)) ||
((fkeys->basic.ip_proto != IPPROTO_TCP) &&
(fkeys->basic.ip_proto != IPPROTO_UDP))) {
rc = -EPROTONOSUPPORT;
goto err_free;
}
+ if (fkeys->basic.n_proto == htons(ETH_P_IPV6) &&
+ bp->hwrm_spec_code < 0x10601) {
+ rc = -EPROTONOSUPPORT;
+ goto err_free;
+ }
memcpy(new_fltr->dst_mac_addr, eth->h_dest, ETH_ALEN);
memcpy(new_fltr->src_mac_addr, eth->h_source, ETH_ALEN);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 1cfa7a6..e6b1196 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -524,24 +524,49 @@ static int bnxt_grxclsrule(struct bnxt *bp, struct ethtool_rxnfc *cmd)
fltr_found:
fkeys = &fltr->fkeys;
- if (fkeys->basic.ip_proto == IPPROTO_TCP)
- fs->flow_type = TCP_V4_FLOW;
- else if (fkeys->basic.ip_proto == IPPROTO_UDP)
- fs->flow_type = UDP_V4_FLOW;
- else
- goto fltr_err;
+ if (fkeys->basic.n_proto == htons(ETH_P_IP)) {
+ if (fkeys->basic.ip_proto == IPPROTO_TCP)
+ fs->flow_type = TCP_V4_FLOW;
+ else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+ fs->flow_type = UDP_V4_FLOW;
+ else
+ goto fltr_err;
+
+ fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
+ fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+
+ fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
+ fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
- fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
- fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+ fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
+ fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
- fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
- fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
+ fs->h_u.tcp_ip4_spec.pdst = fkeys->ports.dst;
+ fs->m_u.tcp_ip4_spec.pdst = cpu_to_be16(~0);
+ } else {
+ int i;
- fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
- fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
+ if (fkeys->basic.ip_proto == IPPROTO_TCP)
+ fs->flow_type = TCP_V6_FLOW;
+ else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+ fs->flow_type = UDP_V6_FLOW;
+ else
+ goto fltr_err;
+
+ *(struct in6_addr *)&fs->h_u.tcp_ip6_spec.ip6src[0] =
+ fkeys->addrs.v6addrs.src;
+ *(struct in6_addr *)&fs->h_u.tcp_ip6_spec.ip6dst[0] =
+ fkeys->addrs.v6addrs.dst;
+ for (i = 0; i < 4; i++) {
+ fs->m_u.tcp_ip6_spec.ip6src[i] = cpu_to_be32(~0);
+ fs->m_u.tcp_ip6_spec.ip6dst[i] = cpu_to_be32(~0);
+ }
+ fs->h_u.tcp_ip6_spec.psrc = fkeys->ports.src;
+ fs->m_u.tcp_ip6_spec.psrc = cpu_to_be16(~0);
- fs->h_u.tcp_ip4_spec.pdst = fkeys->ports.dst;
- fs->m_u.tcp_ip4_spec.pdst = cpu_to_be16(~0);
+ fs->h_u.tcp_ip6_spec.pdst = fkeys->ports.dst;
+ fs->m_u.tcp_ip6_spec.pdst = cpu_to_be16(~0);
+ }
fs->ring_cookie = fltr->rxq;
rc = 0;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 11/14] bnxt_en: Implement new scheme to reserve tx rings.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
In order to properly support TX rate limiting in SRIOV VF functions or
NPAR functions, firmware needs better control over tx ring allocations.
The new scheme requires the driver to reserve the number of tx rings
and to query to see if the requested number of tx rings is reserved.
The driver will use the new scheme when the firmware interface spec is
1.6.1 or newer.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 59 ++++++++++++++++++++++-
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 15 ++++++
drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 10 +++-
4 files changed, 83 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4d478e7..338dbd0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4045,6 +4045,50 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path)
}
}
+/* Caller must hold bp->hwrm_cmd_lock */
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings)
+{
+ struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+ struct hwrm_func_qcfg_input req = {0};
+ int rc;
+
+ if (bp->hwrm_spec_code < 0x10601)
+ return 0;
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
+ req.fid = cpu_to_le16(fid);
+ rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ if (!rc)
+ *tx_rings = le16_to_cpu(resp->alloc_tx_rings);
+
+ return rc;
+}
+
+int bnxt_hwrm_reserve_tx_rings(struct bnxt *bp, int *tx_rings)
+{
+ struct hwrm_func_cfg_input req = {0};
+ int rc;
+
+ if (bp->hwrm_spec_code < 0x10601)
+ return 0;
+
+ if (BNXT_VF(bp))
+ return 0;
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+ req.fid = cpu_to_le16(0xffff);
+ req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS);
+ req.num_tx_rings = cpu_to_le16(*tx_rings);
+ rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ if (rc)
+ return rc;
+
+ mutex_lock(&bp->hwrm_cmd_lock);
+ rc = __bnxt_hwrm_get_tx_rings(bp, 0xffff, tx_rings);
+ mutex_unlock(&bp->hwrm_cmd_lock);
+ return rc;
+}
+
static void bnxt_hwrm_set_coal_params(struct bnxt *bp, u32 max_bufs,
u32 buf_tmrs, u16 flags,
struct hwrm_ring_cmpl_ring_cfg_aggint_params_input *req)
@@ -6509,10 +6553,16 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
sh = true;
if (tc) {
- int max_rx_rings, max_tx_rings, rc;
+ int max_rx_rings, max_tx_rings, req_tx_rings, rsv_tx_rings, rc;
+ req_tx_rings = bp->tx_nr_rings_per_tc * tc;
rc = bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, sh);
- if (rc || bp->tx_nr_rings_per_tc * tc > max_tx_rings)
+ if (rc || req_tx_rings > max_tx_rings)
+ return -ENOMEM;
+
+ rsv_tx_rings = req_tx_rings;
+ if (bnxt_hwrm_reserve_tx_rings(bp, &rsv_tx_rings) ||
+ rsv_tx_rings < req_tx_rings)
return -ENOMEM;
}
@@ -7000,6 +7050,11 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
return rc;
bp->rx_nr_rings = min_t(int, dflt_rings, max_rx_rings);
bp->tx_nr_rings_per_tc = min_t(int, dflt_rings, max_tx_rings);
+
+ rc = bnxt_hwrm_reserve_tx_rings(bp, &bp->tx_nr_rings_per_tc);
+ if (rc)
+ netdev_warn(bp->dev, "Unable to reserve tx rings\n");
+
bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
bp->tx_nr_rings + bp->rx_nr_rings;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 75803e5..d174729d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1149,6 +1149,8 @@ struct bnxt {
int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
int bmap_size);
int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id);
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings);
+int bnxt_hwrm_reserve_tx_rings(struct bnxt *bp, int *tx_rings);
int bnxt_hwrm_set_coal(struct bnxt *);
unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index e6b1196..dd21be4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -388,6 +388,7 @@ static int bnxt_set_channels(struct net_device *dev,
{
struct bnxt *bp = netdev_priv(dev);
int max_rx_rings, max_tx_rings, tcs;
+ int req_tx_rings, rsv_tx_rings;
u32 rc = 0;
bool sh = false;
@@ -423,6 +424,20 @@ static int bnxt_set_channels(struct net_device *dev,
channel->tx_count > max_tx_rings))
return -ENOMEM;
+ req_tx_rings = sh ? channel->combined_count : channel->tx_count;
+ req_tx_rings = min_t(int, req_tx_rings, max_tx_rings);
+ if (tcs > 1)
+ req_tx_rings *= tcs;
+
+ rsv_tx_rings = req_tx_rings;
+ if (bnxt_hwrm_reserve_tx_rings(bp, &rsv_tx_rings))
+ return -ENOMEM;
+
+ if (rsv_tx_rings < req_tx_rings) {
+ netdev_warn(dev, "Unable to allocate the requested tx rings\n");
+ return -ENOMEM;
+ }
+
if (netif_running(dev)) {
if (BNXT_PF(bp)) {
/* TODO CHIMP_FW: Send message to all VF's
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 0c9f6c1..64ef0e5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -416,6 +416,7 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
u16 vf_ring_grps;
struct hwrm_func_cfg_input req = {0};
struct bnxt_pf_info *pf = &bp->pf;
+ int total_vf_tx_rings = 0;
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
@@ -460,6 +461,8 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
mutex_lock(&bp->hwrm_cmd_lock);
for (i = 0; i < num_vfs; i++) {
+ int vf_tx_rsvd = vf_tx_rings;
+
req.fid = cpu_to_le16(pf->first_vf_id + i);
rc = _hwrm_send_message(bp, &req, sizeof(req),
HWRM_CMD_TIMEOUT);
@@ -467,10 +470,15 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
break;
pf->active_vfs = i + 1;
pf->vf[i].fw_fid = le16_to_cpu(req.fid);
+ rc = __bnxt_hwrm_get_tx_rings(bp, pf->vf[i].fw_fid,
+ &vf_tx_rsvd);
+ if (rc)
+ break;
+ total_vf_tx_rings += vf_tx_rsvd;
}
mutex_unlock(&bp->hwrm_cmd_lock);
if (!rc) {
- pf->max_tx_rings -= vf_tx_rings * num_vfs;
+ pf->max_tx_rings -= total_vf_tx_rings;
pf->max_rx_rings -= vf_rx_rings * num_vfs;
pf->max_hw_ring_grps -= vf_ring_grps * num_vfs;
pf->max_cp_rings -= vf_cp_rings * num_vfs;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 12/14] bnxt_en: Set default completion ring for async events.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
With the added support for the bnxt_re RDMA driver, both drivers can be
allocating completion rings in any order. The firmware does not know
which completion ring should be receiving async events. Add an
extra step to tell firmware the completion ring number for receiving
async events after bnxt_en allocates the completion rings.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 338dbd0..1f54a7a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3858,6 +3858,30 @@ static int hwrm_ring_alloc_send_msg(struct bnxt *bp,
return rc;
}
+static int bnxt_hwrm_set_async_event_cr(struct bnxt *bp, int idx)
+{
+ int rc;
+
+ if (BNXT_PF(bp)) {
+ struct hwrm_func_cfg_input req = {0};
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+ req.fid = cpu_to_le16(0xffff);
+ req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+ req.async_event_cr = cpu_to_le16(idx);
+ rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ } else {
+ struct hwrm_func_vf_cfg_input req = {0};
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1);
+ req.enables =
+ cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+ req.async_event_cr = cpu_to_le16(idx);
+ rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ }
+ return rc;
+}
+
static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
{
int i, rc = 0;
@@ -3874,6 +3898,12 @@ static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
goto err_out;
BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
bp->grp_info[i].cp_fw_ring_id = ring->fw_ring_id;
+
+ if (!i) {
+ rc = bnxt_hwrm_set_async_event_cr(bp, ring->fw_ring_id);
+ if (rc)
+ netdev_warn(bp->dev, "Failed to set async event completion ring.\n");
+ }
}
for (i = 0; i < bp->tx_nr_rings; i++) {
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 13/14] bnxt_en: Handle no aggregation ring gracefully.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
The current code assumes that we will always have at least 2 rx rings, 1
will be used as an aggregation ring for TPA and jumbo page placements.
However, it is possible, especially on a VF, that there is only 1 rx
ring available. In this scenario, the current code will fail to initialize.
To handle it, we need to properly set up only 1 ring without aggregation.
Set a new flag BNXT_FLAG_NO_AGG_RINGS for this condition and add logic to
set up the chip to place RX data linearly into a single buffer per packet.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 +++++++++++++++++++++----
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 +
2 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f54a7a..98e9484 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2496,7 +2496,7 @@ void bnxt_set_ring_params(struct bnxt *bp)
agg_factor = min_t(u32, 4, 65536 / BNXT_RX_PAGE_SIZE);
bp->flags &= ~BNXT_FLAG_JUMBO;
- if (rx_space > PAGE_SIZE) {
+ if (rx_space > PAGE_SIZE && !(bp->flags & BNXT_FLAG_NO_AGG_RINGS)) {
u32 jumbo_factor;
bp->flags |= BNXT_FLAG_JUMBO;
@@ -6174,6 +6174,9 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features)
if (features & NETIF_F_LRO)
flags |= BNXT_FLAG_LRO;
+ if (bp->flags & BNXT_FLAG_NO_AGG_RINGS)
+ flags &= ~BNXT_FLAG_TPA;
+
if (features & NETIF_F_HW_VLAN_CTAG_RX)
flags |= BNXT_FLAG_STRIP_VLAN;
@@ -7040,8 +7043,17 @@ static int bnxt_get_dflt_rings(struct bnxt *bp, int *max_rx, int *max_tx,
int rc;
rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
- if (rc)
- return rc;
+ if (rc && (bp->flags & BNXT_FLAG_AGG_RINGS)) {
+ /* Not enough rings, try disabling agg rings. */
+ bp->flags &= ~BNXT_FLAG_AGG_RINGS;
+ rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+ if (rc)
+ return rc;
+ bp->flags |= BNXT_FLAG_NO_AGG_RINGS;
+ bp->dev->hw_features &= ~NETIF_F_LRO;
+ bp->dev->features &= ~NETIF_F_LRO;
+ bnxt_set_ring_params(bp);
+ }
if (bp->flags & BNXT_FLAG_ROCE_CAP) {
int max_cp, max_stat, max_irq;
@@ -7236,7 +7248,12 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
bnxt_set_tpa_flags(bp);
bnxt_set_ring_params(bp);
bnxt_set_max_func_irqs(bp, max_irqs);
- bnxt_set_dflt_rings(bp);
+ rc = bnxt_set_dflt_rings(bp);
+ if (rc) {
+ netdev_err(bp->dev, "Not enough rings available.\n");
+ rc = -ENOMEM;
+ goto init_err;
+ }
/* Default RSS hash cfg. */
bp->rss_hash_cfg = VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4 |
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index d174729d..f6b9b1c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -950,6 +950,7 @@ struct bnxt {
#define BNXT_FLAG_ROCEV2_CAP 0x10000
#define BNXT_FLAG_ROCE_CAP (BNXT_FLAG_ROCEV1_CAP | \
BNXT_FLAG_ROCEV2_CAP)
+ #define BNXT_FLAG_NO_AGG_RINGS 0x20000
#define BNXT_FLAG_CHIP_NITRO_A0 0x1000000
#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA | \
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 09/14] bnxt_en: Assign additional vnics to VFs.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
Assign additional vnics to VFs whenever possible so that NTUPLE can be
supported on the VFs.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index c696025..0c9f6c1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -429,6 +429,8 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
vf_rx_rings = (pf->max_rx_rings - bp->rx_nr_rings) / num_vfs;
vf_ring_grps = (bp->pf.max_hw_ring_grps - bp->rx_nr_rings) / num_vfs;
vf_tx_rings = (pf->max_tx_rings - bp->tx_nr_rings) / num_vfs;
+ vf_vnics = (pf->max_vnics - bp->nr_vnics) / num_vfs;
+ vf_vnics = min_t(u16, vf_vnics, vf_rx_rings);
req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_MTU |
FUNC_CFG_REQ_ENABLES_MRU |
@@ -451,7 +453,6 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
req.num_rx_rings = cpu_to_le16(vf_rx_rings);
req.num_hw_ring_grps = cpu_to_le16(vf_ring_grps);
req.num_l2_ctxs = cpu_to_le16(4);
- vf_vnics = 1;
req.num_vnics = cpu_to_le16(vf_vnics);
/* FIXME spec currently uses 1 bit for stats ctx */
@@ -506,6 +507,8 @@ static int bnxt_sriov_enable(struct bnxt *bp, int *num_vfs)
min_rx_rings)
rx_ok = 1;
}
+ if (bp->pf.max_vnics - bp->nr_vnics < min_rx_rings)
+ rx_ok = 0;
if (bp->pf.max_tx_rings - bp->tx_nr_rings >= min_tx_rings)
tx_ok = 1;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 14/14] MAINTAINERS: Add bnxt_en maintainer info.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
MAINTAINERS | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/MAINTAINERS b/MAINTAINERS
index cfff2c9..11904a9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2605,6 +2605,12 @@ L: netdev@vger.kernel.org
S: Supported
F: drivers/net/ethernet/broadcom/bnx2x/
+BROADCOM BNXT_EN 50 GIGABIT ETHERNET DRIVER
+M: Michael Chan <michael.chan@broadcom.com>
+L: netdev@vger.kernel.org
+S: Supported
+F: drivers/net/ethernet/broadcom/bnxt/
+
BROADCOM BCM281XX/BCM11XXX/BCM216XX ARM ARCHITECTURE
M: Florian Fainelli <f.fainelli@gmail.com>
M: Ray Jui <rjui@broadcom.com>
--
1.8.3.1
^ permalink raw reply related
* [PATCH v1 0/2] bpf: add longest prefix match map
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
This patch set adds a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges. It is exposed as a
bpf map type.
Internally, data is stored in an unbalanced tree of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.
Not that this has nothing to do with fib or fib6 and is in no way meant
to replace or share code with it. It's rather a much simpler
implementation that is specifically written with bpf maps in mind.
Patch 1/2 adds the implementation, and 2/2 an extensive test suite.
Feedback is much appreciated.
Thanks,
Daniel
Changelog:
rfc -> v1:
* Add __rcu pointer annotations to make sparse happy
* Fold _lpm_trie_find_target_node() into its only caller
* Fix some minor documentation issues
Daniel Mack (1):
bpf: add a longest prefix match trie map implementation
David Herrmann (1):
bpf: Add tests for the lpm trie map
include/uapi/linux/bpf.h | 7 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/lpm_trie.c | 468 +++++++++++++++++++++++++++++
tools/testing/selftests/bpf/.gitignore | 1 +
tools/testing/selftests/bpf/Makefile | 4 +-
tools/testing/selftests/bpf/test_lpm_map.c | 348 +++++++++++++++++++++
6 files changed, 827 insertions(+), 3 deletions(-)
create mode 100644 kernel/bpf/lpm_trie.c
create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c
--
2.9.3
^ permalink raw reply
* [PATCH v1 2/2] bpf: Add tests for the lpm trie map
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
In-Reply-To: <20161229172855.14910-1-daniel@zonque.org>
From: David Herrmann <dh.herrmann@gmail.com>
The first part of this program runs randomized tests against the
lpm-bpf-map. It implements a "Trivial Longest Prefix Match" (tlpm)
based on simple, linear, single linked lists. The implementation
should be pretty straightforward.
Based on tlpm, this inserts randomized data into bpf-lpm-maps and
verifies the trie-based bpf-map implementation behaves the same way
as tlpm.
The second part uses 'real world' IPv4 and IPv6 addresses and tests
the trie with those.
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Daniel Mack <daniel@zonque.org>
---
tools/testing/selftests/bpf/.gitignore | 1 +
tools/testing/selftests/bpf/Makefile | 4 +-
tools/testing/selftests/bpf/test_lpm_map.c | 348 +++++++++++++++++++++++++++++
3 files changed, 351 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 071431b..d3b1c9b 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -1,3 +1,4 @@
test_verifier
test_maps
test_lru_map
+test_lpm_map
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7a5f245..064a3e5 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,8 +1,8 @@
CFLAGS += -Wall -O2 -I../../../../usr/include
-test_objs = test_verifier test_maps test_lru_map
+test_objs = test_verifier test_maps test_lru_map test_lpm_map
-TEST_PROGS := test_verifier test_maps test_lru_map test_kmod.sh
+TEST_PROGS := test_verifier test_maps test_lru_map test_lpm_map test_kmod.sh
TEST_FILES := $(test_objs)
all: $(test_objs)
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
new file mode 100644
index 0000000..08db750
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -0,0 +1,348 @@
+/*
+ * Randomized tests for eBPF longest-prefix-match maps
+ *
+ * This program runs randomized tests against the lpm-bpf-map. It implements a
+ * "Trivial Longest Prefix Match" (tlpm) based on simple, linear, singly linked
+ * lists. The implementation should be pretty straightforward.
+ *
+ * Based on tlpm, this inserts randomized data into bpf-lpm-maps and verifies
+ * the trie-based bpf-map implementation behaves the same way as tlpm.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/bpf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+
+#include "bpf_sys.h"
+#include "bpf_util.h"
+
+struct tlpm_node {
+ struct tlpm_node *next;
+ size_t n_bits;
+ uint8_t key[];
+};
+
+static struct tlpm_node *tlpm_add(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits)
+{
+ struct tlpm_node *node;
+ size_t n;
+
+ /* add new entry with @key/@n_bits to @list and return new head */
+
+ n = (n_bits + 7) / 8;
+ node = malloc(sizeof(*node) + n);
+ assert(node);
+
+ node->next = list;
+ node->n_bits = n_bits;
+ memcpy(node->key, key, n);
+
+ return node;
+}
+
+static void tlpm_clear(struct tlpm_node *list)
+{
+ struct tlpm_node *node;
+
+ /* free all entries in @list */
+
+ while ((node = list)) {
+ list = list->next;
+ free(node);
+ }
+}
+
+static struct tlpm_node *tlpm_match(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits)
+{
+ struct tlpm_node *best = NULL;
+ size_t i;
+
+ /*
+ * Perform longest prefix-match on @key/@n_bits. That is, iterate all
+ * entries and match each prefix against @key. Remember the "best"
+ * entry we find (i.e., the longest prefix that matches) and return it
+ * to the caller when done.
+ */
+
+ for ( ; list; list = list->next) {
+ for (i = 0; i < n_bits && i < list->n_bits; ++i) {
+ if ((key[i / 8] & (1 << (7 - i % 8))) !=
+ (list->key[i / 8] & (1 << (7 - i % 8))))
+ break;
+ }
+
+ if (i >= list->n_bits) {
+ if (!best || i > best->n_bits)
+ best = list;
+ }
+ }
+
+ return best;
+}
+
+static void test_lpm_basic(void)
+{
+ struct tlpm_node *list = NULL, *t1, *t2;
+
+ /* very basic, static tests to verify tlpm works as expected */
+
+ assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+
+ t1 = list = tlpm_add(list, (uint8_t[]){ 0xff }, 8);
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0x00 }, 16));
+ assert(!tlpm_match(list, (uint8_t[]){ 0x7f }, 8));
+ assert(!tlpm_match(list, (uint8_t[]){ 0xfe }, 8));
+ assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 7));
+
+ t2 = list = tlpm_add(list, (uint8_t[]){ 0xff, 0xff }, 16);
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+ assert(t2 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 15));
+ assert(!tlpm_match(list, (uint8_t[]){ 0x7f, 0xff }, 16));
+
+ tlpm_clear(list);
+}
+
+static void test_lpm_order(void)
+{
+ struct tlpm_node *t1, *t2, *l1 = NULL, *l2 = NULL;
+ size_t i, j;
+
+ /*
+ * Verify the tlpm implementation works correctly regardless of the
+ * order of entries. Insert a random set of entries into @l1, and copy
+ * the same data in reverse order into @l2. Then verify a lookup of
+ * random keys will yield the same result in both sets.
+ */
+
+ for (i = 0; i < (1 << 12); ++i)
+ l1 = tlpm_add(l1, (uint8_t[]){
+ rand() % 0xff,
+ rand() % 0xff,
+ }, rand() % 16 + 1);
+
+ for (t1 = l1; t1; t1 = t1->next)
+ l2 = tlpm_add(l2, t1->key, t1->n_bits);
+
+ for (i = 0; i < (1 << 8); ++i) {
+ uint8_t key[] = { rand() % 0xff, rand() % 0xff };
+
+ t1 = tlpm_match(l1, key, 16);
+ t2 = tlpm_match(l2, key, 16);
+
+ assert(!t1 == !t2);
+ if (t1) {
+ assert(t1->n_bits == t2->n_bits);
+ for (j = 0; j < t1->n_bits; ++j)
+ assert((t1->key[j / 8] & (1 << (7 - j % 8))) ==
+ (t2->key[j / 8] & (1 << (7 - j % 8))));
+ }
+ }
+
+ tlpm_clear(l1);
+ tlpm_clear(l2);
+}
+
+static void test_lpm_map(void)
+{
+ size_t i, j, n_matches, n_nodes, n_lookups;
+ struct tlpm_node *t, *list = NULL;
+ struct bpf_lpm_trie_key *key;
+ uint8_t value[8] = {};
+ int r, map;
+
+ /*
+ * Compare behavior of tlpm vs. bpf-lpm. Create a randomized set of
+ * prefixes and insert it into both tlpm and bpf-lpm. Then run some
+ * randomized lookups and verify both maps return the same result.
+ */
+
+ n_matches = 0;
+ n_nodes = 1 << 8;
+ n_lookups = 1 << 16;
+
+ key = alloca(sizeof(*key) + 4);
+ memset(key, 0, sizeof(*key) + 4);
+
+ map = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+ sizeof(*key) + 4,
+ sizeof(value),
+ 4096,
+ 0);
+ assert(map >= 0);
+
+ for (i = 0; i < n_nodes; ++i) {
+ value[0] = rand() & 0xff;
+ value[1] = rand() & 0xff;
+ value[2] = rand() & 0xff;
+ value[3] = rand() & 0xff;
+ value[4] = rand() % 33;
+
+ list = tlpm_add(list, value, value[4]);
+
+ key->prefixlen = value[4];
+ memcpy(key->data, value, 4);
+ r = bpf_map_update(map, key, value, 0);
+ assert(!r);
+ }
+
+ for (i = 0; i < n_lookups; ++i) {
+ uint8_t data[] = {
+ rand() % 0xff,
+ rand() % 0xff,
+ rand() % 0xff,
+ rand() % 0xff
+ };
+
+ t = tlpm_match(list, data, 32);
+
+ key->prefixlen = 32;
+ memcpy(key->data, data, 4);
+ r = bpf_map_lookup(map, key, value);
+ assert(!r || errno == ENOENT);
+ assert(!t == !!r);
+
+ if (t) {
+ ++n_matches;
+ assert(t->n_bits == value[4]);
+ for (j = 0; j < t->n_bits; ++j)
+ assert((t->key[j / 8] & (1 << (7 - j % 8))) ==
+ (value[j / 8] & (1 << (7 - j % 8))));
+ }
+ }
+
+ close(map);
+ tlpm_clear(list);
+
+ /*
+ * With 255 random nodes in the map, we are pretty likely to match
+ * something on every lookup. For statistics, use this:
+ *
+ * printf(" nodes: %zu\n"
+ * "lookups: %zu\n"
+ * "matches: %zu\n", n_nodes, n_lookups, n_matches);
+ */
+}
+
+/* Test the implementation with some 'real world' examples */
+
+static void test_lpm_ipaddr(void)
+{
+ struct bpf_lpm_trie_key *key_ipv4;
+ struct bpf_lpm_trie_key *key_ipv6;
+ size_t key_size_ipv4;
+ size_t key_size_ipv6;
+ int map_fd_ipv4;
+ int map_fd_ipv6;
+ __u64 value;
+
+ key_size_ipv4 = sizeof(*key_ipv4) + sizeof(__u32);
+ key_size_ipv6 = sizeof(*key_ipv6) + sizeof(__u32) * 4;
+ key_ipv4 = alloca(key_size_ipv4);
+ key_ipv6 = alloca(key_size_ipv6);
+
+ map_fd_ipv4 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+ key_size_ipv4, sizeof(value),
+ 100, 0);
+ assert(map_fd_ipv4 >= 0);
+
+ map_fd_ipv6 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+ key_size_ipv6, sizeof(value),
+ 100, 0);
+ assert(map_fd_ipv6 >= 0);
+
+ /* Fill data some IPv4 and IPv6 address ranges */
+ value = 1;
+ key_ipv4->prefixlen = 16;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 2;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 3;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.128.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 5;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.1.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 4;
+ key_ipv4->prefixlen = 23;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 0xdeadbeef;
+ key_ipv6->prefixlen = 64;
+ inet_pton(AF_INET6, "2a00:1450:4001:814::200e", key_ipv6->data);
+ assert(bpf_map_update(map_fd_ipv6, key_ipv6, &value, 0) == 0);
+
+ /* Set tprefixlen to maximum for lookups */
+ key_ipv4->prefixlen = 32;
+ key_ipv6->prefixlen = 128;
+
+ /* Test some lookups that should come back with a value */
+ inet_pton(AF_INET, "192.168.128.23", key_ipv4->data);
+ assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == 0);
+ assert(value == 3);
+
+ inet_pton(AF_INET, "192.168.0.1", key_ipv4->data);
+ assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == 0);
+ assert(value == 2);
+
+ inet_pton(AF_INET6, "2a00:1450:4001:814::", key_ipv6->data);
+ assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == 0);
+ assert(value == 0xdeadbeef);
+
+ inet_pton(AF_INET6, "2a00:1450:4001:814::1", key_ipv6->data);
+ assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == 0);
+ assert(value == 0xdeadbeef);
+
+ /* Test some lookups that should not match any entry */
+ inet_pton(AF_INET, "10.0.0.1", key_ipv4->data);
+ assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == -1 &&
+ errno == ENOENT);
+
+ inet_pton(AF_INET, "11.11.11.11", key_ipv4->data);
+ assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == -1 &&
+ errno == ENOENT);
+
+ inet_pton(AF_INET6, "2a00:ffff::", key_ipv6->data);
+ assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == -1 &&
+ errno == ENOENT);
+
+ close(map_fd_ipv4);
+ close(map_fd_ipv6);
+}
+
+int main(void)
+{
+ /* we want predictable, pseudo random tests */
+ srand(0xf00ba1);
+
+ test_lpm_basic();
+ test_lpm_order();
+ test_lpm_map();
+ test_lpm_ipaddr();
+
+ printf("test_lpm: OK\n");
+ return 0;
+}
--
2.9.3
^ permalink raw reply related
* [PATCH v1 1/2] bpf: add a longest prefix match trie map implementation
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
In-Reply-To: <20161229172855.14910-1-daniel@zonque.org>
This trie implements a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges.
Internally, data is stored in an unbalanced trie of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.
Tries may be created with prefix lengths that are multiples of 8, in
the range from 8 to 2048. The key used for lookup and update operations
is a struct bpf_lpm_trie_key, and the value is a uint64_t.
The code carries more information about the internal implementation.
Signed-off-by: Daniel Mack <daniel@zonque.org>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
---
include/uapi/linux/bpf.h | 7 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/lpm_trie.c | 468 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 476 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/lpm_trie.c
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87..d564277 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -63,6 +63,12 @@ struct bpf_insn {
__s32 imm; /* signed immediate constant */
};
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+ __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */
+ __u8 data[0]; /* Arbitrary size */
+};
+
/* BPF syscall commands, see bpf(2) man-page for details. */
enum bpf_cmd {
BPF_MAP_CREATE,
@@ -89,6 +95,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CGROUP_ARRAY,
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ BPF_MAP_TYPE_LPM_TRIE,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474..e1ce4f4 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 0000000..8b6a61d
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,468 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+ struct rcu_head rcu;
+ struct lpm_trie_node __rcu *child[2];
+ u32 prefixlen;
+ u32 flags;
+ u64 value;
+ u8 data[0];
+};
+
+struct lpm_trie {
+ struct bpf_map map;
+ struct lpm_trie_node __rcu *root;
+ size_t n_entries;
+ size_t max_prefixlen;
+ size_t data_size;
+ spinlock_t lock;
+};
+
+/*
+ * This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of that (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * |
+ * +----------------+
+ * | (2) |
+ * | 192.168.0.0/24 |
+ * | value: 2 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (2) | | (3) |
+ * | 192.168.0.0/24 | | 192.168.128.0/24 |
+ * | value: 2 | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node is does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differenciate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (4) (I) | | (3) |
+ * | 192.168.0.0/23 | | 192.168.128.0/24 |
+ * | value: --- | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ * | |
+ * +----------------+ +----------------+
+ * | (2) | | (5) |
+ * | 192.168.0.0/24 | | 192.168.1.0/24 |
+ * | value: 2 | | value: 5 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+ return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * longest_prefix_match() - determine the longest prefix
+ * @trie: The trie to get internal sizes from
+ * @node: The node to operate on
+ * @key: The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key *key)
+{
+ size_t prefixlen = 0;
+ int i;
+
+ for (i = 0; i < trie->data_size; i++) {
+ size_t b;
+
+ b = 8 - fls(node->data[i] ^ key->data[i]);
+ prefixlen += b;
+
+ if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
+ return min(node->prefixlen, key->prefixlen);
+
+ if (b < 8)
+ break;
+ }
+
+ return prefixlen;
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+ struct lpm_trie_node *node, *found = NULL;
+ struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie *trie =
+ container_of(map, struct lpm_trie, map);
+
+ /* Start walking the trie from the root node ... */
+
+ for (node = rcu_dereference(trie->root); node;) {
+ unsigned int next_bit;
+ size_t matchlen;
+
+ /*
+ * Determine the longest prefix of @node that matches @key.
+ * If it's the maximum possible prefix for this trie, we have
+ * an exact match and can return it directly.
+ */
+ matchlen = longest_prefix_match(trie, node, key);
+ if (matchlen == trie->max_prefixlen)
+ return &node->value;
+
+ /*
+ * If the number of bits that match is smaller than the prefix
+ * length of @node, bail out and return the node we have seen
+ * last in the traversal (ie, the parent).
+ */
+ if (matchlen < node->prefixlen)
+ break;
+
+ /*
+ * Consider this node as return candidate unless it is an
+ * artificially added intermediate one
+ */
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ found = node;
+
+ /*
+ * If the node match is fully satisfied, let's see if we can
+ * become more specific. Determine the next bit in the key and
+ * traverse down.
+ */
+ next_bit = extract_bit(key->data, node->prefixlen);
+ node = rcu_dereference(node->child[next_bit]);
+ }
+
+ return found ? &found->value : NULL;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(size_t data_size)
+{
+ return kmalloc(sizeof(struct lpm_trie_node) + data_size,
+ GFP_ATOMIC | __GFP_NOWARN);
+}
+
+/* Called from syscall or from eBPF program */
+static int trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *node, *im_node, *new_node = NULL;
+ struct lpm_trie_node __rcu **slot;
+ struct bpf_lpm_trie_key *key = _key;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ spin_lock(&trie->lock);
+
+ /* Allocate and fill a new node */
+
+ if (trie->n_entries == trie->map.max_entries) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ new_node = lpm_trie_node_alloc(trie->data_size);
+ if (!new_node) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ trie->n_entries++;
+ new_node->value = *(u64 *) value;
+ new_node->prefixlen = key->prefixlen;
+ new_node->flags = 0;
+ new_node->child[0] = NULL;
+ new_node->child[1] = NULL;
+ memcpy(new_node->data, key->data, trie->data_size);
+
+ /*
+ * Now find a slot to attach the new node. To do that, walk the tree
+ * from the root match as many bits as possible for each node until we
+ * either find an empty slot or a slot that needs to be replaced by an
+ * intermediate node.
+ */
+ slot = &trie->root;
+
+ while ((node = rcu_dereference_protected(*slot,
+ lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen ||
+ node->prefixlen == trie->max_prefixlen)
+ break;
+
+ next_bit = extract_bit(key->data, node->prefixlen);
+ slot = &node->child[next_bit];
+ }
+
+ /*
+ * If the slot is empty (a free child pointer or an empty root),
+ * simply assign the @new_node to that slot and be done.
+ */
+ if (!node) {
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ /*
+ * If the slot we picked already exists, replace it with @new_node
+ * which already has the correct data array and value set.
+ */
+ if (node->prefixlen == matchlen) {
+ new_node->child[0] = node->child[0];
+ new_node->child[1] = node->child[1];
+
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ trie->n_entries--;
+
+ rcu_assign_pointer(*slot, new_node);
+ kfree_rcu(node, rcu);
+
+ goto out;
+ }
+
+ /*
+ * If the new node matches the prefix completely, it must be an
+ * inserted as an ancestor. Simply insert it between @node and @*slot.
+ */
+ if (matchlen == key->prefixlen) {
+ next_bit = extract_bit(node->data, matchlen);
+ rcu_assign_pointer(new_node->child[next_bit], node);
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ im_node = lpm_trie_node_alloc(trie->data_size);
+ if (!im_node) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ im_node->prefixlen = matchlen;
+ im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+ memcpy(im_node->data, node->data, trie->data_size);
+
+ /* Now determine which child to install in which slot */
+ if (extract_bit(key->data, matchlen)) {
+ rcu_assign_pointer(im_node->child[0], node);
+ rcu_assign_pointer(im_node->child[1], new_node);
+ } else {
+ rcu_assign_pointer(im_node->child[0], new_node);
+ rcu_assign_pointer(im_node->child[1], node);
+ }
+
+ /* Finally, assign the intermediate node to the determined spot */
+ rcu_assign_pointer(*slot, im_node);
+
+out:
+ if (ret) {
+ if (new_node)
+ trie->n_entries--;
+
+ kfree(new_node);
+ kfree(im_node);
+ }
+
+ spin_unlock(&trie->lock);
+
+ return ret;
+}
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+ struct lpm_trie *trie;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->map_flags ||
+ attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 ||
+ attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 ||
+ attr->value_size != sizeof(u64))
+ return ERR_PTR(-EINVAL);
+
+ trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+ if (!trie)
+ return NULL;
+
+ /* copy mandatory map attributes */
+ trie->map.map_type = attr->map_type;
+ trie->map.key_size = attr->key_size;
+ trie->map.value_size = attr->value_size;
+ trie->map.max_entries = attr->max_entries;
+ trie->data_size = attr->key_size -
+ offsetof(struct bpf_lpm_trie_key, data);
+ trie->max_prefixlen = trie->data_size * 8;
+
+ spin_lock_init(&trie->lock);
+
+ return &trie->map;
+}
+
+static void trie_free(struct bpf_map *map)
+{
+ struct lpm_trie_node __rcu **slot;
+ struct lpm_trie_node *node;
+ struct lpm_trie *trie =
+ container_of(map, struct lpm_trie, map);
+
+ spin_lock(&trie->lock);
+
+ /*
+ * Always start at the root and walk down to a node that has no
+ * children. Then free that node, nullify its pointer in the parent,
+ * then start over.
+ */
+
+ for (;;) {
+ slot = &trie->root;
+
+ for (;;) {
+ node = rcu_dereference_protected(*slot,
+ lockdep_is_held(&trie->lock));
+ if (!node)
+ goto out;
+
+ if (node->child[0]) {
+ slot = &node->child[0];
+ continue;
+ }
+
+ if (node->child[1]) {
+ slot = &node->child[1];
+ continue;
+ }
+
+ kfree(node);
+ rcu_assign_pointer(*slot, NULL);
+ break;
+ }
+ }
+
+out:
+ spin_unlock(&trie->lock);
+}
+
+static const struct bpf_map_ops trie_ops = {
+ .map_alloc = trie_alloc,
+ .map_free = trie_free,
+ .map_lookup_elem = trie_lookup_elem,
+ .map_update_elem = trie_update_elem,
+};
+
+static struct bpf_map_type_list trie_type __read_mostly = {
+ .ops = &trie_ops,
+ .type = BPF_MAP_TYPE_LPM_TRIE,
+};
+
+static int __init register_trie_map(void)
+{
+ bpf_register_map_type(&trie_type);
+ return 0;
+}
+late_initcall(register_trie_map);
--
2.9.3
^ permalink raw reply related
* RE: [Open-FCoE] [PATCH RFC net-next 1/5] qed: Add support for hardware offloaded FCoE.
From: Mintz, Yuval @ 2016-12-29 17:28 UTC (permalink / raw)
To: Hannes Reinecke, Dupuis, Chad, martin.petersen@oracle.com
Cc: fcoe-devel@open-fcoe.org, netdev@vger.kernel.org,
Dept-Eng QLogic Storage Upstream, linux-scsi@vger.kernel.org
In-Reply-To: <371bc2ac-c630-0e95-29aa-3fa16fe0c764@suse.de>
> > +struct fcoe_tstorm_fcoe_task_st_ctx_read_write {
> > + union fcoe_cleanup_addr_exp_ro_union
> cleanup_addr_exp_ro_union;
> > + __le16 flags;
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK
> 0x7
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_SHIFT 0
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT
> 3
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT 4
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT 5
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT
> 6
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_MAS
> K 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT
> 7
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_MASK
> 0x3
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT 8
> > +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK
> 0x3F
> > +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT
> 10
>
> A very odd way of defining a bitfield ...
> Why not use a 'normal' bitfield here?
This is the format of our generated firmware HSI; We already have
Thousands of definitions using this same format.
^ permalink raw reply
* [PATCH net-next] sctp: refactor sctp_datamsg_from_user
From: Marcelo Ricardo Leitner @ 2016-12-29 17:53 UTC (permalink / raw)
To: netdev; +Cc: linux-sctp, Neil Horman, Vlad Yasevich
This patch refactors sctp_datamsg_from_user() in an attempt to make it
better to read and avoid code duplication for handling the last
fragment.
It also avoids doing division and remaining operations. Even though, it
should still operate similarly as before this patch.
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
net/sctp/chunk.c | 107 +++++++++++++++++--------------------------------------
1 file changed, 32 insertions(+), 75 deletions(-)
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 615f0ddd41dfb1ff46a9d4e564716de8e7b60ea6..e3621cb4827fadb5f5cb41ebe8455dfa3300a765 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct sctp_sndrcvinfo *sinfo,
struct iov_iter *from)
{
- int max, whole, i, offset, over, err;
- int len, first_len;
- int max_data;
+ size_t len, first_len, max_data, remaining;
+ size_t msg_len = iov_iter_count(from);
+ struct list_head *pos, *temp;
struct sctp_chunk *chunk;
struct sctp_datamsg *msg;
- struct list_head *pos, *temp;
- size_t msg_len = iov_iter_count(from);
- __u8 frag;
+ int err;
msg = sctp_datamsg_new(GFP_KERNEL);
if (!msg)
@@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
(SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
!SCTP_PR_POLICY(sinfo->sinfo_flags)))
msg->expires_at = jiffies +
- msecs_to_jiffies(sinfo->sinfo_timetolive);
+ msecs_to_jiffies(sinfo->sinfo_timetolive);
/* This is the biggest possible DATA chunk that can fit into
* the packet
@@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
max_data = SCTP_TRUNC4(max_data);
- max = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
* we need to account for bundling of the AUTH chunks along with
* DATA.
@@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
hmac_desc->hmac_len);
}
- /* Now, check if we need to reduce our max */
- if (max > max_data)
- max = max_data;
+ /* Check what's our max considering the above */
+ max_data = min_t(size_t, max_data, asoc->frag_point);
- whole = 0;
- first_len = max;
+ /* Set first_len and then account for possible bundles on first frag */
+ first_len = max_data;
/* Check to see if we have a pending SACK and try to let it be bundled
* with this message. Do this if we don't have any data queued already.
@@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
asoc->outqueue.out_qlen == 0 &&
list_empty(&asoc->outqueue.retransmit) &&
- msg_len > max)
- max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
+ msg_len > max_data)
+ first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
/* Encourage Cookie-ECHO bundling. */
if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
- max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
-
- /* Now that we adjusted completely, reset first_len */
- if (first_len > max_data)
- first_len = max_data;
+ first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
/* Account for a different sized first fragment */
if (msg_len >= first_len) {
- msg_len -= first_len;
- whole = 1;
msg->can_delay = 0;
- }
-
- /* How many full sized? How many bytes leftover? */
- whole += msg_len / max;
- over = msg_len % max;
- offset = 0;
-
- if ((whole > 1) || (whole && over))
SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+ } else {
+ /* Which may be the only one... */
+ first_len = msg_len;
+ }
- /* Create chunks for all the full sized DATA chunks. */
- for (i = 0, len = first_len; i < whole; i++) {
- frag = SCTP_DATA_MIDDLE_FRAG;
+ /* Create chunks for all DATA chunks. */
+ for (remaining = msg_len; remaining; remaining -= len) {
+ u8 frag = SCTP_DATA_MIDDLE_FRAG;
- if (0 == i)
+ if (remaining == msg_len) {
+ /* First frag, which may also be the last */
frag |= SCTP_DATA_FIRST_FRAG;
+ len = first_len;
+ } else {
+ /* Middle frags */
+ len = max_data;
+ }
- if ((i == (whole - 1)) && !over) {
+ if (len >= remaining) {
+ /* Last frag, which may also be the first */
+ len = remaining;
frag |= SCTP_DATA_LAST_FRAG;
/* The application requests to set the I-bit of the
@@ -271,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
0, GFP_KERNEL);
-
if (!chunk) {
err = -ENOMEM;
goto errout;
@@ -282,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
goto errout_chunk_free;
/* Put the chunk->skb back into the form expected by send. */
- __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
- - (__u8 *)chunk->skb->data);
-
- sctp_datamsg_assign(msg, chunk);
- list_add_tail(&chunk->frag_list, &msg->chunks);
-
- /* The first chunk, the first chunk was likely short
- * to allow bundling, so reset to full size.
- */
- if (0 == i)
- len = max;
- }
-
- /* .. now the leftover bytes. */
- if (over) {
- if (!whole)
- frag = SCTP_DATA_NOT_FRAG;
- else
- frag = SCTP_DATA_LAST_FRAG;
-
- if ((sinfo->sinfo_flags & SCTP_EOF) ||
- (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
- frag |= SCTP_DATA_SACK_IMM;
-
- chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
- 0, GFP_KERNEL);
-
- if (!chunk) {
- err = -ENOMEM;
- goto errout;
- }
-
- err = sctp_user_addto_chunk(chunk, over, from);
-
- /* Put the chunk->skb back into the form expected by send. */
- __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
- - (__u8 *)chunk->skb->data);
- if (err < 0)
- goto errout_chunk_free;
+ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
+ chunk->skb->data);
sctp_datamsg_assign(msg, chunk);
list_add_tail(&chunk->frag_list, &msg->chunks);
@@ -338,6 +294,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
sctp_chunk_free(chunk);
}
sctp_datamsg_put(msg);
+
return ERR_PTR(err);
}
--
2.9.3
^ permalink raw reply related
* Re: [PATCH iproute2 net-next] tc: flower: support matching flags
From: Stephen Hemminger @ 2016-12-29 18:43 UTC (permalink / raw)
To: Paul Blakey
Cc: netdev, David S. Miller, Hadar Hen Zion, Or Gerlitz, Roi Dayan
In-Reply-To: <1482930409-55059-1-git-send-email-paulb@mellanox.com>
On Wed, 28 Dec 2016 15:06:49 +0200
Paul Blakey <paulb@mellanox.com> wrote:
> Enhance flower to support matching on flags.
>
> The 1st flag allows to match on whether the packet is
> an IP fragment.
>
> Example:
>
> # add a flower filter that will drop fragmented packets
> # (bit 0 of control flags)
> tc filter add dev ens4f0 protocol ip parent ffff: \
> flower \
> src_mac e4:1d:2d:fd:8b:01 \
> dst_mac e4:1d:2d:fd:8b:02 \
> indev ens4f0 \
> matching_flags 0x1/0x1 \
> action drop
>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
Applied. Had to manually fixup merge conflicts with other flower changes.
^ permalink raw reply
* Re: [PATCH iproute2] fix typo in ip-xfrm man page, rmd610 -> rmd160
From: Stephen Hemminger @ 2016-12-29 18:44 UTC (permalink / raw)
To: Alexey Kodanev; +Cc: netdev, Vasily Isaenko
In-Reply-To: <1482490996-17048-1-git-send-email-alexey.kodanev@oracle.com>
On Fri, 23 Dec 2016 14:03:16 +0300
Alexey Kodanev <alexey.kodanev@oracle.com> wrote:
> Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Applied.
^ permalink raw reply
* Re: [PATCH] tc: add missing limits.h header
From: Stephen Hemminger @ 2016-12-29 18:44 UTC (permalink / raw)
To: Baruch Siach; +Cc: netdev
In-Reply-To: <c18609c5d906bbba8176138cfafd88c4a38c8190.1482432767.git.baruch@tkos.co.il>
On Thu, 22 Dec 2016 20:52:48 +0200
Baruch Siach <baruch@tkos.co.il> wrote:
> This fixes under musl build issues like:
>
> f_matchall.c: In function ‘matchall_parse_opt’:
> f_matchall.c:48:12: error: ‘LONG_MIN’ undeclared (first use in this function)
> if (h == LONG_MIN || h == LONG_MAX) {
> ^
> f_matchall.c:48:12: note: each undeclared identifier is reported only once for each function it appears in
> f_matchall.c:48:29: error: ‘LONG_MAX’ undeclared (first use in this function)
> if (h == LONG_MIN || h == LONG_MAX) {
> ^
>
> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Sure, applied
^ permalink raw reply
* Re: [PATCH net] rtnl: stats - add missing netlink message size checks
From: David Miller @ 2016-12-29 19:06 UTC (permalink / raw)
To: minipli; +Cc: netdev, roopa
In-Reply-To: <1482943935-18052-1-git-send-email-minipli@googlemail.com>
From: Mathias Krause <minipli@googlemail.com>
Date: Wed, 28 Dec 2016 17:52:15 +0100
> We miss to check if the netlink message is actually big enough to contain
> a struct if_stats_msg.
>
> Add a check to prevent userland from sending us short messages that would
> make us access memory beyond the end of the message.
>
> Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump...")
> Signed-off-by: Mathias Krause <minipli@googlemail.com>
Looks good, applied and queued up for -stable.
^ permalink raw reply
* Re: [PATCH v2] net: fix incorrect original ingress device index in PKTINFO
From: David Miller @ 2016-12-29 19:08 UTC (permalink / raw)
To: asuka.com; +Cc: kuznet, jmorris, yoshfuji, kaber, dsa, netdev, linux-kernel
In-Reply-To: <1483001104-17614-1-git-send-email-asuka.com@163.com>
From: Wei Zhang <asuka.com@163.com>
Date: Thu, 29 Dec 2016 16:45:04 +0800
> When we send a packet for our own local address on a non-loopback
> interface (e.g. eth0), due to the change had been introduced from
> commit 0b922b7a829c ("net: original ingress device index in PKTINFO"), the
> original ingress device index would be set as the loopback interface.
> However, the packet should be considered as if it is being arrived via the
> sending interface (eth0), otherwise it would break the expectation of the
> userspace application (e.g. the DHCPRELEASE message from dhcp_release
> binary would be ignored by the dnsmasq daemon, since it come from lo which
> is not the interface dnsmasq bind to)
>
> Fixes: 0b922b7a829c ("net: original ingress device index in PKTINFO")
> Acked-by: David Ahern <dsa@cumulusnetworks.com>
> Signed-off-by: Wei Zhang <asuka.com@163.com>
Applied and queued up for -stable.
^ permalink raw reply
* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
From: David Miller @ 2016-12-29 19:08 UTC (permalink / raw)
To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel
In-Reply-To: <1483005521-27799-1-git-send-email-matthias.tafelmeier@gmx.net>
From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Thu, 29 Dec 2016 10:58:41 +0100
> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
>
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior processing
> support (e.g. aRFS).
>
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
>
> This patch therefore introduces an independent configurability via sysctl to
> userland.
This is missing a signoff.
^ permalink raw reply
* Re: [PATCH net 0/5] mlx4 misc fixes
From: David Miller @ 2016-12-29 19:18 UTC (permalink / raw)
To: tariqt; +Cc: netdev, eranbe
In-Reply-To: <1483029433-3624-1-git-send-email-tariqt@mellanox.com>
From: Tariq Toukan <tariqt@mellanox.com>
Date: Thu, 29 Dec 2016 18:37:08 +0200
> This patchset contains several bug fixes from the team to the
> mlx4 Eth and Core drivers.
>
> Series generated against net commit:
> 60133867f1f1 'net: wan: slic_ds26522: fix spelling mistake: "configurated" -> "configured"'
Series applied, thank you.
^ permalink raw reply
* [PATCH v4] net: dev_weight: TX/RX orthogonality
From: Matthias Tafelmeier @ 2016-12-29 19:23 UTC (permalink / raw)
To: netdev; +Cc: hagen, fw, edumazet, daniel
In-Reply-To: <20161229.140854.1456743873101323068.davem@davemloft.net>
Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.
This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).
A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.
This patch therefore introduces an independent configurability via sysctl to
userland.
Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
---
Documentation/sysctl/net.txt | 21 +++++++++++++++++++++
include/linux/netdevice.h | 4 ++++
net/core/dev.c | 6 +++++-
net/core/sysctl_net_core.c | 31 ++++++++++++++++++++++++++++++-
net/sched/sch_generic.c | 2 +-
5 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
it's a Per-CPU variable.
Default: 64
+dev_weight_rx_bias
+--------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
default_qdisc
--------------
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..ecd78b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
extern int netdev_max_backlog;
extern int netdev_tstamp_prequeue;
extern int weight_p;
+extern int dev_weight_rx_bias;
+extern int dev_weight_tx_bias;
+extern int dev_rx_weight;
+extern int dev_tx_weight;
bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..f2fe98b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3428,6 +3428,10 @@ EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
int weight_p __read_mostly = 64; /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = weight_p;
+int dev_tx_weight __read_mostly = weight_p;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = weight_p;
+ napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..698ddd7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret != 0)
+ return ret;
+
+ dev_rx_weight = weight_p * dev_weight_rx_bias;
+ dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+ return ret;
+}
+
static int proc_do_rss_key(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
.data = &weight_p,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_rx_bias",
+ .data = &dev_weight_rx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_tx_bias",
+ .data = &dev_weight_tx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
},
{
.procname = "netdev_max_backlog",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e..b052b27 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
void __qdisc_run(struct Qdisc *q)
{
- int quota = weight_p;
+ int quota = dev_tx_weight;
int packets;
while (qdisc_restart(q, &packets)) {
--
2.7.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox