* [PATCH net-next 10/14] bnxt_en: Add IPV6 hardware RFS support.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
Accept ipv6 flows in .ndo_rx_flow_steer() and support ETHTOOL_GRXCLSRULE
ipv6 flows.
Signed-off-by: Michael Chan <michael.chan@broadocm.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 32 +++++++++++---
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 53 +++++++++++++++++------
2 files changed, 66 insertions(+), 19 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0ca530e..4d478e7 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3316,10 +3316,26 @@ static int bnxt_hwrm_cfa_ntuple_filter_alloc(struct bnxt *bp,
req.ip_addr_type = CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4;
req.ip_protocol = keys->basic.ip_proto;
- req.src_ipaddr[0] = keys->addrs.v4addrs.src;
- req.src_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
- req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
- req.dst_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+ if (keys->basic.n_proto == htons(ETH_P_IPV6)) {
+ int i;
+
+ req.ethertype = htons(ETH_P_IPV6);
+ req.ip_addr_type =
+ CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6;
+ *(struct in6_addr *)&req.src_ipaddr[0] =
+ keys->addrs.v6addrs.src;
+ *(struct in6_addr *)&req.dst_ipaddr[0] =
+ keys->addrs.v6addrs.dst;
+ for (i = 0; i < 4; i++) {
+ req.src_ipaddr_mask[i] = cpu_to_be32(0xffffffff);
+ req.dst_ipaddr_mask[i] = cpu_to_be32(0xffffffff);
+ }
+ } else {
+ req.src_ipaddr[0] = keys->addrs.v4addrs.src;
+ req.src_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+ req.dst_ipaddr[0] = keys->addrs.v4addrs.dst;
+ req.dst_ipaddr_mask[0] = cpu_to_be32(0xffffffff);
+ }
req.src_port = keys->ports.src;
req.src_port_mask = cpu_to_be16(0xffff);
@@ -6588,12 +6604,18 @@ static int bnxt_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
goto err_free;
}
- if ((fkeys->basic.n_proto != htons(ETH_P_IP)) ||
+ if ((fkeys->basic.n_proto != htons(ETH_P_IP) &&
+ fkeys->basic.n_proto != htons(ETH_P_IPV6)) ||
((fkeys->basic.ip_proto != IPPROTO_TCP) &&
(fkeys->basic.ip_proto != IPPROTO_UDP))) {
rc = -EPROTONOSUPPORT;
goto err_free;
}
+ if (fkeys->basic.n_proto == htons(ETH_P_IPV6) &&
+ bp->hwrm_spec_code < 0x10601) {
+ rc = -EPROTONOSUPPORT;
+ goto err_free;
+ }
memcpy(new_fltr->dst_mac_addr, eth->h_dest, ETH_ALEN);
memcpy(new_fltr->src_mac_addr, eth->h_source, ETH_ALEN);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 1cfa7a6..e6b1196 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -524,24 +524,49 @@ static int bnxt_grxclsrule(struct bnxt *bp, struct ethtool_rxnfc *cmd)
fltr_found:
fkeys = &fltr->fkeys;
- if (fkeys->basic.ip_proto == IPPROTO_TCP)
- fs->flow_type = TCP_V4_FLOW;
- else if (fkeys->basic.ip_proto == IPPROTO_UDP)
- fs->flow_type = UDP_V4_FLOW;
- else
- goto fltr_err;
+ if (fkeys->basic.n_proto == htons(ETH_P_IP)) {
+ if (fkeys->basic.ip_proto == IPPROTO_TCP)
+ fs->flow_type = TCP_V4_FLOW;
+ else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+ fs->flow_type = UDP_V4_FLOW;
+ else
+ goto fltr_err;
+
+ fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
+ fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+
+ fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
+ fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
- fs->h_u.tcp_ip4_spec.ip4src = fkeys->addrs.v4addrs.src;
- fs->m_u.tcp_ip4_spec.ip4src = cpu_to_be32(~0);
+ fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
+ fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
- fs->h_u.tcp_ip4_spec.ip4dst = fkeys->addrs.v4addrs.dst;
- fs->m_u.tcp_ip4_spec.ip4dst = cpu_to_be32(~0);
+ fs->h_u.tcp_ip4_spec.pdst = fkeys->ports.dst;
+ fs->m_u.tcp_ip4_spec.pdst = cpu_to_be16(~0);
+ } else {
+ int i;
- fs->h_u.tcp_ip4_spec.psrc = fkeys->ports.src;
- fs->m_u.tcp_ip4_spec.psrc = cpu_to_be16(~0);
+ if (fkeys->basic.ip_proto == IPPROTO_TCP)
+ fs->flow_type = TCP_V6_FLOW;
+ else if (fkeys->basic.ip_proto == IPPROTO_UDP)
+ fs->flow_type = UDP_V6_FLOW;
+ else
+ goto fltr_err;
+
+ *(struct in6_addr *)&fs->h_u.tcp_ip6_spec.ip6src[0] =
+ fkeys->addrs.v6addrs.src;
+ *(struct in6_addr *)&fs->h_u.tcp_ip6_spec.ip6dst[0] =
+ fkeys->addrs.v6addrs.dst;
+ for (i = 0; i < 4; i++) {
+ fs->m_u.tcp_ip6_spec.ip6src[i] = cpu_to_be32(~0);
+ fs->m_u.tcp_ip6_spec.ip6dst[i] = cpu_to_be32(~0);
+ }
+ fs->h_u.tcp_ip6_spec.psrc = fkeys->ports.src;
+ fs->m_u.tcp_ip6_spec.psrc = cpu_to_be16(~0);
- fs->h_u.tcp_ip4_spec.pdst = fkeys->ports.dst;
- fs->m_u.tcp_ip4_spec.pdst = cpu_to_be16(~0);
+ fs->h_u.tcp_ip6_spec.pdst = fkeys->ports.dst;
+ fs->m_u.tcp_ip6_spec.pdst = cpu_to_be16(~0);
+ }
fs->ring_cookie = fltr->rxq;
rc = 0;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 11/14] bnxt_en: Implement new scheme to reserve tx rings.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
In order to properly support TX rate limiting in SRIOV VF functions or
NPAR functions, firmware needs better control over tx ring allocations.
The new scheme requires the driver to reserve the number of tx rings
and to query to see if the requested number of tx rings is reserved.
The driver will use the new scheme when the firmware interface spec is
1.6.1 or newer.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 59 ++++++++++++++++++++++-
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 15 ++++++
drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 10 +++-
4 files changed, 83 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4d478e7..338dbd0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4045,6 +4045,50 @@ static void bnxt_hwrm_ring_free(struct bnxt *bp, bool close_path)
}
}
+/* Caller must hold bp->hwrm_cmd_lock */
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings)
+{
+ struct hwrm_func_qcfg_output *resp = bp->hwrm_cmd_resp_addr;
+ struct hwrm_func_qcfg_input req = {0};
+ int rc;
+
+ if (bp->hwrm_spec_code < 0x10601)
+ return 0;
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_QCFG, -1, -1);
+ req.fid = cpu_to_le16(fid);
+ rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ if (!rc)
+ *tx_rings = le16_to_cpu(resp->alloc_tx_rings);
+
+ return rc;
+}
+
+int bnxt_hwrm_reserve_tx_rings(struct bnxt *bp, int *tx_rings)
+{
+ struct hwrm_func_cfg_input req = {0};
+ int rc;
+
+ if (bp->hwrm_spec_code < 0x10601)
+ return 0;
+
+ if (BNXT_VF(bp))
+ return 0;
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+ req.fid = cpu_to_le16(0xffff);
+ req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS);
+ req.num_tx_rings = cpu_to_le16(*tx_rings);
+ rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ if (rc)
+ return rc;
+
+ mutex_lock(&bp->hwrm_cmd_lock);
+ rc = __bnxt_hwrm_get_tx_rings(bp, 0xffff, tx_rings);
+ mutex_unlock(&bp->hwrm_cmd_lock);
+ return rc;
+}
+
static void bnxt_hwrm_set_coal_params(struct bnxt *bp, u32 max_bufs,
u32 buf_tmrs, u16 flags,
struct hwrm_ring_cmpl_ring_cfg_aggint_params_input *req)
@@ -6509,10 +6553,16 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
sh = true;
if (tc) {
- int max_rx_rings, max_tx_rings, rc;
+ int max_rx_rings, max_tx_rings, req_tx_rings, rsv_tx_rings, rc;
+ req_tx_rings = bp->tx_nr_rings_per_tc * tc;
rc = bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, sh);
- if (rc || bp->tx_nr_rings_per_tc * tc > max_tx_rings)
+ if (rc || req_tx_rings > max_tx_rings)
+ return -ENOMEM;
+
+ rsv_tx_rings = req_tx_rings;
+ if (bnxt_hwrm_reserve_tx_rings(bp, &rsv_tx_rings) ||
+ rsv_tx_rings < req_tx_rings)
return -ENOMEM;
}
@@ -7000,6 +7050,11 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
return rc;
bp->rx_nr_rings = min_t(int, dflt_rings, max_rx_rings);
bp->tx_nr_rings_per_tc = min_t(int, dflt_rings, max_tx_rings);
+
+ rc = bnxt_hwrm_reserve_tx_rings(bp, &bp->tx_nr_rings_per_tc);
+ if (rc)
+ netdev_warn(bp->dev, "Unable to reserve tx rings\n");
+
bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
bp->tx_nr_rings + bp->rx_nr_rings;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 75803e5..d174729d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1149,6 +1149,8 @@ struct bnxt {
int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
int bmap_size);
int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id);
+int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings);
+int bnxt_hwrm_reserve_tx_rings(struct bnxt *bp, int *tx_rings);
int bnxt_hwrm_set_coal(struct bnxt *);
unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index e6b1196..dd21be4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -388,6 +388,7 @@ static int bnxt_set_channels(struct net_device *dev,
{
struct bnxt *bp = netdev_priv(dev);
int max_rx_rings, max_tx_rings, tcs;
+ int req_tx_rings, rsv_tx_rings;
u32 rc = 0;
bool sh = false;
@@ -423,6 +424,20 @@ static int bnxt_set_channels(struct net_device *dev,
channel->tx_count > max_tx_rings))
return -ENOMEM;
+ req_tx_rings = sh ? channel->combined_count : channel->tx_count;
+ req_tx_rings = min_t(int, req_tx_rings, max_tx_rings);
+ if (tcs > 1)
+ req_tx_rings *= tcs;
+
+ rsv_tx_rings = req_tx_rings;
+ if (bnxt_hwrm_reserve_tx_rings(bp, &rsv_tx_rings))
+ return -ENOMEM;
+
+ if (rsv_tx_rings < req_tx_rings) {
+ netdev_warn(dev, "Unable to allocate the requested tx rings\n");
+ return -ENOMEM;
+ }
+
if (netif_running(dev)) {
if (BNXT_PF(bp)) {
/* TODO CHIMP_FW: Send message to all VF's
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 0c9f6c1..64ef0e5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -416,6 +416,7 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
u16 vf_ring_grps;
struct hwrm_func_cfg_input req = {0};
struct bnxt_pf_info *pf = &bp->pf;
+ int total_vf_tx_rings = 0;
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
@@ -460,6 +461,8 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
mutex_lock(&bp->hwrm_cmd_lock);
for (i = 0; i < num_vfs; i++) {
+ int vf_tx_rsvd = vf_tx_rings;
+
req.fid = cpu_to_le16(pf->first_vf_id + i);
rc = _hwrm_send_message(bp, &req, sizeof(req),
HWRM_CMD_TIMEOUT);
@@ -467,10 +470,15 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
break;
pf->active_vfs = i + 1;
pf->vf[i].fw_fid = le16_to_cpu(req.fid);
+ rc = __bnxt_hwrm_get_tx_rings(bp, pf->vf[i].fw_fid,
+ &vf_tx_rsvd);
+ if (rc)
+ break;
+ total_vf_tx_rings += vf_tx_rsvd;
}
mutex_unlock(&bp->hwrm_cmd_lock);
if (!rc) {
- pf->max_tx_rings -= vf_tx_rings * num_vfs;
+ pf->max_tx_rings -= total_vf_tx_rings;
pf->max_rx_rings -= vf_rx_rings * num_vfs;
pf->max_hw_ring_grps -= vf_ring_grps * num_vfs;
pf->max_cp_rings -= vf_cp_rings * num_vfs;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 12/14] bnxt_en: Set default completion ring for async events.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
With the added support for the bnxt_re RDMA driver, both drivers can be
allocating completion rings in any order. The firmware does not know
which completion ring should be receiving async events. Add an
extra step to tell firmware the completion ring number for receiving
async events after bnxt_en allocates the completion rings.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 338dbd0..1f54a7a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3858,6 +3858,30 @@ static int hwrm_ring_alloc_send_msg(struct bnxt *bp,
return rc;
}
+static int bnxt_hwrm_set_async_event_cr(struct bnxt *bp, int idx)
+{
+ int rc;
+
+ if (BNXT_PF(bp)) {
+ struct hwrm_func_cfg_input req = {0};
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
+ req.fid = cpu_to_le16(0xffff);
+ req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+ req.async_event_cr = cpu_to_le16(idx);
+ rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ } else {
+ struct hwrm_func_vf_cfg_input req = {0};
+
+ bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_VF_CFG, -1, -1);
+ req.enables =
+ cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_ASYNC_EVENT_CR);
+ req.async_event_cr = cpu_to_le16(idx);
+ rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+ }
+ return rc;
+}
+
static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
{
int i, rc = 0;
@@ -3874,6 +3898,12 @@ static int bnxt_hwrm_ring_alloc(struct bnxt *bp)
goto err_out;
BNXT_CP_DB(cpr->cp_doorbell, cpr->cp_raw_cons);
bp->grp_info[i].cp_fw_ring_id = ring->fw_ring_id;
+
+ if (!i) {
+ rc = bnxt_hwrm_set_async_event_cr(bp, ring->fw_ring_id);
+ if (rc)
+ netdev_warn(bp->dev, "Failed to set async event completion ring.\n");
+ }
}
for (i = 0; i < bp->tx_nr_rings; i++) {
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 13/14] bnxt_en: Handle no aggregation ring gracefully.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
The current code assumes that we will always have at least 2 rx rings, 1
will be used as an aggregation ring for TPA and jumbo page placements.
However, it is possible, especially on a VF, that there is only 1 rx
ring available. In this scenario, the current code will fail to initialize.
To handle it, we need to properly set up only 1 ring without aggregation.
Set a new flag BNXT_FLAG_NO_AGG_RINGS for this condition and add logic to
set up the chip to place RX data linearly into a single buffer per packet.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 +++++++++++++++++++++----
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 +
2 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f54a7a..98e9484 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2496,7 +2496,7 @@ void bnxt_set_ring_params(struct bnxt *bp)
agg_factor = min_t(u32, 4, 65536 / BNXT_RX_PAGE_SIZE);
bp->flags &= ~BNXT_FLAG_JUMBO;
- if (rx_space > PAGE_SIZE) {
+ if (rx_space > PAGE_SIZE && !(bp->flags & BNXT_FLAG_NO_AGG_RINGS)) {
u32 jumbo_factor;
bp->flags |= BNXT_FLAG_JUMBO;
@@ -6174,6 +6174,9 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features)
if (features & NETIF_F_LRO)
flags |= BNXT_FLAG_LRO;
+ if (bp->flags & BNXT_FLAG_NO_AGG_RINGS)
+ flags &= ~BNXT_FLAG_TPA;
+
if (features & NETIF_F_HW_VLAN_CTAG_RX)
flags |= BNXT_FLAG_STRIP_VLAN;
@@ -7040,8 +7043,17 @@ static int bnxt_get_dflt_rings(struct bnxt *bp, int *max_rx, int *max_tx,
int rc;
rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
- if (rc)
- return rc;
+ if (rc && (bp->flags & BNXT_FLAG_AGG_RINGS)) {
+ /* Not enough rings, try disabling agg rings. */
+ bp->flags &= ~BNXT_FLAG_AGG_RINGS;
+ rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+ if (rc)
+ return rc;
+ bp->flags |= BNXT_FLAG_NO_AGG_RINGS;
+ bp->dev->hw_features &= ~NETIF_F_LRO;
+ bp->dev->features &= ~NETIF_F_LRO;
+ bnxt_set_ring_params(bp);
+ }
if (bp->flags & BNXT_FLAG_ROCE_CAP) {
int max_cp, max_stat, max_irq;
@@ -7236,7 +7248,12 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
bnxt_set_tpa_flags(bp);
bnxt_set_ring_params(bp);
bnxt_set_max_func_irqs(bp, max_irqs);
- bnxt_set_dflt_rings(bp);
+ rc = bnxt_set_dflt_rings(bp);
+ if (rc) {
+ netdev_err(bp->dev, "Not enough rings available.\n");
+ rc = -ENOMEM;
+ goto init_err;
+ }
/* Default RSS hash cfg. */
bp->rss_hash_cfg = VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4 |
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index d174729d..f6b9b1c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -950,6 +950,7 @@ struct bnxt {
#define BNXT_FLAG_ROCEV2_CAP 0x10000
#define BNXT_FLAG_ROCE_CAP (BNXT_FLAG_ROCEV1_CAP | \
BNXT_FLAG_ROCEV2_CAP)
+ #define BNXT_FLAG_NO_AGG_RINGS 0x20000
#define BNXT_FLAG_CHIP_NITRO_A0 0x1000000
#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA | \
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 09/14] bnxt_en: Assign additional vnics to VFs.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
Assign additional vnics to VFs whenever possible so that NTUPLE can be
supported on the VFs.
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index c696025..0c9f6c1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -429,6 +429,8 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
vf_rx_rings = (pf->max_rx_rings - bp->rx_nr_rings) / num_vfs;
vf_ring_grps = (bp->pf.max_hw_ring_grps - bp->rx_nr_rings) / num_vfs;
vf_tx_rings = (pf->max_tx_rings - bp->tx_nr_rings) / num_vfs;
+ vf_vnics = (pf->max_vnics - bp->nr_vnics) / num_vfs;
+ vf_vnics = min_t(u16, vf_vnics, vf_rx_rings);
req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_MTU |
FUNC_CFG_REQ_ENABLES_MRU |
@@ -451,7 +453,6 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
req.num_rx_rings = cpu_to_le16(vf_rx_rings);
req.num_hw_ring_grps = cpu_to_le16(vf_ring_grps);
req.num_l2_ctxs = cpu_to_le16(4);
- vf_vnics = 1;
req.num_vnics = cpu_to_le16(vf_vnics);
/* FIXME spec currently uses 1 bit for stats ctx */
@@ -506,6 +507,8 @@ static int bnxt_sriov_enable(struct bnxt *bp, int *num_vfs)
min_rx_rings)
rx_ok = 1;
}
+ if (bp->pf.max_vnics - bp->nr_vnics < min_rx_rings)
+ rx_ok = 0;
if (bp->pf.max_tx_rings - bp->tx_nr_rings >= min_tx_rings)
tx_ok = 1;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next 14/14] MAINTAINERS: Add bnxt_en maintainer info.
From: Michael Chan @ 2016-12-29 17:13 UTC (permalink / raw)
To: davem; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
MAINTAINERS | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/MAINTAINERS b/MAINTAINERS
index cfff2c9..11904a9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2605,6 +2605,12 @@ L: netdev@vger.kernel.org
S: Supported
F: drivers/net/ethernet/broadcom/bnx2x/
+BROADCOM BNXT_EN 50 GIGABIT ETHERNET DRIVER
+M: Michael Chan <michael.chan@broadcom.com>
+L: netdev@vger.kernel.org
+S: Supported
+F: drivers/net/ethernet/broadcom/bnxt/
+
BROADCOM BCM281XX/BCM11XXX/BCM216XX ARM ARCHITECTURE
M: Florian Fainelli <f.fainelli@gmail.com>
M: Ray Jui <rjui@broadcom.com>
--
1.8.3.1
^ permalink raw reply related
* [PATCH v1 0/2] bpf: add longest prefix match map
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
This patch set adds a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges. It is exposed as a
bpf map type.
Internally, data is stored in an unbalanced tree of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.
Not that this has nothing to do with fib or fib6 and is in no way meant
to replace or share code with it. It's rather a much simpler
implementation that is specifically written with bpf maps in mind.
Patch 1/2 adds the implementation, and 2/2 an extensive test suite.
Feedback is much appreciated.
Thanks,
Daniel
Changelog:
rfc -> v1:
* Add __rcu pointer annotations to make sparse happy
* Fold _lpm_trie_find_target_node() into its only caller
* Fix some minor documentation issues
Daniel Mack (1):
bpf: add a longest prefix match trie map implementation
David Herrmann (1):
bpf: Add tests for the lpm trie map
include/uapi/linux/bpf.h | 7 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/lpm_trie.c | 468 +++++++++++++++++++++++++++++
tools/testing/selftests/bpf/.gitignore | 1 +
tools/testing/selftests/bpf/Makefile | 4 +-
tools/testing/selftests/bpf/test_lpm_map.c | 348 +++++++++++++++++++++
6 files changed, 827 insertions(+), 3 deletions(-)
create mode 100644 kernel/bpf/lpm_trie.c
create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c
--
2.9.3
^ permalink raw reply
* [PATCH v1 2/2] bpf: Add tests for the lpm trie map
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
In-Reply-To: <20161229172855.14910-1-daniel@zonque.org>
From: David Herrmann <dh.herrmann@gmail.com>
The first part of this program runs randomized tests against the
lpm-bpf-map. It implements a "Trivial Longest Prefix Match" (tlpm)
based on simple, linear, single linked lists. The implementation
should be pretty straightforward.
Based on tlpm, this inserts randomized data into bpf-lpm-maps and
verifies the trie-based bpf-map implementation behaves the same way
as tlpm.
The second part uses 'real world' IPv4 and IPv6 addresses and tests
the trie with those.
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Daniel Mack <daniel@zonque.org>
---
tools/testing/selftests/bpf/.gitignore | 1 +
tools/testing/selftests/bpf/Makefile | 4 +-
tools/testing/selftests/bpf/test_lpm_map.c | 348 +++++++++++++++++++++++++++++
3 files changed, 351 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/selftests/bpf/test_lpm_map.c
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 071431b..d3b1c9b 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -1,3 +1,4 @@
test_verifier
test_maps
test_lru_map
+test_lpm_map
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7a5f245..064a3e5 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -1,8 +1,8 @@
CFLAGS += -Wall -O2 -I../../../../usr/include
-test_objs = test_verifier test_maps test_lru_map
+test_objs = test_verifier test_maps test_lru_map test_lpm_map
-TEST_PROGS := test_verifier test_maps test_lru_map test_kmod.sh
+TEST_PROGS := test_verifier test_maps test_lru_map test_lpm_map test_kmod.sh
TEST_FILES := $(test_objs)
all: $(test_objs)
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
new file mode 100644
index 0000000..08db750
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -0,0 +1,348 @@
+/*
+ * Randomized tests for eBPF longest-prefix-match maps
+ *
+ * This program runs randomized tests against the lpm-bpf-map. It implements a
+ * "Trivial Longest Prefix Match" (tlpm) based on simple, linear, singly linked
+ * lists. The implementation should be pretty straightforward.
+ *
+ * Based on tlpm, this inserts randomized data into bpf-lpm-maps and verifies
+ * the trie-based bpf-map implementation behaves the same way as tlpm.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/bpf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+
+#include "bpf_sys.h"
+#include "bpf_util.h"
+
+struct tlpm_node {
+ struct tlpm_node *next;
+ size_t n_bits;
+ uint8_t key[];
+};
+
+static struct tlpm_node *tlpm_add(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits)
+{
+ struct tlpm_node *node;
+ size_t n;
+
+ /* add new entry with @key/@n_bits to @list and return new head */
+
+ n = (n_bits + 7) / 8;
+ node = malloc(sizeof(*node) + n);
+ assert(node);
+
+ node->next = list;
+ node->n_bits = n_bits;
+ memcpy(node->key, key, n);
+
+ return node;
+}
+
+static void tlpm_clear(struct tlpm_node *list)
+{
+ struct tlpm_node *node;
+
+ /* free all entries in @list */
+
+ while ((node = list)) {
+ list = list->next;
+ free(node);
+ }
+}
+
+static struct tlpm_node *tlpm_match(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits)
+{
+ struct tlpm_node *best = NULL;
+ size_t i;
+
+ /*
+ * Perform longest prefix-match on @key/@n_bits. That is, iterate all
+ * entries and match each prefix against @key. Remember the "best"
+ * entry we find (i.e., the longest prefix that matches) and return it
+ * to the caller when done.
+ */
+
+ for ( ; list; list = list->next) {
+ for (i = 0; i < n_bits && i < list->n_bits; ++i) {
+ if ((key[i / 8] & (1 << (7 - i % 8))) !=
+ (list->key[i / 8] & (1 << (7 - i % 8))))
+ break;
+ }
+
+ if (i >= list->n_bits) {
+ if (!best || i > best->n_bits)
+ best = list;
+ }
+ }
+
+ return best;
+}
+
+static void test_lpm_basic(void)
+{
+ struct tlpm_node *list = NULL, *t1, *t2;
+
+ /* very basic, static tests to verify tlpm works as expected */
+
+ assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+
+ t1 = list = tlpm_add(list, (uint8_t[]){ 0xff }, 8);
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0x00 }, 16));
+ assert(!tlpm_match(list, (uint8_t[]){ 0x7f }, 8));
+ assert(!tlpm_match(list, (uint8_t[]){ 0xfe }, 8));
+ assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 7));
+
+ t2 = list = tlpm_add(list, (uint8_t[]){ 0xff, 0xff }, 16);
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+ assert(t2 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 15));
+ assert(!tlpm_match(list, (uint8_t[]){ 0x7f, 0xff }, 16));
+
+ tlpm_clear(list);
+}
+
+static void test_lpm_order(void)
+{
+ struct tlpm_node *t1, *t2, *l1 = NULL, *l2 = NULL;
+ size_t i, j;
+
+ /*
+ * Verify the tlpm implementation works correctly regardless of the
+ * order of entries. Insert a random set of entries into @l1, and copy
+ * the same data in reverse order into @l2. Then verify a lookup of
+ * random keys will yield the same result in both sets.
+ */
+
+ for (i = 0; i < (1 << 12); ++i)
+ l1 = tlpm_add(l1, (uint8_t[]){
+ rand() % 0xff,
+ rand() % 0xff,
+ }, rand() % 16 + 1);
+
+ for (t1 = l1; t1; t1 = t1->next)
+ l2 = tlpm_add(l2, t1->key, t1->n_bits);
+
+ for (i = 0; i < (1 << 8); ++i) {
+ uint8_t key[] = { rand() % 0xff, rand() % 0xff };
+
+ t1 = tlpm_match(l1, key, 16);
+ t2 = tlpm_match(l2, key, 16);
+
+ assert(!t1 == !t2);
+ if (t1) {
+ assert(t1->n_bits == t2->n_bits);
+ for (j = 0; j < t1->n_bits; ++j)
+ assert((t1->key[j / 8] & (1 << (7 - j % 8))) ==
+ (t2->key[j / 8] & (1 << (7 - j % 8))));
+ }
+ }
+
+ tlpm_clear(l1);
+ tlpm_clear(l2);
+}
+
+static void test_lpm_map(void)
+{
+ size_t i, j, n_matches, n_nodes, n_lookups;
+ struct tlpm_node *t, *list = NULL;
+ struct bpf_lpm_trie_key *key;
+ uint8_t value[8] = {};
+ int r, map;
+
+ /*
+ * Compare behavior of tlpm vs. bpf-lpm. Create a randomized set of
+ * prefixes and insert it into both tlpm and bpf-lpm. Then run some
+ * randomized lookups and verify both maps return the same result.
+ */
+
+ n_matches = 0;
+ n_nodes = 1 << 8;
+ n_lookups = 1 << 16;
+
+ key = alloca(sizeof(*key) + 4);
+ memset(key, 0, sizeof(*key) + 4);
+
+ map = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+ sizeof(*key) + 4,
+ sizeof(value),
+ 4096,
+ 0);
+ assert(map >= 0);
+
+ for (i = 0; i < n_nodes; ++i) {
+ value[0] = rand() & 0xff;
+ value[1] = rand() & 0xff;
+ value[2] = rand() & 0xff;
+ value[3] = rand() & 0xff;
+ value[4] = rand() % 33;
+
+ list = tlpm_add(list, value, value[4]);
+
+ key->prefixlen = value[4];
+ memcpy(key->data, value, 4);
+ r = bpf_map_update(map, key, value, 0);
+ assert(!r);
+ }
+
+ for (i = 0; i < n_lookups; ++i) {
+ uint8_t data[] = {
+ rand() % 0xff,
+ rand() % 0xff,
+ rand() % 0xff,
+ rand() % 0xff
+ };
+
+ t = tlpm_match(list, data, 32);
+
+ key->prefixlen = 32;
+ memcpy(key->data, data, 4);
+ r = bpf_map_lookup(map, key, value);
+ assert(!r || errno == ENOENT);
+ assert(!t == !!r);
+
+ if (t) {
+ ++n_matches;
+ assert(t->n_bits == value[4]);
+ for (j = 0; j < t->n_bits; ++j)
+ assert((t->key[j / 8] & (1 << (7 - j % 8))) ==
+ (value[j / 8] & (1 << (7 - j % 8))));
+ }
+ }
+
+ close(map);
+ tlpm_clear(list);
+
+ /*
+ * With 255 random nodes in the map, we are pretty likely to match
+ * something on every lookup. For statistics, use this:
+ *
+ * printf(" nodes: %zu\n"
+ * "lookups: %zu\n"
+ * "matches: %zu\n", n_nodes, n_lookups, n_matches);
+ */
+}
+
+/* Test the implementation with some 'real world' examples */
+
+static void test_lpm_ipaddr(void)
+{
+ struct bpf_lpm_trie_key *key_ipv4;
+ struct bpf_lpm_trie_key *key_ipv6;
+ size_t key_size_ipv4;
+ size_t key_size_ipv6;
+ int map_fd_ipv4;
+ int map_fd_ipv6;
+ __u64 value;
+
+ key_size_ipv4 = sizeof(*key_ipv4) + sizeof(__u32);
+ key_size_ipv6 = sizeof(*key_ipv6) + sizeof(__u32) * 4;
+ key_ipv4 = alloca(key_size_ipv4);
+ key_ipv6 = alloca(key_size_ipv6);
+
+ map_fd_ipv4 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+ key_size_ipv4, sizeof(value),
+ 100, 0);
+ assert(map_fd_ipv4 >= 0);
+
+ map_fd_ipv6 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE,
+ key_size_ipv6, sizeof(value),
+ 100, 0);
+ assert(map_fd_ipv6 >= 0);
+
+ /* Fill data some IPv4 and IPv6 address ranges */
+ value = 1;
+ key_ipv4->prefixlen = 16;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 2;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 3;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.128.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 5;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.1.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 4;
+ key_ipv4->prefixlen = 23;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 0xdeadbeef;
+ key_ipv6->prefixlen = 64;
+ inet_pton(AF_INET6, "2a00:1450:4001:814::200e", key_ipv6->data);
+ assert(bpf_map_update(map_fd_ipv6, key_ipv6, &value, 0) == 0);
+
+ /* Set tprefixlen to maximum for lookups */
+ key_ipv4->prefixlen = 32;
+ key_ipv6->prefixlen = 128;
+
+ /* Test some lookups that should come back with a value */
+ inet_pton(AF_INET, "192.168.128.23", key_ipv4->data);
+ assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == 0);
+ assert(value == 3);
+
+ inet_pton(AF_INET, "192.168.0.1", key_ipv4->data);
+ assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == 0);
+ assert(value == 2);
+
+ inet_pton(AF_INET6, "2a00:1450:4001:814::", key_ipv6->data);
+ assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == 0);
+ assert(value == 0xdeadbeef);
+
+ inet_pton(AF_INET6, "2a00:1450:4001:814::1", key_ipv6->data);
+ assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == 0);
+ assert(value == 0xdeadbeef);
+
+ /* Test some lookups that should not match any entry */
+ inet_pton(AF_INET, "10.0.0.1", key_ipv4->data);
+ assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == -1 &&
+ errno == ENOENT);
+
+ inet_pton(AF_INET, "11.11.11.11", key_ipv4->data);
+ assert(bpf_map_lookup(map_fd_ipv4, key_ipv4, &value) == -1 &&
+ errno == ENOENT);
+
+ inet_pton(AF_INET6, "2a00:ffff::", key_ipv6->data);
+ assert(bpf_map_lookup(map_fd_ipv6, key_ipv6, &value) == -1 &&
+ errno == ENOENT);
+
+ close(map_fd_ipv4);
+ close(map_fd_ipv6);
+}
+
+int main(void)
+{
+ /* we want predictable, pseudo random tests */
+ srand(0xf00ba1);
+
+ test_lpm_basic();
+ test_lpm_order();
+ test_lpm_map();
+ test_lpm_ipaddr();
+
+ printf("test_lpm: OK\n");
+ return 0;
+}
--
2.9.3
^ permalink raw reply related
* [PATCH v1 1/2] bpf: add a longest prefix match trie map implementation
From: Daniel Mack @ 2016-12-29 17:28 UTC (permalink / raw)
To: ast; +Cc: dh.herrmann, daniel, netdev, davem, Daniel Mack
In-Reply-To: <20161229172855.14910-1-daniel@zonque.org>
This trie implements a longest prefix match algorithm that can be used
to match IP addresses to a stored set of ranges.
Internally, data is stored in an unbalanced trie of nodes that has a
maximum height of n, where n is the prefixlen the trie was created
with.
Tries may be created with prefix lengths that are multiples of 8, in
the range from 8 to 2048. The key used for lookup and update operations
is a struct bpf_lpm_trie_key, and the value is a uint64_t.
The code carries more information about the internal implementation.
Signed-off-by: Daniel Mack <daniel@zonque.org>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
---
include/uapi/linux/bpf.h | 7 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/lpm_trie.c | 468 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 476 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/lpm_trie.c
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87..d564277 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -63,6 +63,12 @@ struct bpf_insn {
__s32 imm; /* signed immediate constant */
};
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+ __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */
+ __u8 data[0]; /* Arbitrary size */
+};
+
/* BPF syscall commands, see bpf(2) man-page for details. */
enum bpf_cmd {
BPF_MAP_CREATE,
@@ -89,6 +95,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_CGROUP_ARRAY,
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ BPF_MAP_TYPE_LPM_TRIE,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474..e1ce4f4 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 0000000..8b6a61d
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,468 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+ struct rcu_head rcu;
+ struct lpm_trie_node __rcu *child[2];
+ u32 prefixlen;
+ u32 flags;
+ u64 value;
+ u8 data[0];
+};
+
+struct lpm_trie {
+ struct bpf_map map;
+ struct lpm_trie_node __rcu *root;
+ size_t n_entries;
+ size_t max_prefixlen;
+ size_t data_size;
+ spinlock_t lock;
+};
+
+/*
+ * This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of that (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * |
+ * +----------------+
+ * | (2) |
+ * | 192.168.0.0/24 |
+ * | value: 2 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (2) | | (3) |
+ * | 192.168.0.0/24 | | 192.168.128.0/24 |
+ * | value: 2 | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node is does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differenciate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (4) (I) | | (3) |
+ * | 192.168.0.0/23 | | 192.168.128.0/24 |
+ * | value: --- | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ * | |
+ * +----------------+ +----------------+
+ * | (2) | | (5) |
+ * | 192.168.0.0/24 | | 192.168.1.0/24 |
+ * | value: 2 | | value: 5 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+ return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * longest_prefix_match() - determine the longest prefix
+ * @trie: The trie to get internal sizes from
+ * @node: The node to operate on
+ * @key: The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key *key)
+{
+ size_t prefixlen = 0;
+ int i;
+
+ for (i = 0; i < trie->data_size; i++) {
+ size_t b;
+
+ b = 8 - fls(node->data[i] ^ key->data[i]);
+ prefixlen += b;
+
+ if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
+ return min(node->prefixlen, key->prefixlen);
+
+ if (b < 8)
+ break;
+ }
+
+ return prefixlen;
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+ struct lpm_trie_node *node, *found = NULL;
+ struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie *trie =
+ container_of(map, struct lpm_trie, map);
+
+ /* Start walking the trie from the root node ... */
+
+ for (node = rcu_dereference(trie->root); node;) {
+ unsigned int next_bit;
+ size_t matchlen;
+
+ /*
+ * Determine the longest prefix of @node that matches @key.
+ * If it's the maximum possible prefix for this trie, we have
+ * an exact match and can return it directly.
+ */
+ matchlen = longest_prefix_match(trie, node, key);
+ if (matchlen == trie->max_prefixlen)
+ return &node->value;
+
+ /*
+ * If the number of bits that match is smaller than the prefix
+ * length of @node, bail out and return the node we have seen
+ * last in the traversal (ie, the parent).
+ */
+ if (matchlen < node->prefixlen)
+ break;
+
+ /*
+ * Consider this node as return candidate unless it is an
+ * artificially added intermediate one
+ */
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ found = node;
+
+ /*
+ * If the node match is fully satisfied, let's see if we can
+ * become more specific. Determine the next bit in the key and
+ * traverse down.
+ */
+ next_bit = extract_bit(key->data, node->prefixlen);
+ node = rcu_dereference(node->child[next_bit]);
+ }
+
+ return found ? &found->value : NULL;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(size_t data_size)
+{
+ return kmalloc(sizeof(struct lpm_trie_node) + data_size,
+ GFP_ATOMIC | __GFP_NOWARN);
+}
+
+/* Called from syscall or from eBPF program */
+static int trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *node, *im_node, *new_node = NULL;
+ struct lpm_trie_node __rcu **slot;
+ struct bpf_lpm_trie_key *key = _key;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ spin_lock(&trie->lock);
+
+ /* Allocate and fill a new node */
+
+ if (trie->n_entries == trie->map.max_entries) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ new_node = lpm_trie_node_alloc(trie->data_size);
+ if (!new_node) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ trie->n_entries++;
+ new_node->value = *(u64 *) value;
+ new_node->prefixlen = key->prefixlen;
+ new_node->flags = 0;
+ new_node->child[0] = NULL;
+ new_node->child[1] = NULL;
+ memcpy(new_node->data, key->data, trie->data_size);
+
+ /*
+ * Now find a slot to attach the new node. To do that, walk the tree
+ * from the root match as many bits as possible for each node until we
+ * either find an empty slot or a slot that needs to be replaced by an
+ * intermediate node.
+ */
+ slot = &trie->root;
+
+ while ((node = rcu_dereference_protected(*slot,
+ lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen ||
+ node->prefixlen == trie->max_prefixlen)
+ break;
+
+ next_bit = extract_bit(key->data, node->prefixlen);
+ slot = &node->child[next_bit];
+ }
+
+ /*
+ * If the slot is empty (a free child pointer or an empty root),
+ * simply assign the @new_node to that slot and be done.
+ */
+ if (!node) {
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ /*
+ * If the slot we picked already exists, replace it with @new_node
+ * which already has the correct data array and value set.
+ */
+ if (node->prefixlen == matchlen) {
+ new_node->child[0] = node->child[0];
+ new_node->child[1] = node->child[1];
+
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ trie->n_entries--;
+
+ rcu_assign_pointer(*slot, new_node);
+ kfree_rcu(node, rcu);
+
+ goto out;
+ }
+
+ /*
+ * If the new node matches the prefix completely, it must be an
+ * inserted as an ancestor. Simply insert it between @node and @*slot.
+ */
+ if (matchlen == key->prefixlen) {
+ next_bit = extract_bit(node->data, matchlen);
+ rcu_assign_pointer(new_node->child[next_bit], node);
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ im_node = lpm_trie_node_alloc(trie->data_size);
+ if (!im_node) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ im_node->prefixlen = matchlen;
+ im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+ memcpy(im_node->data, node->data, trie->data_size);
+
+ /* Now determine which child to install in which slot */
+ if (extract_bit(key->data, matchlen)) {
+ rcu_assign_pointer(im_node->child[0], node);
+ rcu_assign_pointer(im_node->child[1], new_node);
+ } else {
+ rcu_assign_pointer(im_node->child[0], new_node);
+ rcu_assign_pointer(im_node->child[1], node);
+ }
+
+ /* Finally, assign the intermediate node to the determined spot */
+ rcu_assign_pointer(*slot, im_node);
+
+out:
+ if (ret) {
+ if (new_node)
+ trie->n_entries--;
+
+ kfree(new_node);
+ kfree(im_node);
+ }
+
+ spin_unlock(&trie->lock);
+
+ return ret;
+}
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+ struct lpm_trie *trie;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->map_flags ||
+ attr->key_size < sizeof(struct bpf_lpm_trie_key) + 1 ||
+ attr->key_size > sizeof(struct bpf_lpm_trie_key) + 256 ||
+ attr->value_size != sizeof(u64))
+ return ERR_PTR(-EINVAL);
+
+ trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+ if (!trie)
+ return NULL;
+
+ /* copy mandatory map attributes */
+ trie->map.map_type = attr->map_type;
+ trie->map.key_size = attr->key_size;
+ trie->map.value_size = attr->value_size;
+ trie->map.max_entries = attr->max_entries;
+ trie->data_size = attr->key_size -
+ offsetof(struct bpf_lpm_trie_key, data);
+ trie->max_prefixlen = trie->data_size * 8;
+
+ spin_lock_init(&trie->lock);
+
+ return &trie->map;
+}
+
+static void trie_free(struct bpf_map *map)
+{
+ struct lpm_trie_node __rcu **slot;
+ struct lpm_trie_node *node;
+ struct lpm_trie *trie =
+ container_of(map, struct lpm_trie, map);
+
+ spin_lock(&trie->lock);
+
+ /*
+ * Always start at the root and walk down to a node that has no
+ * children. Then free that node, nullify its pointer in the parent,
+ * then start over.
+ */
+
+ for (;;) {
+ slot = &trie->root;
+
+ for (;;) {
+ node = rcu_dereference_protected(*slot,
+ lockdep_is_held(&trie->lock));
+ if (!node)
+ goto out;
+
+ if (node->child[0]) {
+ slot = &node->child[0];
+ continue;
+ }
+
+ if (node->child[1]) {
+ slot = &node->child[1];
+ continue;
+ }
+
+ kfree(node);
+ rcu_assign_pointer(*slot, NULL);
+ break;
+ }
+ }
+
+out:
+ spin_unlock(&trie->lock);
+}
+
+static const struct bpf_map_ops trie_ops = {
+ .map_alloc = trie_alloc,
+ .map_free = trie_free,
+ .map_lookup_elem = trie_lookup_elem,
+ .map_update_elem = trie_update_elem,
+};
+
+static struct bpf_map_type_list trie_type __read_mostly = {
+ .ops = &trie_ops,
+ .type = BPF_MAP_TYPE_LPM_TRIE,
+};
+
+static int __init register_trie_map(void)
+{
+ bpf_register_map_type(&trie_type);
+ return 0;
+}
+late_initcall(register_trie_map);
--
2.9.3
^ permalink raw reply related
* RE: [Open-FCoE] [PATCH RFC net-next 1/5] qed: Add support for hardware offloaded FCoE.
From: Mintz, Yuval @ 2016-12-29 17:28 UTC (permalink / raw)
To: Hannes Reinecke, Dupuis, Chad, martin.petersen@oracle.com
Cc: fcoe-devel@open-fcoe.org, netdev@vger.kernel.org,
Dept-Eng QLogic Storage Upstream, linux-scsi@vger.kernel.org
In-Reply-To: <371bc2ac-c630-0e95-29aa-3fa16fe0c764@suse.de>
> > +struct fcoe_tstorm_fcoe_task_st_ctx_read_write {
> > + union fcoe_cleanup_addr_exp_ro_union
> cleanup_addr_exp_ro_union;
> > + __le16 flags;
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK
> 0x7
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_SHIFT 0
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT
> 3
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT 4
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT 5
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_MASK
> 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT
> 6
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_MAS
> K 0x1
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT
> 7
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_MASK
> 0x3
> > +#define
> FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT 8
> > +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK
> 0x3F
> > +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT
> 10
>
> A very odd way of defining a bitfield ...
> Why not use a 'normal' bitfield here?
This is the format of our generated firmware HSI; We already have
Thousands of definitions using this same format.
^ permalink raw reply
* [PATCH net-next] sctp: refactor sctp_datamsg_from_user
From: Marcelo Ricardo Leitner @ 2016-12-29 17:53 UTC (permalink / raw)
To: netdev; +Cc: linux-sctp, Neil Horman, Vlad Yasevich
This patch refactors sctp_datamsg_from_user() in an attempt to make it
better to read and avoid code duplication for handling the last
fragment.
It also avoids doing division and remaining operations. Even though, it
should still operate similarly as before this patch.
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
net/sctp/chunk.c | 107 +++++++++++++++++--------------------------------------
1 file changed, 32 insertions(+), 75 deletions(-)
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 615f0ddd41dfb1ff46a9d4e564716de8e7b60ea6..e3621cb4827fadb5f5cb41ebe8455dfa3300a765 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
struct sctp_sndrcvinfo *sinfo,
struct iov_iter *from)
{
- int max, whole, i, offset, over, err;
- int len, first_len;
- int max_data;
+ size_t len, first_len, max_data, remaining;
+ size_t msg_len = iov_iter_count(from);
+ struct list_head *pos, *temp;
struct sctp_chunk *chunk;
struct sctp_datamsg *msg;
- struct list_head *pos, *temp;
- size_t msg_len = iov_iter_count(from);
- __u8 frag;
+ int err;
msg = sctp_datamsg_new(GFP_KERNEL);
if (!msg)
@@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
(SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) ||
!SCTP_PR_POLICY(sinfo->sinfo_flags)))
msg->expires_at = jiffies +
- msecs_to_jiffies(sinfo->sinfo_timetolive);
+ msecs_to_jiffies(sinfo->sinfo_timetolive);
/* This is the biggest possible DATA chunk that can fit into
* the packet
@@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
max_data = SCTP_TRUNC4(max_data);
- max = asoc->frag_point;
/* If the the peer requested that we authenticate DATA chunks
* we need to account for bundling of the AUTH chunks along with
* DATA.
@@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
hmac_desc->hmac_len);
}
- /* Now, check if we need to reduce our max */
- if (max > max_data)
- max = max_data;
+ /* Check what's our max considering the above */
+ max_data = min_t(size_t, max_data, asoc->frag_point);
- whole = 0;
- first_len = max;
+ /* Set first_len and then account for possible bundles on first frag */
+ first_len = max_data;
/* Check to see if we have a pending SACK and try to let it be bundled
* with this message. Do this if we don't have any data queued already.
@@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
asoc->outqueue.out_qlen == 0 &&
list_empty(&asoc->outqueue.retransmit) &&
- msg_len > max)
- max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
+ msg_len > max_data)
+ first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
/* Encourage Cookie-ECHO bundling. */
if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
- max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
-
- /* Now that we adjusted completely, reset first_len */
- if (first_len > max_data)
- first_len = max_data;
+ first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
/* Account for a different sized first fragment */
if (msg_len >= first_len) {
- msg_len -= first_len;
- whole = 1;
msg->can_delay = 0;
- }
-
- /* How many full sized? How many bytes leftover? */
- whole += msg_len / max;
- over = msg_len % max;
- offset = 0;
-
- if ((whole > 1) || (whole && over))
SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+ } else {
+ /* Which may be the only one... */
+ first_len = msg_len;
+ }
- /* Create chunks for all the full sized DATA chunks. */
- for (i = 0, len = first_len; i < whole; i++) {
- frag = SCTP_DATA_MIDDLE_FRAG;
+ /* Create chunks for all DATA chunks. */
+ for (remaining = msg_len; remaining; remaining -= len) {
+ u8 frag = SCTP_DATA_MIDDLE_FRAG;
- if (0 == i)
+ if (remaining == msg_len) {
+ /* First frag, which may also be the last */
frag |= SCTP_DATA_FIRST_FRAG;
+ len = first_len;
+ } else {
+ /* Middle frags */
+ len = max_data;
+ }
- if ((i == (whole - 1)) && !over) {
+ if (len >= remaining) {
+ /* Last frag, which may also be the first */
+ len = remaining;
frag |= SCTP_DATA_LAST_FRAG;
/* The application requests to set the I-bit of the
@@ -271,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
0, GFP_KERNEL);
-
if (!chunk) {
err = -ENOMEM;
goto errout;
@@ -282,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
goto errout_chunk_free;
/* Put the chunk->skb back into the form expected by send. */
- __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
- - (__u8 *)chunk->skb->data);
-
- sctp_datamsg_assign(msg, chunk);
- list_add_tail(&chunk->frag_list, &msg->chunks);
-
- /* The first chunk, the first chunk was likely short
- * to allow bundling, so reset to full size.
- */
- if (0 == i)
- len = max;
- }
-
- /* .. now the leftover bytes. */
- if (over) {
- if (!whole)
- frag = SCTP_DATA_NOT_FRAG;
- else
- frag = SCTP_DATA_LAST_FRAG;
-
- if ((sinfo->sinfo_flags & SCTP_EOF) ||
- (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
- frag |= SCTP_DATA_SACK_IMM;
-
- chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
- 0, GFP_KERNEL);
-
- if (!chunk) {
- err = -ENOMEM;
- goto errout;
- }
-
- err = sctp_user_addto_chunk(chunk, over, from);
-
- /* Put the chunk->skb back into the form expected by send. */
- __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
- - (__u8 *)chunk->skb->data);
- if (err < 0)
- goto errout_chunk_free;
+ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
+ chunk->skb->data);
sctp_datamsg_assign(msg, chunk);
list_add_tail(&chunk->frag_list, &msg->chunks);
@@ -338,6 +294,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
sctp_chunk_free(chunk);
}
sctp_datamsg_put(msg);
+
return ERR_PTR(err);
}
--
2.9.3
^ permalink raw reply related
* Re: [PATCH iproute2 net-next] tc: flower: support matching flags
From: Stephen Hemminger @ 2016-12-29 18:43 UTC (permalink / raw)
To: Paul Blakey
Cc: netdev, David S. Miller, Hadar Hen Zion, Or Gerlitz, Roi Dayan
In-Reply-To: <1482930409-55059-1-git-send-email-paulb@mellanox.com>
On Wed, 28 Dec 2016 15:06:49 +0200
Paul Blakey <paulb@mellanox.com> wrote:
> Enhance flower to support matching on flags.
>
> The 1st flag allows to match on whether the packet is
> an IP fragment.
>
> Example:
>
> # add a flower filter that will drop fragmented packets
> # (bit 0 of control flags)
> tc filter add dev ens4f0 protocol ip parent ffff: \
> flower \
> src_mac e4:1d:2d:fd:8b:01 \
> dst_mac e4:1d:2d:fd:8b:02 \
> indev ens4f0 \
> matching_flags 0x1/0x1 \
> action drop
>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>
Applied. Had to manually fixup merge conflicts with other flower changes.
^ permalink raw reply
* Re: [PATCH iproute2] fix typo in ip-xfrm man page, rmd610 -> rmd160
From: Stephen Hemminger @ 2016-12-29 18:44 UTC (permalink / raw)
To: Alexey Kodanev; +Cc: netdev, Vasily Isaenko
In-Reply-To: <1482490996-17048-1-git-send-email-alexey.kodanev@oracle.com>
On Fri, 23 Dec 2016 14:03:16 +0300
Alexey Kodanev <alexey.kodanev@oracle.com> wrote:
> Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Applied.
^ permalink raw reply
* Re: [PATCH] tc: add missing limits.h header
From: Stephen Hemminger @ 2016-12-29 18:44 UTC (permalink / raw)
To: Baruch Siach; +Cc: netdev
In-Reply-To: <c18609c5d906bbba8176138cfafd88c4a38c8190.1482432767.git.baruch@tkos.co.il>
On Thu, 22 Dec 2016 20:52:48 +0200
Baruch Siach <baruch@tkos.co.il> wrote:
> This fixes under musl build issues like:
>
> f_matchall.c: In function ‘matchall_parse_opt’:
> f_matchall.c:48:12: error: ‘LONG_MIN’ undeclared (first use in this function)
> if (h == LONG_MIN || h == LONG_MAX) {
> ^
> f_matchall.c:48:12: note: each undeclared identifier is reported only once for each function it appears in
> f_matchall.c:48:29: error: ‘LONG_MAX’ undeclared (first use in this function)
> if (h == LONG_MIN || h == LONG_MAX) {
> ^
>
> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Sure, applied
^ permalink raw reply
* Re: [PATCH net] rtnl: stats - add missing netlink message size checks
From: David Miller @ 2016-12-29 19:06 UTC (permalink / raw)
To: minipli; +Cc: netdev, roopa
In-Reply-To: <1482943935-18052-1-git-send-email-minipli@googlemail.com>
From: Mathias Krause <minipli@googlemail.com>
Date: Wed, 28 Dec 2016 17:52:15 +0100
> We miss to check if the netlink message is actually big enough to contain
> a struct if_stats_msg.
>
> Add a check to prevent userland from sending us short messages that would
> make us access memory beyond the end of the message.
>
> Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump...")
> Signed-off-by: Mathias Krause <minipli@googlemail.com>
Looks good, applied and queued up for -stable.
^ permalink raw reply
* Re: [PATCH v2] net: fix incorrect original ingress device index in PKTINFO
From: David Miller @ 2016-12-29 19:08 UTC (permalink / raw)
To: asuka.com; +Cc: kuznet, jmorris, yoshfuji, kaber, dsa, netdev, linux-kernel
In-Reply-To: <1483001104-17614-1-git-send-email-asuka.com@163.com>
From: Wei Zhang <asuka.com@163.com>
Date: Thu, 29 Dec 2016 16:45:04 +0800
> When we send a packet for our own local address on a non-loopback
> interface (e.g. eth0), due to the change had been introduced from
> commit 0b922b7a829c ("net: original ingress device index in PKTINFO"), the
> original ingress device index would be set as the loopback interface.
> However, the packet should be considered as if it is being arrived via the
> sending interface (eth0), otherwise it would break the expectation of the
> userspace application (e.g. the DHCPRELEASE message from dhcp_release
> binary would be ignored by the dnsmasq daemon, since it come from lo which
> is not the interface dnsmasq bind to)
>
> Fixes: 0b922b7a829c ("net: original ingress device index in PKTINFO")
> Acked-by: David Ahern <dsa@cumulusnetworks.com>
> Signed-off-by: Wei Zhang <asuka.com@163.com>
Applied and queued up for -stable.
^ permalink raw reply
* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
From: David Miller @ 2016-12-29 19:08 UTC (permalink / raw)
To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel
In-Reply-To: <1483005521-27799-1-git-send-email-matthias.tafelmeier@gmx.net>
From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Thu, 29 Dec 2016 10:58:41 +0100
> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
>
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior processing
> support (e.g. aRFS).
>
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
>
> This patch therefore introduces an independent configurability via sysctl to
> userland.
This is missing a signoff.
^ permalink raw reply
* Re: [PATCH net 0/5] mlx4 misc fixes
From: David Miller @ 2016-12-29 19:18 UTC (permalink / raw)
To: tariqt; +Cc: netdev, eranbe
In-Reply-To: <1483029433-3624-1-git-send-email-tariqt@mellanox.com>
From: Tariq Toukan <tariqt@mellanox.com>
Date: Thu, 29 Dec 2016 18:37:08 +0200
> This patchset contains several bug fixes from the team to the
> mlx4 Eth and Core drivers.
>
> Series generated against net commit:
> 60133867f1f1 'net: wan: slic_ds26522: fix spelling mistake: "configurated" -> "configured"'
Series applied, thank you.
^ permalink raw reply
* [PATCH v4] net: dev_weight: TX/RX orthogonality
From: Matthias Tafelmeier @ 2016-12-29 19:23 UTC (permalink / raw)
To: netdev; +Cc: hagen, fw, edumazet, daniel
In-Reply-To: <20161229.140854.1456743873101323068.davem@davemloft.net>
Oftenly, introducing side effects on packet processing on the other half
of the stack by adjusting one of TX/RX via sysctl is not desirable.
There are cases of demand for asymmetric, orthogonal configurability.
This holds true especially for nodes where RPS for RFS usage on top is
configured and therefore use the 'old dev_weight'. This is quite a
common base configuration setup nowadays, even with NICs of superior processing
support (e.g. aRFS).
A good example use case are nodes acting as noSQL data bases with a
large number of tiny requests and rather fewer but large packets as responses.
It's affordable to have large budget and rx dev_weights for the
requests. But as a side effect having this large a number on TX
processed in one run can overwhelm drivers.
This patch therefore introduces an independent configurability via sysctl to
userland.
Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
---
Documentation/sysctl/net.txt | 21 +++++++++++++++++++++
include/linux/netdevice.h | 4 ++++
net/core/dev.c | 6 +++++-
net/core/sysctl_net_core.c | 31 ++++++++++++++++++++++++++++++-
net/sched/sch_generic.c | 2 +-
5 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index f0480f7..53cef32 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -61,6 +61,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
it's a Per-CPU variable.
Default: 64
+dev_weight_rx_bias
+--------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+Default: 1
+
+dev_weight_tx_bias
+--------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+Default: 1
+
default_qdisc
--------------
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f742..ecd78b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3795,6 +3795,10 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
extern int netdev_max_backlog;
extern int netdev_tstamp_prequeue;
extern int weight_p;
+extern int dev_weight_rx_bias;
+extern int dev_weight_tx_bias;
+extern int dev_rx_weight;
+extern int dev_tx_weight;
bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8db5a0b..f2fe98b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3428,6 +3428,10 @@ EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
int weight_p __read_mostly = 64; /* old backlog weight */
+int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
+int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
+int dev_rx_weight __read_mostly = weight_p;
+int dev_tx_weight __read_mostly = weight_p;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4833,7 +4837,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = weight_p;
+ napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2a46e40..698ddd7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
+static int proc_do_dev_weight(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret != 0)
+ return ret;
+
+ dev_rx_weight = weight_p * dev_weight_rx_bias;
+ dev_tx_weight = weight_p * dev_weight_tx_bias;
+
+ return ret;
+}
+
static int proc_do_rss_key(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = {
.data = &weight_p,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_rx_bias",
+ .data = &dev_weight_rx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
+ },
+ {
+ .procname = "dev_weight_tx_bias",
+ .data = &dev_weight_tx_bias,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_dev_weight,
},
{
.procname = "netdev_max_backlog",
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6eb9c8e..b052b27 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets)
void __qdisc_run(struct Qdisc *q)
{
- int quota = weight_p;
+ int quota = dev_tx_weight;
int packets;
while (qdisc_restart(q, &packets)) {
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net-next 00/14] bnxt_en: updates for net-next.
From: David Miller @ 2016-12-29 19:42 UTC (permalink / raw)
To: michael.chan; +Cc: netdev
In-Reply-To: <1483031624-20076-1-git-send-email-michael.chan@broadcom.com>
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 29 Dec 2016 12:13:30 -0500
> This patch series for net-next contains cleanups, new features and minor
> fixes. The driver specific busy polling code is removed to use busy
> polling support in core networking. Hardware RFS support is enhanced with
> added ipv6 flows support and VF support. A new scheme to allocate TX
> rings from the firmware is implemented for newer chips and firmware. Plus
> some misc. cleanups, minor fixes, and to add the maintainer entry. Please
> review.
Looks good, series applied, thanks Michael.
^ permalink raw reply
* Re: [PATCH net-next] sctp: refactor sctp_datamsg_from_user
From: David Miller @ 2016-12-29 19:44 UTC (permalink / raw)
To: marcelo.leitner; +Cc: netdev, linux-sctp, nhorman, vyasevich
In-Reply-To: <ba007bd30f61decbdae5c23541cbdb397c030b68.1483033905.git.marcelo.leitner@gmail.com>
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 29 Dec 2016 15:53:28 -0200
> This patch refactors sctp_datamsg_from_user() in an attempt to make it
> better to read and avoid code duplication for handling the last
> fragment.
>
> It also avoids doing division and remaining operations. Even though, it
> should still operate similarly as before this patch.
>
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Applied.
^ permalink raw reply
* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
From: David Miller @ 2016-12-29 19:44 UTC (permalink / raw)
To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel
In-Reply-To: <1483039398-28017-1-git-send-email-matthias.tafelmeier@gmx.net>
From: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Date: Thu, 29 Dec 2016 20:23:18 +0100
> Oftenly, introducing side effects on packet processing on the other half
> of the stack by adjusting one of TX/RX via sysctl is not desirable.
> There are cases of demand for asymmetric, orthogonal configurability.
>
> This holds true especially for nodes where RPS for RFS usage on top is
> configured and therefore use the 'old dev_weight'. This is quite a
> common base configuration setup nowadays, even with NICs of superior processing
> support (e.g. aRFS).
>
> A good example use case are nodes acting as noSQL data bases with a
> large number of tiny requests and rather fewer but large packets as responses.
> It's affordable to have large budget and rx dev_weights for the
> requests. But as a side effect having this large a number on TX
> processed in one run can overwhelm drivers.
>
> This patch therefore introduces an independent configurability via sysctl to
> userland.
>
> Signed-off-by: Matthias Tafelmeier <matthias.tafelmeier@gmx.net>
Applied.
^ permalink raw reply
* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
From: David Miller @ 2016-12-29 19:45 UTC (permalink / raw)
To: matthias.tafelmeier; +Cc: netdev, hagen, fw, edumazet, daniel
In-Reply-To: <20161229.144456.2187353715676829840.davem@davemloft.net>
Actually, reverted, you didn't even build test this:
net/core/dev.c:3433:35: error: initializer element is not constant
int dev_rx_weight __read_mostly = weight_p;
^~~~~~~~
net/core/dev.c:3434:35: error: initializer element is not constant
int dev_tx_weight __read_mostly = weight_p;
^~~~~~~~
^ permalink raw reply
* Re: [PATCH v4] net: dev_weight: TX/RX orthogonality
From: Matthias Tafelmeier @ 2016-12-29 19:53 UTC (permalink / raw)
To: David Miller; +Cc: netdev, hagen, fw, edumazet, daniel
In-Reply-To: <20161229.144555.1740958763290967121.davem@davemloft.net>
[-- Attachment #1.1.1: Type: text/plain, Size: 435 bytes --]
> Actually, reverted, you didn't even build test this:
>
> net/core/dev.c:3433:35: error: initializer element is not constant
> int dev_rx_weight __read_mostly = weight_p;
> ^~~~~~~~
> net/core/dev.c:3434:35: error: initializer element is not constant
> int dev_tx_weight __read_mostly = weight_p;
> ^~~~~~~~
Thought I would have ... let me check.
[-- Attachment #1.1.2: 0x8ADF343B.asc --]
[-- Type: application/pgp-keys, Size: 4806 bytes --]
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 538 bytes --]
^ permalink raw reply
* [PATCH] wlcore: fix spelling mistake in wl1271_warning
From: Colin King @ 2016-12-29 20:14 UTC (permalink / raw)
To: Kalle Valo, Shahar Patury, Guy Mishol, linux-wireless, netdev
Cc: linux-kernel
From: Colin Ian King <colin.king@canonical.com>
trivial fix to spelling mistake of function name in wl1271_warning,
should be dynamic_ps_timeout instead of dyanmic_ps_timeout.
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
drivers/net/wireless/ti/wlcore/debugfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/wireless/ti/wlcore/debugfs.c b/drivers/net/wireless/ti/wlcore/debugfs.c
index 7f672f6..58e148d 100644
--- a/drivers/net/wireless/ti/wlcore/debugfs.c
+++ b/drivers/net/wireless/ti/wlcore/debugfs.c
@@ -281,7 +281,7 @@ static ssize_t dynamic_ps_timeout_write(struct file *file,
}
if (value < 1 || value > 65535) {
- wl1271_warning("dyanmic_ps_timeout is not in valid range");
+ wl1271_warning("dynamic_ps_timeout is not in valid range");
return -ERANGE;
}
--
2.10.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox