* [net-next v2 02/11] ixgbe: add XDP support for pass and drop actions
From: Jeff Kirsher @ 2017-04-30 3:08 UTC (permalink / raw)
To: davem; +Cc: John Fastabend, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20170430030810.56415-1-jeffrey.t.kirsher@intel.com>
From: John Fastabend <john.r.fastabend@intel.com>
Basic XDP drop support for ixgbe. Uses READ_ONCE/xchg semantics on XDP
programs instead of RCU primitives as suggested by Daniel Borkmann and
Alex Duyck.
v2: fix the build issues seen w/ XDP when page sizes are larger than 4K
and made minor fixes based on feedback from Jakub Kicinski
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe.h | 4 +-
drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 4 +-
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 169 +++++++++++++++++++----
3 files changed, 148 insertions(+), 29 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 656ca8f69768..cb14813b0080 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -318,6 +318,7 @@ struct ixgbe_ring {
struct ixgbe_ring *next; /* pointer to next ring in q_vector */
struct ixgbe_q_vector *q_vector; /* backpointer to host q_vector */
struct net_device *netdev; /* netdev ring belongs to */
+ struct bpf_prog *xdp_prog;
struct device *dev; /* device for DMA mapping */
struct ixgbe_fwd_adapter *l2_accel_priv;
void *desc; /* descriptor ring memory */
@@ -555,6 +556,7 @@ struct ixgbe_adapter {
unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
/* OS defined structs */
struct net_device *netdev;
+ struct bpf_prog *xdp_prog;
struct pci_dev *pdev;
unsigned long state;
@@ -835,7 +837,7 @@ void ixgbe_down(struct ixgbe_adapter *adapter);
void ixgbe_reinit_locked(struct ixgbe_adapter *adapter);
void ixgbe_reset(struct ixgbe_adapter *adapter);
void ixgbe_set_ethtool_ops(struct net_device *netdev);
-int ixgbe_setup_rx_resources(struct ixgbe_ring *);
+int ixgbe_setup_rx_resources(struct ixgbe_adapter *, struct ixgbe_ring *);
int ixgbe_setup_tx_resources(struct ixgbe_ring *);
void ixgbe_free_rx_resources(struct ixgbe_ring *);
void ixgbe_free_tx_resources(struct ixgbe_ring *);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 59730ede4746..79a126d9e091 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -1128,7 +1128,7 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
sizeof(struct ixgbe_ring));
temp_ring[i].count = new_rx_count;
- err = ixgbe_setup_rx_resources(&temp_ring[i]);
+ err = ixgbe_setup_rx_resources(adapter, &temp_ring[i]);
if (err) {
while (i) {
i--;
@@ -1761,7 +1761,7 @@ static int ixgbe_setup_desc_rings(struct ixgbe_adapter *adapter)
rx_ring->netdev = adapter->netdev;
rx_ring->reg_idx = adapter->rx_ring[0]->reg_idx;
- err = ixgbe_setup_rx_resources(rx_ring);
+ err = ixgbe_setup_rx_resources(adapter, rx_ring);
if (err) {
ret_val = 4;
goto err_nomem;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index afff2ca7f8c0..99b5357c3e00 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -49,6 +49,9 @@
#include <linux/if_macvlan.h>
#include <linux/if_bridge.h>
#include <linux/prefetch.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/atomic.h>
#include <scsi/fc/fc_fcoe.h>
#include <net/udp_tunnel.h>
#include <net/pkt_cls.h>
@@ -1855,6 +1858,10 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
* @rx_desc: pointer to the EOP Rx descriptor
* @skb: pointer to current skb being fixed
*
+ * Check if the skb is valid in the XDP case it will be an error pointer.
+ * Return true in this case to abort processing and advance to next
+ * descriptor.
+ *
* Check for corrupted packet headers caused by senders on the local L2
* embedded NIC switch not setting up their Tx Descriptors right. These
* should be very rare.
@@ -1873,6 +1880,10 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring,
{
struct net_device *netdev = rx_ring->netdev;
+ /* XDP packets use error pointer so abort at this point */
+ if (IS_ERR(skb))
+ return true;
+
/* verify that the packet does not have any known errors */
if (unlikely(ixgbe_test_staterr(rx_desc,
IXGBE_RXDADV_ERR_FRAME_ERR_MASK) &&
@@ -2048,7 +2059,7 @@ static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring,
/* hand second half of page back to the ring */
ixgbe_reuse_rx_page(rx_ring, rx_buffer);
} else {
- if (IXGBE_CB(skb)->dma == rx_buffer->dma) {
+ if (!IS_ERR(skb) && IXGBE_CB(skb)->dma == rx_buffer->dma) {
/* the page has been released from the ring */
IXGBE_CB(skb)->page_released = true;
} else {
@@ -2069,21 +2080,22 @@ static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring,
static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *rx_buffer,
- union ixgbe_adv_rx_desc *rx_desc,
- unsigned int size)
+ struct xdp_buff *xdp,
+ union ixgbe_adv_rx_desc *rx_desc)
{
- void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
+ unsigned int size = xdp->data_end - xdp->data;
#if (PAGE_SIZE < 8192)
unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
#else
- unsigned int truesize = SKB_DATA_ALIGN(size);
+ unsigned int truesize = SKB_DATA_ALIGN(xdp->data_end -
+ xdp->data_hard_start);
#endif
struct sk_buff *skb;
/* prefetch first cache line of first page */
- prefetch(va);
+ prefetch(xdp->data);
#if L1_CACHE_BYTES < 128
- prefetch(va + L1_CACHE_BYTES);
+ prefetch(xdp->data + L1_CACHE_BYTES);
#endif
/* allocate a skb to store the frags */
@@ -2096,7 +2108,7 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
IXGBE_CB(skb)->dma = rx_buffer->dma;
skb_add_rx_frag(skb, 0, rx_buffer->page,
- rx_buffer->page_offset,
+ xdp->data - page_address(rx_buffer->page),
size, truesize);
#if (PAGE_SIZE < 8192)
rx_buffer->page_offset ^= truesize;
@@ -2104,7 +2116,8 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
rx_buffer->page_offset += truesize;
#endif
} else {
- memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
+ memcpy(__skb_put(skb, size),
+ xdp->data, ALIGN(size, sizeof(long)));
rx_buffer->pagecnt_bias++;
}
@@ -2113,32 +2126,32 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *rx_buffer,
- union ixgbe_adv_rx_desc *rx_desc,
- unsigned int size)
+ struct xdp_buff *xdp,
+ union ixgbe_adv_rx_desc *rx_desc)
{
- void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
#if (PAGE_SIZE < 8192)
unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
#else
unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
- SKB_DATA_ALIGN(IXGBE_SKB_PAD + size);
+ SKB_DATA_ALIGN(xdp->data_end -
+ xdp->data_hard_start);
#endif
struct sk_buff *skb;
/* prefetch first cache line of first page */
- prefetch(va);
+ prefetch(xdp->data);
#if L1_CACHE_BYTES < 128
- prefetch(va + L1_CACHE_BYTES);
+ prefetch(xdp->data + L1_CACHE_BYTES);
#endif
- /* build an skb around the page buffer */
- skb = build_skb(va - IXGBE_SKB_PAD, truesize);
+ /* build an skb to around the page buffer */
+ skb = build_skb(xdp->data_hard_start, truesize);
if (unlikely(!skb))
return NULL;
/* update pointers within the skb to store the data */
- skb_reserve(skb, IXGBE_SKB_PAD);
- __skb_put(skb, size);
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ __skb_put(skb, xdp->data_end - xdp->data);
/* record DMA address if this is the start of a chain of buffers */
if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
@@ -2154,6 +2167,41 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
return skb;
}
+#define IXGBE_XDP_PASS 0
+#define IXGBE_XDP_CONSUMED 1
+
+static struct sk_buff *ixgbe_run_xdp(struct ixgbe_ring *rx_ring,
+ struct xdp_buff *xdp)
+{
+ int result = IXGBE_XDP_PASS;
+ struct bpf_prog *xdp_prog;
+ u32 act;
+
+ rcu_read_lock();
+ xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+
+ if (!xdp_prog)
+ goto xdp_out;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+ switch (act) {
+ case XDP_PASS:
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ case XDP_TX:
+ case XDP_ABORTED:
+ trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+ /* fallthrough -- handle aborts by dropping packet */
+ case XDP_DROP:
+ result = IXGBE_XDP_CONSUMED;
+ break;
+ }
+xdp_out:
+ rcu_read_unlock();
+ return ERR_PTR(-result);
+}
+
/**
* ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
* @q_vector: structure containing interrupt and ring information
@@ -2183,6 +2231,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
union ixgbe_adv_rx_desc *rx_desc;
struct ixgbe_rx_buffer *rx_buffer;
struct sk_buff *skb;
+ struct xdp_buff xdp;
unsigned int size;
/* return some buffers to hardware, one at a time is too slow */
@@ -2205,14 +2254,29 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size);
/* retrieve a buffer from the ring */
- if (skb)
+ if (!skb) {
+ xdp.data = page_address(rx_buffer->page) +
+ rx_buffer->page_offset;
+ xdp.data_hard_start = xdp.data -
+ ixgbe_rx_offset(rx_ring);
+ xdp.data_end = xdp.data + size;
+
+ skb = ixgbe_run_xdp(rx_ring, &xdp);
+ }
+
+ if (IS_ERR(skb)) {
+ total_rx_packets++;
+ total_rx_bytes += size;
+ rx_buffer->pagecnt_bias++;
+ } else if (skb) {
ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
- else if (ring_uses_build_skb(rx_ring))
+ } else if (ring_uses_build_skb(rx_ring)) {
skb = ixgbe_build_skb(rx_ring, rx_buffer,
- rx_desc, size);
- else
+ &xdp, rx_desc);
+ } else {
skb = ixgbe_construct_skb(rx_ring, rx_buffer,
- rx_desc, size);
+ &xdp, rx_desc);
+ }
/* exit if we failed to retrieve a buffer */
if (!skb) {
@@ -6073,7 +6137,8 @@ static int ixgbe_setup_all_tx_resources(struct ixgbe_adapter *adapter)
*
* Returns 0 on success, negative on failure
**/
-int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring)
+int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
+ struct ixgbe_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev);
@@ -6112,6 +6177,8 @@ int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring)
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
+ rx_ring->xdp_prog = adapter->xdp_prog;
+
return 0;
err:
vfree(rx_ring->rx_buffer_info);
@@ -6135,7 +6202,7 @@ static int ixgbe_setup_all_rx_resources(struct ixgbe_adapter *adapter)
int i, err = 0;
for (i = 0; i < adapter->num_rx_queues; i++) {
- err = ixgbe_setup_rx_resources(adapter->rx_ring[i]);
+ err = ixgbe_setup_rx_resources(adapter, adapter->rx_ring[i]);
if (!err)
continue;
@@ -6203,6 +6270,7 @@ void ixgbe_free_rx_resources(struct ixgbe_ring *rx_ring)
{
ixgbe_clean_rx_ring(rx_ring);
+ rx_ring->xdp_prog = NULL;
vfree(rx_ring->rx_buffer_info);
rx_ring->rx_buffer_info = NULL;
@@ -9468,6 +9536,54 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
return features;
}
+static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
+{
+ int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+ struct ixgbe_adapter *adapter = netdev_priv(dev);
+ struct bpf_prog *old_prog;
+
+ if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)
+ return -EINVAL;
+
+ if (adapter->flags & IXGBE_FLAG_DCB_ENABLED)
+ return -EINVAL;
+
+ /* verify ixgbe ring attributes are sufficient for XDP */
+ for (i = 0; i < adapter->num_rx_queues; i++) {
+ struct ixgbe_ring *ring = adapter->rx_ring[i];
+
+ if (ring_is_rsc_enabled(ring))
+ return -EINVAL;
+
+ if (frame_size > ixgbe_rx_bufsz(ring))
+ return -EINVAL;
+ }
+
+ old_prog = xchg(&adapter->xdp_prog, prog);
+ for (i = 0; i < adapter->num_rx_queues; i++)
+ xchg(&adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog);
+
+ if (old_prog)
+ bpf_prog_put(old_prog);
+
+ return 0;
+}
+
+static int ixgbe_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+ struct ixgbe_adapter *adapter = netdev_priv(dev);
+
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ return ixgbe_xdp_setup(dev, xdp->prog);
+ case XDP_QUERY_PROG:
+ xdp->prog_attached = !!(adapter->xdp_prog);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_open = ixgbe_open,
.ndo_stop = ixgbe_close,
@@ -9513,6 +9629,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_udp_tunnel_add = ixgbe_add_udp_tunnel_port,
.ndo_udp_tunnel_del = ixgbe_del_udp_tunnel_port,
.ndo_features_check = ixgbe_features_check,
+ .ndo_xdp = ixgbe_xdp,
};
/**
--
2.12.2
^ permalink raw reply related
* [net-next v2 01/11] ixgbe: Acquire PHY semaphore before device reset
From: Jeff Kirsher @ 2017-04-30 3:08 UTC (permalink / raw)
To: davem; +Cc: Paul Greenwalt, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20170430030810.56415-1-jeffrey.t.kirsher@intel.com>
From: Paul Greenwalt <paul.greenwalt@intel.com>
A recent firmware change fixed an issue to acquire the PHY semaphore before
accessing PHY registers. This led to a case where SW can issue a device
reset clearing the MDIO registers. This patch makes SW acquire the PHY
semaphore before issuing a device reset.
Signed-off-by: Paul Greenwalt <paul.greenwalt@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c | 8 ++++++++
drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 8 ++++++++
2 files changed, 16 insertions(+)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
index 84a467a8ed3d..6ea0d6a5fb90 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
@@ -95,6 +95,7 @@ s32 ixgbe_reset_hw_X540(struct ixgbe_hw *hw)
{
s32 status;
u32 ctrl, i;
+ u32 swfw_mask = hw->phy.phy_semaphore_mask;
/* Call adapter stop to disable tx/rx and clear interrupts */
status = hw->mac.ops.stop_adapter(hw);
@@ -105,10 +106,17 @@ s32 ixgbe_reset_hw_X540(struct ixgbe_hw *hw)
ixgbe_clear_tx_pending(hw);
mac_reset_top:
+ status = hw->mac.ops.acquire_swfw_sync(hw, swfw_mask);
+ if (status) {
+ hw_dbg(hw, "semaphore failed with %d", status);
+ return IXGBE_ERR_SWFW_SYNC;
+ }
+
ctrl = IXGBE_CTRL_RST;
ctrl |= IXGBE_READ_REG(hw, IXGBE_CTRL);
IXGBE_WRITE_REG(hw, IXGBE_CTRL, ctrl);
IXGBE_WRITE_FLUSH(hw);
+ hw->mac.ops.release_swfw_sync(hw, swfw_mask);
usleep_range(1000, 1200);
/* Poll for reset bit to self-clear indicating reset is complete */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 2658394599e4..58d3bcaca2b9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -3318,6 +3318,7 @@ static s32 ixgbe_reset_hw_X550em(struct ixgbe_hw *hw)
u32 ctrl = 0;
u32 i;
bool link_up = false;
+ u32 swfw_mask = hw->phy.phy_semaphore_mask;
/* Call adapter stop to disable Tx/Rx and clear interrupts */
status = hw->mac.ops.stop_adapter(hw);
@@ -3363,9 +3364,16 @@ static s32 ixgbe_reset_hw_X550em(struct ixgbe_hw *hw)
ctrl = IXGBE_CTRL_RST;
}
+ status = hw->mac.ops.acquire_swfw_sync(hw, swfw_mask);
+ if (status) {
+ hw_dbg(hw, "semaphore failed with %d", status);
+ return IXGBE_ERR_SWFW_SYNC;
+ }
+
ctrl |= IXGBE_READ_REG(hw, IXGBE_CTRL);
IXGBE_WRITE_REG(hw, IXGBE_CTRL, ctrl);
IXGBE_WRITE_FLUSH(hw);
+ hw->mac.ops.release_swfw_sync(hw, swfw_mask);
usleep_range(1000, 1200);
/* Poll for reset bit to self-clear meaning reset is complete */
--
2.12.2
^ permalink raw reply related
* [net-next v2 00/11][pull request] 10GbE Intel Wired LAN Driver Updates 2017-04-29
From: Jeff Kirsher @ 2017-04-30 3:07 UTC (permalink / raw)
To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene
This series contains updates to ixgbe and ixgbevf only, most notable is
the addition of XDP support to our 10GbE drivers.
Paul fixes ixgbe to acquire the PHY semaphore before accessing PHY
registers when issuing a device reset.
John adds XDP support (yeah!) for ixgbe.
Emil fixes an issue by flushing the MACVLAN filters on VF reset to avoid
conflicts with other VFs that may end up using the same MAC address. Also
fixed a bug where ethtool -S displayed some empty fields for ixgbevf
because it was using ixgbe_stats instead ixgbevf_stats for
IXGBEVF_QUEUE_STATS_LEN.
Tony adds the ability to specify a zero MAC address in order to clear the
VF's MAC address from the RAR table. Also adds support for a new
1000Base-T device based on x550EM_X MAC type. Fixed an issue where the
RSS key specified by the user would be over-written with a pre-existing
value, so change the rss_key to a pointer so we can check to see if the
key has a value set before attempting to set it. Fixed the logic for
mailbox support for getting RETA and RSS values, which are only supported
by 82599 and x540 devices.
v2: fixed up patches #2 and #3 based on feedback from Jakub and to
address build issues when page sizes are larger than 4k
The following are changes since commit 4c042a80c1776477d8f13d0b214809a2eaedf01d:
Merge branch 'bpf-misc-next'
and are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 10GbE
Emil Tantilov (2):
ixgbe: clean macvlan MAC filter table on VF reset
ixgbevf: fix size of queue stats length
John Fastabend (3):
ixgbe: add XDP support for pass and drop actions
ixgbe: add support for XDP_TX action
ixgbe: delay tail write to every 'n' packets
Paul Greenwalt (2):
ixgbe: Acquire PHY semaphore before device reset
ixgbe: Add 1000Base-T device based on X550EM_X MAC
Tony Nguyen (4):
ixgbe: Allow setting zero MAC address for VF
ixgbe: Check for RSS key before setting value
ixgbevf: Fix errors in retrieving RETA and RSS from PF
ixgbevf: Check for RSS key before setting value
drivers/net/ethernet/intel/ixgbe/ixgbe.h | 27 +-
drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 33 +-
drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 75 +++-
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 477 +++++++++++++++++++---
drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 135 +++---
drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 1 +
drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c | 8 +
drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 53 ++-
drivers/net/ethernet/intel/ixgbevf/ethtool.c | 5 +-
drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 2 +-
drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 33 +-
drivers/net/ethernet/intel/ixgbevf/vf.c | 6 +-
12 files changed, 701 insertions(+), 154 deletions(-)
--
2.12.2
^ permalink raw reply
* Re: [PATCH v3 binutils] Add BPF support to binutils...
From: David Miller @ 2017-04-30 2:37 UTC (permalink / raw)
To: ast; +Cc: daniel, aconole, netdev, xdp-newbies
In-Reply-To: <20170429.222450.1300920007783667009.davem@davemloft.net>
From: David Miller <davem@davemloft.net>
Date: Sat, 29 Apr 2017 22:24:50 -0400 (EDT)
> Some of your bugs should be fixed by this patch below, I'll add
> test cases soon:
Ok, here are all the local changes in my tree. I made the relocs
match LLVM and I fixed some dwarf debugging stuff.
With this we are also down to one test case failure under binutils/
and it's something weird with merging 64-bit notes which I should be
able to fix soon.
I can fix these bugs fast, keep reporting.
BTW, should I just remove tailcall from the opcode table altogether?
diff --git a/binutils/readelf.c b/binutils/readelf.c
index 6c67d98..3931a3a 100644
--- a/binutils/readelf.c
+++ b/binutils/readelf.c
@@ -95,6 +95,7 @@
#include "elf/arm.h"
#include "elf/avr.h"
#include "elf/bfin.h"
+#include "elf/bpf.h"
#include "elf/cr16.h"
#include "elf/cris.h"
#include "elf/crx.h"
@@ -746,6 +747,7 @@ guess_is_rela (unsigned int e_machine)
case EM_AVR:
case EM_AVR_OLD:
case EM_BLACKFIN:
+ case EM_BPF:
case EM_CR16:
case EM_CRIS:
case EM_CRX:
@@ -1458,6 +1460,10 @@ dump_relocations (FILE * file,
rtype = elf_bfin_reloc_type (type);
break;
+ case EM_BPF:
+ rtype = elf_bpf_reloc_type (type);
+ break;
+
case EM_CYGNUS_MEP:
rtype = elf_mep_reloc_type (type);
break;
diff --git a/gas/config/tc-bpf.c b/gas/config/tc-bpf.c
index 0ba2afa..36393b7 100644
--- a/gas/config/tc-bpf.c
+++ b/gas/config/tc-bpf.c
@@ -288,6 +288,14 @@ md_assemble (char *str ATTRIBUTE_UNUSED)
switch (*args)
{
case '+':
+ if (*s == '+')
+ {
+ ++s;
+ continue;
+ }
+ if (*s == '-')
+ continue;
+ break;
case ',':
case '[':
case ']':
@@ -494,6 +502,9 @@ md_apply_fix (fixS *fixP, valueT *valP ATTRIBUTE_UNUSED, segT segment ATTRIBUTE_
offsetT val = * (offsetT *) valP;
gas_assert (fixP->fx_r_type < BFD_RELOC_UNUSED);
+
+ fixP->fx_addnumber = val; /* Remember value for emit_reloc. */
+
/* If this is a data relocation, just output VAL. */
if (fixP->fx_r_type == BFD_RELOC_8)
diff --git a/include/elf/bpf.h b/include/elf/bpf.h
index 5019b11..77463e3 100644
--- a/include/elf/bpf.h
+++ b/include/elf/bpf.h
@@ -26,14 +26,14 @@
/* Relocation types. */
START_RELOC_NUMBERS (elf_bpf_reloc_type)
RELOC_NUMBER (R_BPF_NONE, 0)
- RELOC_NUMBER (R_BPF_INSN_16, 1)
- RELOC_NUMBER (R_BPF_INSN_32, 2)
- RELOC_NUMBER (R_BPF_INSN_64, 3)
- RELOC_NUMBER (R_BPF_WDISP16, 4)
- RELOC_NUMBER (R_BPF_DATA_8, 5)
- RELOC_NUMBER (R_BPF_DATA_16, 6)
- RELOC_NUMBER (R_BPF_DATA_32, 7)
- RELOC_NUMBER (R_BPF_DATA_64, 8)
+ RELOC_NUMBER (R_BPF_DATA_64, 1)
+ RELOC_NUMBER (R_BPF_INSN_16, 2)
+ RELOC_NUMBER (R_BPF_INSN_32, 3)
+ RELOC_NUMBER (R_BPF_INSN_64, 4)
+ RELOC_NUMBER (R_BPF_WDISP16, 5)
+ RELOC_NUMBER (R_BPF_DATA_8, 6)
+ RELOC_NUMBER (R_BPF_DATA_16, 7)
+ RELOC_NUMBER (R_BPF_DATA_32, 10)
END_RELOC_NUMBERS (R_BPF_max)
#endif /* _ELF_BPF_H */
diff --git a/opcodes/bpf-dis.c b/opcodes/bpf-dis.c
index 92e29af..39656bf 100644
--- a/opcodes/bpf-dis.c
+++ b/opcodes/bpf-dis.c
@@ -49,7 +49,7 @@ print_insn_bpf (bfd_vma memaddr, disassemble_info *info)
bpf_opcode_hash *op;
int code, dest, src;
bfd_byte buffer[8];
- unsigned short off;
+ signed short off;
int status, ret;
signed int imm;
@@ -78,7 +78,7 @@ print_insn_bpf (bfd_vma memaddr, disassemble_info *info)
else
{
getword = bfd_getl32;
- gethalf = bfd_getl32;
+ gethalf = bfd_getl16;
}
code = buffer[0];
@@ -128,7 +128,7 @@ print_insn_bpf (bfd_vma memaddr, disassemble_info *info)
(*info->fprintf_func) (stream, "%d", imm);
break;
case 'O':
- (*info->fprintf_func) (stream, "%d", off);
+ (*info->fprintf_func) (stream, "%d", (int) off);
break;
case 'L':
info->target = memaddr + ((off - 1) * 8);
^ permalink raw reply related
* Re: prog ID and next steps. Was: [RFC net-next 0/2] Introduce bpf_prog ID and iteration
From: Hannes Frederic Sowa @ 2017-04-30 2:36 UTC (permalink / raw)
To: Alexei Starovoitov, Martin KaFai Lau, netdev
Cc: Daniel Borkmann, kernel-team, David S. Miller,
Jesper Dangaard Brouer, John Fastabend, Thomas Graf
In-Reply-To: <b3dbda71-df4c-7e92-76aa-2c2252266cbc@fb.com>
Hi,
just quickly, because I am on a run:
On Sun, Apr 30, 2017, at 04:06, Alexei Starovoitov wrote:
> On 4/28/17 2:13 PM, Hannes Frederic Sowa wrote:
> >
> > Let's assume the following program with a constant key lookup and
> > different tables:
> >
> > action = bpf_map_lookup_elem(&actions, 0);
> > if (!*action)
> > return XDP_DROP;
> > else
> > return bpf_redirect(skb->ifindex, 0);
> >
> > It does something completely different depending on the map being used.
> > That is the reason why I see it makes sense to be specific which program
> > gets used if you try to analyze a program interactively.
>
> Instead of two exactly the same programs accessing different
> maps it could have been one program accessing one map.
> The end result is the same.
> Depending on contents of the map and the input bytes of the packet
> the program will do arbitrary decisions. I still don't see how printing
> prog ID can help debugging.
Because during live inspection/debugging you get the relationship chain
in order:
n bpf-programs with same tag : loaded at m hooks : using z maps
You trace bpf_redirect and get back only the bpf tag.
You inspect all hooks and get back a bpf program identified by either or
all of filedescriptor, tag or prog_id. This doesn't solve the ambiguity
which program just actually was traced. Thus you have to consider all of
the programs. Each program can reference different maps.
So you extract all bpf programs and attributes and take the union of all
referenced maps, which you actually would have to inspect. If you had
the prog_id available, you could clearly figure out which maps are
referenced by the traced program.
I do see this as a prolem, because lot's of policy scripts might be
absolutely the same (thus also same tag) but might use different maps in
the end. We might not be able to correlate traces back to the maps.
Imagine a lot of bpf-lwt programs loaded, all the same, doing decisions
based on labels in the maps. It would be nice to see which specific map
also caused the behavior and to trace it back to user space and where
and why it was generated.
> >> The programs gets unloaded too and this 'perf record' and stack
> >> traces come from the past, hence the need for stable prog_tag.
> >
> > perf only stores addresses in perf.data. That said, if the program isn't
> > loaded, it won't give you any tag. If another program is reusing the
> > same address, if will give you any other random name for the function in
> > the calltrace.
>
> This issue is well understood. We discussed it with Arnaldo and
> the plan is to emit PERF_RECORD_MMAP that will contain the address
> of JITed bpf program, so perf.data will contain all the info necessary
> for later analysis.
I think you meant to say "will contain the *name*" of the JITed bpf
program?
If so, that is cool.
> > If you store the perf script output or have kallsyms handy, certainly, yes.
>
> right now 'perf script' doesn't work for short lived programs either.
> The stack trace IPs may be already gone from kallsyms.
> PERF_RECORD_MMAP is the right solution to that as well.
Absolutely ack.
> > Most of the time I was debugging interactively. Developers would
> > probably also enjoy to have a way to trace the program to the exact
> > identity. I have no problem keeping the tag in place and append just the
> > prog_id for the specific reason that the program might be loaded
> > multiple times with different tags in place. I was concerned about the
> > space for function names in kallsyms.
>
> developers enjoy 'exact identity of the program' with program tag.
> Dynamic prog ID doesn't provide additional info to the user.
> Like when you write brand new kernel module, do you care what
> order it's loaded in lsmod ?
> Or if system crashed, do you care that veth0 had ifindex 10 or 20?
> Or PID of segfaulted program was 3256 ?
I do care which data structures my kernel module or program used,
because I might want to reference exactly those for debugging. ;)
> > I have to think more about it. Maybe there is a way to achieve both
> > without too much hassle.
>
> It seems we agree on 80+% of topics discussed in this thread, so
> I suggest we implement, review and land these key pieces and later
> resolve the contentious points. By that time we will likely
> see each other arguments in different light :)
I agree with that. Maybe better ideas come up.
I was a bit pushy here, because I fear that kallsym name changes could
have uapi impact.
Thanks,
Hannes
^ permalink raw reply
* Re: [PATCH v3 binutils] Add BPF support to binutils...
From: David Miller @ 2017-04-30 2:24 UTC (permalink / raw)
To: ast; +Cc: daniel, aconole, netdev, xdp-newbies
In-Reply-To: <9d5e3a87-15d0-01d7-6272-fa8d2bf0d076@fb.com>
From: Alexei Starovoitov <ast@fb.com>
Date: Sat, 29 Apr 2017 17:48:43 -0700
> $ bld/binutils/objdump -S test.o
>
> test.o: file format elf64-bpfbe
>
> Disassembly of section .text:
>
> 0000000000000000 <bpf_prog1>:
> 0: 18 10 00 00 83 98 47 39 ldimm64 r1, 590618314553
> 8: 00 00 00 00 00 00 00 89
> 10: 7b a1 ff f8 00 00 00 00 stdw [r10+65528], r1
> 18: 79 1a ff f8 00 00 00 00 lddw r1, [r10+65528]
> 20: 07 10 00 00 8f ff 00 02 add r1, -1879113726
> 28: 79 01 00 00 00 00 00 00 lddw r0, [r1+0]
> 30: 95 00 00 00 00 00 00 00 exit
>
> looks good except negative offsets are reported as large positive.
Some of your bugs should be fixed by this patch below, I'll add
test cases soon:
diff --git a/gas/config/tc-bpf.c b/gas/config/tc-bpf.c
index 0ba2afa..36393b7 100644
--- a/gas/config/tc-bpf.c
+++ b/gas/config/tc-bpf.c
@@ -288,6 +288,14 @@ md_assemble (char *str ATTRIBUTE_UNUSED)
switch (*args)
{
case '+':
+ if (*s == '+')
+ {
+ ++s;
+ continue;
+ }
+ if (*s == '-')
+ continue;
+ break;
case ',':
case '[':
case ']':
diff --git a/opcodes/bpf-dis.c b/opcodes/bpf-dis.c
index 92e29af..39656bf 100644
--- a/opcodes/bpf-dis.c
+++ b/opcodes/bpf-dis.c
@@ -49,7 +49,7 @@ print_insn_bpf (bfd_vma memaddr, disassemble_info *info)
bpf_opcode_hash *op;
int code, dest, src;
bfd_byte buffer[8];
- unsigned short off;
+ signed short off;
int status, ret;
signed int imm;
@@ -78,7 +78,7 @@ print_insn_bpf (bfd_vma memaddr, disassemble_info *info)
else
{
getword = bfd_getl32;
- gethalf = bfd_getl32;
+ gethalf = bfd_getl16;
}
code = buffer[0];
@@ -128,7 +128,7 @@ print_insn_bpf (bfd_vma memaddr, disassemble_info *info)
(*info->fprintf_func) (stream, "%d", imm);
break;
case 'O':
- (*info->fprintf_func) (stream, "%d", off);
+ (*info->fprintf_func) (stream, "%d", (int) off);
break;
case 'L':
info->target = memaddr + ((off - 1) * 8);
^ permalink raw reply related
* Re: [PATCH v3 binutils] Add BPF support to binutils...
From: Alexei Starovoitov @ 2017-04-30 2:24 UTC (permalink / raw)
To: David Miller; +Cc: daniel, aconole, netdev, xdp-newbies
In-Reply-To: <20170429.221315.1605574480939856975.davem@davemloft.net>
On 4/29/17 7:13 PM, David Miller wrote:
> From: Alexei Starovoitov <ast@fb.com>
> Date: Sat, 29 Apr 2017 17:48:43 -0700
>
>> /w/binutils-gdb/bld/binutils/objdump: invalid relocation type 10
>> /w/binutils-gdb/bld/binutils/objdump: Dwarf Error: found address size
>
> I discussed this in another email, the relocation numbers I used in
> binutils do not match what is in LLVM currently.
>
> In fact, I thought you guys weren't using relocations in any capacity
> at all so just picked things from scratch :-)
yeah :) will reply in the other thread.
Too many public and internal discussions in the last week.
Weekend is the only time to reduce the backlog :)
> Please use "--target=bpf-elf"
Thanks. That worked. Built the whole thing :)
objdump behaves the same way.
When compiled by clang with '-g'
(gdb) x/10i bpf_prog1
0x0 <clang version 5.0.0 (trunk 301652) (llvm/trunk 301662)>:
ldimm64 r0, 590618314553
0x10 <clang version 5.0.0 (trunk 301652) (llvm/trunk 301662)+16>:
stdw [r1+65528], r10
0x18 <clang version 5.0.0 (trunk 301652) (llvm/trunk 301662)+24>:
lddw r10, [r1+65528]
0x20 <clang version 5.0.0 (trunk 301652) (llvm/trunk 301662)+32>:
add r0, -1879113726
0x28 <clang version 5.0.0 (trunk 301652) (llvm/trunk 301662)+40>:
lddw r1, [r0+0]
0x30 <clang version 5.0.0 (trunk 301652) (llvm/trunk 301662)+48>: exit
0x38: Cannot access memory at address 0x38
Even without -g the last line 'Cannot access' is printed.
It seems gdb miscalculates the total func size?
The printing of 'clang version...' is due to '-g'.
Without -g it looks good:
(gdb) x/10i bpf_prog1
0x0 <bpf_prog1>: ldimm64 r1, 590618314553
0x10 <bpf_prog1+16>: stdw [r10+65528], r1
Btw, I'm using this C file for testing:
int bpf_prog1(void *ign)
{
volatile unsigned long t = 0x8983984739ull;
return *(unsigned long *)((0xffffffff8fff0002ull) + t);
}
There was a bug in llvm backend with imm overflow which
was recently fixed.
^ permalink raw reply
* Re: [PATCH v3 binutils] Add BPF support to binutils...
From: David Miller @ 2017-04-30 2:13 UTC (permalink / raw)
To: ast; +Cc: daniel, aconole, netdev, xdp-newbies
In-Reply-To: <9d5e3a87-15d0-01d7-6272-fa8d2bf0d076@fb.com>
From: Alexei Starovoitov <ast@fb.com>
Date: Sat, 29 Apr 2017 17:48:43 -0700
> /w/binutils-gdb/bld/binutils/objdump: invalid relocation type 10
> /w/binutils-gdb/bld/binutils/objdump: Dwarf Error: found address size
I discussed this in another email, the relocation numbers I used in
binutils do not match what is in LLVM currently.
In fact, I thought you guys weren't using relocations in any capacity
at all so just picked things from scratch :-)
^ permalink raw reply
* Re: [PATCH v3 binutils] Add BPF support to binutils...
From: David Miller @ 2017-04-30 2:11 UTC (permalink / raw)
To: ast; +Cc: daniel, aconole, netdev, xdp-newbies
In-Reply-To: <9d5e3a87-15d0-01d7-6272-fa8d2bf0d076@fb.com>
From: Alexei Starovoitov <ast@fb.com>
Date: Sat, 29 Apr 2017 17:48:43 -0700
> On 4/28/17 1:33 PM, David Miller wrote:
>> New in this version:
>>
>> 1) All the relocation work I posted earlier today.
>> 2) Teach readelf about a few bpf relocs as needed
>> 3) Add a 'nop' instruction which facilitates the gas
>> testsuite. I used "mov r0,r0"
>>
>> The whole gas testsuite passes now. :-) But that just
>> means we have to add more tests I guess....
>>
>> Signed-off-by: David S. Miller <davem@davemloft.net>
>
> it seems by default bpf target is not enabled,
> so I tried to build it with:
> ../configure --enable-targets=bpf,x86
Please use "--target=bpf-elf"
^ permalink raw reply
* Re: prog ID and next steps. Was: [RFC net-next 0/2] Introduce bpf_prog ID and iteration
From: Alexei Starovoitov @ 2017-04-30 2:06 UTC (permalink / raw)
To: Hannes Frederic Sowa, Martin KaFai Lau, netdev
Cc: Daniel Borkmann, kernel-team, David S. Miller,
Jesper Dangaard Brouer, John Fastabend, Thomas Graf
In-Reply-To: <cc962035-4374-b6c1-b350-c4dda9bf095c@stressinduktion.org>
On 4/28/17 2:13 PM, Hannes Frederic Sowa wrote:
>
> Let's assume the following program with a constant key lookup and
> different tables:
>
> action = bpf_map_lookup_elem(&actions, 0);
> if (!*action)
> return XDP_DROP;
> else
> return bpf_redirect(skb->ifindex, 0);
>
> It does something completely different depending on the map being used.
> That is the reason why I see it makes sense to be specific which program
> gets used if you try to analyze a program interactively.
Instead of two exactly the same programs accessing different
maps it could have been one program accessing one map.
The end result is the same.
Depending on contents of the map and the input bytes of the packet
the program will do arbitrary decisions. I still don't see how printing
prog ID can help debugging.
>> The programs gets unloaded too and this 'perf record' and stack
>> traces come from the past, hence the need for stable prog_tag.
>
> perf only stores addresses in perf.data. That said, if the program isn't
> loaded, it won't give you any tag. If another program is reusing the
> same address, if will give you any other random name for the function in
> the calltrace.
This issue is well understood. We discussed it with Arnaldo and
the plan is to emit PERF_RECORD_MMAP that will contain the address
of JITed bpf program, so perf.data will contain all the info necessary
for later analysis.
> If you store the perf script output or have kallsyms handy, certainly, yes.
right now 'perf script' doesn't work for short lived programs either.
The stack trace IPs may be already gone from kallsyms.
PERF_RECORD_MMAP is the right solution to that as well.
> Most of the time I was debugging interactively. Developers would
> probably also enjoy to have a way to trace the program to the exact
> identity. I have no problem keeping the tag in place and append just the
> prog_id for the specific reason that the program might be loaded
> multiple times with different tags in place. I was concerned about the
> space for function names in kallsyms.
developers enjoy 'exact identity of the program' with program tag.
Dynamic prog ID doesn't provide additional info to the user.
Like when you write brand new kernel module, do you care what
order it's loaded in lsmod ?
Or if system crashed, do you care that veth0 had ifindex 10 or 20?
Or PID of segfaulted program was 3256 ?
> I have to think more about it. Maybe there is a way to achieve both
> without too much hassle.
It seems we agree on 80+% of topics discussed in this thread, so
I suggest we implement, review and land these key pieces and later
resolve the contentious points. By that time we will likely
see each other arguments in different light :)
^ permalink raw reply
* Re: xdp_redirect ifindex vs port. Was: best API for returning/setting egress port?
From: Alexei Starovoitov @ 2017-04-30 1:35 UTC (permalink / raw)
To: Hannes Frederic Sowa, John Fastabend, Jesper Dangaard Brouer,
Andy Gospodarek
Cc: Alexei Starovoitov, Daniel Borkmann, Daniel Borkmann,
netdev@vger.kernel.org, xdp-newbies@vger.kernel.org
In-Reply-To: <dd965e88-b9fd-4d58-4272-07588b842252@stressinduktion.org>
On 4/28/17 12:43 PM, Hannes Frederic Sowa wrote:
> On 28.04.2017 07:30, Alexei Starovoitov wrote:
>> On 4/27/17 10:06 PM, John Fastabend wrote:
>>> That is more or less what I was thinking as well. The other question
>>> I have though is should we have a bpf_redirect() call for the simple
>>> case where I use the ifindex directly. This will be helpful for taking
>>> existing programs from tc_cls into xdp. I think it makes sense to have
>>> both bpf_tx_allports(), bpf_tx_port(), and bpf_redirect().
>>
>> I think so too.
>> Once netdevice is stored into netdev_array map the netdevice is pinned
>> and we need to figure out what to do if somebody tries to delete it.
>> Should we add a new netlink notifier that this netdev's refcnt is
>> almost zero and it's only in netdev_array(s) ?
>
> We basically do that automatically in netdev_wait_allrefs:
>
> pr_emerg("unregister_netdevice: waiting for %s to become free. Usage
> count = %d\n",
> dev->name, refcnt);
>
> It is a very unpleasant warning and users probably think about a bug in
> the kernel at first.
>
> I don't think we should wait for user space to clean that up but have to
> do it automatically from the kernel. Maybe we can introduce a special
> value that basically NOPs the transmission. The hash table itself would
> install a netdevice notifier and would clean all tables. Could
> definitely cause some storm in the kernel, if a lot of keys are mapped
> to the same interface.
>
>> or should it be deleted from the array(s) automatically and
>> then user space will be notified post-deletion?
>> Both approaches have their pros and cons.
>
> I am leaning more towards deleting it automatically. But walking all
> tables and in there all keys might cause some unwanted load spikes.
yeah, when netdev is being unregistered we have to drop the ref
to avoid the warning.
Speaking of delete notifiers for maps.
Until recently all deletions were explicit and the program
could do extra perf_event_submit() if it needed to notify
user space of deletion.
But right now we have LRU map that deletes automatically and this
netdev_array will be deleting automatically too, so we probably
need some sort of generic map notifier for all map types.
It can be as simple as user space sleeping on read(map_fd, buf);
or waiting for epoll on map_fd and kernel wakes it up
when map element is deleted and pushes 'key/value' pair
into the buf. That should be generic for all map types,
but it needs some mechanism to avoid blocking, since number
of deletions can be huge. Something similar to perf's event record
'lost N records' should do the trick.
^ permalink raw reply
* Re: xdp_redirect ifindex vs port. Was: best API for returning/setting egress port?
From: Alexei Starovoitov @ 2017-04-30 1:04 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Andy Gospodarek, John Fastabend, Alexei Starovoitov,
Daniel Borkmann, Daniel Borkmann, netdev@vger.kernel.org,
xdp-newbies@vger.kernel.org
In-Reply-To: <20170428125842.6b144d20@redhat.com>
On 4/28/17 3:58 AM, Jesper Dangaard Brouer wrote:
> On Thu, 27 Apr 2017 16:31:14 -0700
> Alexei Starovoitov <ast@fb.com> wrote:
>
>> On 4/27/17 1:41 AM, Jesper Dangaard Brouer wrote:
>>> When registering/attaching a XDP/bpf program, we would just send the
>>> file-descriptor for this port-map along (like we do with the bpf_prog
>>> FD). Plus, it own ingress-port number this program is in the port-map.
>>>
>>> It is not clear to me, in-which-data-structure on the kernel-side we
>>> store this reference to the port-map and ingress-port. As today we only
>>> have the "raw" struct bpf_prog pointer. I see several options:
>>>
>>> 1. Create a new xdp_prog struct that contains existing bpf_prog,
>>> a port-map pointer and ingress-port. (IMHO easiest solution)
>>>
>>> 2. Just create a new pointer to port-map and store it in driver rx-ring
>>> struct (like existing bpf_prog), but this create a race-challenge
>>> replacing (cmpxchg) the program (or perhaps it's not a problem as it
>>> runs under rcu and RTNL-lock).
>>>
>>> 3. Extend bpf_prog to store this port-map and ingress-port, and have a
>>> fast-way to access it. I assume it will be accessible via
>>> bpf_prog->bpf_prog_aux->used_maps[X] but it will be too slow for XDP.
>>
>> I'm not sure I completely follow the 3 proposals.
>> Are you suggesting to have only one netdev_array per program?
>
> Yes, but I can see you have a more clever idea below.
>
>> Why not to allow any number like we do for tailcall+prog_array, etc?
>
>> We can teach verifier to allow new helper
>> bpf_tx_port(netdev_array, port_num);
>> to only be used with netdev_array map type.
>> It will fetch netdevice pointer from netdev_array[port_num]
>> and will tx the packet into it.
>
> I love it.
>
> I just don't like the "netdev" part of the name "netdev_array" as one
> basic ideas of a port tabel, is that a port can be anything that can
> consume a XDP_buff packet. This generalization allow us to move code
> out of the drivers. We might be on the same page, as I do imagine that
> netdev_array or port_array is just a struct bpf_map pointer, and the
> bpf_map->map_type will tell us that this bpf_map contains net_device
> pointers. Thus, when later introducing a new type of redirect (like to
> a socket or remote-CPU) then we just add a new bpf_map_type for this,
> without needing to change anything in the drivers, right?
In theory, yes, but in practice I doubt it will be so easy.
We probably shouldn't allow very different types of netdev
into the same netdev_array or port_array (whatever the name).
They need to be similar enough, otherwise we'd have to do run-time
checks. If they're all the same, these checks can be done at
insertion time instead.
> Do you imagine that bpf-side bpf_tx_port() returns XDP_REDIRECT?
> Or does it return if the call was successful (e.g validate port_num
> existed in map)?
don't know :)
we need to brainstorm pros and cons.
> On the kernel side, we need to receive this info "port_array" and
> "port_num", given you don't provide the call a xdp_buff/ctx, then I
> assume you want the per-CPU temp-store solution. Then during the
> XDP_REDIRECT action we call a core redirect function that based on the
> bpf_map_type does a lookup, and find the net_device ptr.
hmm. didn't think that far either :)
indeed makes sense to pass 'ctx' into such helper as well,
so it's easier to deal with original netdev.
>> We can make it similar to bpf_tail_call(), so that program will
>> finish on successful bpf_tx_port() or
>> make it into 'delayed' tx which will be executed when program finishes.
>> Not sure which approach is better.
>
> I know you are talking about something slightly different, about
> delaying TX.
>
> But I want to mention (as I've done before) that it is important (for
> me) that we get bulking working/integrated. I imagine the driver will
> call a function that will delay the TX/redirect action and at the end
> of the NAPI cycle have a function that flush packets, bulk per
> destination port.
>
> I was wondering where to store these delayed TX packets, but now that
> we have an associated bpf_map data-structure (netdev_array), I'm thinking
> about storing packets (ordered by port) inside that. And then have a
> bpf_tx_flush(netdev_array) call in the driver (for every port-table-map
> seen, which will likely be small).
makes sense to me as well.
Ideally we should try to make an api such, that batching or no-batching
can be kernel choice. The program will just do
xdp_tx_port(...something here...)
and the kernel does the best for performance. If it needs to delay
the result to do batching the api should allow that transparently.
Like right now xdp program does 'return XDP_TX;' and
on the kernel side we can either do immediate xmit (like we do today)
or can change it to do batching and programs don't need to change.
>> We can also extend this netdev_array into broadcast/multicast. Like
>> bpf_tx_allports(&netdev_array);
>> call from the program will xmit the packet to all netdevices
>> in that 'netdev_array' map type.
>
> When broadcasting you often don't want to broadcast the packet out of
> the incoming interface. How can you support this?
>
> Normally you would know your ingress port, and then excluded that port
> in the broadcast. But with many netdev_array's how do the program know
> it's own ingress port.
absolutely!
bpf_tx_allports() should somehow exclude the port packet arrived on.
What you're proposing about passing 'ctx' into this helper,
should solve it, I guess.
> Thanks a lot for all this input, I got a much more clear picture of how
> I can/should implement this :-)
awesome :) Let's brainstorm more and get John's opinion on it as well,
since sounds like he'll be heavy user of such api.
^ permalink raw reply
* Re: [PATCH v3 binutils] Add BPF support to binutils...
From: Alexei Starovoitov @ 2017-04-30 0:48 UTC (permalink / raw)
To: David Miller; +Cc: daniel, aconole, netdev, xdp-newbies
In-Reply-To: <20170428.163355.2067951664875385680.davem@davemloft.net>
On 4/28/17 1:33 PM, David Miller wrote:
> New in this version:
>
> 1) All the relocation work I posted earlier today.
> 2) Teach readelf about a few bpf relocs as needed
> 3) Add a 'nop' instruction which facilitates the gas
> testsuite. I used "mov r0,r0"
>
> The whole gas testsuite passes now. :-) But that just
> means we have to add more tests I guess....
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
it seems by default bpf target is not enabled,
so I tried to build it with:
../configure --enable-targets=bpf,x86
make -j40
but it failed to build :(
Then I did ../configure --target=bpf
and it went better. At least I could compile objdump,
but gas refused to be configured:
checking whether byte ordering is bigendian... no
This target is no longer supported in gas
make[1]: *** [configure-gas] Error 1
Not sure what I'm doing wrong.
At least i tested objdump:
$ clang -O2 -target bpfeb -c test.c
$ llvm-objdump -S test.o
test.o: file format ELF64-BPF
Disassembly of section .text:
bpf_prog1:
0: 18 10 00 00 83 98 47 39 00 00 00 00 00 00 00 89 r1 =
590618314553ll
2: 7b a1 ff f8 00 00 00 00 *(u64 *)(r10 - 8) = r1
3: 79 1a ff f8 00 00 00 00 r1 = *(u64 *)(r10 - 8)
4: 07 10 00 00 8f ff 00 02 r1 += -1879113726
5: 79 01 00 00 00 00 00 00 r0 = *(u64 *)(r1 + 0)
6: 95 00 00 00 00 00 00 00 exit
$ bld/binutils/objdump -S test.o
test.o: file format elf64-bpfbe
Disassembly of section .text:
0000000000000000 <bpf_prog1>:
0: 18 10 00 00 83 98 47 39 ldimm64 r1, 590618314553
8: 00 00 00 00 00 00 00 89
10: 7b a1 ff f8 00 00 00 00 stdw [r10+65528], r1
18: 79 1a ff f8 00 00 00 00 lddw r1, [r10+65528]
20: 07 10 00 00 8f ff 00 02 add r1, -1879113726
28: 79 01 00 00 00 00 00 00 lddw r0, [r1+0]
30: 95 00 00 00 00 00 00 00 exit
looks good except negative offsets are reported as large positive.
If compiled with clang -O2 -target bpfel and result is:
$ bld/binutils/objdump -S test.o
test.o: file format elf64-bpfle
Disassembly of section .text:
0000000000000000 <bpf_prog1>:
0: 18 01 00 00 39 47 98 83 ldimm64 r0, 590618314553
8: 00 00 00 00 89 00 00 00
10: 7b 1a f8 ff 00 00 00 00 stdw [r1+65528], r10
18: 79 a1 f8 ff 00 00 00 00 lddw r10, [r1+65528]
Now the src/dst registers are swapped :(
The bytes printed are correct though.
clang debug info is not recognized as well:
$ clang -O2 -g -target bpfel -c test.c
/w/binutils-gdb/bld/binutils/objdump -S test.o
test.o: file format elf64-bpfle
Disassembly of section .text:
0000000000000000 <bpf_prog1>:
/w/binutils-gdb/bld/binutils/objdump: invalid relocation type 10
/w/binutils-gdb/bld/binutils/objdump: BFD (GNU Binutils)
2.28.51.20170429 assertion fail ../../bfd/elf64-bpf.c:139
...
/w/binutils-gdb/bld/binutils/objdump: BFD (GNU Binutils)
2.28.51.20170429 assertion fail ../../bfd/elf64-bpf.c:139
/w/binutils-gdb/bld/binutils/objdump: invalid relocation type 10
/w/binutils-gdb/bld/binutils/objdump: BFD (GNU Binutils)
2.28.51.20170429 assertion fail ../../bfd/elf64-bpf.c:139
0: 18 01 00 00 39 47 98 83 ldimm64 r0, 590618314553
8: 00 00 00 00 89 00 00 00
10: 7b 1a f8 ff 00 00 00 00 stdw [r1+65528], r10
$ clang -O2 -g -target bpfeb -c test.c
$ /w/binutils-gdb/bld/binutils/objdump -S test.o
test.o: file format elf64-bpfbe
Disassembly of section .text:
0000000000000000 <bpf_prog1>:
/w/binutils-gdb/bld/binutils/objdump: invalid relocation type 10
/w/binutils-gdb/bld/binutils/objdump: BFD (GNU Binutils)
2.28.51.20170429 assertion fail ../../bfd/elf64-bpf.c:139
...
/w/binutils-gdb/bld/binutils/objdump: invalid relocation type 10
/w/binutils-gdb/bld/binutils/objdump: Dwarf Error: found address size
'0', this reader can only handle address sizes '2', '4' and '8'.
0: 18 10 00 00 83 98 47 39 ldimm64 r1, 590618314553
8: 00 00 00 00 00 00 00 89
10: 7b a1 ff f8 00 00 00 00 stdw [r10+65528], r1
18: 79 1a ff f8 00 00 00 00 lddw r1, [r10+65528]
With llvm it should be like this:
$ ./bin/clang -O2 -g -target bpfel -c test.c
$ ./bin/llvm-objdump -S test.o
test.o: file format ELF64-BPF
Disassembly of section .text:
bpf_prog1:
; {
0: 18 01 00 00 39 47 98 83 00 00 00 00 89 00 00 00 r1 =
590618314553ll
; volatile unsigned long t = 0x8983984739ull;
2: 7b 1a f8 ff 00 00 00 00 *(u64 *)(r10 - 8) = r1
; return *(unsigned long *)((0xffffffff8fff0002ull) + t);
3: 79 a1 f8 ff 00 00 00 00 r1 = *(u64 *)(r10 - 8)
4: 07 01 00 00 02 00 ff 8f r1 += -1879113726
5: 79 10 00 00 00 00 00 00 r0 = *(u64 *)(r1 + 0)
6: 95 00 00 00 00 00 00 00 exit
The output of llvm with '-g -target bpfeb' is probably partially broken,
since 'llvm-objdump -S test.o' doesn't see any debug in there.
So gnu objdump's error:
Dwarf Error: found address size '0'
is likely legit.
^ permalink raw reply
* Re: ipsec doesn't route TCP with 4.11 kernel
From: Don Bowman @ 2017-04-30 0:39 UTC (permalink / raw)
To: Steffen Klassert
Cc: Cong Wang, linux-kernel@vger.kernel.org, Herbert Xu,
Linux Kernel Network Developers
In-Reply-To: <20170428071337.GG2649@secunet.com>
On 28 April 2017 at 03:13, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> On Thu, Apr 27, 2017 at 06:13:38PM -0400, Don Bowman wrote:
>> On 27 April 2017 at 04:42, Steffen Klassert <steffen.klassert@secunet.com>
>> wrote:
>> > On Wed, Apr 26, 2017 at 10:01:34PM -0700, Cong Wang wrote:
>> >> (Cc'ing netdev and IPSec maintainers)
>> >>
>> >> On Tue, Apr 25, 2017 at 6:08 PM, Don Bowman <db@donbowman.ca> wrote:
>>
<snip>
confirmed, with this patch in place that the tcp functions properly.
^ permalink raw reply
* [PATCH] net: sunhme: fix spelling mistakes: "ParityErro" -> "ParityError"
From: Colin King @ 2017-04-29 21:38 UTC (permalink / raw)
To: David S . Miller, Philippe Reynes, Jarod Wilson, Tobias Klauser,
netdev
Cc: linux-kernel
From: Colin Ian King <colin.king@canonical.com>
trivial fix to spelling mistakes in printk message.
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
drivers/net/ethernet/sun/sunhme.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c
index a6cc9a2d41c1..9e983e1d8249 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -1857,7 +1857,7 @@ static int happy_meal_is_not_so_happy(struct happy_meal *hp, u32 status)
if (status & GREG_STAT_TXLERR)
printk("LateError ");
if (status & GREG_STAT_TXPERR)
- printk("ParityErro ");
+ printk("ParityError ");
if (status & GREG_STAT_TXTERR)
printk("TagBotch ");
printk("]\n");
--
2.11.0
^ permalink raw reply related
* Re: Low speed MPLS to virtio-net
From: David Ahern @ 2017-04-29 20:50 UTC (permalink / raw)
To: Алексей Болдырев,
netdev
In-Reply-To: <428971493234129@web14g.yandex.ru>
On 4/26/17 1:15 PM, Алексей Болдырев wrote:
> Started MPLS on the branch - Everything was fine. When I tried to run MPLS on a real network of virtual machines, there were problems with the speed:
> root@containers:~# iperf3 -c 10.194.10.2 -B 10.194.10.1 -Z
> Connecting to host 10.194.10.2, port 5201
> [ 4] local 10.194.10.1 port 49533 connected to 10.194.10.2 port 5201
> [ ID] Interval Transfer Bandwidth Retr Cwnd
> [ 4] 0.00-1.00 sec 1018 KBytes 8.34 Mbits/sec 238 5.64 KBytes
> [ 4] 1.00-2.00 sec 1.42 MBytes 11.9 Mbits/sec 373 1.41 KBytes
> [ 4] 2.00-3.00 sec 1.43 MBytes 12.0 Mbits/sec 379 5.64 KBytes
> [ 4] 3.00-4.00 sec 1.43 MBytes 12.0 Mbits/sec 376 5.64 KBytes
> [ 4] 4.00-5.00 sec 1.41 MBytes 11.8 Mbits/sec 375 2.82 KBytes
> [ 4] 5.00-6.00 sec 1.42 MBytes 11.9 Mbits/sec 376 2.82 KBytes
> [ 4] 6.00-7.00 sec 1.42 MBytes 11.9 Mbits/sec 373 5.64 KBytes
> [ 4] 7.00-8.00 sec 1.41 MBytes 11.8 Mbits/sec 372 5.64 KBytes
> [ 4] 8.00-9.00 sec 1.42 MBytes 11.9 Mbits/sec 379 2.82 KBytes
> [ 4] 9.00-10.00 sec 1.42 MBytes 11.9 Mbits/sec 373 5.64 KBytes
> - - - - - - - - - - - - - - - - - - - - - - - - -
> [ ID] Interval Transfer Bandwidth Retr
> [ 4] 0.00-10.00 sec 13.8 MBytes 11.5 Mbits/sec 3614 sender
> [ 4] 0.00-10.00 sec 13.6 MBytes 11.4 Mbits/sec receiver
A picture of your network topology is more helpful than dumping network
config commands.
For this topology:
10.10.10.10
+-----+ +-----+ +-----+
| VM1 |-------| VM2 |-------| VM3 |
+-----+ +-----+ +-----+
push 100 pop 100
I get 4+ Gbps using netperf from VM1 to VM3:
$ netperf -c -C -H 10.10.10.10 -l 10 -t TCP_STREAM
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
10.10.10.10 () port 0 AF_INET
Recv Send Send Utilization Service
Demand
Socket Socket Message Elapsed Send Recv Send Recv
Size Size Size Time Throughput local remote local
remote
bytes bytes bytes secs. 10^6bits/s % S % S us/KB us/KB
87380 16384 16384 10.01 4377.58 37.69 60.78 1.411 2.275
So your bandwidth above is really low.
I suggest you confirm a good bandwidth without MPLS to make sure the VM
config is proper.
^ permalink raw reply
* Re: [PATCH v2] iov_iter: don't revert iov buffer if csum error
From: Al Viro @ 2017-04-29 20:48 UTC (permalink / raw)
To: Ding Tianhong
Cc: David Miller, pabeni, edumazet, hannes, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org, LinuxArm, weiyongjun (A)
In-Reply-To: <0cc7c1d2-87d0-6cc4-624b-1cf96678bfd7@huawei.com>
On Sat, Apr 29, 2017 at 05:37:38PM +0800, Ding Tianhong wrote:
> Looks good, if so, we don't need the csum_error any more,
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Dave, I could put that through my tree, but I think it would be better off
in net.git; either way, it needs to go into mainline before -final...
^ permalink raw reply
* Re: [PATCH RFC net-next v4 3/7] sk_buff: remove support for csum_bad in sk_buff
From: Tom Herbert @ 2017-04-29 20:21 UTC (permalink / raw)
To: Davide Caratti
Cc: Alexander Duyck, David Laight, David S . Miller,
Marcelo Ricardo Leitner, Linux Kernel Network Developers,
linux-sctp @ vger . kernel . org
In-Reply-To: <529dbc49e0124c4632e1c40df457fe33553584ca.1492692976.git.dcaratti@redhat.com>
On Thu, Apr 20, 2017 at 6:38 AM, Davide Caratti <dcaratti@redhat.com> wrote:
> This bit was introduced with 5a21232983aa ("net: Support for csum_bad in
> skbuff") to reduce the stack workload when processing RX packets carrying
> a wrong Internet Checksum. Up to now, only one driver (besides GRO core)
> are setting it.
> The test on NAPI_GRO_CB(skb)->flush in dev_gro_receive() is now done
> before the test on same_flow, to preserve behavior in case of wrong
> checksum.
>
> Suggested-by: Tom Herbert <tom@herbertland.com>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
> drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 2 +-
> include/linux/netdevice.h | 4 +---
> include/linux/skbuff.h | 23 ++---------------------
> net/bridge/netfilter/nft_reject_bridge.c | 5 +----
> net/core/dev.c | 8 +++-----
> net/ipv4/netfilter/nf_reject_ipv4.c | 2 +-
> net/ipv6/netfilter/nf_reject_ipv6.c | 3 ---
> 7 files changed, 9 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> index 3a8a4aa..9a08179 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> @@ -223,7 +223,7 @@ int aq_ring_rx_clean(struct aq_ring_s *self, int *work_done, int budget)
> skb->protocol = eth_type_trans(skb, ndev);
> if (unlikely(buff->is_cso_err)) {
> ++self->stats.rx.errors;
> - __skb_mark_checksum_bad(skb);
> + skb->ip_summed = CHECKSUM_NONE;
> } else {
> if (buff->is_ip_cso) {
> __skb_incr_checksum_unnecessary(skb);
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index bf84a67..ab9e3dc 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2546,9 +2546,7 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
> if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \
> __ret = __skb_gro_checksum_validate_complete(skb, \
> compute_pseudo(skb, proto)); \
> - if (__ret) \
> - __skb_mark_checksum_bad(skb); \
> - else \
> + if (!__ret) \
> skb_gro_incr_csum_unnecessary(skb); \
> __ret; \
> })
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index ec4551b..927309e 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -743,7 +743,7 @@ struct sk_buff {
> __u8 csum_valid:1;
> __u8 csum_complete_sw:1;
> __u8 csum_level:2;
> - __u8 csum_bad:1;
> + __u8 __csum_bad_unused:1; /* one bit hole */
>
> __u8 dst_pending_confirm:1;
> #ifdef CONFIG_IPV6_NDISC_NODETYPE
> @@ -3387,21 +3387,6 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
> }
> }
>
> -static inline void __skb_mark_checksum_bad(struct sk_buff *skb)
> -{
> - /* Mark current checksum as bad (typically called from GRO
> - * path). In the case that ip_summed is CHECKSUM_NONE
> - * this must be the first checksum encountered in the packet.
> - * When ip_summed is CHECKSUM_UNNECESSARY, this is the first
> - * checksum after the last one validated. For UDP, a zero
> - * checksum can not be marked as bad.
> - */
> -
> - if (skb->ip_summed == CHECKSUM_NONE ||
> - skb->ip_summed == CHECKSUM_UNNECESSARY)
> - skb->csum_bad = 1;
> -}
> -
> /* Check if we need to perform checksum complete validation.
> *
> * Returns true if checksum complete is needed, false otherwise
> @@ -3455,9 +3440,6 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
> skb->csum_valid = 1;
> return 0;
> }
> - } else if (skb->csum_bad) {
> - /* ip_summed == CHECKSUM_NONE in this case */
> - return (__force __sum16)1;
> }
>
> skb->csum = psum;
> @@ -3517,8 +3499,7 @@ static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
>
> static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
> {
> - return (skb->ip_summed == CHECKSUM_NONE &&
> - skb->csum_valid && !skb->csum_bad);
> + return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
> }
>
> static inline void __skb_checksum_convert(struct sk_buff *skb,
> diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
> index 346ef6b..c16dd3a 100644
> --- a/net/bridge/netfilter/nft_reject_bridge.c
> +++ b/net/bridge/netfilter/nft_reject_bridge.c
> @@ -111,7 +111,7 @@ static void nft_reject_br_send_v4_unreach(struct net *net,
> __wsum csum;
> u8 proto;
>
> - if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb))
> + if (!nft_bridge_iphdr_validate(oldskb))
> return;
>
> /* IP header checks: fragment. */
> @@ -226,9 +226,6 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int hook)
> __be16 fo;
> u8 proto = ip6h->nexthdr;
>
> - if (skb->csum_bad)
> - return false;
> -
> if (skb_csum_unnecessary(skb))
> return true;
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index c7aec95..77a2d73 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4533,9 +4533,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> if (!(skb->dev->features & NETIF_F_GRO))
> goto normal;
>
> - if (skb->csum_bad)
> - goto normal;
> -
> gro_list_prepare(napi, skb);
>
> rcu_read_lock();
> @@ -4595,11 +4592,12 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> napi->gro_count--;
> }
>
> + if (NAPI_GRO_CB(skb)->flush)
> + goto normal;
> +
> if (same_flow)
> goto ok;
>
> - if (NAPI_GRO_CB(skb)->flush)
> - goto normal;
>
> if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
> struct sk_buff *nskb = napi->gro_list;
> diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
> index 7cd8d0d..6f8d9e5 100644
> --- a/net/ipv4/netfilter/nf_reject_ipv4.c
> +++ b/net/ipv4/netfilter/nf_reject_ipv4.c
> @@ -172,7 +172,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
> struct iphdr *iph = ip_hdr(skb_in);
> u8 proto;
>
> - if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET))
> + if (iph->frag_off & htons(IP_OFFSET))
> return;
>
> if (skb_csum_unnecessary(skb_in)) {
> diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
> index eedee5d..f63b18e 100644
> --- a/net/ipv6/netfilter/nf_reject_ipv6.c
> +++ b/net/ipv6/netfilter/nf_reject_ipv6.c
> @@ -220,9 +220,6 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook)
> __be16 fo;
> u8 proto;
>
> - if (skb->csum_bad)
> - return false;
> -
> if (skb_csum_unnecessary(skb))
> return true;
>
> --
> 2.7.4
>
Acked-by: Tom Herbert <tom@herbertland.com>
^ permalink raw reply
* Re: [PATCH RFC net-next v4 7/7] sk_buff.h: improve description of CHECKSUM_{COMPLETE,UNNECESSARY}
From: Tom Herbert @ 2017-04-29 20:20 UTC (permalink / raw)
To: Davide Caratti
Cc: Alexander Duyck, David Laight, David S . Miller,
Marcelo Ricardo Leitner, Linux Kernel Network Developers,
linux-sctp @ vger . kernel . org
In-Reply-To: <a6f87efa3d7a00deaaddd080cd41be8a5b34387e.1492692976.git.dcaratti@redhat.com>
On Thu, Apr 20, 2017 at 6:38 AM, Davide Caratti <dcaratti@redhat.com> wrote:
> Add FCoE to the list of protocols that can set CHECKSUM_UNNECESSARY; add a
> note to CHECKSUM_COMPLETE section to specify that it does not apply to SCTP
> and FCoE protocols.
>
> Suggested-by: Tom Herbert <tom@herbertland.com>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
> include/linux/skbuff.h | 7 +++++--
> 1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 4002c11..c902b77 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -109,6 +109,7 @@
> * may perform further validation in this case.
> * GRE: only if the checksum is present in the header.
> * SCTP: indicates the CRC in SCTP header has been validated.
> + * FCOE: indicates the CRC in FC frame has been validated.
> *
> * skb->csum_level indicates the number of consecutive checksums found in
> * the packet minus one that have been verified as CHECKSUM_UNNECESSARY.
> @@ -126,8 +127,10 @@
> * packet as seen by netif_rx() and fills out in skb->csum. Meaning, the
> * hardware doesn't need to parse L3/L4 headers to implement this.
> *
> - * Note: Even if device supports only some protocols, but is able to produce
> - * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
> + * Notes:
> + * - Even if device supports only some protocols, but is able to produce
> + * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
> + * - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
> *
> * CHECKSUM_PARTIAL:
> *
> --
> 2.7.4
>
Acked-by: Tom Herbert <tom@herbertland.com>
^ permalink raw reply
* Re: [PATCH RFC net-next v4 4/7] net: use skb->csum_not_inet to identify packets needing crc32c
From: Tom Herbert @ 2017-04-29 20:18 UTC (permalink / raw)
To: Davide Caratti
Cc: Alexander Duyck, David Laight, David S . Miller,
Marcelo Ricardo Leitner, Linux Kernel Network Developers,
linux-sctp @ vger . kernel . org
In-Reply-To: <4f3a01b66bc46d3a2169a8ef2b4d4285e03f5894.1492692976.git.dcaratti@redhat.com>
On Thu, Apr 20, 2017 at 6:38 AM, Davide Caratti <dcaratti@redhat.com> wrote:
> skb->csum_not_inet carries the indication on which algorithm is needed to
> compute checksum on skb in the transmit path, when skb->ip_summed is equal
> to CHECKSUM_PARTIAL. If skb carries a SCTP packet and crc32c hasn't been
> yet written in L4 header, skb->csum_not_inet is assigned to 1; otherwise,
> assume Internet Checksum is needed and thus set skb->csum_not_inet to 0.
>
> Suggested-by: Tom Herbert <tom@herbertland.com>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
> include/linux/skbuff.h | 16 +++++++++-------
> net/core/dev.c | 1 +
> net/sched/act_csum.c | 1 +
> net/sctp/offload.c | 1 +
> net/sctp/output.c | 1 +
> 5 files changed, 13 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 927309e..419f4c8 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -189,12 +189,13 @@
> *
> * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
> * offloading the SCTP CRC in a packet. To perform this offload the stack
> - * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
> - * accordingly. Note the there is no indication in the skbuff that the
> - * CHECKSUM_PARTIAL refers to an SCTP checksum, a driver that supports
> - * both IP checksum offload and SCTP CRC offload must verify which offload
> - * is configured for a packet presumably by inspecting packet headers; in
> - * case, skb_crc32c_csum_help is provided to compute CRC on SCTP packets.
> + * will set set csum_start and csum_offset accordingly, set ip_summed to
> + * CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in
> + * the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
> + * A driver that supports both IP checksum offload and SCTP CRC32c offload
> + * must verify which offload is configured for a packet by testing the
> + * value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve
> + * CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
> *
> * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
> * offloading the FCOE CRC in a packet. To perform this offload the stack
> @@ -615,6 +616,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
> * @wifi_acked_valid: wifi_acked was set
> * @wifi_acked: whether frame was acked on wifi or not
> * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
> + * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
> * @dst_pending_confirm: need to confirm neighbour
> * @napi_id: id of the NAPI struct this skb came from
> * @secmark: security marking
> @@ -743,7 +745,7 @@ struct sk_buff {
> __u8 csum_valid:1;
> __u8 csum_complete_sw:1;
> __u8 csum_level:2;
> - __u8 __csum_bad_unused:1; /* one bit hole */
> + __u8 csum_not_inet:1;
>
> __u8 dst_pending_confirm:1;
> #ifdef CONFIG_IPV6_NDISC_NODETYPE
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 77a2d73..9f56f87 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2642,6 +2642,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
> }
> *(__le32 *)(skb->data + offset) = crc32c_csum;
> skb->ip_summed = CHECKSUM_NONE;
> + skb->csum_not_inet = 0;
> out:
> return ret;
> }
> diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
> index ab6fdbd..3317a2f 100644
> --- a/net/sched/act_csum.c
> +++ b/net/sched/act_csum.c
> @@ -350,6 +350,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
> sctph->checksum = sctp_compute_cksum(skb,
> skb_network_offset(skb) + ihl);
> skb->ip_summed = CHECKSUM_NONE;
> + skb->csum_not_inet = 0;
>
> return 1;
> }
> diff --git a/net/sctp/offload.c b/net/sctp/offload.c
> index 378f462..ef156ac 100644
> --- a/net/sctp/offload.c
> +++ b/net/sctp/offload.c
> @@ -35,6 +35,7 @@
> static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
> {
> skb->ip_summed = CHECKSUM_NONE;
> + skb->csum_not_inet = 0;
> return sctp_compute_cksum(skb, skb_transport_offset(skb));
> }
>
> diff --git a/net/sctp/output.c b/net/sctp/output.c
> index 1409a87..e2edf2e 100644
> --- a/net/sctp/output.c
> +++ b/net/sctp/output.c
> @@ -538,6 +538,7 @@ static int sctp_packet_pack(struct sctp_packet *packet,
> } else {
> chksum:
> head->ip_summed = CHECKSUM_PARTIAL;
> + head->csum_not_inet = 1;
> head->csum_start = skb_transport_header(head) - head->head;
> head->csum_offset = offsetof(struct sctphdr, checksum);
> }
> --
> 2.7.4
>
Looks great!
Acked-by: Tom Herbert <tom@herbertland.com>
^ permalink raw reply
* Re: assembler mnenomics for call/tailcall plus maps...
From: David Miller @ 2017-04-29 18:38 UTC (permalink / raw)
To: ast; +Cc: daniel, netdev, xdp-newbies
In-Reply-To: <398986a4-ef4b-6631-c6eb-96505e0a5849@fb.com>
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 27 Apr 2017 19:06:14 -0700
> So in asm the map lookup will look like:
> .section maps,"aw",@progbits
> .globl hashmap_def
> hashmap_def:
> .long 1 # type
> .long 24 # key_size
> .long 40 # value_size
> .long 256 # max_entries
>
> .text
> .section xdp_tx_iptunnel,"ax",@progbits
> .globl _xdp_prog
> _xdp_prog:
> ldimm64 r1, hashmap_def
> mov r2, r10
> add r2, -8
> call bpf_map_lookup_elem
>
> this is 64-bit relo for ldimm64 insn
>
> This is how it's defined in llvm:
> ELF_RELOC(R_BPF_NONE, 0)
> ELF_RELOC(R_BPF_64_64, 1)
> ELF_RELOC(R_BPF_64_32, 10)
>
> The R_BPF_64_64 is the only relocation against .text
> The other one is used for relo into dwarf sections.
There is another missing element here, we don't use the relocation to
write the actual address of "&hashmap_def" into the ldimm64
instruction.
Instead, we use the relocation to write the file descriptor number for
that map into the instruction. The only parts of the relocation that
are used are the offset (to find the BPF instruction) and the symbol
reference (to find out which map we are referencing).
The BPF loader also doesn't even check the relocation number to make
sure it's of the correct type. It just assumes that any relocation it
sees is exactly this kind used for maps.
My relocations in binutils currently look like this:
START_RELOC_NUMBERS (elf_bpf_reloc_type)
RELOC_NUMBER (R_BPF_NONE, 0)
RELOC_NUMBER (R_BPF_INSN_16, 1)
RELOC_NUMBER (R_BPF_INSN_32, 2)
RELOC_NUMBER (R_BPF_INSN_64, 3)
RELOC_NUMBER (R_BPF_WDISP16, 4)
RELOC_NUMBER (R_BPF_DATA_8, 5)
RELOC_NUMBER (R_BPF_DATA_16, 6)
RELOC_NUMBER (R_BPF_DATA_32, 7)
RELOC_NUMBER (R_BPF_DATA_64, 8)
END_RELOC_NUMBERS (R_BPF_max)
There is of course R_BPF_NONE, and then we have 4 relocations for
instructions. One for the 16-bit offset field (non-pc-relative), one
for the 32-bit immediate field (again, non-pc-relative), one for
ldimm64's immediate field, and finally R_BPF_WDISP16 for doing a
pc-relative relocation to the offset field of a jump instruction.
Then we have the usual data relocations, for 8, 16, 32, and 64-bit.
I guess LLVM's R_BPF_64_64 is the one used for the ldimm64 reference
to a map, and is equivalent to R_BPF_INSN_64 above.
We just need to sort out how we want this semantically to interconnect
etc.
In binutils we can make the MAP reference special and explicit, so we
would for example add:
RELOC_NUMBER (R_BPF_MAP, 9)
or whatever. And then for assembler syntax, use something like:
%map(SYMBOL)
So you would go:
ldimm64 r1, %map(hash_map)
or, taking it one step further, do the following since we know this
maps to a 32-bit FD:
mov32 r1, %map(hash_map)
and this way avoid eating up a 16-byte opcode just for this.
In GCC it will be simple to get the backend to emit this, various
options exist. We can make it a special "__attribute__((map))", or
use address spaces to annotate the map object. And then when the
ldimm64 or whatever instruction is emitted, and it sees the symbol
referenced has this special type, it will emit "%%map(%s)" instead of
just "%s" for the symbol name in the asembler output.
That in turn will lead to the correct relocation.
But I guess for now what I could do is just make R_BPF_INSN_64 have
the same number as LLVM's R_BPF_64_64 and it should "just work" using
tooling.
I think we should spend serious time properly designing the
relocations and thinking ahead about people perhaps wanting to link
multiple objects together, call functions in other objects, and
perhaps even doing dynamic relocations. Nothing fundamentally in
eBPF prevents this.
^ permalink raw reply
* Re: [PATCH net-next 11/11] ibmvnic: Move queue restart
From: kbuild test robot @ 2017-04-29 17:18 UTC (permalink / raw)
To: Nathan Fontenot; +Cc: kbuild-all, netdev, brking, jallen, muvic, tlfalcon
In-Reply-To: <20170429081634.13438.38831.stgit@ltcalpine2-lp23.aus.stglabs.ibm.com>
[-- Attachment #1: Type: text/plain, Size: 4324 bytes --]
Hi Nathan,
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/Nathan-Fontenot/ibmvnic-Updated-reset-handler-and-code-fixes/20170430-004245
config: powerpc-allmodconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
In file included from include/linux/if_ether.h:23:0,
from include/uapi/linux/ethtool.h:18,
from include/linux/ethtool.h:17,
from include/linux/netdevice.h:42,
from drivers/net//ethernet/ibm/ibmvnic.c:54:
drivers/net//ethernet/ibm/ibmvnic.c: In function 'ibmvnic_interrupt_tx':
>> include/linux/skbuff.h:3689:12: warning: 'skb' may be used uninitialized in this function [-Wmaybe-uninitialized]
return skb->queue_mapping;
~~~^~~~~~~~~~~~~~~
drivers/net//ethernet/ibm/ibmvnic.c:1779:18: note: 'skb' was declared here
struct sk_buff *skb;
^~~
--
In file included from include/linux/if_ether.h:23:0,
from include/uapi/linux/ethtool.h:18,
from include/linux/ethtool.h:17,
from include/linux/netdevice.h:42,
from drivers/net/ethernet/ibm/ibmvnic.c:54:
drivers/net/ethernet/ibm/ibmvnic.c: In function 'ibmvnic_interrupt_tx':
>> include/linux/skbuff.h:3689:12: warning: 'skb' may be used uninitialized in this function [-Wmaybe-uninitialized]
return skb->queue_mapping;
~~~^~~~~~~~~~~~~~~
drivers/net/ethernet/ibm/ibmvnic.c:1779:18: note: 'skb' was declared here
struct sk_buff *skb;
^~~
vim +/skb +3689 include/linux/skbuff.h
574f7194 Eric W. Biederman 2014-04-01 3673 return !skb->destructor &&
574f7194 Eric W. Biederman 2014-04-01 3674 #if IS_ENABLED(CONFIG_XFRM)
574f7194 Eric W. Biederman 2014-04-01 3675 !skb->sp &&
574f7194 Eric W. Biederman 2014-04-01 3676 #endif
cb9c6836 Florian Westphal 2017-01-23 3677 !skb_nfct(skb) &&
574f7194 Eric W. Biederman 2014-04-01 3678 !skb->_skb_refdst &&
574f7194 Eric W. Biederman 2014-04-01 3679 !skb_has_frag_list(skb);
574f7194 Eric W. Biederman 2014-04-01 3680 }
574f7194 Eric W. Biederman 2014-04-01 3681
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3682 static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3683 {
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3684 skb->queue_mapping = queue_mapping;
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3685 }
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3686
9247744e Stephen Hemminger 2009-03-21 3687 static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
4e3ab47a Pavel Emelyanov 2007-10-21 3688 {
4e3ab47a Pavel Emelyanov 2007-10-21 @3689 return skb->queue_mapping;
4e3ab47a Pavel Emelyanov 2007-10-21 3690 }
4e3ab47a Pavel Emelyanov 2007-10-21 3691
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3692 static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3693 {
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3694 to->queue_mapping = from->queue_mapping;
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3695 }
f25f4e44 Peter P Waskiewicz Jr 2007-07-06 3696
d5a9e24a David S. Miller 2009-01-27 3697 static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
:::::: The code at line 3689 was first introduced by commit
:::::: 4e3ab47a547616e583c7a5458beced6aa34c8ef3 [NET]: Make and use skb_get_queue_mapping
:::::: TO: Pavel Emelyanov <xemul@openvz.org>
:::::: CC: David S. Miller <davem@sunset.davemloft.net>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 52793 bytes --]
^ permalink raw reply
* (unknown),
From: Dmitry Bazhenov @ 2017-04-29 15:25 UTC (permalink / raw)
To: netdev
unsubscribe
^ permalink raw reply
* Re: [PATCH net-next] lwtunnel: fix error path in lwtunnel_fill_encap()
From: David Ahern @ 2017-04-29 13:03 UTC (permalink / raw)
To: Dan Carpenter, David S. Miller, Pan Bian
Cc: Roopa Prabhu, Robert Shearman, Tom Herbert, David Lebrun, netdev,
kernel-janitors
In-Reply-To: <20170428130347.53hagk77wh6scmcs@mwanda>
On 4/28/17 7:03 AM, Dan Carpenter wrote:
> We recently added a check to see if nla_nest_start() fails. There are
> two issues with that. First, if it fails then I don't think we should
> call nla_nest_cancel(). Second, it's slightly convoluted but the
> current code returns success but we should return -EMSGSIZE instead.
>
> Fixes: a50fe0ffd76f ("lwtunnel: check return value of nla_nest_start")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
^ permalink raw reply
* 54179 netdev
From: kindergartenchaos2 @ 2017-04-29 12:02 UTC (permalink / raw)
To: netdev
[-- Attachment #1: 93791932238.zip --]
[-- Type: application/zip, Size: 2992 bytes --]
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox