* [PATCH 1/3] i40e: Initial support for XDP
From: Björn Töpel @ 2016-12-08 17:00 UTC (permalink / raw)
To: jeffrey.t.kirsher, intel-wired-lan
Cc: Björn Töpel, john.r.fastabend, magnus.karlsson, netdev
In-Reply-To: <20161208170022.11555-1-bjorn.topel@gmail.com>
From: Björn Töpel <bjorn.topel@intel.com>
This commit adds basic XDP support for i40e derived NICs. All XDP
actions will end up in XDP_DROP.
Only the default/main VSI has support for enabling XDP.
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e.h | 13 +++
drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 3 +
drivers/net/ethernet/intel/i40e/i40e_main.c | 74 +++++++++++++
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 146 ++++++++++++++++++++-----
drivers/net/ethernet/intel/i40e/i40e_txrx.h | 2 +
5 files changed, 213 insertions(+), 25 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index ba8d30984bee..05d805f439e6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -545,6 +545,8 @@ struct i40e_vsi {
struct i40e_ring **rx_rings;
struct i40e_ring **tx_rings;
+ struct bpf_prog *xdp_prog;
+
u32 active_filters;
u32 promisc_threshold;
@@ -904,4 +906,15 @@ i40e_status i40e_get_npar_bw_setting(struct i40e_pf *pf);
i40e_status i40e_set_npar_bw_setting(struct i40e_pf *pf);
i40e_status i40e_commit_npar_bw_setting(struct i40e_pf *pf);
void i40e_print_link_message(struct i40e_vsi *vsi, bool isup);
+
+/**
+ * i40e_enabled_xdp_vsi - Check if VSI has XDP enabled
+ * @vsi: pointer to a vsi
+ *
+ * Returns true if the VSI has XDP enabled.
+ **/
+static inline bool i40e_enabled_xdp_vsi(const struct i40e_vsi *vsi)
+{
+ return vsi->xdp_prog;
+}
#endif /* _I40E_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index cc1465aac2ef..831bbc208fc8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1254,6 +1254,9 @@ static int i40e_set_ringparam(struct net_device *netdev,
if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending))
return -EINVAL;
+ if (i40e_enabled_xdp_vsi(vsi))
+ return -EINVAL;
+
if (ring->tx_pending > I40E_MAX_NUM_DESCRIPTORS ||
ring->tx_pending < I40E_MIN_NUM_DESCRIPTORS ||
ring->rx_pending > I40E_MAX_NUM_DESCRIPTORS ||
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index da4cbe32eb86..db0240213f3b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -24,6 +24,7 @@
*
******************************************************************************/
+#include <linux/bpf.h>
#include <linux/etherdevice.h>
#include <linux/of_net.h>
#include <linux/pci.h>
@@ -2431,6 +2432,13 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu)
struct i40e_netdev_priv *np = netdev_priv(netdev);
struct i40e_vsi *vsi = np->vsi;
+ if (i40e_enabled_xdp_vsi(vsi)) {
+ int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+
+ if (max_frame > I40E_RXBUFFER_2048)
+ return -EINVAL;
+ }
+
netdev_info(netdev, "changing MTU from %d to %d\n",
netdev->mtu, new_mtu);
netdev->mtu = new_mtu;
@@ -3085,6 +3093,15 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
writel(0, ring->tail);
+ if (i40e_enabled_xdp_vsi(vsi)) {
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_add(vsi->xdp_prog, 1);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ ring->xdp_prog = prog;
+ }
+
i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
return 0;
@@ -9234,6 +9251,62 @@ static netdev_features_t i40e_features_check(struct sk_buff *skb,
return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
}
+/**
+ * i40e_xdp_setup - Add/remove an XDP program to a VSI
+ * @vsi: the VSI to add the program
+ * @prog: the XDP program
+ **/
+static int i40e_xdp_setup(struct i40e_vsi *vsi,
+ struct bpf_prog *prog)
+{
+ struct i40e_pf *pf = vsi->back;
+ struct net_device *netdev = vsi->netdev;
+ int frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+
+ if (frame_size > I40E_RXBUFFER_2048)
+ return -EINVAL;
+
+ if (!(pf->flags & I40E_FLAG_MSIX_ENABLED))
+ return -EINVAL;
+
+ if (!i40e_enabled_xdp_vsi(vsi) && !prog)
+ return 0;
+
+ i40e_prep_for_reset(pf);
+
+ if (vsi->xdp_prog)
+ bpf_prog_put(vsi->xdp_prog);
+ vsi->xdp_prog = prog;
+
+ i40e_reset_and_rebuild(pf, true);
+ return 0;
+}
+
+/**
+ * i40e_xdp - NDO for enabled/query
+ * @dev: the netdev
+ * @xdp: XDP program
+ **/
+static int i40e_xdp(struct net_device *dev,
+ struct netdev_xdp *xdp)
+{
+ struct i40e_netdev_priv *np = netdev_priv(dev);
+ struct i40e_vsi *vsi = np->vsi;
+
+ if (vsi->type != I40E_VSI_MAIN)
+ return -EINVAL;
+
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ return i40e_xdp_setup(vsi, xdp->prog);
+ case XDP_QUERY_PROG:
+ xdp->prog_attached = i40e_enabled_xdp_vsi(vsi);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static const struct net_device_ops i40e_netdev_ops = {
.ndo_open = i40e_open,
.ndo_stop = i40e_close,
@@ -9270,6 +9343,7 @@ static const struct net_device_ops i40e_netdev_ops = {
.ndo_features_check = i40e_features_check,
.ndo_bridge_getlink = i40e_ndo_bridge_getlink,
.ndo_bridge_setlink = i40e_ndo_bridge_setlink,
+ .ndo_xdp = i40e_xdp,
};
/**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 352cf7cd2ef4..d835a51dafa6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -24,6 +24,7 @@
*
******************************************************************************/
+#include <linux/bpf.h>
#include <linux/prefetch.h>
#include <net/busy_poll.h>
#include "i40e.h"
@@ -1040,6 +1041,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
rx_ring->next_to_alloc = 0;
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
+
+ if (rx_ring->xdp_prog) {
+ bpf_prog_put(rx_ring->xdp_prog);
+ rx_ring->xdp_prog = NULL;
+ }
}
/**
@@ -1600,30 +1606,104 @@ static bool i40e_add_rx_frag(struct i40e_ring *rx_ring,
}
/**
+ * i40e_run_xdp - Runs an XDP program for an Rx ring
+ * @rx_ring: Rx ring used for XDP
+ * @rx_buffer: current Rx buffer
+ * @rx_desc: current Rx descriptor
+ * @xdp_prog: the XDP program to run
+ *
+ * Returns true if the XDP program consumed the incoming frame. False
+ * means pass the frame to the good old stack.
+ **/
+static bool i40e_run_xdp(struct i40e_ring *rx_ring,
+ struct i40e_rx_buffer *rx_buffer,
+ union i40e_rx_desc *rx_desc,
+ struct bpf_prog *xdp_prog)
+{
+ u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+ unsigned int size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
+ I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
+ struct xdp_buff xdp;
+ u32 xdp_action;
+
+ WARN_ON(!i40e_test_staterr(rx_desc,
+ BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)));
+
+ xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset;
+ xdp.data_end = xdp.data + size;
+ xdp_action = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+ switch (xdp_action) {
+ case XDP_PASS:
+ return false;
+ default:
+ bpf_warn_invalid_xdp_action(xdp_action);
+ case XDP_ABORTED:
+ case XDP_TX:
+ case XDP_DROP:
+ if (likely(!i40e_page_is_reserved(rx_buffer->page))) {
+ i40e_reuse_rx_page(rx_ring, rx_buffer);
+ rx_ring->rx_stats.page_reuse_count++;
+ break;
+ }
+
+ /* we are not reusing the buffer so unmap it */
+ dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ __free_pages(rx_buffer->page, 0);
+ }
+
+ /* clear contents of buffer_info */
+ rx_buffer->page = NULL;
+ return true; /* Swallowed by XDP */
+}
+
+/**
* i40e_fetch_rx_buffer - Allocate skb and populate it
* @rx_ring: rx descriptor ring to transact packets on
* @rx_desc: descriptor containing info written by hardware
+ * @skb: The allocated skb, if any
*
- * This function allocates an skb on the fly, and populates it with the page
- * data from the current receive descriptor, taking care to set up the skb
- * correctly, as well as handling calling the page recycle function if
- * necessary.
+ * Unless XDP is enabled, this function allocates an skb on the fly,
+ * and populates it with the page data from the current receive
+ * descriptor, taking care to set up the skb correctly, as well as
+ * handling calling the page recycle function if necessary.
+ *
+ * If the received frame was handled by XDP, true is
+ * returned. Otherwise, the skb is returned to the caller via the skb
+ * parameter.
*/
static inline
-struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
- union i40e_rx_desc *rx_desc)
+bool i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
+ union i40e_rx_desc *rx_desc,
+ struct sk_buff **skb)
{
struct i40e_rx_buffer *rx_buffer;
- struct sk_buff *skb;
struct page *page;
rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
page = rx_buffer->page;
prefetchw(page);
- skb = rx_buffer->skb;
+ /* we are reusing so sync this buffer for CPU use */
+ dma_sync_single_range_for_cpu(rx_ring->dev,
+ rx_buffer->dma,
+ rx_buffer->page_offset,
+ I40E_RXBUFFER_2048,
+ DMA_FROM_DEVICE);
+
+ if (rx_ring->xdp_prog) {
+ bool xdp_consumed;
+
+ xdp_consumed = i40e_run_xdp(rx_ring, rx_buffer,
+ rx_desc, rx_ring->xdp_prog);
+ if (xdp_consumed)
+ return true;
+ }
- if (likely(!skb)) {
+ *skb = rx_buffer->skb;
+
+ if (likely(!*skb)) {
void *page_addr = page_address(page) + rx_buffer->page_offset;
/* prefetch first cache line of first page */
@@ -1633,32 +1713,25 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
#endif
/* allocate a skb to store the frags */
- skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
- I40E_RX_HDR_SIZE,
- GFP_ATOMIC | __GFP_NOWARN);
- if (unlikely(!skb)) {
+ *skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
+ I40E_RX_HDR_SIZE,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!*skb)) {
rx_ring->rx_stats.alloc_buff_failed++;
- return NULL;
+ return false;
}
/* we will be copying header into skb->data in
* pskb_may_pull so it is in our interest to prefetch
* it now to avoid a possible cache miss
*/
- prefetchw(skb->data);
+ prefetchw((*skb)->data);
} else {
rx_buffer->skb = NULL;
}
- /* we are reusing so sync this buffer for CPU use */
- dma_sync_single_range_for_cpu(rx_ring->dev,
- rx_buffer->dma,
- rx_buffer->page_offset,
- I40E_RXBUFFER_2048,
- DMA_FROM_DEVICE);
-
/* pull page into skb */
- if (i40e_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
+ if (i40e_add_rx_frag(rx_ring, rx_buffer, rx_desc, *skb)) {
/* hand second half of page back to the ring */
i40e_reuse_rx_page(rx_ring, rx_buffer);
rx_ring->rx_stats.page_reuse_count++;
@@ -1671,7 +1744,7 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
/* clear contents of buffer_info */
rx_buffer->page = NULL;
- return skb;
+ return false;
}
/**
@@ -1716,6 +1789,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
}
/**
+ * i40e_update_rx_next_to_clean - Bumps the next-to-clean for an Rx ing
+ * @rx_ring: Rx ring to bump
+ **/
+static void i40e_update_rx_next_to_clean(struct i40e_ring *rx_ring)
+{
+ u32 ntc = rx_ring->next_to_clean + 1;
+
+ ntc = (ntc < rx_ring->count) ? ntc : 0;
+ rx_ring->next_to_clean = ntc;
+
+ prefetch(I40E_RX_DESC(rx_ring, ntc));
+}
+
+/**
* i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
* @rx_ring: rx descriptor ring to transact packets on
* @budget: Total limit on number of packets to process
@@ -1739,6 +1826,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
u16 vlan_tag;
u8 rx_ptype;
u64 qword;
+ bool xdp_consumed;
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
@@ -1764,7 +1852,15 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
*/
dma_rmb();
- skb = i40e_fetch_rx_buffer(rx_ring, rx_desc);
+ xdp_consumed = i40e_fetch_rx_buffer(rx_ring, rx_desc, &skb);
+ if (xdp_consumed) {
+ cleaned_count++;
+
+ i40e_update_rx_next_to_clean(rx_ring);
+ total_rx_packets++;
+ continue;
+ }
+
if (!skb)
break;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index e065321ce8ed..957d856a82c4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -341,6 +341,8 @@ struct i40e_ring {
struct rcu_head rcu; /* to avoid race on free */
u16 next_to_alloc;
+
+ struct bpf_prog *xdp_prog;
} ____cacheline_internodealigned_in_smp;
enum i40e_latency_range {
--
2.9.3
^ permalink raw reply related
* Re: [PATCH net] phy: Don't increment MDIO bus refcount unless it's a different owner
From: Johan Hovold @ 2016-12-08 17:01 UTC (permalink / raw)
To: Florian Fainelli; +Cc: Johan Hovold, netdev, rmk+kernel, andrew
In-Reply-To: <81ffa62a-b385-94ca-2396-f2137a320e8b@gmail.com>
On Thu, Dec 08, 2016 at 08:47:54AM -0800, Florian Fainelli wrote:
> On 12/08/2016 08:27 AM, Johan Hovold wrote:
> > On Tue, Dec 06, 2016 at 08:54:43PM -0800, Florian Fainelli wrote:
> >> Commit 3e3aaf649416 ("phy: fix mdiobus module safety") fixed the way we
> >> dealt with MDIO bus module reference count, but sort of introduced a
> >> regression in that, if an Ethernet driver registers its own MDIO bus
> >> driver, as is common, we will end up with the Ethernet driver's
> >> module->refnct set to 1, thus preventing this driver from any removal.
> >>
> >> Fix this by comparing the network device's device driver owner against
> >> the MDIO bus driver owner, and only if they are different, increment the
> >> MDIO bus module refcount.
> >>
> >> Fixes: 3e3aaf649416 ("phy: fix mdiobus module safety")
> >> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> >> ---
> >> Russell,
> >>
> >> I verified this against the ethoc driver primarily (on a TS7300 board)
> >> and bcmgenet.
> >>
> >> Thanks!
> >>
> >> drivers/net/phy/phy_device.c | 16 +++++++++++++---
> >> 1 file changed, 13 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> >> index 1a4bf8acad78..c4ceb082e970 100644
> >> --- a/drivers/net/phy/phy_device.c
> >> +++ b/drivers/net/phy/phy_device.c
> >> @@ -857,11 +857,17 @@ EXPORT_SYMBOL(phy_attached_print);
> >> int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
> >> u32 flags, phy_interface_t interface)
> >> {
> >> + struct module *ndev_owner = dev->dev.parent->driver->owner;
> >
> > Is this really safe? A driver does not need to set a parent device, and
> > in that case you get a NULL-deref here (I tried using cpsw).
>
> Humm, cpsw does call SET_NETDEV_DEV() which should take care of that, is
> the call made too late? Do you have an example oops?
Sorry if I was being unclear, cpsw does set a parent device, but there
are network driver that do not. Perhaps such drivers will never hit this
code path, but I can't say for sure and everything appear to work for
cpsw if you comment out that SET_NETDEV_DEV (well, at least before this
patch).
> I don't mind safeguarding this with a check against dev->dev.parent, but
> I would like to fix the drivers where relevant too, since
> SET_NETDEV_DEV() should really be called, otherwise a number of things
> just don't work
I grepped for for register_netdev and think I saw a number of drivers
which do not call SET_NETDEV_DEV.
Again, perhaps they will never hit this path, but thought I should ask.
Johan
^ permalink raw reply
* [PATCH 0/3] i40e: Support for XDP
From: Björn Töpel @ 2016-12-08 17:00 UTC (permalink / raw)
To: jeffrey.t.kirsher, intel-wired-lan
Cc: Björn Töpel, john.r.fastabend, magnus.karlsson, netdev
From: Björn Töpel <bjorn.topel@intel.com>
This series adds XDP support for i40e-based NICs.
The first patch adds XDP_RX support, the second XDP_TX support and the
last patch makes it possible to change an XDP program without
rebuilding the rings.
Björn
Björn Töpel (3):
i40e: Initial support for XDP
i40e: Add XDP_TX support
i40e: Don't reset/rebuild rings on XDP program swap
drivers/net/ethernet/intel/i40e/i40e.h | 18 +
drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 3 +
drivers/net/ethernet/intel/i40e/i40e_main.c | 358 +++++++++++++++++---
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 445 +++++++++++++++++++++----
drivers/net/ethernet/intel/i40e/i40e_txrx.h | 7 +
5 files changed, 715 insertions(+), 116 deletions(-)
--
2.9.3
^ permalink raw reply
* Re: [PATCH v4 net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: John Fastabend @ 2016-12-08 17:00 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann
Cc: Martin KaFai Lau, netdev, Alexei Starovoitov, Brenden Blanco,
David Miller, Jakub Kicinski, Jesper Dangaard Brouer,
Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <20161208165515.GA92599@ast-mbp.thefacebook.com>
On 16-12-08 08:55 AM, Alexei Starovoitov wrote:
> On Thu, Dec 08, 2016 at 10:02:37AM +0100, Daniel Borkmann wrote:
>> On 12/08/2016 12:53 AM, Martin KaFai Lau wrote:
>>> This patch allows XDP prog to extend/remove the packet
>>> data at the head (like adding or removing header). It is
>>> done by adding a new XDP helper bpf_xdp_adjust_head().
>>>
>>> It also renames bpf_helper_changes_skb_data() to
>>> bpf_helper_changes_pkt_data() to better reflect
>>> that XDP prog does not work on skb.
>>>
>>> This patch adds one "xdp_adjust_head" bit to bpf_prog for the
>>> XDP-capable driver to check if the XDP prog requires
>>> bpf_xdp_adjust_head() support. The driver can then decide
>>> to error out during XDP_SETUP_PROG.
>>>
>>> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>>
>> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
>
Thanks for fixing up the xdp_buff struct. And for good measure,
Acked-by: John Fastabend <john.r.fastabend@intel.com>
^ permalink raw reply
* Re: [PATCH net-next v6 0/2] net/sched: cls_flower: Support matching on ICMP
From: David Miller @ 2016-12-08 16:58 UTC (permalink / raw)
To: simon.horman; +Cc: jiri, tom, netdev, ogerlitz
In-Reply-To: <1481114908-24189-1-git-send-email-simon.horman@netronome.com>
From: Simon Horman <simon.horman@netronome.com>
Date: Wed, 7 Dec 2016 13:48:26 +0100
> this series adds support for matching on ICMP type and code to
> cls_flower.
Series applied, thanks Simon. I guess we'll see some netronome
offloading of this stuff coming soon? :-)
Simon and Or, you both added extensions to cls_flower at the same
time. Or's changes went in first, so his UAPI numbers did not change.
Simons, your changes went in next so your numbers did change and
therefore you will have to recompile any userland components you were
using for testing.
Just FYI...
^ permalink raw reply
* Re: [PATCH v4 net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: Alexei Starovoitov @ 2016-12-08 16:55 UTC (permalink / raw)
To: Daniel Borkmann
Cc: Martin KaFai Lau, netdev, Alexei Starovoitov, Brenden Blanco,
David Miller, Jakub Kicinski, Jesper Dangaard Brouer,
John Fastabend, Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <584921AD.3060004@iogearbox.net>
On Thu, Dec 08, 2016 at 10:02:37AM +0100, Daniel Borkmann wrote:
> On 12/08/2016 12:53 AM, Martin KaFai Lau wrote:
> >This patch allows XDP prog to extend/remove the packet
> >data at the head (like adding or removing header). It is
> >done by adding a new XDP helper bpf_xdp_adjust_head().
> >
> >It also renames bpf_helper_changes_skb_data() to
> >bpf_helper_changes_pkt_data() to better reflect
> >that XDP prog does not work on skb.
> >
> >This patch adds one "xdp_adjust_head" bit to bpf_prog for the
> >XDP-capable driver to check if the XDP prog requires
> >bpf_xdp_adjust_head() support. The driver can then decide
> >to error out during XDP_SETUP_PROG.
> >
> >Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
^ permalink raw reply
* Re: [PATCH net] phy: Don't increment MDIO bus refcount unless it's a different owner
From: Florian Fainelli @ 2016-12-08 16:47 UTC (permalink / raw)
To: Johan Hovold; +Cc: netdev, rmk+kernel, andrew
In-Reply-To: <20161208162740.GI31573@localhost>
On 12/08/2016 08:27 AM, Johan Hovold wrote:
> On Tue, Dec 06, 2016 at 08:54:43PM -0800, Florian Fainelli wrote:
>> Commit 3e3aaf649416 ("phy: fix mdiobus module safety") fixed the way we
>> dealt with MDIO bus module reference count, but sort of introduced a
>> regression in that, if an Ethernet driver registers its own MDIO bus
>> driver, as is common, we will end up with the Ethernet driver's
>> module->refnct set to 1, thus preventing this driver from any removal.
>>
>> Fix this by comparing the network device's device driver owner against
>> the MDIO bus driver owner, and only if they are different, increment the
>> MDIO bus module refcount.
>>
>> Fixes: 3e3aaf649416 ("phy: fix mdiobus module safety")
>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>> ---
>> Russell,
>>
>> I verified this against the ethoc driver primarily (on a TS7300 board)
>> and bcmgenet.
>>
>> Thanks!
>>
>> drivers/net/phy/phy_device.c | 16 +++++++++++++---
>> 1 file changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index 1a4bf8acad78..c4ceb082e970 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -857,11 +857,17 @@ EXPORT_SYMBOL(phy_attached_print);
>> int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
>> u32 flags, phy_interface_t interface)
>> {
>> + struct module *ndev_owner = dev->dev.parent->driver->owner;
>
> Is this really safe? A driver does not need to set a parent device, and
> in that case you get a NULL-deref here (I tried using cpsw).
Humm, cpsw does call SET_NETDEV_DEV() which should take care of that, is
the call made too late? Do you have an example oops?
I don't mind safeguarding this with a check against dev->dev.parent, but
I would like to fix the drivers where relevant too, since
SET_NETDEV_DEV() should really be called, otherwise a number of things
just don't work
>
>> struct mii_bus *bus = phydev->mdio.bus;
>> struct device *d = &phydev->mdio.dev;
>> int err;
>>
>> - if (!try_module_get(bus->owner)) {
>> + /* For Ethernet device drivers that register their own MDIO bus, we
>> + * will have bus->owner match ndev_mod, so we do not want to increment
>
> You also wanted s/ndev_mod/ndev_owner/ here.
Meh, it's merged now, but thanks, I will fix this once we find out the
proper solution for cpsw.
--
Florian
^ permalink raw reply
* Re: [RESEND PATCH 3/3] net: stmmac: stmmac_platform: use correct setup function for gmac4
From: David Miller @ 2016-12-08 16:36 UTC (permalink / raw)
To: niklas.cassel
Cc: peppe.cavallaro, alexandre.torgue, niklass, netdev, linux-kernel
In-Reply-To: <1481114469-4788-3-git-send-email-niklass@axis.com>
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 13:41:08 +0100
> From: Niklas Cassel <niklas.cassel@axis.com>
>
> devicetree binding for stmmac states:
> - compatible: Should be "snps,dwmac-<ip_version>", "snps,dwmac"
> For backwards compatibility: "st,spear600-gmac" is also supported.
>
> Previously, when specifying "snps,dwmac-4.10a", "snps,dwmac" as your
> compatible string, plat_stmmacenet_data would have both has_gmac and
> has_gmac4 set.
>
> This would lead to stmmac_hw_init calling dwmac1000_setup rather than
> dwmac4_setup, resulting in a non-functional driver.
> This happened since the check for has_gmac is done before the check for
> has_gmac4. However, the order should not matter, so it does not make sense
> to have both set.
>
> If something is valid for both, you should do as the stmmac_interrupt does:
> if (priv->plat->has_gmac || priv->plat->has_gmac4) ...
>
> The places where it was obvious that the author actually meant
> if (has_gmac || has_gmac4) rather than if (has_gmac) has been updated.
>
> Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
> Acked-by: Alexandre TORGUE <alexandre.torgue@st.com>
Applied.
^ permalink raw reply
* Re: [RESEND PATCH 2/3] net: stmmac: dwmac-generic: add missing compatible strings
From: David Miller @ 2016-12-08 16:35 UTC (permalink / raw)
To: niklas.cassel
Cc: peppe.cavallaro, alexandre.torgue, niklass, netdev, linux-kernel
In-Reply-To: <1481114469-4788-2-git-send-email-niklass@axis.com>
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 13:41:07 +0100
> From: Niklas Cassel <niklas.cassel@axis.com>
>
> devicetree binding for stmmac states:
> - compatible: Should be "snps,dwmac-<ip_version>", "snps,dwmac"
> For backwards compatibility: "st,spear600-gmac" is also supported.
>
> Since dwmac-generic.c calls stmmac_probe_config_dt explicitly,
> another alternative would have been to remove all compatible strings
> other than "snps,dwmac" and "st,spear600-gmac" from dwmac-generic.c.
>
> However, that would probably do more good than harm, since when trying
> to figure out what hardware a certain driver supports, you usually look
> at the compatible strings in the struct of_device_id, and not in some
> function defined in a completely different file.
>
> No functional change intended.
>
> Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
> Acked-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Applied.
^ permalink raw reply
* Re: [RESEND PATCH 1/3] bindings: net: stmmac: correct note about TSO
From: David Miller @ 2016-12-08 16:35 UTC (permalink / raw)
To: niklas.cassel-VrBV9hrLPhE
Cc: robh+dt-DgEjT+Ai2ygdnm+yROfE0A, mark.rutland-5wv7dgnIgG8,
peppe.cavallaro-qxv4g6HH51o, alexandre.torgue-qxv4g6HH51o,
preid-qgqNFa1JUf/o2iN0hyhwsIdd74u8MsAO,
eric-op+oiCINJLTt9jDmeYuA0g, niklass-VrBV9hrLPhE,
netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1481114469-4788-1-git-send-email-niklass-VrBV9hrLPhE@public.gmane.org>
From: Niklas Cassel <niklas.cassel-VrBV9hrLPhE@public.gmane.org>
Date: Wed, 7 Dec 2016 13:41:06 +0100
> From: Niklas Cassel <niklas.cassel-VrBV9hrLPhE@public.gmane.org>
>
> snps,tso was previously placed under AXI BUS Mode parameters,
> suggesting that the property should be in the stmmac-axi-config node.
>
> TSO (TCP Segmentation Offloading) has nothing to do with AXI BUS Mode
> parameters, and the parser actually expects it to be in the root node,
> not in the stmmac-axi-config.
>
> Also added a note about snps,tso only being available on GMAC4 and newer.
>
> Signed-off-by: Niklas Cassel <niklas.cassel-VrBV9hrLPhE@public.gmane.org>
> Acked-by: Alexandre TORGUE <alexandre.torgue-qxv4g6HH51o@public.gmane.org>
> Acked-by: Rob Herring <robh-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> Acked-by: Giuseppe Cavallaro <peppe.cavallaro-qxv4g6HH51o@public.gmane.org>
Applied.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH] net: axienet: Utilize of_get_mac_address()
From: David Miller @ 2016-12-08 16:34 UTC (permalink / raw)
To: tklauser; +Cc: netdev, michal.simek, soren.brinkmann, f.fainelli
In-Reply-To: <20161207122529.29508-1-tklauser@distanz.ch>
From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed, 7 Dec 2016 13:25:28 +0100
> Do not open code getting the MAC address exclusively from the
> "local-mac-address" property, but instead use of_get_mac_address()
> which looks up the MAC address using the 3 typical property names.
>
> Also avoid casting away the const qualifier of the return value by
> making axienet_set_mac_address() take a const void* address.
>
> Follows commit b34296a9c047 ("net: ethoc: Utilize
> of_get_mac_address()").
>
> Cc: Florian Fainelli <f.fainelli@gmail.com>
> Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Applied.
^ permalink raw reply
* Re: [PATCH] net: ll_temac: Utilize of_get_mac_address()
From: David Miller @ 2016-12-08 16:34 UTC (permalink / raw)
To: tklauser; +Cc: netdev, michal.simek, soren.brinkmann, f.fainelli
In-Reply-To: <20161207122529.29508-2-tklauser@distanz.ch>
From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed, 7 Dec 2016 13:25:29 +0100
> Do not open code getting the MAC address exclusively from the
> "local-mac-address" property, but instead use of_get_mac_address()
> which looks up the MAC address using the 3 typical property names.
>
> Also avoid casting away the const qualifier of the return value by
> making temac_init_mac_address() take a const void* address.
>
> Follows commit b34296a9c047 ("net: ethoc: Utilize
> of_get_mac_address()").
>
> Cc: Florian Fainelli <f.fainelli@gmail.com>
> Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Applied.
^ permalink raw reply
* Re: [PATCH net-next V3 0/2] net/sched: cls_flower: Add support for matching on dissection flags
From: David Miller @ 2016-12-08 16:33 UTC (permalink / raw)
To: ogerlitz; +Cc: netdev, jiri, roid, hadarh
In-Reply-To: <1481112191-14115-1-git-send-email-ogerlitz@mellanox.com>
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 7 Dec 2016 14:03:09 +0200
> This series add the UAPI to provide set of flags for matching, where the
> flags provided from user-space are mapped to flow-dissector flags.
>
> The 1st flag allows to match on whether the packet is an
> IP fragment and corresponds to the FLOW_DIS_IS_FRAGMENT flag.
...
> v2->v3:
> - replace BIT() with << (kbuild test robot)
>
> v1->v2:
> - dropped the flow dissector patch (#1) as no changes are needed there (Jiri)
> - applied code review comments from Jiri to the flower patch
Series applied.
^ permalink raw reply
* Re: [patch net-next] net: mvneta: Indent some statements
From: David Miller @ 2016-12-08 16:31 UTC (permalink / raw)
To: dan.carpenter; +Cc: thomas.petazzoni, mw, netdev, kernel-janitors
In-Reply-To: <20161207113217.GE5507@elgon.mountain>
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 7 Dec 2016 14:32:17 +0300
> These two statements were not indented correctly so it's sort of
> confusing.
>
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Applied.
^ permalink raw reply
* Re: fs, net: deadlock between bind/splice on af_unix
From: Dmitry Vyukov @ 2016-12-08 16:30 UTC (permalink / raw)
To: Al Viro, linux-fsdevel@vger.kernel.org, LKML, David Miller,
Rainer Weikusat, Hannes Frederic Sowa, Cong Wang, netdev,
Eric Dumazet
Cc: syzkaller
In-Reply-To: <CACT4Y+Z981V+QLHr=PnQy1Dvxrpp-nCDhQtf+5HuNAusH+Vqxw@mail.gmail.com>
On Thu, Dec 8, 2016 at 3:47 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
> Hello,
>
> I am getting the following deadlock reports while running syzkaller
> fuzzer on 318c8932ddec5c1c26a4af0f3c053784841c598e (Dec 7).
>
>
> [ INFO: possible circular locking dependency detected ]
> 4.9.0-rc8+ #77 Not tainted
> -------------------------------------------------------
> syz-executor0/3155 is trying to acquire lock:
> (&u->bindlock){+.+.+.}, at: [<ffffffff871bca1a>]
> unix_autobind.isra.26+0xca/0x8a0 net/unix/af_unix.c:852
> but task is already holding lock:
> (&pipe->mutex/1){+.+.+.}, at: [< inline >] pipe_lock_nested
> fs/pipe.c:66
> (&pipe->mutex/1){+.+.+.}, at: [<ffffffff81a8ea4b>]
> pipe_lock+0x5b/0x70 fs/pipe.c:74
> which lock already depends on the new lock.
>
> the existing dependency chain (in reverse order) is:
>
> [ 202.103497] [< inline >] validate_chain
> kernel/locking/lockdep.c:2265
> [ 202.103497] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
> [ 202.103497] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
> [ 202.103497] [< inline >] __mutex_lock_common
> kernel/locking/mutex.c:521
> [ 202.103497] [<ffffffff88195bcf>]
> mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
> [ 202.103497] [< inline >] pipe_lock_nested fs/pipe.c:66
> [ 202.103497] [<ffffffff81a8ea4b>] pipe_lock+0x5b/0x70 fs/pipe.c:74
> [ 202.103497] [<ffffffff81b451f7>]
> iter_file_splice_write+0x267/0xfa0 fs/splice.c:717
> [ 202.103497] [< inline >] do_splice_from fs/splice.c:869
> [ 202.103497] [< inline >] do_splice fs/splice.c:1160
> [ 202.103497] [< inline >] SYSC_splice fs/splice.c:1410
> [ 202.103497] [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0
> fs/splice.c:1393
> [ 202.103497] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
> [ 202.103497] [< inline >] validate_chain
> kernel/locking/lockdep.c:2265
> [ 202.103497] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
> [ 202.103497] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
> [ 202.103497] [< inline >]
> percpu_down_read_preempt_disable include/linux/percpu-rwsem.h:35
> [ 202.103497] [< inline >] percpu_down_read
> include/linux/percpu-rwsem.h:58
> [ 202.103497] [<ffffffff81a7bb33>]
> __sb_start_write+0x193/0x2a0 fs/super.c:1252
> [ 202.103497] [< inline >] sb_start_write
> include/linux/fs.h:1549
> [ 202.103497] [<ffffffff81af9954>] mnt_want_write+0x44/0xb0
> fs/namespace.c:389
> [ 202.103497] [<ffffffff81ab09f6>] filename_create+0x156/0x620
> fs/namei.c:3598
> [ 202.103497] [<ffffffff81ab0ef8>] kern_path_create+0x38/0x50
> fs/namei.c:3644
> [ 202.103497] [< inline >] unix_mknod net/unix/af_unix.c:967
> [ 202.103497] [<ffffffff871c0e11>] unix_bind+0x4d1/0xe60
> net/unix/af_unix.c:1035
> [ 202.103497] [<ffffffff86a76b7e>] SYSC_bind+0x20e/0x4c0
> net/socket.c:1382
> [ 202.103497] [<ffffffff86a7a509>] SyS_bind+0x29/0x30 net/socket.c:1368
> [ 202.103497] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
> [ 202.103497] [< inline >] check_prev_add
> kernel/locking/lockdep.c:1828
> [ 202.103497] [<ffffffff8156309b>]
> check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
> [ 202.103497] [< inline >] validate_chain
> kernel/locking/lockdep.c:2265
> [ 202.103497] [<ffffffff81569576>]
> __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
> [ 202.103497] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
> kernel/locking/lockdep.c:3749
> [ 202.103497] [< inline >] __mutex_lock_common
> kernel/locking/mutex.c:521
> [ 202.103497] [<ffffffff88196b82>]
> mutex_lock_interruptible_nested+0x2d2/0x11d0
> kernel/locking/mutex.c:650
> [ 202.103497] [<ffffffff871bca1a>]
> unix_autobind.isra.26+0xca/0x8a0 net/unix/af_unix.c:852
> [ 202.103497] [<ffffffff871c76dd>]
> unix_dgram_sendmsg+0x105d/0x1730 net/unix/af_unix.c:1667
> [ 202.103497] [<ffffffff871c7ea8>]
> unix_seqpacket_sendmsg+0xf8/0x170 net/unix/af_unix.c:2071
> [ 202.103497] [< inline >] sock_sendmsg_nosec net/socket.c:621
> [ 202.103497] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
> net/socket.c:631
> [ 202.103497] [<ffffffff86a7683c>] kernel_sendmsg+0x4c/0x60
> net/socket.c:639
> [ 202.103497] [<ffffffff86a8101d>]
> sock_no_sendpage+0x20d/0x310 net/core/sock.c:2321
> [ 202.103497] [<ffffffff86a74c95>] kernel_sendpage+0x95/0xf0
> net/socket.c:3289
> [ 202.103497] [<ffffffff86a74d92>] sock_sendpage+0xa2/0xd0
> net/socket.c:775
> [ 202.103497] [<ffffffff81b3ee1e>]
> pipe_to_sendpage+0x2ae/0x390 fs/splice.c:469
> [ 202.103497] [< inline >] splice_from_pipe_feed fs/splice.c:520
> [ 202.103497] [<ffffffff81b42f3f>]
> __splice_from_pipe+0x31f/0x750 fs/splice.c:644
> [ 202.103497] [<ffffffff81b4665c>]
> splice_from_pipe+0x1dc/0x300 fs/splice.c:679
> [ 202.103497] [<ffffffff81b467c5>]
> generic_splice_sendpage+0x45/0x60 fs/splice.c:850
> [ 202.103497] [< inline >] do_splice_from fs/splice.c:869
> [ 202.103497] [< inline >] do_splice fs/splice.c:1160
> [ 202.103497] [< inline >] SYSC_splice fs/splice.c:1410
> [ 202.103497] [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0
> fs/splice.c:1393
> [ 202.103497] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
>
> other info that might help us debug this:
>
> Chain exists of:
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&pipe->mutex/1);
> lock(sb_writers#5);
> lock(&pipe->mutex/1);
> lock(&u->bindlock);
>
> *** DEADLOCK ***
>
> 1 lock held by syz-executor0/3155:
> #0: (&pipe->mutex/1){+.+.+.}, at: [< inline >]
> pipe_lock_nested fs/pipe.c:66
> #0: (&pipe->mutex/1){+.+.+.}, at: [<ffffffff81a8ea4b>]
> pipe_lock+0x5b/0x70 fs/pipe.c:74
>
> stack backtrace:
> CPU: 3 PID: 3155 Comm: syz-executor0 Not tainted 4.9.0-rc8+ #77
> Hardware name: Google Google/Google, BIOS Google 01/01/2011
> ffff88004b1fe288 ffffffff834c44f9 ffffffff00000003 1ffff1000963fbe4
> ffffed000963fbdc 0000000041b58ab3 ffffffff895816f0 ffffffff834c420b
> 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> Call Trace:
> [< inline >] __dump_stack lib/dump_stack.c:15
> [<ffffffff834c44f9>] dump_stack+0x2ee/0x3f5 lib/dump_stack.c:51
> [<ffffffff81560cb0>] print_circular_bug+0x310/0x3c0
> kernel/locking/lockdep.c:1202
> [< inline >] check_prev_add kernel/locking/lockdep.c:1828
> [<ffffffff8156309b>] check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
> [< inline >] validate_chain kernel/locking/lockdep.c:2265
> [<ffffffff81569576>] __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
> [<ffffffff8156b672>] lock_acquire+0x2a2/0x790 kernel/locking/lockdep.c:3749
> [< inline >] __mutex_lock_common kernel/locking/mutex.c:521
> [<ffffffff88196b82>] mutex_lock_interruptible_nested+0x2d2/0x11d0
> kernel/locking/mutex.c:650
> [<ffffffff871bca1a>] unix_autobind.isra.26+0xca/0x8a0 net/unix/af_unix.c:852
> [<ffffffff871c76dd>] unix_dgram_sendmsg+0x105d/0x1730 net/unix/af_unix.c:1667
> [<ffffffff871c7ea8>] unix_seqpacket_sendmsg+0xf8/0x170 net/unix/af_unix.c:2071
> [< inline >] sock_sendmsg_nosec net/socket.c:621
> [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110 net/socket.c:631
> [<ffffffff86a7683c>] kernel_sendmsg+0x4c/0x60 net/socket.c:639
> [<ffffffff86a8101d>] sock_no_sendpage+0x20d/0x310 net/core/sock.c:2321
> [<ffffffff86a74c95>] kernel_sendpage+0x95/0xf0 net/socket.c:3289
> [<ffffffff86a74d92>] sock_sendpage+0xa2/0xd0 net/socket.c:775
> [<ffffffff81b3ee1e>] pipe_to_sendpage+0x2ae/0x390 fs/splice.c:469
> [< inline >] splice_from_pipe_feed fs/splice.c:520
> [<ffffffff81b42f3f>] __splice_from_pipe+0x31f/0x750 fs/splice.c:644
> [<ffffffff81b4665c>] splice_from_pipe+0x1dc/0x300 fs/splice.c:679
> [<ffffffff81b467c5>] generic_splice_sendpage+0x45/0x60 fs/splice.c:850
> [< inline >] do_splice_from fs/splice.c:869
> [< inline >] do_splice fs/splice.c:1160
> [< inline >] SYSC_splice fs/splice.c:1410
> [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0 fs/splice.c:1393
> [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
Seems to be the same, but detected in the context of the second thread:
[ INFO: possible circular locking dependency detected ]
4.9.0-rc8+ #77 Not tainted
-------------------------------------------------------
syz-executor3/24365 is trying to acquire lock:
(&pipe->mutex/1){+.+.+.}, at: [< inline >] pipe_lock_nested
fs/pipe.c:66
(&pipe->mutex/1){+.+.+.}, at: [<ffffffff81a8ea4b>]
pipe_lock+0x5b/0x70 fs/pipe.c:74
but task is already holding lock:
(sb_writers#5){.+.+.+}, at: [< inline >] file_start_write
include/linux/fs.h:2592
(sb_writers#5){.+.+.+}, at: [< inline >] do_splice fs/splice.c:1159
(sb_writers#5){.+.+.+}, at: [< inline >] SYSC_splice fs/splice.c:1410
(sb_writers#5){.+.+.+}, at: [<ffffffff81b47d9f>]
SyS_splice+0x11af/0x16a0 fs/splice.c:1393
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
[ 131.709013] [< inline >] validate_chain
kernel/locking/lockdep.c:2265
[ 131.709013] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
[ 131.709013] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
[ 131.709013] [< inline >]
percpu_down_read_preempt_disable include/linux/percpu-rwsem.h:35
[ 131.709013] [< inline >] percpu_down_read
include/linux/percpu-rwsem.h:58
[ 131.709013] [<ffffffff81a7bb33>]
__sb_start_write+0x193/0x2a0 fs/super.c:1252
[ 131.709013] [< inline >] sb_start_write
include/linux/fs.h:1549
[ 131.709013] [<ffffffff81af9954>] mnt_want_write+0x44/0xb0
fs/namespace.c:389
[ 131.709013] [<ffffffff81ab09f6>] filename_create+0x156/0x620
fs/namei.c:3598
[ 131.709013] [<ffffffff81ab0ef8>] kern_path_create+0x38/0x50
fs/namei.c:3644
[ 131.709013] [< inline >] unix_mknod net/unix/af_unix.c:967
[ 131.709013] [<ffffffff871c0e11>] unix_bind+0x4d1/0xe60
net/unix/af_unix.c:1035
[ 131.709013] [<ffffffff86a76b7e>] SYSC_bind+0x20e/0x4c0
net/socket.c:1382
[ 131.709013] [<ffffffff86a7a509>] SyS_bind+0x29/0x30 net/socket.c:1368
[ 131.709013] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
[ 131.709013] [< inline >] validate_chain
kernel/locking/lockdep.c:2265
[ 131.709013] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
[ 131.709013] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
[ 131.709013] [< inline >] __mutex_lock_common
kernel/locking/mutex.c:521
[ 131.709013] [<ffffffff88196b82>]
mutex_lock_interruptible_nested+0x2d2/0x11d0
kernel/locking/mutex.c:650
[ 131.709013] [<ffffffff871bca1a>]
unix_autobind.isra.26+0xca/0x8a0 net/unix/af_unix.c:852
[ 131.709013] [<ffffffff871c76dd>]
unix_dgram_sendmsg+0x105d/0x1730 net/unix/af_unix.c:1667
[ 131.709013] [<ffffffff871c7ea8>]
unix_seqpacket_sendmsg+0xf8/0x170 net/unix/af_unix.c:2071
[ 131.709013] [< inline >] sock_sendmsg_nosec net/socket.c:621
[ 131.709013] [<ffffffff86a7618f>] sock_sendmsg+0xcf/0x110
net/socket.c:631
[ 131.709013] [<ffffffff86a7683c>] kernel_sendmsg+0x4c/0x60
net/socket.c:639
[ 131.709013] [<ffffffff86a8101d>]
sock_no_sendpage+0x20d/0x310 net/core/sock.c:2321
[ 131.709013] [<ffffffff86a74c95>] kernel_sendpage+0x95/0xf0
net/socket.c:3289
[ 131.709013] [<ffffffff86a74d92>] sock_sendpage+0xa2/0xd0
net/socket.c:775
[ 131.709013] [<ffffffff81b3ee1e>]
pipe_to_sendpage+0x2ae/0x390 fs/splice.c:469
[ 131.709013] [< inline >] splice_from_pipe_feed fs/splice.c:520
[ 131.709013] [<ffffffff81b42f3f>]
__splice_from_pipe+0x31f/0x750 fs/splice.c:644
[ 131.709013] [<ffffffff81b4665c>]
splice_from_pipe+0x1dc/0x300 fs/splice.c:679
[ 131.709013] [<ffffffff81b467c5>]
generic_splice_sendpage+0x45/0x60 fs/splice.c:850
[ 131.709013] [< inline >] do_splice_from fs/splice.c:869
[ 131.709013] [< inline >] do_splice fs/splice.c:1160
[ 131.709013] [< inline >] SYSC_splice fs/splice.c:1410
[ 131.709013] [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0
fs/splice.c:1393
[ 131.709013] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
[ 131.709013] [< inline >] check_prev_add
kernel/locking/lockdep.c:1828
[ 131.709013] [<ffffffff8156309b>]
check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
[ 131.709013] [< inline >] validate_chain
kernel/locking/lockdep.c:2265
[ 131.709013] [<ffffffff81569576>]
__lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
[ 131.709013] [<ffffffff8156b672>] lock_acquire+0x2a2/0x790
kernel/locking/lockdep.c:3749
[ 131.709013] [< inline >] __mutex_lock_common
kernel/locking/mutex.c:521
[ 131.709013] [<ffffffff88195bcf>]
mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
[ 131.709013] [< inline >] pipe_lock_nested fs/pipe.c:66
[ 131.709013] [<ffffffff81a8ea4b>] pipe_lock+0x5b/0x70 fs/pipe.c:74
[ 131.709013] [<ffffffff81b451f7>]
iter_file_splice_write+0x267/0xfa0 fs/splice.c:717
[ 131.709013] [< inline >] do_splice_from fs/splice.c:869
[ 131.709013] [< inline >] do_splice fs/splice.c:1160
[ 131.709013] [< inline >] SYSC_splice fs/splice.c:1410
[ 131.709013] [<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0
fs/splice.c:1393
[ 131.709013] [<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
other info that might help us debug this:
Chain exists of:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(sb_writers#5);
lock(&u->bindlock);
lock(sb_writers#5);
lock(&pipe->mutex/1);
*** DEADLOCK ***
1 lock held by syz-executor3/24365:
#0: (sb_writers#5){.+.+.+}, at: [< inline >]
file_start_write include/linux/fs.h:2592
#0: (sb_writers#5){.+.+.+}, at: [< inline >] do_splice
fs/splice.c:1159
#0: (sb_writers#5){.+.+.+}, at: [< inline >] SYSC_splice
fs/splice.c:1410
#0: (sb_writers#5){.+.+.+}, at: [<ffffffff81b47d9f>]
SyS_splice+0x11af/0x16a0 fs/splice.c:1393
stack backtrace:
CPU: 2 PID: 24365 Comm: syz-executor3 Not tainted 4.9.0-rc8+ #77
Hardware name: Google Google/Google, BIOS Google 01/01/2011
ffff8800597b6af8 ffffffff834c44f9 ffffffff00000002 1ffff1000b2f6cf2
ffffed000b2f6cea 0000000041b58ab3 ffffffff895816f0 ffffffff834c420b
0000000041b58ab3 ffffffff894dbca8 ffffffff8155c780 ffff8800597b6878
Call Trace:
[< inline >] __dump_stack lib/dump_stack.c:15
[<ffffffff834c44f9>] dump_stack+0x2ee/0x3f5 lib/dump_stack.c:51
[<ffffffff81560cb0>] print_circular_bug+0x310/0x3c0
kernel/locking/lockdep.c:1202
[< inline >] check_prev_add kernel/locking/lockdep.c:1828
[<ffffffff8156309b>] check_prevs_add+0xaab/0x1c20 kernel/locking/lockdep.c:1938
[< inline >] validate_chain kernel/locking/lockdep.c:2265
[<ffffffff81569576>] __lock_acquire+0x2156/0x3380 kernel/locking/lockdep.c:3338
[<ffffffff8156b672>] lock_acquire+0x2a2/0x790 kernel/locking/lockdep.c:3749
[< inline >] __mutex_lock_common kernel/locking/mutex.c:521
[<ffffffff88195bcf>] mutex_lock_nested+0x23f/0xf20 kernel/locking/mutex.c:621
[< inline >] pipe_lock_nested fs/pipe.c:66
[<ffffffff81a8ea4b>] pipe_lock+0x5b/0x70 fs/pipe.c:74
[<ffffffff81b451f7>] iter_file_splice_write+0x267/0xfa0 fs/splice.c:717
[< inline >] do_splice_from fs/splice.c:869
[< inline >] do_splice fs/splice.c:1160
[< inline >] SYSC_splice fs/splice.c:1410
[<ffffffff81b473c7>] SyS_splice+0x7d7/0x16a0 fs/splice.c:1393
[<ffffffff881a5f85>] entry_SYSCALL_64_fastpath+0x23/0xc6
^ permalink raw reply
* Re: [patch] drivers: net: xgene: uninitialized variable in xgene_enet_free_pagepool()
From: David Miller @ 2016-12-08 16:30 UTC (permalink / raw)
To: dan.carpenter
Cc: isubramanian, kchudgar, netdev, linux-kernel, kernel-janitors
In-Reply-To: <20161207111424.GA5507@elgon.mountain>
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 7 Dec 2016 14:14:24 +0300
> We never set "slots" in this function.
>
> Fixes: a9380b0f7be8 ("drivers: net: xgene: Add support for Jumbo frame")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Applied.
^ permalink raw reply
* Re: [PATCH 2/2] vhost: remove unnecessary smp_mb from vhost_work_queue
From: David Miller @ 2016-12-08 16:30 UTC (permalink / raw)
To: bergwolf; +Cc: stefanha, kvm, virtualization, netdev
In-Reply-To: <1481104340-77035-2-git-send-email-bergwolf@gmail.com>
From: Peng Tao <bergwolf@gmail.com>
Date: Wed, 7 Dec 2016 17:52:19 +0800
> test_and_set_bit() already implies a memory barrier.
>
> Signed-off-by: Peng Tao <bergwolf@gmail.com>
Applied.
^ permalink raw reply
* Re: [PATCH 1/2] vhost-vsock: remove unused vq variable
From: David Miller @ 2016-12-08 16:30 UTC (permalink / raw)
To: bergwolf; +Cc: stefanha, kvm, virtualization, netdev
In-Reply-To: <1481104340-77035-1-git-send-email-bergwolf@gmail.com>
From: Peng Tao <bergwolf@gmail.com>
Date: Wed, 7 Dec 2016 17:52:18 +0800
> Signed-off-by: Peng Tao <bergwolf@gmail.com>
Applied.
^ permalink raw reply
* Re: [PATCH] sh_eth: add wake-on-lan support via magic packet
From: Sergei Shtylyov @ 2016-12-08 16:29 UTC (permalink / raw)
To: Niklas Söderlund, Simon Horman, netdev, linux-renesas-soc
In-Reply-To: <59a1f246-9062-20e1-3f85-a1c5f6fcfc29@cogentembedded.com>
On 12/08/2016 03:28 PM, Sergei Shtylyov wrote:
> Good to see that somebody cares still about this driver, one more task off
> my back. :-)
>
> On 12/07/2016 07:28 PM, Niklas Söderlund wrote:
>
> You only enable the WOL support fo the R-Car gen2 chips but never say that
> explicitly, neither in the subject nor here.
Some patch description wouldn't hurt here, especially with the way you
implemented this support, e.g. RPM vs clk API -- that needs some explanation...
>> Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
>> ---
>> drivers/net/ethernet/renesas/sh_eth.c | 120 +++++++++++++++++++++++++++++++---
>> drivers/net/ethernet/renesas/sh_eth.h | 4 ++
>> 2 files changed, 116 insertions(+), 8 deletions(-)
>> diff --git a/drivers/net/ethernet/renesas/sh_eth.c
>> b/drivers/net/ethernet/renesas/sh_eth.c
>> index 05b0dc5..3974046 100644
>> --- a/drivers/net/ethernet/renesas/sh_eth.c
>> +++ b/drivers/net/ethernet/renesas/sh_eth.c
[...]
>> diff --git a/drivers/net/ethernet/renesas/sh_eth.h
>> b/drivers/net/ethernet/renesas/sh_eth.h
>> index d050f37..26c6620 100644
>> --- a/drivers/net/ethernet/renesas/sh_eth.h
>> +++ b/drivers/net/ethernet/renesas/sh_eth.h
>> @@ -493,6 +493,7 @@ struct sh_eth_cpu_data {
>> unsigned shift_rd0:1; /* shift Rx descriptor word 0 right by 16 */
>> unsigned rmiimode:1; /* EtherC has RMIIMODE register */
>> unsigned rtrate:1; /* EtherC has RTRATE register */
>> + unsigned magic:1; /* EtherC have PMDE in ECMR and MPDIP in ECSIPR */
>
> OK, e.g. RZ/A1 doesn't have these bits...
However, I'd prefer that the comment be reworded as such:
/* EtherC has ECMR.PMDE and ECSR.MPD */
or
/* EtherC has ECMR_PMDE and ECSR_MPD */
MBR, Sergei
^ permalink raw reply
* Re: [PATCH net] phy: Don't increment MDIO bus refcount unless it's a different owner
From: Johan Hovold @ 2016-12-08 16:27 UTC (permalink / raw)
To: Florian Fainelli; +Cc: netdev, johan, rmk+kernel, andrew
In-Reply-To: <20161207045443.26246-1-f.fainelli@gmail.com>
On Tue, Dec 06, 2016 at 08:54:43PM -0800, Florian Fainelli wrote:
> Commit 3e3aaf649416 ("phy: fix mdiobus module safety") fixed the way we
> dealt with MDIO bus module reference count, but sort of introduced a
> regression in that, if an Ethernet driver registers its own MDIO bus
> driver, as is common, we will end up with the Ethernet driver's
> module->refnct set to 1, thus preventing this driver from any removal.
>
> Fix this by comparing the network device's device driver owner against
> the MDIO bus driver owner, and only if they are different, increment the
> MDIO bus module refcount.
>
> Fixes: 3e3aaf649416 ("phy: fix mdiobus module safety")
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
> Russell,
>
> I verified this against the ethoc driver primarily (on a TS7300 board)
> and bcmgenet.
>
> Thanks!
>
> drivers/net/phy/phy_device.c | 16 +++++++++++++---
> 1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index 1a4bf8acad78..c4ceb082e970 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -857,11 +857,17 @@ EXPORT_SYMBOL(phy_attached_print);
> int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
> u32 flags, phy_interface_t interface)
> {
> + struct module *ndev_owner = dev->dev.parent->driver->owner;
Is this really safe? A driver does not need to set a parent device, and
in that case you get a NULL-deref here (I tried using cpsw).
> struct mii_bus *bus = phydev->mdio.bus;
> struct device *d = &phydev->mdio.dev;
> int err;
>
> - if (!try_module_get(bus->owner)) {
> + /* For Ethernet device drivers that register their own MDIO bus, we
> + * will have bus->owner match ndev_mod, so we do not want to increment
You also wanted s/ndev_mod/ndev_owner/ here.
> + * our own module->refcnt here, otherwise we would not be able to
> + * unload later on.
> + */
> + if (ndev_owner != bus->owner && !try_module_get(bus->owner)) {
> dev_err(&dev->dev, "failed to get the bus module\n");
> return -EIO;
Johan
^ permalink raw reply
* Re: [net-next] icmp: correct return value of icmp_rcv()
From: David Miller @ 2016-12-08 16:24 UTC (permalink / raw)
To: zhangshengju; +Cc: netdev
In-Reply-To: <1481093573-5123-1-git-send-email-zhangshengju@cmss.chinamobile.com>
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Wed, 7 Dec 2016 14:52:53 +0800
> Currently, icmp_rcv() always return zero on a packet delivery upcall.
>
> To make its behavior more compliant with the way this API should be
> used, this patch changes this to let it return NET_RX_SUCCESS when the
> packet is proper handled, and NET_RX_DROP otherwise.
>
> Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Applied.
^ permalink raw reply
* Re: [PATCH] linux/types.h: enable endian checks for all sparse builds
From: Michael S. Tsirkin @ 2016-12-08 16:17 UTC (permalink / raw)
To: Bart Van Assche
Cc: linux-kernel@vger.kernel.org, Linus Torvalds, Christoph Hellwig,
Jason Wang, linux-kbuild@vger.kernel.org, Michal Marek,
Arnd Bergmann, Greg Kroah-Hartman, Matt Mackall, Herbert Xu,
David Airlie, Gerd Hoffmann, Ohad Ben-Cohen,
Christian Borntraeger, Cornelia Huck, James E.J. Bottomley,
David S. Miller, Jens Axboe, Neil Armstrong <narmstron
In-Reply-To: <BLUPR02MB168304F8FBA50C916A6A4E6081840@BLUPR02MB1683.namprd02.prod.outlook.com>
On Thu, Dec 08, 2016 at 06:38:11AM +0000, Bart Van Assche wrote:
> On 12/07/16 21:54, Michael S. Tsirkin wrote:
> > On Thu, Dec 08, 2016 at 05:21:47AM +0000, Bart Van Assche wrote:
> >> Additionally, there are notable exceptions to the rule that most drivers
> >> are endian-clean, e.g. drivers/scsi/qla2xxx. I would appreciate it if it
> >> would remain possible to check such drivers with sparse without enabling
> >> endianness checks. Have you considered to change #ifdef __CHECK_ENDIAN__
> >> into e.g. #ifndef __DONT_CHECK_ENDIAN__?
> >
> > The right thing is probably just to fix these, isn't it?
> > Until then, why not just ignore the warnings?
>
> Neither option is realistic. With endian-checking enabled the qla2xxx
> driver triggers so many warnings that it becomes a real challenge to
> filter the non-endian warnings out manually:
>
> $ for f in "" CF=-D__CHECK_ENDIAN__; do make M=drivers/scsi/qla2xxx C=2\
> $f | &grep -c ': warning:'; done
> 4
> 752
You can always revert this patch in your tree, or whatever. It does not
look like this will get fixed otherwise.
> If you think it would be easy to fix the endian warnings triggered by
> the qla2xxx driver, you are welcome to try to fix these.
>
> Bart.
Yea, this hardware was designed by someone who thought mixing
LE and BE all over the place is a good idea.
But who said it should be easy?
Maybe this change will be enough to motivate the maintainers.
Here's a minor buglet for you as a motivator:
if (ct_rsp->header.response !=
cpu_to_be16(CT_ACCEPT_RESPONSE)) {
ql_dbg(ql_dbg_disc + ql_dbg_buffer, vha, 0x2077,
"%s failed rejected request on port_id: %02x%02x%02x Compeltion status 0x%x, response 0x%x\n",
routine, vha->d_id.b.domain,
vha->d_id.b.area, vha->d_id.b.al_pa, comp_status, ct_rsp->header.response);
response is BE and isn't printed correctly.
another:
eiter->a.max_frame_size = cpu_to_be32(eiter->a.max_frame_size);
size += 4 + 4;
ql_dbg(ql_dbg_disc, vha, 0x20bc,
"Max_Frame_Size = %x.\n", eiter->a.max_frame_size);
printed too late, it's be by that time.
Here's another suspicious line
ctio24->u.status1.flags = (atio->u.isp24.attr << 9) |
cpu_to_le16(CTIO7_FLAGS_STATUS_MODE_1 |
CTIO7_FLAGS_TERMINATE);
shifting attr by 9 bits gives different results on BE and LE,
mixing it with le16 looks rather strange.
Another:
ha->flags.dport_enabled =
(mid_init_cb->init_cb.firmware_options_1 & BIT_7) != 0;
BIT_7 is native endian, firmware_options_1 is LE I think.
Look at qla27xx_find_valid_image as well.
if (pri_image_status.signature != QLA27XX_IMG_STATUS_SIGN)
qla27xx_image_status seems to be data coming from flash, but is
somehow native-endian? Maybe ...
lun = a->u.isp24.fcp_cmnd.lun;
I think lun here is in hardware format (le?), code treats it
as native.
Not to speak about interface abuse all over the place.
How about this:
uint32_t *
qla24xx_read_flash_data(scsi_qla_host_t *vha, uint32_t *dwptr, uint32_t
faddr,
uint32_t dwords)
{
uint32_t i;
struct qla_hw_data *ha = vha->hw;
/* Dword reads to flash. */
for (i = 0; i < dwords; i++, faddr++)
dwptr[i] = cpu_to_le32(qla24xx_read_flash_dword(ha,
flash_data_addr(ha, faddr)));
return dwptr;
}
OK so we convert to LE ...
qla24xx_read_flash_data(vha, dcode, faddr, 4);
risc_addr = be32_to_cpu(dcode[2]);
*srisc_addr = *srisc_addr == 0 ? risc_addr : *srisc_addr;
risc_size = be32_to_cpu(dcode[3]);
then happily assume it's BE.
And again, coming from flash, it's unlikely to actually be in the native
endian-ness as callers seem to assume. I'm guessing it's all BE.
I poked at it a bit and was able to cut down # of warnings
from 1700 to 1400 in an hour. Someone familiar with the code
should look at it.
--
MST
^ permalink raw reply
* Re: net: deadlock on genl_mutex
From: Dmitry Vyukov @ 2016-12-08 16:16 UTC (permalink / raw)
To: syzkaller
Cc: Eric Dumazet, David Miller, Matti Vaittinen, Tycho Andersen,
Cong Wang, Florian Westphal, stephen hemminger, Tom Herbert,
netdev, LKML, Richard Guy Briggs, netdev-owner
In-Reply-To: <0227d7e83cc5ac0a192d1ba0fee61413@codeaurora.org>
On Tue, Nov 29, 2016 at 6:59 AM, <subashab@codeaurora.org> wrote:
>>
>> Issue was reported yesterday and is under investigation.
>>
>>
>> http://marc.info/?l=linux-netdev&m=148014004331663&w=2
>>
>>
>> Thanks !
>
>
> Hi Dmitry
>
> Can you try the patch below with your reproducer? I haven't seen similar
> crashes reported after this (or even with Eric's patch).
I've synced to 318c8932ddec5c1c26a4af0f3c053784841c598e (Dec 7) and do
_not_ see this report happening anymore.
Thanks.
^ permalink raw reply
* Re: [PATCH net-next] udp: under rx pressure, try to condense skbs
From: Eric Dumazet @ 2016-12-08 16:08 UTC (permalink / raw)
To: Rick Jones; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Paolo Abeni
In-Reply-To: <de654cac-fa3c-a4e1-a24a-27187456b8d7@hpe.com>
On Thu, 2016-12-08 at 07:36 -0800, Rick Jones wrote:
> On 12/08/2016 07:30 AM, Eric Dumazet wrote:
> > On Thu, 2016-12-08 at 10:46 +0100, Jesper Dangaard Brouer wrote:
> >
> >> Hmmm... I'm not thrilled to have such heuristics, that change memory
> >> behavior when half of the queue size (sk->sk_rcvbuf) is reached.
> >
> > Well, copybreak drivers do that unconditionally, even under no stress at
> > all, you really should complain then.
>
> Isn't that behaviour based (in part?) on the observation/belief that it
> is fewer cycles to copy the small packet into a small buffer than to
> send the larger buffer up the stack and have to allocate and map a
> replacement?
If properly done yes ;)
Some drivers do a copybreak, but throw away the original page frag and
reallocates a fresh one anyway.
Like if you have a PAGE_SIZE=65536, it is split in ~32 frags, and
drivers might not bother trying to reuse 1 frag.
^ permalink raw reply
* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
From: Alexander Duyck @ 2016-12-08 16:05 UTC (permalink / raw)
To: David Laight; +Cc: tndave, Jeff Kirsher, intel-wired-lan, Netdev
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB02382BD@AcuExch.aculab.com>
On Thu, Dec 8, 2016 at 2:43 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Alexander Duyck
>> Sent: 06 December 2016 17:10
> ...
>> I was thinking about it and I realized we can probably simplify this
>> even further. In the case of most other architectures the
>> DMA_ATTR_WEAK_ORDERING has no effect anyway. So from what I can tell
>> there is probably no reason not to just always pass that attribute
>> with the DMA mappings. From what I can tell the only other
>> architecture that uses this is the PowerPC Cell architecture.
>
> And I should have read all the thread :-(
>
>> Also I was wondering if you actually needed to enable this attribute
>> for both Rx and Tx buffers or just Rx buffers? The patch that enabled
>> DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes, but I
>> didn't see anything about reads. I'm just wondering if changing the
>> code for Tx has any effect? If not you could probably drop those
>> changes and just focus on Rx.
>
> 'Weak ordering' only applies to PCIe read transfers, so can only have
> an effect on descriptor reads and transmit buffer reads.
>
> Basically PCIe is a comms protocol and an endpoint (or the host) can
> have multiple outstanding read requests (each of which might generate
> multiple response messages.
> The responses for each request must arrive in order, but responses for
> different requests can be interleaved.
> Setting 'not weak ordering' lets the host interwork with broken endpoints.
> (Or, like we did, you fix the fpga's PCIe implementation.)
I get the basics of relaxed ordering. The question is how does the
Sparc64 IOMMU translate DMA_ATTR_WEAK_ORDERING into relaxed ordering
messages, and at what level the ordering is relaxed. Odds are the
wording in the description where this attribute was added to Sparc is
just awkward, but I was wanting to verify if this only applies to
writes, or also read completions.
> In this case you need the reads of both transmit and receive rings to
> 'overtake' reads of transmit data.
Actually that isn't quite right. With relaxed ordering completions
and writes can pass each other if I recall correctly, but reads will
always force all writes ahead of them to be completed before you can
begin generating the read completions.
> I'm not at all clear how this 'flag' can be set on dma_map().
> It is a property of the PCIe subsystem.
That was where my original question on this came in. We can do a
blanket enable of relaxed ordering for Tx and Rx data buffers, but if
we only need it on Rx then there isn't any need for us to make
unnecessary changes.
- Alex
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox