* [PATCH net] packet: check for ndo_select_queue during queue selection
@ 2014-02-13 17:18 Daniel Borkmann
  2014-02-14  5:40 ` David Miller
  0 siblings, 1 reply; 5+ messages in thread
From: Daniel Borkmann @ 2014-02-13 17:18 UTC (permalink / raw)
  To: davem; +Cc: mathias.kretschmer, netdev, Jesper Dangaard Brouer
Mathias reported that on an AMD Geode LX embedded board (ALiX)
with ath9k driver PACKET_QDISC_BYPASS triggers a WARN_ON()
coming from the driver itself via 066dae93bdf ("ath9k: rework
tx queue selection and fix queue stopping/waking"). The reason
why this happened is that ndo_select_queue() call is not
invoked from direct xmit path i.e. for ieee80211 subsystem
that sets queue and TID (similar to 802.1d tag) which is being
put into the frame through 802.11e (WMM, QoS). If that is not
set, pending frame counter for e.g. ath9k can get messed up.
So the WARN_ON() in ath9k is absolutely legitimate. Generally,
the hw queue selection in ieee80211 depends on the type of
traffic, and priorities are set according to ieee80211_ac_numbers
mapping; working in a similar way as DiffServ only on a lower
layer, so that the AP can favour frames that have "real-time"
requirements like voice or video data frames. We were evaluating
this and concluded that it's best for PACKET_QDISC_BYPASS to
just probe for existance of this function and invoke it if
available so that drivers that implement it can make an
informed decision to which hw queue the frame should be
dispatched to. Only few drivers implement this function, so
for the majority of drivers nothing changes. The original
pktgen scenario for which PACKET_QDISC_BYPASS was designed
for is still intact with this change anyway. We think
restricting this feature only to ethernet drivers would be an
artificial barrier where we would instead only need to call
ndo_select_queue(). Besides, checking for ARPHRD_ETHER as pktgen
does wouldn't be sufficient as also many ieee80211 drivers
use it as dev->type. If side-effects that we documented are
undesired or use case involves buffering, then people just go
with the normal dev_queue_xmit() that is on default enabled.
Reported-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
---
 Intended for 3.14.
 include/linux/netdevice.h | 20 ++++++++++++++++++++
 net/core/flow_dissector.c | 13 +------------
 net/packet/af_packet.c    | 20 ++++++++++++++++----
 3 files changed, 37 insertions(+), 16 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 440a02e..5f8c860 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2276,6 +2276,26 @@ static inline void netdev_reset_queue(struct net_device *dev_queue)
 }
 
 /**
+ * 	netdev_cap_txqueue - check if selected tx queue exceeds device queues
+ * 	@dev: network device
+ * 	@queue_index: given tx queue index
+ *
+ * 	Returns 0 if given tx queue index >= number of device tx queues,
+ * 	otherwise returns the originally passed tx queue index.
+ */
+static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
+{
+	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
+		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
+				     dev->name, queue_index,
+				     dev->real_num_tx_queues);
+		return 0;
+	}
+
+	return queue_index;
+}
+
+/**
  *	netif_running - test if up
  *	@dev: network device
  *
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 87577d4..d0e1fb1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -323,17 +323,6 @@ u32 __skb_get_poff(const struct sk_buff *skb)
 	return poff;
 }
 
-static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
-{
-	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
-		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
-				     dev->name, queue_index,
-				     dev->real_num_tx_queues);
-		return 0;
-	}
-	return queue_index;
-}
-
 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
@@ -409,7 +398,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 			queue_index = __netdev_pick_tx(dev, skb);
 
 		if (!accel_priv)
-			queue_index = dev_cap_txqueue(dev, queue_index);
+			queue_index = netdev_cap_txqueue(dev, queue_index);
 	}
 
 	skb_set_queue_mapping(skb, queue_index);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 6a2bb37..562ced9 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -308,9 +308,19 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
 	return po->xmit == packet_direct_xmit;
 }
 
-static u16 packet_pick_tx_queue(struct net_device *dev)
+static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
 {
-	return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
+	const struct net_device_ops *ops = dev->netdev_ops;
+	u16 queue_index;
+
+	if (ops->ndo_select_queue) {
+		queue_index = ops->ndo_select_queue(dev, skb, NULL);
+		queue_index = netdev_cap_txqueue(dev, queue_index);
+	} else {
+		queue_index = raw_smp_processor_id() % dev->real_num_tx_queues;
+	}
+
+	skb_set_queue_mapping(skb, queue_index);
 }
 
 /* register_prot_hook must be invoked with the po->bind_lock held,
@@ -2285,7 +2295,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			}
 		}
 
-		skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
+		packet_pick_tx_queue(dev, skb);
+
 		skb->destructor = tpacket_destruct_skb;
 		__packet_set_status(po, ph, TP_STATUS_SENDING);
 		packet_inc_pending(&po->tx_ring);
@@ -2499,7 +2510,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	skb->dev = dev;
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
-	skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
+
+	packet_pick_tx_queue(dev, skb);
 
 	if (po->has_vnet_hdr) {
 		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-- 
1.7.11.7
^ permalink raw reply related	[flat|nested] 5+ messages in thread
* Re: [PATCH net] packet: check for ndo_select_queue during queue selection
  2014-02-13 17:18 [PATCH net] packet: check for ndo_select_queue during queue selection Daniel Borkmann
@ 2014-02-14  5:40 ` David Miller
  2014-02-14 10:21   ` Daniel Borkmann
  0 siblings, 1 reply; 5+ messages in thread
From: David Miller @ 2014-02-14  5:40 UTC (permalink / raw)
  To: dborkman; +Cc: mathias.kretschmer, netdev, brouer
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 13 Feb 2014 18:18:55 +0100
> The original pktgen scenario for which PACKET_QDISC_BYPASS was
> designed for is still intact with this change anyway. We think
I guess you have no intention of using this feature on bnx2x, ixgbe,
or mlx4 devices then?  That covers a rather large component of gigabit
ethernet devices out there, doesn't it?
They both hook up an ndo_select_queue method.
^ permalink raw reply	[flat|nested] 5+ messages in thread
* Re: [PATCH net] packet: check for ndo_select_queue during queue selection
  2014-02-14  5:40 ` David Miller
@ 2014-02-14 10:21   ` Daniel Borkmann
  2014-02-14 18:48     ` David Miller
  0 siblings, 1 reply; 5+ messages in thread
From: Daniel Borkmann @ 2014-02-14 10:21 UTC (permalink / raw)
  To: David Miller; +Cc: mathias.kretschmer, netdev, brouer
On 02/14/2014 06:40 AM, David Miller wrote:
> From: Daniel Borkmann <dborkman@redhat.com>
> Date: Thu, 13 Feb 2014 18:18:55 +0100
>
>> The original pktgen scenario for which PACKET_QDISC_BYPASS was
>> designed for is still intact with this change anyway. We think
>
> I guess you have no intention of using this feature on bnx2x, ixgbe,
> or mlx4 devices then?  That covers a rather large component of gigabit
> ethernet devices out there, doesn't it?
>
> They both hook up an ndo_select_queue method.
Yes, it's suboptimal :/ The other two possibilities I see is 1) to check
for dev->ieee80211_ptr on setsockopt(2) and bind(2) (which could happen
_after_ the option was set) and just disallow that option for those cases
(doesn't make sense to use it for ieee80211 devs anyway, and they seem to
be the only special-cased users), or, preferably 2) to just leave the
code as it currently is, that is, similar to pktgen.
^ permalink raw reply	[flat|nested] 5+ messages in thread
* Re: [PATCH net] packet: check for ndo_select_queue during queue selection
  2014-02-14 10:21   ` Daniel Borkmann
@ 2014-02-14 18:48     ` David Miller
  2014-02-14 21:03       ` Daniel Borkmann
  0 siblings, 1 reply; 5+ messages in thread
From: David Miller @ 2014-02-14 18:48 UTC (permalink / raw)
  To: dborkman; +Cc: mathias.kretschmer, netdev, brouer
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 14 Feb 2014 11:21:22 +0100
> The other two possibilities I see is 1) to check for
> dev->ieee80211_ptr on setsockopt(2) and bind(2) (which could happen
> _after_ the option was set) and just disallow that option for those
> cases (doesn't make sense to use it for ieee80211 devs anyway, and
> they seem to be the only special-cased users), or, preferably 2) to
> just leave the code as it currently is, that is, similar to pktgen.
Don't sell yourself so short, think out of the box:
typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb);
 ...
	u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
				void *accel_priv, select_queue_fallback_t fallback);
And your packet code would do something like:
static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
{
	return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
}
static u16 packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
{
	u16 queue_index;
	if (ops->ndo_select_queue)
		queue_index = ops->ndo_select_queue(dev, skb, NULL, __packet_pick_tx_queue);
	else
		queue_index = __packet_pick_tx_queue(dev, skb);
	skb_set_queue_mapping(skb, queue_index);
}
Then you go through the existing ndo_select_queue implementations, you'll notice
a lot of them are of the form:
	if (special_stuff)
		queue_index = whatever;
	else
		queue_index = __netdev_pick_tx(...);
replace that __netdev_pick_tx call with one to "fallback".
^ permalink raw reply	[flat|nested] 5+ messages in thread
* Re: [PATCH net] packet: check for ndo_select_queue during queue selection
  2014-02-14 18:48     ` David Miller
@ 2014-02-14 21:03       ` Daniel Borkmann
  0 siblings, 0 replies; 5+ messages in thread
From: Daniel Borkmann @ 2014-02-14 21:03 UTC (permalink / raw)
  To: David Miller; +Cc: mathias.kretschmer, netdev, brouer
Thanks for the feedback Dave, will fix that up tomorrow.
^ permalink raw reply	[flat|nested] 5+ messages in thread
end of thread, other threads:[~2014-02-14 21:03 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-02-13 17:18 [PATCH net] packet: check for ndo_select_queue during queue selection Daniel Borkmann
2014-02-14  5:40 ` David Miller
2014-02-14 10:21   ` Daniel Borkmann
2014-02-14 18:48     ` David Miller
2014-02-14 21:03       ` Daniel Borkmann
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).