netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Björn Töpel" <bjorn.topel@gmail.com>
To: bjorn.topel@gmail.com, magnus.karlsson@intel.com,
	alexander.h.duyck@intel.com, alexander.duyck@gmail.com,
	john.fastabend@gmail.com, ast@fb.com, brouer@redhat.com,
	michael.lundkvist@ericsson.com, ravineet.singh@ericsson.com,
	daniel@iogearbox.net, netdev@vger.kernel.org
Cc: jesse.brandeburg@intel.com, anjali.singhai@intel.com,
	rami.rosen@intel.com, jeffrey.b.shaw@intel.com,
	ferruh.yigit@intel.com, qi.z.zhang@intel.com
Subject: [RFC PATCH 05/14] packet: enable Tx support for AF_PACKET V4
Date: Tue, 31 Oct 2017 13:41:36 +0100	[thread overview]
Message-ID: <20171031124145.9667-6-bjorn.topel@gmail.com> (raw)
In-Reply-To: <20171031124145.9667-1-bjorn.topel@gmail.com>

From: Magnus Karlsson <magnus.karlsson@intel.com>

In this commit AF_PACKET V4 egress support is added.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 include/linux/tpacket4.h | 192 +++++++++++++++++++++++++++++++++++++++++++++++
 net/packet/af_packet.c   | 169 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 350 insertions(+), 11 deletions(-)

diff --git a/include/linux/tpacket4.h b/include/linux/tpacket4.h
index 1d4c13d472e5..ac6c721294e8 100644
--- a/include/linux/tpacket4.h
+++ b/include/linux/tpacket4.h
@@ -18,6 +18,8 @@
 #define TP4_UMEM_MIN_FRAME_SIZE 2048
 #define TP4_KERNEL_HEADROOM 256 /* Headrom for XDP */
 
+#define TP4A_FRAME_COMPLETED TP4_DESC_KERNEL
+
 enum tp4_validation {
 	TP4_VALIDATION_NONE,	/* No validation is performed */
 	TP4_VALIDATION_IDX,	/* Only address to packet buffer is validated */
@@ -402,6 +404,60 @@ static inline int tp4q_enqueue_from_array(struct tp4_packet_array *a,
 }
 
 /**
+ * tp4q_enqueue_completed_from_array - Enqueue only completed entries
+ *				       from packet array
+ *
+ * @a: Pointer to the packet array to enqueue from
+ * @dcnt: Max number of entries to enqueue
+ *
+ * Returns the number of entries successfully enqueued or a negative errno
+ * at failure.
+ **/
+static inline int tp4q_enqueue_completed_from_array(struct tp4_packet_array *a,
+						    u32 dcnt)
+{
+	struct tp4_queue *q = a->tp4q;
+	unsigned int used_idx = q->used_idx;
+	struct tpacket4_desc *d = a->items;
+	int i, j;
+
+	if (q->num_free < dcnt)
+		return -ENOSPC;
+
+	for (i = 0; i < dcnt; i++) {
+		unsigned int didx = (a->start + i) & a->mask;
+
+		if (d[didx].flags & TP4A_FRAME_COMPLETED) {
+			unsigned int idx = (used_idx++) & q->ring_mask;
+
+			q->ring[idx].idx = d[didx].idx;
+			q->ring[idx].len = d[didx].len;
+			q->ring[idx].offset = d[didx].offset;
+			q->ring[idx].error = d[didx].error;
+		} else {
+			break;
+		}
+	}
+
+	if (i == 0)
+		return 0;
+
+	/* Order flags and data */
+	smp_wmb();
+
+	for (j = i - 1; j >= 0; j--) {
+		unsigned int idx = (q->used_idx + j) & q->ring_mask;
+		unsigned int didx = (a->start + j) & a->mask;
+
+		q->ring[idx].flags = d[didx].flags & ~TP4_DESC_KERNEL;
+	}
+	q->num_free -= i;
+	q->used_idx += i;
+
+	return i;
+}
+
+/**
  * tp4q_dequeue_to_array - Dequeue entries from tp4 queue to packet array
  *
  * @a: Pointer to the packet array to dequeue from
@@ -581,6 +637,15 @@ static inline struct tpacket4_desc *tp4q_get_desc(struct tp4_frame_set *p)
  **/
 
 /**
+ * tp4f_reset - Start to traverse the frames in the set from the beginning
+ * @p: pointer to frame set
+ **/
+static inline void tp4f_reset(struct tp4_frame_set *p)
+{
+	p->curr = p->start;
+}
+
+/**
  * tp4f_next_frame - Go to next frame in frame set
  * @p: pointer to frame set
  *
@@ -597,6 +662,38 @@ static inline bool tp4f_next_frame(struct tp4_frame_set *p)
 }
 
 /**
+ * tp4f_get_frame_id - Get packet buffer id of frame
+ * @p: pointer to frame set
+ *
+ * Returns the id of the packet buffer of the current frame
+ **/
+static inline u64 tp4f_get_frame_id(struct tp4_frame_set *p)
+{
+	return p->pkt_arr->items[p->curr & p->pkt_arr->mask].idx;
+}
+
+/**
+ * tp4f_get_frame_len - Get length of data in current frame
+ * @p: pointer to frame set
+ *
+ * Returns the length of data in the packet buffer of the current frame
+ **/
+static inline u32 tp4f_get_frame_len(struct tp4_frame_set *p)
+{
+	return p->pkt_arr->items[p->curr & p->pkt_arr->mask].len;
+}
+
+/**
+ * tp4f_set_error - Set an error on the current frame
+ * @p: pointer to frame set
+ * @errno: the errno to be assigned
+ **/
+static inline void tp4f_set_error(struct tp4_frame_set *p, int errno)
+{
+	p->pkt_arr->items[p->curr & p->pkt_arr->mask].error = errno;
+}
+
+/**
  * tp4f_get_data - Gets a pointer to the frame the frame set is on
  * @p: pointer to the frame set
  *
@@ -627,6 +724,48 @@ static inline void tp4f_set_frame(struct tp4_frame_set *p, u32 len, u16 offset,
 		d->flags |= TP4_PKT_CONT;
 }
 
+/*************** PACKET OPERATIONS *******************************/
+/* A packet consists of one or more frames. Both frames and packets
+ * are represented by a tp4_frame_set. The only difference is that
+ * packet functions look at the EOP flag.
+ **/
+
+/**
+ * tp4f_get_packet_len - Length of packet
+ * @p: pointer to packet
+ *
+ * Returns the length of the packet in bytes.
+ * Resets curr pointer of packet.
+ **/
+static inline u32 tp4f_get_packet_len(struct tp4_frame_set *p)
+{
+	u32 len = 0;
+
+	tp4f_reset(p);
+
+	do {
+		len += tp4f_get_frame_len(p);
+	} while (tp4f_next_frame(p));
+
+	return len;
+}
+
+/**
+ * tp4f_packet_completed - Mark packet as completed
+ * @p: pointer to packet
+ *
+ * Resets curr pointer of packet.
+ **/
+static inline void tp4f_packet_completed(struct tp4_frame_set *p)
+{
+	tp4f_reset(p);
+
+	do {
+		p->pkt_arr->items[p->curr & p->pkt_arr->mask].flags |=
+			TP4A_FRAME_COMPLETED;
+	} while (tp4f_next_frame(p));
+}
+
 /**************** PACKET_ARRAY FUNCTIONS ********************************/
 
 static inline struct tp4_packet_array *__tp4a_new(
@@ -815,6 +954,59 @@ static inline unsigned int tp4a_max_data_size(struct tp4_packet_array *a)
 }
 
 /**
+ * tp4a_next_packet - Get next packet in array and advance curr pointer
+ * @a: pointer to packet array
+ * @p: supplied pointer to packet structure that is filled in by function
+ *
+ * Returns true if there is a packet, false otherwise. Packet returned in *p.
+ **/
+static inline bool tp4a_next_packet(struct tp4_packet_array *a,
+				    struct tp4_frame_set *p)
+{
+	u32 avail = a->end - a->curr;
+
+	if (avail == 0)
+		return false; /* empty */
+
+	p->pkt_arr = a;
+	p->start = a->curr;
+	p->curr = a->curr;
+	p->end = a->curr;
+
+	/* XXX Sanity check for too-many-frames packets? */
+	while (a->items[p->end++ & a->mask].flags & TP4_PKT_CONT) {
+		avail--;
+		if (avail == 0)
+			return false;
+	}
+
+	a->curr += (p->end - p->start);
+	return true;
+}
+
+/**
+ * tp4a_flush_completed - Flushes only frames marked as completed
+ * @a: pointer to packet array
+ *
+ * Returns 0 for success and -1 for failure
+ **/
+static inline int tp4a_flush_completed(struct tp4_packet_array *a)
+{
+	u32 avail = a->curr - a->start;
+	int ret;
+
+	if (avail == 0)
+		return 0; /* nothing to flush */
+
+	ret = tp4q_enqueue_completed_from_array(a, avail);
+	if (ret < 0)
+		return -1;
+
+	a->start += ret;
+	return 0;
+}
+
+/**
  * tp4a_populate - Populate an array with packets from associated tp4q
  * @a: pointer to packet array
  **/
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 830d97ff4358..444eb4834362 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2462,6 +2462,28 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	goto drop_n_restore;
 }
 
+static void packet_v4_destruct_skb(struct sk_buff *skb)
+{
+	struct packet_sock *po = pkt_sk(skb->sk);
+
+	if (likely(po->tx_ring.pg_vec)) {
+		u64 idx = (u64)skb_shinfo(skb)->destructor_arg;
+		struct tp4_frame_set p = {.start = idx,
+					  .curr = idx,
+					  .end = idx + 1,
+					  .pkt_arr = po->tx_ring.tp4a};
+
+		spin_lock(&po->sk.sk_write_queue.lock);
+		tp4f_packet_completed(&p);
+		WARN_ON_ONCE(tp4a_flush_completed(po->tx_ring.tp4a));
+		spin_unlock(&po->sk.sk_write_queue.lock);
+
+		packet_dec_pending(&po->tx_ring);
+	}
+
+	sock_wfree(skb);
+}
+
 static void tpacket_destruct_skb(struct sk_buff *skb)
 {
 	struct packet_sock *po = pkt_sk(skb->sk);
@@ -2519,24 +2541,24 @@ static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
 }
 
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
-		void *frame, struct net_device *dev, void *data, int tp_len,
+		void *dtor_arg, struct net_device *dev, void *data, int tp_len,
 		__be16 proto, unsigned char *addr, int hlen, int copylen,
 		const struct sockcm_cookie *sockc)
 {
-	union tpacket_uhdr ph;
 	int to_write, offset, len, nr_frags, len_max;
 	struct socket *sock = po->sk.sk_socket;
 	struct page *page;
 	int err;
 
-	ph.raw = frame;
-
 	skb->protocol = proto;
 	skb->dev = dev;
 	skb->priority = po->sk.sk_priority;
 	skb->mark = po->sk.sk_mark;
-	sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
-	skb_shinfo(skb)->destructor_arg = ph.raw;
+	if (sockc) {
+		sock_tx_timestamp(&po->sk, sockc->tsflags,
+				  &skb_shinfo(skb)->tx_flags);
+	}
+	skb_shinfo(skb)->destructor_arg = dtor_arg;
 
 	skb_reserve(skb, hlen);
 	skb_reset_network_header(skb);
@@ -2840,6 +2862,126 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	return err;
 }
 
+static int packet_v4_snd(struct packet_sock *po, struct msghdr *msg)
+{
+	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
+	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
+	struct packet_ring_buffer *rb = &po->tx_ring;
+	int err = 0, dlen, size_max, hlen, tlen;
+	struct tp4_frame_set p;
+	struct net_device *dev;
+	struct sk_buff *skb;
+	unsigned char *addr;
+	bool has_packet;
+	__be16 proto;
+	void *data;
+
+	mutex_lock(&po->pg_vec_lock);
+
+	if (likely(!saddr)) {
+		dev = packet_cached_dev_get(po);
+		proto = po->num;
+		addr = NULL;
+	} else {
+		pr_warn("packet v4 not implemented!\n");
+		return -EINVAL;
+	}
+
+	err = -ENXIO;
+	if (unlikely(!dev))
+		goto out;
+	err = -ENETDOWN;
+	if (unlikely(!(dev->flags & IFF_UP)))
+		goto out_put;
+
+	size_max = tp4a_max_data_size(rb->tp4a);
+
+	if (size_max > dev->mtu + dev->hard_header_len + VLAN_HLEN)
+		size_max = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+
+	spin_lock_bh(&po->sk.sk_write_queue.lock);
+	tp4a_populate(rb->tp4a);
+	spin_unlock_bh(&po->sk.sk_write_queue.lock);
+
+	do {
+		spin_lock_bh(&po->sk.sk_write_queue.lock);
+		has_packet = tp4a_next_packet(rb->tp4a, &p);
+		spin_unlock_bh(&po->sk.sk_write_queue.lock);
+
+		if (!has_packet) {
+			if (need_wait && need_resched()) {
+				schedule();
+				continue;
+			}
+			break;
+		}
+
+		dlen = tp4f_get_packet_len(&p);
+		data = tp4f_get_data(&p);
+		hlen = LL_RESERVED_SPACE(dev);
+		tlen = dev->needed_tailroom;
+		skb = sock_alloc_send_skb(&po->sk,
+					  hlen + tlen +
+					  sizeof(struct sockaddr_ll),
+					  !need_wait, &err);
+
+		if (unlikely(!skb)) {
+			err = -EAGAIN;
+			goto out_err;
+		}
+
+		dlen = tpacket_fill_skb(po, skb,
+					(void *)(long)tp4f_get_frame_id(&p),
+					dev,
+					data, dlen, proto, addr, hlen,
+					dev->hard_header_len, NULL);
+		if (likely(dlen >= 0) &&
+		    dlen > dev->mtu + dev->hard_header_len &&
+		    !packet_extra_vlan_len_allowed(dev, skb)) {
+			dlen = -EMSGSIZE;
+		}
+
+		if (unlikely(dlen < 0)) {
+			err = dlen;
+			goto out_err;
+		}
+
+		skb->destructor = packet_v4_destruct_skb;
+		packet_inc_pending(&po->tx_ring);
+
+		err = po->xmit(skb);
+		/* Ignore NET_XMIT_CN as packet might have been sent */
+		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
+			err = -EAGAIN;
+			packet_dec_pending(&po->tx_ring);
+			skb = NULL;
+			goto out_err;
+		}
+	} while (!err ||
+		/* Note: packet_read_pending() might be slow if we have
+		 * to call it as it's per_cpu variable, but in fast-path
+		 * we already short-circuit the loop with the first
+		 * condition, and luckily don't have to go that path
+		 * anyway.
+		 */
+		 (need_wait && packet_read_pending(&po->tx_ring)));
+
+	goto out_put;
+
+out_err:
+	spin_lock_bh(&po->sk.sk_write_queue.lock);
+	tp4f_set_error(&p, -err);
+	tp4f_packet_completed(&p);
+	WARN_ON_ONCE(tp4a_flush_completed(rb->tp4a));
+	spin_unlock_bh(&po->sk.sk_write_queue.lock);
+	kfree_skb(skb);
+out_put:
+	dev_put(dev);
+out:
+	mutex_unlock(&po->pg_vec_lock);
+	return 0;
+}
+
 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
 				        size_t reserve, size_t len,
 				        size_t linear, int noblock,
@@ -3015,10 +3157,10 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	struct packet_sock *po = pkt_sk(sk);
 
 	if (po->tx_ring.pg_vec) {
-		if (po->tp_version == TPACKET_V4)
-			return -EINVAL;
+		if (po->tp_version != TPACKET_V4)
+			return tpacket_snd(po, msg);
 
-		return tpacket_snd(po, msg);
+		return packet_v4_snd(po, msg);
 	}
 
 	return packet_snd(sock, msg, len);
@@ -4329,9 +4471,14 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
 		po->pressure = 0;
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
 	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (po->tx_ring.pg_vec && po->tp_version != TPACKET_V4) {
-		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
+	if (po->tx_ring.pg_vec) {
+		if (po->tp_version == TPACKET_V4) {
+			if (tp4q_nb_avail(&po->tx_ring.tp4q, 1))
+				mask |= POLLOUT | POLLWRNORM;
+		} else if (packet_current_frame(po, &po->tx_ring,
+					 TP_STATUS_AVAILABLE)) {
 			mask |= POLLOUT | POLLWRNORM;
+		}
 	}
 	spin_unlock_bh(&sk->sk_write_queue.lock);
 	return mask;
-- 
2.11.0

  parent reply	other threads:[~2017-10-31 12:42 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-10-31 12:41 [RFC PATCH 00/14] Introducing AF_PACKET V4 support Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 01/14] packet: introduce AF_PACKET V4 userspace API Björn Töpel
2017-11-02  1:45   ` Willem de Bruijn
2017-11-02 10:06     ` Björn Töpel
2017-11-02 16:40       ` Tushar Dave
2017-11-02 16:47         ` Björn Töpel
2017-11-03  2:29       ` Willem de Bruijn
2017-11-03  9:54         ` Björn Töpel
2017-11-15 22:21           ` chet l
2017-11-16 16:53             ` Jesper Dangaard Brouer
2017-11-17  3:32               ` chetan L
2017-11-15 22:34   ` chet l
2017-11-16  1:44     ` David Miller
2017-11-16 19:32       ` chetan L
2017-10-31 12:41 ` [RFC PATCH 02/14] packet: implement PACKET_MEMREG setsockopt Björn Töpel
2017-11-03  3:00   ` Willem de Bruijn
2017-11-03  9:57     ` Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 03/14] packet: enable AF_PACKET V4 rings Björn Töpel
2017-11-03  4:16   ` Willem de Bruijn
2017-11-03 10:02     ` Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 04/14] packet: enable Rx for AF_PACKET V4 Björn Töpel
2017-10-31 12:41 ` Björn Töpel [this message]
2017-10-31 12:41 ` [RFC PATCH 06/14] netdevice: add AF_PACKET V4 zerocopy ops Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 07/14] packet: wire up zerocopy for AF_PACKET V4 Björn Töpel
2017-11-03  3:17   ` Willem de Bruijn
2017-11-03 10:47     ` Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 08/14] i40e: AF_PACKET V4 ndo_tp4_zerocopy Rx support Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 09/14] i40e: AF_PACKET V4 ndo_tp4_zerocopy Tx support Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 10/14] samples/tpacket4: added tpbench Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 11/14] veth: added support for PACKET_ZEROCOPY Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 12/14] samples/tpacket4: added veth support Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 13/14] i40e: added XDP support for TP4 enabled queue pairs Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 14/14] xdp: introducing XDP_PASS_TO_KERNEL for PACKET_ZEROCOPY use Björn Töpel
2017-11-03  4:34 ` [RFC PATCH 00/14] Introducing AF_PACKET V4 support Willem de Bruijn
2017-11-03 10:13   ` Karlsson, Magnus
2017-11-03 13:55     ` Willem de Bruijn
2017-11-13 13:07 ` Björn Töpel
2017-11-13 14:34   ` John Fastabend
2017-11-13 23:50   ` Alexei Starovoitov
2017-11-14  5:33     ` Björn Töpel
2017-11-14  7:02       ` John Fastabend
2017-11-14 12:20         ` Willem de Bruijn
2017-11-16  2:55           ` Alexei Starovoitov
2017-11-16  3:35             ` Willem de Bruijn
2017-11-16  7:09               ` Björn Töpel
2017-11-16  8:26                 ` Jesper Dangaard Brouer
2017-11-14 17:19   ` [RFC PATCH 00/14] Introducing AF_PACKET V4 support (AF_XDP or AF_CHANNEL?) Jesper Dangaard Brouer
2017-11-14 19:01     ` Björn Töpel
2017-11-16  8:00       ` Jesper Dangaard Brouer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171031124145.9667-6-bjorn.topel@gmail.com \
    --to=bjorn.topel@gmail.com \
    --cc=alexander.duyck@gmail.com \
    --cc=alexander.h.duyck@intel.com \
    --cc=anjali.singhai@intel.com \
    --cc=ast@fb.com \
    --cc=brouer@redhat.com \
    --cc=daniel@iogearbox.net \
    --cc=ferruh.yigit@intel.com \
    --cc=jeffrey.b.shaw@intel.com \
    --cc=jesse.brandeburg@intel.com \
    --cc=john.fastabend@gmail.com \
    --cc=magnus.karlsson@intel.com \
    --cc=michael.lundkvist@ericsson.com \
    --cc=netdev@vger.kernel.org \
    --cc=qi.z.zhang@intel.com \
    --cc=rami.rosen@intel.com \
    --cc=ravineet.singh@ericsson.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).