From: "Björn Töpel" <bjorn.topel@gmail.com>
To: bjorn.topel@gmail.com, magnus.karlsson@intel.com,
alexander.h.duyck@intel.com, alexander.duyck@gmail.com,
john.fastabend@gmail.com, ast@fb.com, brouer@redhat.com,
michael.lundkvist@ericsson.com, ravineet.singh@ericsson.com,
daniel@iogearbox.net, netdev@vger.kernel.org
Cc: "Björn Töpel" <bjorn.topel@intel.com>,
jesse.brandeburg@intel.com, anjali.singhai@intel.com,
rami.rosen@intel.com, jeffrey.b.shaw@intel.com,
ferruh.yigit@intel.com, qi.z.zhang@intel.com
Subject: [RFC PATCH 02/14] packet: implement PACKET_MEMREG setsockopt
Date: Tue, 31 Oct 2017 13:41:33 +0100 [thread overview]
Message-ID: <20171031124145.9667-3-bjorn.topel@gmail.com> (raw)
In-Reply-To: <20171031124145.9667-1-bjorn.topel@gmail.com>
From: Björn Töpel <bjorn.topel@intel.com>
Here, the PACKET_MEMREG setsockopt is implemented for the AF_PACKET
protocol family. PACKET_MEMREG allows the user to register memory
regions that can be used by AF_PACKET V4 as packet data buffers.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
include/linux/tpacket4.h | 101 +++++++++++++++++++++++++++++
net/packet/af_packet.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++
net/packet/internal.h | 4 ++
3 files changed, 268 insertions(+)
create mode 100644 include/linux/tpacket4.h
diff --git a/include/linux/tpacket4.h b/include/linux/tpacket4.h
new file mode 100644
index 000000000000..fcf4c333c78d
--- /dev/null
+++ b/include/linux/tpacket4.h
@@ -0,0 +1,101 @@
+/*
+ * tpacket v4
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_TPACKET4_H
+#define _LINUX_TPACKET4_H
+
+#define TP4_UMEM_MIN_FRAME_SIZE 2048
+#define TP4_KERNEL_HEADROOM 256 /* Headrom for XDP */
+
+struct tp4_umem {
+ struct pid *pid;
+ struct page **pgs;
+ unsigned int npgs;
+ size_t size;
+ unsigned long address;
+ unsigned int frame_size;
+ unsigned int frame_size_log2;
+ unsigned int nframes;
+ unsigned int nfpplog2; /* num frames per page in log2 */
+ unsigned int data_headroom;
+};
+
+/*************** V4 QUEUE OPERATIONS *******************************/
+
+/**
+ * tp4q_umem_new - Creates a new umem (packet buffer)
+ *
+ * @addr: The address to the umem
+ * @size: The size of the umem
+ * @frame_size: The size of each frame, between 2K and PAGE_SIZE
+ * @data_headroom: The desired data headroom before start of the packet
+ *
+ * Returns a pointer to the new umem or NULL for failure
+ **/
+static inline struct tp4_umem *tp4q_umem_new(unsigned long addr, size_t size,
+ unsigned int frame_size,
+ unsigned int data_headroom)
+{
+ struct tp4_umem *umem;
+ unsigned int nframes;
+
+ if (frame_size < TP4_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+ /* Strictly speaking we could support this, if:
+ * - huge pages, or*
+ * - using an IOMMU, or
+ * - making sure the memory area is consecutive
+ * but for now, we simply say "computer says no".
+ */
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!is_power_of_2(frame_size))
+ return ERR_PTR(-EINVAL);
+
+ if (!PAGE_ALIGNED(addr)) {
+ /* Memory area has to be page size aligned. For
+ * simplicity, this might change.
+ */
+ return ERR_PTR(-EINVAL);
+ }
+
+ if ((addr + size) < addr)
+ return ERR_PTR(-EINVAL);
+
+ nframes = size / frame_size;
+ if (nframes == 0)
+ return ERR_PTR(-EINVAL);
+
+ data_headroom = ALIGN(data_headroom, 64);
+
+ if (frame_size - data_headroom - TP4_KERNEL_HEADROOM < 0)
+ return ERR_PTR(-EINVAL);
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ umem->pid = get_task_pid(current, PIDTYPE_PID);
+ umem->size = size;
+ umem->address = addr;
+ umem->frame_size = frame_size;
+ umem->frame_size_log2 = ilog2(frame_size);
+ umem->nframes = nframes;
+ umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size);
+ umem->data_headroom = data_headroom;
+
+ return umem;
+}
+
+#endif /* _LINUX_TPACKET4_H */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 9603f6ff17a4..b39be424ec0e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -89,11 +89,15 @@
#include <linux/errqueue.h>
#include <linux/net_tstamp.h>
#include <linux/percpu.h>
+#include <linux/log2.h>
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
#include <linux/bpf.h>
#include <net/compat.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
#include "internal.h"
@@ -2975,6 +2979,132 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
return packet_snd(sock, msg, len);
}
+static void
+packet_umem_unpin_pages(struct tp4_umem *umem)
+{
+ unsigned int i;
+
+ for (i = 0; i < umem->npgs; i++) {
+ struct page *page = umem->pgs[i];
+
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+ kfree(umem->pgs);
+ umem->pgs = NULL;
+}
+
+static void
+packet_umem_free(struct tp4_umem *umem)
+{
+ struct mm_struct *mm;
+ struct task_struct *task;
+ unsigned long diff;
+
+ packet_umem_unpin_pages(umem);
+
+ task = get_pid_task(umem->pid, PIDTYPE_PID);
+ put_pid(umem->pid);
+ if (!task)
+ goto out;
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ diff = umem->size >> PAGE_SHIFT;
+
+ down_write(&mm->mmap_sem);
+ mm->pinned_vm -= diff;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+out:
+ kfree(umem);
+}
+
+static struct tp4_umem *
+packet_umem_new(unsigned long addr, size_t size, unsigned int frame_size,
+ unsigned int data_headroom)
+{
+ unsigned long lock_limit, locked, npages;
+ unsigned int gup_flags = FOLL_WRITE;
+ int need_release = 0, j = 0, i, ret;
+ struct page **page_list;
+ struct tp4_umem *umem;
+
+ if (!can_do_mlock())
+ return ERR_PTR(-EPERM);
+
+ umem = tp4q_umem_new(addr, size, frame_size, data_headroom);
+ if (IS_ERR(umem))
+ return umem;
+
+ page_list = (struct page **)__get_free_page(GFP_KERNEL);
+ if (!page_list) {
+ put_pid(umem->pid);
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ npages = PAGE_ALIGN(umem->nframes * umem->frame_size) >> PAGE_SHIFT;
+
+ down_write(¤t->mm->mmap_sem);
+
+ locked = npages + current->mm->pinned_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (npages == 0 || npages > UINT_MAX) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ umem->pgs = kcalloc(npages, sizeof(*umem->pgs), GFP_KERNEL);
+ if (!umem->pgs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ need_release = 1;
+ while (npages) {
+ ret = get_user_pages(addr,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof(struct page *)),
+ gup_flags, page_list, NULL);
+
+ if (ret < 0)
+ goto out;
+
+ umem->npgs += ret;
+ addr += ret * PAGE_SIZE;
+ npages -= ret;
+
+ for (i = 0; i < ret; i++)
+ umem->pgs[j++] = page_list[i];
+ }
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ if (need_release)
+ packet_umem_unpin_pages(umem);
+ put_pid(umem->pid);
+ kfree(umem);
+ } else {
+ current->mm->pinned_vm = locked;
+ }
+
+ up_write(¤t->mm->mmap_sem);
+ free_page((unsigned long)page_list);
+
+ return ret < 0 ? ERR_PTR(ret) : umem;
+}
+
/*
* Close a PACKET socket. This is fairly simple. We immediately go
* to 'closed' state and remove our protocol entry in the device list.
@@ -3024,6 +3154,11 @@ static int packet_release(struct socket *sock)
packet_set_ring(sk, &req_u, 1, 1);
}
+ if (po->umem) {
+ packet_umem_free(po->umem);
+ po->umem = NULL;
+ }
+
f = fanout_release(sk);
synchronize_net();
@@ -3828,6 +3963,31 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
return 0;
}
+ case PACKET_MEMREG:
+ {
+ struct tpacket_memreg_req req;
+ struct tp4_umem *umem;
+
+ if (optlen < sizeof(req))
+ return -EINVAL;
+ if (copy_from_user(&req, optval, sizeof(req)))
+ return -EFAULT;
+
+ umem = packet_umem_new(req.addr, req.len, req.frame_size,
+ req.data_headroom);
+ if (IS_ERR(umem))
+ return PTR_ERR(umem);
+
+ lock_sock(sk);
+ if (po->umem) {
+ release_sock(sk);
+ packet_umem_free(umem);
+ return -EBUSY;
+ }
+ po->umem = umem;
+ release_sock(sk);
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -4245,6 +4405,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
case TPACKET_V3:
po->tp_hdrlen = TPACKET3_HDRLEN;
break;
+ default:
+ err = -EINVAL;
+ goto out;
}
err = -EINVAL;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 94d1d405a116..9c07cfe1b8a3 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -2,6 +2,7 @@
#define __PACKET_INTERNAL_H__
#include <linux/refcount.h>
+#include <linux/tpacket4.h>
struct packet_mclist {
struct packet_mclist *next;
@@ -109,6 +110,9 @@ struct packet_sock {
union tpacket_stats_u stats;
struct packet_ring_buffer rx_ring;
struct packet_ring_buffer tx_ring;
+
+ struct tp4_umem *umem;
+
int copy_thresh;
spinlock_t bind_lock;
struct mutex pg_vec_lock;
--
2.11.0
next prev parent reply other threads:[~2017-10-31 12:42 UTC|newest]
Thread overview: 49+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-10-31 12:41 [RFC PATCH 00/14] Introducing AF_PACKET V4 support Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 01/14] packet: introduce AF_PACKET V4 userspace API Björn Töpel
2017-11-02 1:45 ` Willem de Bruijn
2017-11-02 10:06 ` Björn Töpel
2017-11-02 16:40 ` Tushar Dave
2017-11-02 16:47 ` Björn Töpel
2017-11-03 2:29 ` Willem de Bruijn
2017-11-03 9:54 ` Björn Töpel
2017-11-15 22:21 ` chet l
2017-11-16 16:53 ` Jesper Dangaard Brouer
2017-11-17 3:32 ` chetan L
2017-11-15 22:34 ` chet l
2017-11-16 1:44 ` David Miller
2017-11-16 19:32 ` chetan L
2017-10-31 12:41 ` Björn Töpel [this message]
2017-11-03 3:00 ` [RFC PATCH 02/14] packet: implement PACKET_MEMREG setsockopt Willem de Bruijn
2017-11-03 9:57 ` Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 03/14] packet: enable AF_PACKET V4 rings Björn Töpel
2017-11-03 4:16 ` Willem de Bruijn
2017-11-03 10:02 ` Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 04/14] packet: enable Rx for AF_PACKET V4 Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 05/14] packet: enable Tx support " Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 06/14] netdevice: add AF_PACKET V4 zerocopy ops Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 07/14] packet: wire up zerocopy for AF_PACKET V4 Björn Töpel
2017-11-03 3:17 ` Willem de Bruijn
2017-11-03 10:47 ` Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 08/14] i40e: AF_PACKET V4 ndo_tp4_zerocopy Rx support Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 09/14] i40e: AF_PACKET V4 ndo_tp4_zerocopy Tx support Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 10/14] samples/tpacket4: added tpbench Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 11/14] veth: added support for PACKET_ZEROCOPY Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 12/14] samples/tpacket4: added veth support Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 13/14] i40e: added XDP support for TP4 enabled queue pairs Björn Töpel
2017-10-31 12:41 ` [RFC PATCH 14/14] xdp: introducing XDP_PASS_TO_KERNEL for PACKET_ZEROCOPY use Björn Töpel
2017-11-03 4:34 ` [RFC PATCH 00/14] Introducing AF_PACKET V4 support Willem de Bruijn
2017-11-03 10:13 ` Karlsson, Magnus
2017-11-03 13:55 ` Willem de Bruijn
2017-11-13 13:07 ` Björn Töpel
2017-11-13 14:34 ` John Fastabend
2017-11-13 23:50 ` Alexei Starovoitov
2017-11-14 5:33 ` Björn Töpel
2017-11-14 7:02 ` John Fastabend
2017-11-14 12:20 ` Willem de Bruijn
2017-11-16 2:55 ` Alexei Starovoitov
2017-11-16 3:35 ` Willem de Bruijn
2017-11-16 7:09 ` Björn Töpel
2017-11-16 8:26 ` Jesper Dangaard Brouer
2017-11-14 17:19 ` [RFC PATCH 00/14] Introducing AF_PACKET V4 support (AF_XDP or AF_CHANNEL?) Jesper Dangaard Brouer
2017-11-14 19:01 ` Björn Töpel
2017-11-16 8:00 ` Jesper Dangaard Brouer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20171031124145.9667-3-bjorn.topel@gmail.com \
--to=bjorn.topel@gmail.com \
--cc=alexander.duyck@gmail.com \
--cc=alexander.h.duyck@intel.com \
--cc=anjali.singhai@intel.com \
--cc=ast@fb.com \
--cc=bjorn.topel@intel.com \
--cc=brouer@redhat.com \
--cc=daniel@iogearbox.net \
--cc=ferruh.yigit@intel.com \
--cc=jeffrey.b.shaw@intel.com \
--cc=jesse.brandeburg@intel.com \
--cc=john.fastabend@gmail.com \
--cc=magnus.karlsson@intel.com \
--cc=michael.lundkvist@ericsson.com \
--cc=netdev@vger.kernel.org \
--cc=qi.z.zhang@intel.com \
--cc=rami.rosen@intel.com \
--cc=ravineet.singh@ericsson.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).