From: "Leşe Doru Călin" <lesedorucalin01@gmail.com>
To: Paolo Abeni <pabeni@redhat.com>, netdev@vger.kernel.org
Cc: David Miller <davem@davemloft.net>,
Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>,
Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Subject: [PATCH v4] net: UDP repair mode for retrieving the send queue of corked UDP socket
Date: Thu, 16 Apr 2020 16:22:42 +0300 [thread overview]
Message-ID: <20200416132242.GA2586@white> (raw)
In this year's edition of GSoC, there is a project idea for CRIU to add
support for checkpoint/restore of cork-ed UDP sockets. But to add it, the
kernel API needs to be extended.
This is what this patch does. It adds UDP "repair mode" for UDP sockets in
a similar approach to the TCP "repair mode", but only the send queue is
necessary to be retrieved. So the patch extends the recv and setsockopt
syscalls. Using UDP_REPAIR option in setsockopt, caller can set the socket
in repair mode. If it is setted, the recv/recvfrom/recvmsg will receive the
write queue and the destination of the data. As in the TCP mode, to change
the repair mode requires the CAP_NET_ADMIN capability and to receive data
the caller is obliged to use the MSG_PEEK flag.
Signed-off-by: Lese Doru Calin <lesedorucalin01@gmail.com>
---
include/linux/udp.h | 3 +
include/net/udp.h | 3 +
include/uapi/linux/udp.h | 1
net/ipv4/udp.c | 85 +++++++++++++++++++++++++++++++++++++++--------
net/ipv6/udp.c | 64 ++++++++++++++++++++++++-----------
5 files changed, 122 insertions(+), 34 deletions(-)
diff --git a/include/linux/udp.h b/include/linux/udp.h
index aa84597bdc33..b22bd70118ce 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -51,7 +51,8 @@ struct udp_sock {
* different encapsulation layer set
* this
*/
- gro_enabled:1; /* Can accept GRO packets */
+ gro_enabled:1, /* Can accept GRO packets */
+ repair:1;/* Receive the send queue */
/*
* Following member retains the information to create a UDP header
* when the socket is uncorked.
diff --git a/include/net/udp.h b/include/net/udp.h
index a8fa6c0c6ded..f7a7fab0712f 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -282,8 +282,11 @@ int udp_get_port(struct sock *sk, unsigned short snum,
int udp_err(struct sk_buff *, u32);
int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
+int udp_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int len);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 4828794efcf8..2fe78329d6da 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -29,6 +29,7 @@ struct udphdr {
/* UDP socket options */
#define UDP_CORK 1 /* Never send partially complete segments */
+#define UDP_REPAIR 19 /* Receive the send queue */
#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32564b350823..58b59db42ca3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1720,6 +1720,48 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
}
EXPORT_SYMBOL(__skb_recv_udp);
+int udp_peek_sndq(struct sock *sk, struct msghdr *msg, int off, int len)
+{
+ int copy, copied = 0, err = 0;
+ struct sk_buff *skb;
+
+ lock_sock(sk);
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ copy = len - copied;
+ if (copy > skb->len - off)
+ copy = skb->len - off;
+
+ err = skb_copy_datagram_msg(skb, off, msg, copy);
+ if (err)
+ break;
+
+ copied += copy;
+ if (len <= copied)
+ break;
+ }
+ release_sock(sk);
+ return err ?: copied;
+}
+EXPORT_SYMBOL(udp_peek_sndq);
+
+static void udp_set_source_addr(struct sock *sk, struct msghdr *msg,
+ int *addr_len, u32 addr, u16 port)
+{
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = port;
+ sin->sin_addr.s_addr = addr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin);
+ }
+}
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
@@ -1729,8 +1771,9 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
- DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct udp_sock *up = udp_sk(sk);
struct sk_buff *skb;
+ struct flowi4 *fl4;
unsigned int ulen, copied;
int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
@@ -1739,6 +1782,17 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
if (flags & MSG_ERRQUEUE)
return ip_recv_error(sk, msg, len, addr_len);
+ if (unlikely(up->repair)) {
+ if (!peeking)
+ return -EPERM;
+
+ off = sizeof(struct iphdr) + sizeof(struct udphdr);
+ fl4 = &inet->cork.fl.u.ip4;
+ udp_set_source_addr(sk, msg, addr_len, fl4->daddr,
+ fl4->fl4_dport);
+ return udp_peek_sndq(sk, msg, off, len);
+ }
+
try_again:
off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
@@ -1793,19 +1847,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
UDP_MIB_INDATAGRAMS, is_udplite);
sock_recv_ts_and_drops(msg, sk, skb);
-
- /* Copy the address. */
- if (sin) {
- sin->sin_family = AF_INET;
- sin->sin_port = udp_hdr(skb)->source;
- sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- *addr_len = sizeof(*sin);
-
- if (cgroup_bpf_enabled)
- BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
- (struct sockaddr *)sin);
- }
+ udp_set_source_addr(sk, msg, addr_len, ip_hdr(skb)->saddr,
+ udp_hdr(skb)->source);
if (udp_sk(sk)->gro_enabled)
udp_cmsg_recv(msg, sk, skb);
@@ -1833,6 +1876,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
msg->msg_flags &= ~MSG_TRUNC;
goto try_again;
}
+EXPORT_SYMBOL(udp_recvmsg);
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -2557,6 +2601,15 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
}
break;
+ case UDP_REPAIR:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ err = -EPERM;
+ else if (val != 0)
+ up->repair = 1;
+ else
+ up->repair = 0;
+ break;
+
case UDP_ENCAP:
switch (val) {
case 0:
@@ -2678,6 +2731,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->corkflag;
break;
+ case UDP_REPAIR:
+ val = up->repair;
+ break;
+
case UDP_ENCAP:
val = up->encap_type;
break;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 7d4151747340..be2a668c29c5 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -250,6 +250,24 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
#endif
+static void udpv6_set_source_addr(struct sock *sk, struct msghdr *msg,
+ int *addr_len, struct in6_addr *addr,
+ u16 port, u32 scope_id)
+{
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = port;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_addr = *addr;
+ sin6->sin6_scope_id = scope_id;
+ *addr_len = sizeof(*sin6);
+
+ if (cgroup_bpf_enabled)
+ BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin6);
+}
+
/* do not use the scratch area len for jumbogram: their length execeeds the
* scratch area space; note that the IP6CB flags is still in the first
* cacheline, so checking for jumbograms is cheap
@@ -269,8 +287,11 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct inet_sock *inet = inet_sk(sk);
+ struct udp_sock *up = udp_sk(sk);
+ struct in6_addr saddr;
struct sk_buff *skb;
- unsigned int ulen, copied;
+ struct flowi6 *fl6;
+ unsigned int ulen, scpid, copied;
int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
struct udp_mib __percpu *mib;
@@ -283,6 +304,23 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (np->rxpmtu && np->rxopt.bits.rxpmtu)
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
+ if (unlikely(up->repair)) {
+ if (!peeking)
+ return -EPERM;
+
+ if (up->pending == AF_INET)
+ return udp_recvmsg(sk, msg, len, noblock,
+ flags, addr_len);
+
+ off = sizeof(struct ipv6hdr) + sizeof(struct udphdr);
+ if (msg->msg_name) {
+ fl6 = &inet->cork.fl.u.ip6;
+ udpv6_set_source_addr(sk, msg, addr_len, &fl6->daddr,
+ fl6->fl6_dport, fl6->flowi6_oif);
+ }
+ return udp_peek_sndq(sk, msg, off, len);
+ }
+
try_again:
off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
@@ -336,28 +374,16 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
sock_recv_ts_and_drops(msg, sk, skb);
- /* Copy the address. */
if (msg->msg_name) {
- DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = udp_hdr(skb)->source;
- sin6->sin6_flowinfo = 0;
-
if (is_udp4) {
- ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
- &sin6->sin6_addr);
- sin6->sin6_scope_id = 0;
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &saddr);
+ scpid = 0;
} else {
- sin6->sin6_addr = ipv6_hdr(skb)->saddr;
- sin6->sin6_scope_id =
- ipv6_iface_scope_id(&sin6->sin6_addr,
- inet6_iif(skb));
+ saddr = ipv6_hdr(skb)->saddr;
+ scpid = ipv6_iface_scope_id(&saddr, inet6_iif(skb));
}
- *addr_len = sizeof(*sin6);
-
- if (cgroup_bpf_enabled)
- BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
- (struct sockaddr *)sin6);
+ udpv6_set_source_addr(sk, msg, addr_len, &saddr,
+ udp_hdr(skb)->source, scpid);
}
if (udp_sk(sk)->gro_enabled)
next reply other threads:[~2020-04-16 13:24 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-04-16 13:22 Leşe Doru Călin [this message]
2020-04-16 13:38 ` [PATCH v4] net: UDP repair mode for retrieving the send queue of corked UDP socket Leşe Doru Călin
2020-04-22 10:04 ` Paolo Abeni
2020-04-21 20:06 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200416132242.GA2586@white \
--to=lesedorucalin01@gmail.com \
--cc=davem@davemloft.net \
--cc=kuznet@ms2.inr.ac.ru \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=yoshfuji@linux-ipv6.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.