From: Xin Long <lucien.xin@gmail.com>
To: network dev <netdev@vger.kernel.org>
Cc: davem@davemloft.net, kuba@kernel.org,
Eric Dumazet <edumazet@google.com>,
Paolo Abeni <pabeni@redhat.com>, David Ahern <dsahern@gmail.com>,
Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>,
Pravin B Shelar <pshelar@ovn.org>,
Jamal Hadi Salim <jhs@mojatatu.com>,
Cong Wang <xiyou.wangcong@gmail.com>,
Jiri Pirko <jiri@resnulli.us>,
Pablo Neira Ayuso <pablo@netfilter.org>,
Florian Westphal <fw@strlen.de>,
Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>,
Ilya Maximets <i.maximets@ovn.org>,
Aaron Conole <aconole@redhat.com>,
Roopa Prabhu <roopa@nvidia.com>,
Nikolay Aleksandrov <razor@blackwall.org>,
Mahesh Bandewar <maheshb@google.com>,
Paul Moore <paul@paul-moore.com>,
Guillaume Nault <gnault@redhat.com>
Subject: [PATCHv4 net-next 10/10] net: add support for ipv4 big tcp
Date: Sat, 28 Jan 2023 10:58:39 -0500 [thread overview]
Message-ID: <637aa55b8dbf0c85c2ee8892df26a8bbbf9f2ef5.1674921359.git.lucien.xin@gmail.com> (raw)
In-Reply-To: <cover.1674921359.git.lucien.xin@gmail.com>
Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP.
Firstly, allow sk->sk_gso_max_size to be set to a value greater than
GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size()
for IPv4 TCP sockets.
Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU
in __ip_local_out() to allow to send BIG TCP packets, and this implies
that skb->len is the length of a IPv4 packet; On RX path, use skb->len
as the length of the IPv4 packet when the IP header tot_len is 0 and
skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and
skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only
need to update these APIs.
Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows
the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In
GRO complete, set IP header tot_len to 0 when the merged packet size
greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed
on RX path.
Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes
this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP
packets.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
net/core/gro.c | 12 +++++++-----
net/core/sock.c | 26 ++++++++++++++------------
net/ipv4/af_inet.c | 7 ++++---
net/ipv4/ip_input.c | 2 +-
net/ipv4/ip_output.c | 2 +-
5 files changed, 27 insertions(+), 22 deletions(-)
diff --git a/net/core/gro.c b/net/core/gro.c
index 506f83d715f8..b15f85546bdd 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
struct sk_buff *lp;
int segs;
- /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
- gro_max_size = READ_ONCE(p->dev->gro_max_size);
+ /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
+ gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
+ READ_ONCE(p->dev->gro_max_size) :
+ READ_ONCE(p->dev->gro_ipv4_max_size);
if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
return -E2BIG;
if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
- if (p->protocol != htons(ETH_P_IPV6) ||
- skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
- ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
+ if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
+ (p->protocol == htons(ETH_P_IPV6) &&
+ skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
p->encapsulation)
return -E2BIG;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 7ba4891460ad..f08b76acde9b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2373,17 +2373,22 @@ void sk_free_unlock_clone(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
-static void sk_trim_gso_size(struct sock *sk)
+static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
{
- if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
- return;
+ bool is_ipv6 = false;
+ u32 max_size;
+
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6 &&
- sk_is_tcp(sk) &&
- !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
- return;
+ is_ipv6 = (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
- sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
+ /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
+ max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
+ READ_ONCE(dst->dev->gso_ipv4_max_size);
+ if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
+ max_size = GSO_LEGACY_MAX_SIZE;
+
+ return max_size - (MAX_TCP_HEADER + 1);
}
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
@@ -2403,10 +2408,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
- sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
- sk_trim_gso_size(sk);
- sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
+ sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6c0ec2789943..2f992a323b95 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
if (unlikely(ip_fast_csum((u8 *)iph, 5)))
goto out;
+ NAPI_GRO_CB(skb)->proto = proto;
id = ntohl(*(__be32 *)&iph->id);
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
@@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
- __be16 newlen = htons(skb->len - nhoff);
struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
const struct net_offload *ops;
+ __be16 totlen = iph->tot_len;
int proto = iph->protocol;
int err = -ENOSYS;
@@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
skb_set_inner_network_header(skb, nhoff);
}
- csum_replace2(&iph->check, iph->tot_len, newlen);
- iph->tot_len = newlen;
+ iph_set_totlen(iph, skb->len - nhoff);
+ csum_replace2(&iph->check, totlen, iph->tot_len);
ops = rcu_dereference(inet_offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e880ce77322a..fe9ead9ee863 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto csum_error;
- len = ntohs(iph->tot_len);
+ len = iph_totlen(skb, iph);
if (skb->len < len) {
drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 922c87ef1ab5..4e4e308c3230 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
- iph->tot_len = htons(skb->len);
+ iph_set_totlen(iph, skb->len);
ip_send_check(iph);
/* if egress device is enslaved to an L3 master device pass the
--
2.31.1
next prev parent reply other threads:[~2023-01-28 15:59 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-01-28 15:58 [PATCHv4 net-next 00/10] net: support ipv4 big tcp Xin Long
2023-01-28 15:58 ` [PATCHv4 net-next 01/10] net: add a couple of helpers for iph tot_len Xin Long
2023-02-01 15:31 ` David Ahern
2023-01-28 15:58 ` [PATCHv4 net-next 02/10] bridge: use skb_ip_totlen in br netfilter Xin Long
2023-01-31 15:01 ` Nikolay Aleksandrov
2023-01-28 15:58 ` [PATCHv4 net-next 03/10] openvswitch: use skb_ip_totlen in conntrack Xin Long
2023-02-01 13:29 ` Aaron Conole
2023-01-28 15:58 ` [PATCHv4 net-next 04/10] net: sched: use skb_ip_totlen and iph_totlen Xin Long
2023-01-28 15:58 ` [PATCHv4 net-next 05/10] netfilter: " Xin Long
2023-01-28 15:58 ` [PATCHv4 net-next 06/10] cipso_ipv4: use iph_set_totlen in skbuff_setattr Xin Long
2023-01-28 15:58 ` [PATCHv4 net-next 07/10] ipvlan: use skb_ip_totlen in ipvlan_get_L3_hdr Xin Long
2023-01-28 15:58 ` [PATCHv4 net-next 08/10] packet: add TP_STATUS_GSO_TCP for tp_status Xin Long
2023-02-01 15:32 ` David Ahern
2023-01-28 15:58 ` [PATCHv4 net-next 09/10] net: add gso_ipv4_max_size and gro_ipv4_max_size per device Xin Long
2023-01-31 14:59 ` Paolo Abeni
2023-01-31 17:55 ` Xin Long
2023-02-01 15:36 ` David Ahern
2023-01-28 15:58 ` Xin Long [this message]
2023-02-01 15:38 ` [PATCHv4 net-next 10/10] net: add support for ipv4 big tcp David Ahern
2023-02-02 9:24 ` [PATCHv4 net-next 10/10] net: add support for ipv4 big tcp: manual merge Matthieu Baerts
2023-02-01 8:53 ` [PATCHv4 net-next 00/10] net: support ipv4 big tcp Eric Dumazet
2023-02-01 15:39 ` David Ahern
2023-02-02 5:10 ` patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=637aa55b8dbf0c85c2ee8892df26a8bbbf9f2ef5.1674921359.git.lucien.xin@gmail.com \
--to=lucien.xin@gmail.com \
--cc=aconole@redhat.com \
--cc=davem@davemloft.net \
--cc=dsahern@gmail.com \
--cc=edumazet@google.com \
--cc=fw@strlen.de \
--cc=gnault@redhat.com \
--cc=i.maximets@ovn.org \
--cc=jhs@mojatatu.com \
--cc=jiri@resnulli.us \
--cc=kuba@kernel.org \
--cc=maheshb@google.com \
--cc=marcelo.leitner@gmail.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=pablo@netfilter.org \
--cc=paul@paul-moore.com \
--cc=pshelar@ovn.org \
--cc=razor@blackwall.org \
--cc=roopa@nvidia.com \
--cc=xiyou.wangcong@gmail.com \
--cc=yoshfuji@linux-ipv6.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).