* [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
@ 2013-01-24 22:16 Pravin B Shelar
2013-01-24 23:29 ` Eric Dumazet
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Pravin B Shelar @ 2013-01-24 22:16 UTC (permalink / raw)
To: netdev; +Cc: jesse, eric.dumazet, Pravin B Shelar
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
Fixed according to comments from Jesse and Eric.
- Factored a MAC layer handler out of skb_gso_segment().
- Eliminated copy operation from gre_gso_segment().
- Refresh header pointer after pskb_may_pull().
---
include/linux/netdevice.h | 4 +-
include/linux/skbuff.h | 7 +++
net/core/dev.c | 58 ++++++++++++++----------
net/core/skbuff.c | 7 ++-
net/ipv4/af_inet.c | 1 +
net/ipv4/gre.c | 109 +++++++++++++++++++++++++++++++++++++++++++++
net/ipv4/ip_gre.c | 94 ++++++++++++++++++++++++++++++--------
net/ipv4/tcp.c | 1 +
net/ipv4/udp.c | 3 +-
net/ipv6/ip6_offload.c | 1 +
net/ipv6/udp_offload.c | 3 +-
11 files changed, 243 insertions(+), 45 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 549f5ad..109d27b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2662,8 +2662,10 @@ extern int netdev_master_upper_dev_link(struct net_device *dev,
extern void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev);
extern int skb_checksum_help(struct sk_buff *skb);
+extern struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features);
extern struct sk_buff *skb_gso_segment(struct sk_buff *skb,
- netdev_features_t features);
+ netdev_features_t features);
#ifdef CONFIG_BUG
extern void netdev_rx_csum_fault(struct net_device *dev);
#else
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7c00664..7b98ee6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -307,6 +307,8 @@ enum {
SKB_GSO_TCPV6 = 1 << 4,
SKB_GSO_FCOE = 1 << 5,
+
+ SKB_GSO_GRE = 1 << 6,
};
#if BITS_PER_LONG > 32
@@ -2711,6 +2713,11 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
}
#endif
+struct skb_gso_cb {
+ int tnl_hoffset;
+};
+#define NAPI_GSO_CB(skb) ((struct skb_gso_cb *)(skb)->cb)
+
static inline bool skb_is_gso(const struct sk_buff *skb)
{
return skb_shinfo(skb)->gso_size;
diff --git a/net/core/dev.c b/net/core/dev.c
index c69cd87..208eb8b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2293,18 +2293,8 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-/**
- * skb_gso_segment - Perform segmentation on skb.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- *
- * This function segments the given skb and returns a list of segments.
- *
- * It may return NULL if the skb requires no segmentation. This is
- * only possible when GSO is used for verifying header integrity.
- */
-struct sk_buff *skb_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
@@ -2323,18 +2313,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
vlan_depth += VLAN_HLEN;
}
- skb_reset_mac_header(skb);
- skb->mac_len = skb->network_header - skb->mac_header;
__skb_pull(skb, skb->mac_len);
-
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- skb_warn_bad_offload(skb);
-
- if (skb_header_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
- return ERR_PTR(err);
- }
-
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
@@ -2356,6 +2335,39 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
return segs;
}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+/**
+ * skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ int err;
+
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ skb_warn_bad_offload(skb);
+
+ if (skb_header_cloned(skb)) {
+ err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (err)
+ return ERR_PTR(err);
+ }
+ }
+ NAPI_GSO_CB(skb)->tnl_hoffset = skb_headroom(skb);
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = skb->network_header - skb->mac_header;
+
+ return skb_mac_gso_segment(skb, features);
+}
EXPORT_SYMBOL(skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2568c44..c12b6f3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2752,6 +2752,8 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
unsigned int doffset = skb->data - skb_mac_header(skb);
unsigned int offset = doffset;
unsigned int headroom;
+ unsigned int tnl_hlen = (skb_mac_header(skb) - skb->head) -
+ NAPI_GSO_CB(skb)->tnl_hoffset;
unsigned int len;
int sg = !!(features & NETIF_F_SG);
int nfrags = skb_shinfo(skb)->nr_frags;
@@ -2827,7 +2829,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
skb_set_network_header(nskb, skb->mac_len);
nskb->transport_header = (nskb->network_header +
skb_network_header_len(skb));
- skb_copy_from_linear_data(skb, nskb->data, doffset);
+
+ skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ doffset + tnl_hlen);
if (fskb != skb_shinfo(skb)->frag_list)
continue;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4b70539..2bd9998 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1306,6 +1306,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
0)))
goto out;
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 42a4910..1f86421 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -19,6 +19,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <net/gre.h>
@@ -26,6 +27,11 @@
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
static DEFINE_SPINLOCK(gre_proto_lock);
+struct gre_base_hdr {
+ __be16 flags;
+ __be16 protocol;
+};
+#define GRE_HEADER_SECTION 4
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
@@ -112,12 +118,108 @@ static void gre_err(struct sk_buff *skb, u32 info)
rcu_read_unlock();
}
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ int ghl = GRE_HEADER_SECTION;
+ struct gre_base_hdr *greh;
+ int mac_len = skb->mac_len;
+ int hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ if (greh->flags & GRE_KEY)
+ ghl += GRE_HEADER_SECTION;
+ if (greh->flags & GRE_CSUM) {
+ ghl += GRE_HEADER_SECTION;
+ csum = true;
+ } else
+ csum = false;
+
+ /* setup inner skb. */
+ if (greh->protocol == htons(ETH_P_TEB)) {
+ struct ethhdr *eth = eth_hdr(skb);
+ skb->protocol = eth->h_proto;
+ } else {
+ skb->protocol = greh->protocol;
+ }
+
+ hlen = mac_len + sizeof(struct iphdr);
+ skb->encapsulation = 0;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ segs = skb_mac_gso_segment(skb, 0);
+ if (!segs || IS_ERR(segs))
+ goto out;
+
+ skb = segs;
+ do {
+ __skb_push(skb, ghl + hlen);
+ if (csum) {
+ __be32 *pcsum;
+
+ greh = (struct gre_base_hdr *)(skb->data + hlen);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, hlen,
+ skb->len - hlen, 0));
+ }
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+static int gre_gso_send_check(struct sk_buff *skb)
+{
+ struct gre_base_hdr *greh;
+
+ if (!skb->encapsulation)
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ if (greh->flags & GRE_SEQ)
+ return -EINVAL;
+ return 0;
+}
+
static const struct net_protocol net_gre_protocol = {
.handler = gre_rcv,
.err_handler = gre_err,
.netns_ok = 1,
};
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_send_check = gre_gso_send_check,
+ .gso_segment = gre_gso_segment,
+ },
+};
+
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
@@ -127,11 +229,18 @@ static int __init gre_init(void)
return -EAGAIN;
}
+ if (inet_add_offload(&gre_offload, IPPROTO_GRE)) {
+ pr_err("can't add protocol offload\n");
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+ return -EAGAIN;
+ }
+
return 0;
}
static void __exit gre_exit(void)
{
+ inet_del_offload(&gre_offload, IPPROTO_GRE);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 303012a..ac8cb5e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -735,6 +735,42 @@ drop:
return 0;
}
+static struct sk_buff *handle_offloads(struct sk_buff *skb)
+{
+ int err;
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
+ return skb;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ /* Pages aren't locked and could change at any time.
+ * If this happens after we compute the checksum, the
+ * checksum will be wrong. We linearize now to avoid
+ * this problem.
+ */
+ if (skb_is_nonlinear(skb)) {
+ err = __skb_linearize(skb);
+ if (unlikely(err))
+ goto error;
+ }
+
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ }
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+
static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -751,10 +787,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
__be32 dst;
int mtu;
u8 ttl;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))
- goto tx_error;
+ int pkt_len;
+ struct pcpu_tstats *tstats;
+ int err;
if (dev->type == ARPHRD_ETHER)
IPCB(skb)->flags = 0;
@@ -852,13 +887,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if (skb->protocol == htons(ETH_P_IP)) {
df |= (old_iph->frag_off&htons(IP_DF));
-
- if ((old_iph->frag_off&htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
}
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
@@ -873,11 +901,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
}
- if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- ip_rt_put(rt);
- goto tx_error;
- }
}
#endif
@@ -908,10 +931,19 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
skb_set_owner_w(new_skb, skb->sk);
dev_kfree_skb(skb);
skb = new_skb;
- old_iph = ip_hdr(skb);
/* Warning : tiph value might point to freed memory */
}
+ if (!skb->encapsulation) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ skb = handle_offloads(skb);
+ if (IS_ERR(skb))
+ return NETDEV_TX_OK;
+
+ old_iph = ip_hdr(skb);
skb_push(skb, gre_hlen);
skb_reset_network_header(skb);
skb_set_transport_header(skb, sizeof(*iph));
@@ -967,8 +999,23 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
*(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
}
}
+ pkt_len = skb->len - skb_transport_offset(skb);
+ tstats = this_cpu_ptr(dev->tstats);
+
+ nf_reset(skb);
+ ip_select_ident(iph, skb_dst(skb), NULL);
+
+ err = ip_local_out(skb);
+ if (likely(net_xmit_eval(err) == 0)) {
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += pkt_len;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else {
+ dev->stats.tx_errors++;
+ dev->stats.tx_aborted_errors++;
+ }
- iptunnel_xmit(skb, dev);
return NETDEV_TX_OK;
#if IS_ENABLED(CONFIG_IPV6)
@@ -1374,6 +1421,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
return err;
}
+ if (!(tunnel->parms.o_flags & GRE_SEQ)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
return 0;
}
@@ -1564,6 +1615,10 @@ static int ipgre_tap_init(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
+ if (!(tunnel->parms.o_flags & GRE_SEQ)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
return 0;
}
@@ -1587,6 +1642,9 @@ static void ipgre_tap_setup(struct net_device *dev)
dev->iflink = 0;
dev->features |= NETIF_F_NETNS_LOCAL;
+
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
}
static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5227194..8c47b7d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3032,6 +3032,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
+ SKB_GSO_GRE |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e0610e4..6824272 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2304,7 +2304,8 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo(skb)->gso_type;
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_GRE) ||
!(type & (SKB_GSO_UDP))))
goto out;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index f26f0da..8234c1d 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -99,6 +99,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
~(SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
SKB_GSO_TCPV6 |
0)))
goto out;
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 0c8934a..cf05cf0 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -56,7 +56,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo(skb)->gso_type;
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_GRE) ||
!(type & (SKB_GSO_UDP))))
goto out;
--
1.7.10
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
2013-01-24 22:16 [PATCH 2/2] v2 GRE: Add segmentation offload for GRE Pravin B Shelar
@ 2013-01-24 23:29 ` Eric Dumazet
2013-01-25 0:14 ` Pravin Shelar
2013-01-25 1:14 ` [PATCH 2/2] v2 GRE: Add segmentation offload for GRE Michał Mirosław
2013-01-28 18:28 ` Ben Hutchings
2 siblings, 1 reply; 12+ messages in thread
From: Eric Dumazet @ 2013-01-24 23:29 UTC (permalink / raw)
To: Pravin B Shelar; +Cc: netdev, jesse
On Thu, 2013-01-24 at 14:16 -0800, Pravin B Shelar wrote:
> Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
> ---
> Fixed according to comments from Jesse and Eric.
> - Factored a MAC layer handler out of skb_gso_segment().
> - Eliminated copy operation from gre_gso_segment().
> - Refresh header pointer after pskb_may_pull().
Seems nice !
> + if (skb_is_gso(skb)) {
> + err = skb_unclone(skb, GFP_ATOMIC);
> + if (unlikely(err))
> + goto error;
> + skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
> + return skb;
> + } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
> + /* Pages aren't locked and could change at any time.
> + * If this happens after we compute the checksum, the
> + * checksum will be wrong. We linearize now to avoid
> + * this problem.
> + */
> + if (skb_is_nonlinear(skb)) {
> + err = __skb_linearize(skb);
> + if (unlikely(err))
> + goto error;
> + }
> +
> + err = skb_checksum_help(skb);
> + if (unlikely(err))
> + goto error;
> + }
> +
I really don't understand why you put chunk this in this patch.
Packet being GSO or not, the underlying problem still remains.
This must be addressed separately and at a different layer.
(in skb_checksum_help() most probably)
If the packet is GSO and we compute checksum in software,
then we also have to copy all frags that could potentially
be overwritten.
> + skb->ip_summed = CHECKSUM_NONE;
> +
> + return skb;
> +
> +error:
> + kfree_skb(skb);
> + return ERR_PTR(err);
> +}
> +
> static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> struct ip_tunnel *tunnel = netdev_priv(dev);
> @@ -751,10 +787,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
> __be32 dst;
> int mtu;
> u8 ttl;
> -
> - if (skb->ip_summed == CHECKSUM_PARTIAL &&
> - skb_checksum_help(skb))
> - goto tx_error;
> + int pkt_len;
> + struct pcpu_tstats *tstats;
> + int err;
>
> if (dev->type == ARPHRD_ETHER)
> IPCB(skb)->flags = 0;
> @@ -852,13 +887,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
>
> if (skb->protocol == htons(ETH_P_IP)) {
> df |= (old_iph->frag_off&htons(IP_DF));
> -
> - if ((old_iph->frag_off&htons(IP_DF)) &&
> - mtu < ntohs(old_iph->tot_len)) {
> - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
> - ip_rt_put(rt);
> - goto tx_error;
> - }
Not clear why this chunk can be safely removed, even for non GSO
packet ?
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
2013-01-24 23:29 ` Eric Dumazet
@ 2013-01-25 0:14 ` Pravin Shelar
2013-01-25 1:34 ` Eric Dumazet
0 siblings, 1 reply; 12+ messages in thread
From: Pravin Shelar @ 2013-01-25 0:14 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, jesse
On Thu, Jan 24, 2013 at 3:29 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2013-01-24 at 14:16 -0800, Pravin B Shelar wrote:
>> Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
>> ---
>> Fixed according to comments from Jesse and Eric.
>> - Factored a MAC layer handler out of skb_gso_segment().
>> - Eliminated copy operation from gre_gso_segment().
>> - Refresh header pointer after pskb_may_pull().
>
> Seems nice !
>
>> + if (skb_is_gso(skb)) {
>> + err = skb_unclone(skb, GFP_ATOMIC);
>> + if (unlikely(err))
>> + goto error;
>> + skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
>> + return skb;
>> + } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
>> + /* Pages aren't locked and could change at any time.
>> + * If this happens after we compute the checksum, the
>> + * checksum will be wrong. We linearize now to avoid
>> + * this problem.
>> + */
>> + if (skb_is_nonlinear(skb)) {
>> + err = __skb_linearize(skb);
>> + if (unlikely(err))
>> + goto error;
>> + }
>> +
>> + err = skb_checksum_help(skb);
>> + if (unlikely(err))
>> + goto error;
>> + }
>> +
>
> I really don't understand why you put chunk this in this patch.
>
> Packet being GSO or not, the underlying problem still remains.
>
> This must be addressed separately and at a different layer.
>
> (in skb_checksum_help() most probably)
>
> If the packet is GSO and we compute checksum in software,
> then we also have to copy all frags that could potentially
> be overwritten.
>
I think this patch does fix csum issue without causing any performance
regression. So this patch shld be enough to solve GRE-GSO issue. Once
you have fix, this code can be optimized even more.
>
>> + skb->ip_summed = CHECKSUM_NONE;
>> +
>> + return skb;
>> +
>> +error:
>> + kfree_skb(skb);
>> + return ERR_PTR(err);
>> +}
>> +
>> static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
>> {
>> struct ip_tunnel *tunnel = netdev_priv(dev);
>> @@ -751,10 +787,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
>> __be32 dst;
>> int mtu;
>> u8 ttl;
>> -
>> - if (skb->ip_summed == CHECKSUM_PARTIAL &&
>> - skb_checksum_help(skb))
>> - goto tx_error;
>> + int pkt_len;
>> + struct pcpu_tstats *tstats;
>> + int err;
>>
>> if (dev->type == ARPHRD_ETHER)
>> IPCB(skb)->flags = 0;
>> @@ -852,13 +887,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
>>
>> if (skb->protocol == htons(ETH_P_IP)) {
>> df |= (old_iph->frag_off&htons(IP_DF));
>> -
>> - if ((old_iph->frag_off&htons(IP_DF)) &&
>> - mtu < ntohs(old_iph->tot_len)) {
>> - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
>> - ip_rt_put(rt);
>> - goto tx_error;
>> - }
>
> Not clear why this chunk can be safely removed, even for non GSO
> packet ?
>
This actually does not work specially for TAP devices since ICMP
response won't work because the tunnel endpoint is not part of that IP
network.
This was discussed in VXLAN patch thread.
(http://markmail.org/message/xmqmvdh4noljfq2n).
But I agree we shld keep it for for non tap GRE and non-gso packets.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
2013-01-24 22:16 [PATCH 2/2] v2 GRE: Add segmentation offload for GRE Pravin B Shelar
2013-01-24 23:29 ` Eric Dumazet
@ 2013-01-25 1:14 ` Michał Mirosław
2013-01-28 18:28 ` Ben Hutchings
2 siblings, 0 replies; 12+ messages in thread
From: Michał Mirosław @ 2013-01-25 1:14 UTC (permalink / raw)
To: Pravin B Shelar; +Cc: netdev, jesse, eric.dumazet
2013/1/24 Pravin B Shelar <pshelar@nicira.com>:
[...]
> @@ -1374,6 +1421,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
> return err;
> }
>
> + if (!(tunnel->parms.o_flags & GRE_SEQ)) {
> + dev->features |= NETIF_F_GSO_SOFTWARE;
> + dev->hw_features |= NETIF_F_GSO_SOFTWARE;
> + }
> return 0;
> }
>
Can o_flags change after tunnel creation? If so, NETIF_F_GSO_SOFTWARE
should be set in dev->hw_features always, and it should be forced zero
in ndo_fix_features when GRE_SEQ is set. ipgre_netlink_parms() should
call netdev_update_features() when o_flags changes.
> @@ -1564,6 +1615,10 @@ static int ipgre_tap_init(struct net_device *dev)
> if (!dev->tstats)
> return -ENOMEM;
>
> + if (!(tunnel->parms.o_flags & GRE_SEQ)) {
> + dev->features |= NETIF_F_GSO_SOFTWARE;
> + dev->hw_features |= NETIF_F_GSO_SOFTWARE;
> + }
> return 0;
> }
>
Same here.
Best Regards,
Michał Mirosław
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
2013-01-25 0:14 ` Pravin Shelar
@ 2013-01-25 1:34 ` Eric Dumazet
2013-01-25 3:38 ` Pravin Shelar
0 siblings, 1 reply; 12+ messages in thread
From: Eric Dumazet @ 2013-01-25 1:34 UTC (permalink / raw)
To: Pravin Shelar; +Cc: netdev, jesse
On Thu, 2013-01-24 at 16:14 -0800, Pravin Shelar wrote:
> I think this patch does fix csum issue without causing any performance
> regression. So this patch shld be enough to solve GRE-GSO issue. Once
> you have fix, this code can be optimized even more.
It adds the extra copy, since you assume no SG capability so
skb_segment() _does_ a copy.
As the checksum is needed, its true the copy is almost not noticed,
but one day NIC will be able to perform the checksum for us.
(Maybe its already the case for some of them)
I would first fix the checksum issue in a generic way, then
apply this patch on top of the fix, so that we can use SG and avoid
the extra copy for the typical tcp_sendmsg()
It seems you focus on the TAP use case only, seeing you removed
code that doesn't work for TAP but do work for regular locally
terminated flows.
You did a lot of implementation choices and none of them
are described in a changelog, making future work a bit hard.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
2013-01-25 1:34 ` Eric Dumazet
@ 2013-01-25 3:38 ` Pravin Shelar
2013-01-25 19:52 ` Eric Dumazet
0 siblings, 1 reply; 12+ messages in thread
From: Pravin Shelar @ 2013-01-25 3:38 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, jesse
On Thu, Jan 24, 2013 at 5:34 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2013-01-24 at 16:14 -0800, Pravin Shelar wrote:
>
>> I think this patch does fix csum issue without causing any performance
>> regression. So this patch shld be enough to solve GRE-GSO issue. Once
>> you have fix, this code can be optimized even more.
>
> It adds the extra copy, since you assume no SG capability so
> skb_segment() _does_ a copy.
>
OK,I wil use device features.
> As the checksum is needed, its true the copy is almost not noticed,
> but one day NIC will be able to perform the checksum for us.
> (Maybe its already the case for some of them)
>
> I would first fix the checksum issue in a generic way, then
> apply this patch on top of the fix, so that we can use SG and avoid
> the extra copy for the typical tcp_sendmsg()
I thought you were working on the fix. If not I will post patch.
>
> It seems you focus on the TAP use case only, seeing you removed
> code that doesn't work for TAP but do work for regular locally
> terminated flows.
>
I have tested patch with GRE TAP and non TAP devices.
> You did a lot of implementation choices and none of them
> are described in a changelog, making future work a bit hard.
>
ok. I will update changelog.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
2013-01-25 3:38 ` Pravin Shelar
@ 2013-01-25 19:52 ` Eric Dumazet
2013-01-26 6:34 ` [PATCH net-next] net: fix possible wrong checksum generation Eric Dumazet
0 siblings, 1 reply; 12+ messages in thread
From: Eric Dumazet @ 2013-01-25 19:52 UTC (permalink / raw)
To: Pravin Shelar; +Cc: netdev, jesse
On Thu, 2013-01-24 at 19:38 -0800, Pravin Shelar wrote:
> I thought you were working on the fix. If not I will post patch.
I am working on a patch, will send it shortly.
As it is a stable candidate, I did a short one.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH net-next] net: fix possible wrong checksum generation
2013-01-25 19:52 ` Eric Dumazet
@ 2013-01-26 6:34 ` Eric Dumazet
2013-01-28 5:28 ` David Miller
2013-01-28 18:35 ` Ben Hutchings
0 siblings, 2 replies; 12+ messages in thread
From: Eric Dumazet @ 2013-01-26 6:34 UTC (permalink / raw)
To: Pravin Shelar, David Miller; +Cc: netdev, jesse
From: Eric Dumazet <edumazet@google.com>
Pravin Shelar mentioned that GSO could potentially generate
wrong TX checksum if skb has fragments that are overwritten
by the user between the checksum computation and transmit.
He suggested to linearize skbs but this extra copy can be
avoided for normal tcp skbs cooked by tcp_sendmsg().
This patch introduces a new SKB_GSO_SHARED_FRAG flag, set
in skb_shinfo(skb)->gso_type if at least one frag can be
modified by the user.
Typical sources of such possible overwrites are {vm}splice(),
sendfile(), and macvtap/tun/virtio_net drivers.
Tested:
$ netperf -H 7.7.8.84
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
7.7.8.84 () port 0 AF_INET
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 16384 16384 10.00 3959.52
$ netperf -H 7.7.8.84 -t TCP_SENDFILE
TCP SENDFILE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.8.84 ()
port 0 AF_INET
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 16384 16384 10.00 3216.80
Performance of the SENDFILE is impacted by the extra allocation and
copy, and because we use order-0 pages, while the TCP_STREAM uses
bigger pages.
Reported-by: Pravin Shelar <pshelar@nicira.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
drivers/net/macvtap.c | 3 ++-
drivers/net/tun.c | 12 ++++++++----
drivers/net/virtio_net.c | 12 ++++++++----
include/linux/skbuff.h | 19 +++++++++++++++++++
net/core/dev.c | 9 +++++++++
net/core/skbuff.c | 4 ++++
net/ipv4/af_inet.c | 1 +
net/ipv4/ip_gre.c | 4 +++-
net/ipv4/ipip.c | 4 +++-
net/ipv4/tcp.c | 3 +++
net/ipv4/tcp_input.c | 4 ++--
net/ipv4/tcp_output.c | 4 ++--
net/ipv6/ip6_offload.c | 1 +
13 files changed, 65 insertions(+), 15 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 0f0f9ce..b181dfb 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -543,6 +543,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
skb->data_len += len;
skb->len += len;
skb->truesize += truesize;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG;
atomic_add(truesize, &skb->sk->sk_wmem_alloc);
while (len) {
int off = base & ~PAGE_MASK;
@@ -598,7 +599,7 @@ static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb,
if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
skb_shinfo(skb)->gso_size = vnet_hdr->gso_size;
- skb_shinfo(skb)->gso_type = gso_type;
+ skb_shinfo(skb)->gso_type |= gso_type;
/* Header must be checked, and gso_segs computed. */
skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c81680d..293ce8d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1005,6 +1005,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
skb->data_len += len;
skb->len += len;
skb->truesize += truesize;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG;
atomic_add(truesize, &skb->sk->sk_wmem_alloc);
while (len) {
int off = base & ~PAGE_MASK;
@@ -1150,16 +1151,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
}
if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ unsigned short gso_type = 0;
+
pr_debug("GSO!\n");
switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
case VIRTIO_NET_HDR_GSO_TCPV4:
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+ gso_type = SKB_GSO_TCPV4;
break;
case VIRTIO_NET_HDR_GSO_TCPV6:
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+ gso_type = SKB_GSO_TCPV6;
break;
case VIRTIO_NET_HDR_GSO_UDP:
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+ gso_type = SKB_GSO_UDP;
break;
default:
tun->dev->stats.rx_frame_errors++;
@@ -1168,9 +1171,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
}
if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ gso_type |= SKB_GSO_TCP_ECN;
skb_shinfo(skb)->gso_size = gso.gso_size;
+ skb_shinfo(skb)->gso_type |= gso_type;
if (skb_shinfo(skb)->gso_size == 0) {
tun->dev->stats.rx_frame_errors++;
kfree_skb(skb);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 701408a..58914c8 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -220,6 +220,7 @@ static void set_skb_frag(struct sk_buff *skb, struct page *page,
skb->len += size;
skb->truesize += PAGE_SIZE;
skb_shinfo(skb)->nr_frags++;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG;
*len -= size;
}
@@ -379,16 +380,18 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
ntohs(skb->protocol), skb->len, skb->pkt_type);
if (hdr->hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ unsigned short gso_type = 0;
+
pr_debug("GSO!\n");
switch (hdr->hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
case VIRTIO_NET_HDR_GSO_TCPV4:
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+ gso_type = SKB_GSO_TCPV4;
break;
case VIRTIO_NET_HDR_GSO_UDP:
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+ gso_type = SKB_GSO_UDP;
break;
case VIRTIO_NET_HDR_GSO_TCPV6:
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+ gso_type = SKB_GSO_TCPV6;
break;
default:
net_warn_ratelimited("%s: bad gso type %u.\n",
@@ -397,7 +400,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
}
if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ gso_type |= SKB_GSO_TCP_ECN;
skb_shinfo(skb)->gso_size = hdr->hdr.gso_size;
if (skb_shinfo(skb)->gso_size == 0) {
@@ -405,6 +408,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
goto frame_err;
}
+ skb_shinfo(skb)->gso_type |= gso_type;
/* Header must be checked, and gso_segs computed. */
skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
skb_shinfo(skb)->gso_segs = 0;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8b2256e..0259b71 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -307,6 +307,13 @@ enum {
SKB_GSO_TCPV6 = 1 << 4,
SKB_GSO_FCOE = 1 << 5,
+
+ /* This indicates at least one fragment might be overwritten
+ * (as in vmsplice(), sendfile() ...)
+ * If we need to compute a TX checksum, we'll need to copy
+ * all frags to avoid possible bad checksum
+ */
+ SKB_GSO_SHARED_FRAG = 1 << 6,
};
#if BITS_PER_LONG > 32
@@ -2201,6 +2208,18 @@ static inline int skb_linearize(struct sk_buff *skb)
}
/**
+ * skb_has_shared_frag - can any frag be overwritten
+ * @skb: buffer to test
+ *
+ * Return true if the skb has at least one frag that might be modified
+ * by an external entity (as in vmsplice()/sendfile())
+ */
+static inline bool skb_has_shared_frag(const struct sk_buff *skb)
+{
+ return skb_shinfo(skb)->gso_type & SKB_GSO_SHARED_FRAG;
+}
+
+/**
* skb_linearize_cow - make sure skb is linear and writable
* @skb: buffer to process
*
diff --git a/net/core/dev.c b/net/core/dev.c
index c69cd87..a83375d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2271,6 +2271,15 @@ int skb_checksum_help(struct sk_buff *skb)
return -EINVAL;
}
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (skb_has_shared_frag(skb)) {
+ ret = __skb_linearize(skb);
+ if (ret)
+ goto out;
+ }
+
offset = skb_checksum_start_offset(skb);
BUG_ON(offset >= skb_headlen(skb));
csum = skb_checksum(skb, offset, skb->len - offset, 0);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2568c44..bddc1dd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2340,6 +2340,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
+ skb_shinfo(skb1)->gso_type = skb_shinfo(skb)->gso_type;
+
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. */
@@ -2845,6 +2847,8 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
skb_copy_from_linear_data_offset(skb, offset,
skb_put(nskb, hsize), hsize);
+ skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type;
+
while (pos < offset + len && i < nfrags) {
*frag = skb_shinfo(skb)->frags[i];
__skb_frag_ref(frag);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4b70539..49ddca3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1306,6 +1306,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_SHARED_FRAG |
0)))
goto out;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 303012a..af6be70 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -738,7 +738,7 @@ drop:
static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- const struct iphdr *old_iph = ip_hdr(skb);
+ const struct iphdr *old_iph;
const struct iphdr *tiph;
struct flowi4 fl4;
u8 tos;
@@ -756,6 +756,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
skb_checksum_help(skb))
goto tx_error;
+ old_iph = ip_hdr(skb);
+
if (dev->type == ARPHRD_ETHER)
IPCB(skb)->flags = 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 191fc24..8f024d4 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -472,7 +472,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
__be16 df = tiph->frag_off;
struct rtable *rt; /* Route to the other host */
struct net_device *tdev; /* Device to other host */
- const struct iphdr *old_iph = ip_hdr(skb);
+ const struct iphdr *old_iph;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst = tiph->daddr;
@@ -486,6 +486,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
skb_checksum_help(skb))
goto tx_error;
+ old_iph = ip_hdr(skb);
+
if (tos & 1)
tos = old_iph->tos;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5227194..3ec1f69 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -896,6 +896,8 @@ new_segment:
skb_fill_page_desc(skb, i, page, offset, copy);
}
+ skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG;
+
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
@@ -3032,6 +3034,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
+ SKB_GSO_SHARED_FRAG |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0905997..492c7cf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1240,13 +1240,13 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
*/
if (!skb_shinfo(prev)->gso_size) {
skb_shinfo(prev)->gso_size = mss;
- skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+ skb_shinfo(prev)->gso_type |= sk->sk_gso_type;
}
/* CHECKME: To clear or not to clear? Mimics normal skb currently */
if (skb_shinfo(skb)->gso_segs <= 1) {
skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
+ skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG;
}
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 667a6ad..367e2ec 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1133,6 +1133,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
unsigned int mss_now)
{
+ skb_shinfo(skb)->gso_type &= SKB_GSO_SHARED_FRAG;
if (skb->len <= mss_now || !sk_can_gso(sk) ||
skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
@@ -1140,11 +1141,10 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
*/
skb_shinfo(skb)->gso_segs = 1;
skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
} else {
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
skb_shinfo(skb)->gso_size = mss_now;
- skb_shinfo(skb)->gso_type = sk->sk_gso_type;
+ skb_shinfo(skb)->gso_type |= sk->sk_gso_type;
}
}
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index f26f0da..d141fc3 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -100,6 +100,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
+ SKB_GSO_SHARED_FRAG |
0)))
goto out;
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net: fix possible wrong checksum generation
2013-01-26 6:34 ` [PATCH net-next] net: fix possible wrong checksum generation Eric Dumazet
@ 2013-01-28 5:28 ` David Miller
2013-01-28 18:35 ` Ben Hutchings
1 sibling, 0 replies; 12+ messages in thread
From: David Miller @ 2013-01-28 5:28 UTC (permalink / raw)
To: eric.dumazet; +Cc: pshelar, netdev, jesse
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 25 Jan 2013 22:34:37 -0800
> From: Eric Dumazet <edumazet@google.com>
>
> Pravin Shelar mentioned that GSO could potentially generate
> wrong TX checksum if skb has fragments that are overwritten
> by the user between the checksum computation and transmit.
>
> He suggested to linearize skbs but this extra copy can be
> avoided for normal tcp skbs cooked by tcp_sendmsg().
>
> This patch introduces a new SKB_GSO_SHARED_FRAG flag, set
> in skb_shinfo(skb)->gso_type if at least one frag can be
> modified by the user.
>
> Typical sources of such possible overwrites are {vm}splice(),
> sendfile(), and macvtap/tun/virtio_net drivers.
>
> Tested:
...
> Performance of the SENDFILE is impacted by the extra allocation and
> copy, and because we use order-0 pages, while the TCP_STREAM uses
> bigger pages.
>
> Reported-by: Pravin Shelar <pshelar@nicira.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied, thanks Eric.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
2013-01-24 22:16 [PATCH 2/2] v2 GRE: Add segmentation offload for GRE Pravin B Shelar
2013-01-24 23:29 ` Eric Dumazet
2013-01-25 1:14 ` [PATCH 2/2] v2 GRE: Add segmentation offload for GRE Michał Mirosław
@ 2013-01-28 18:28 ` Ben Hutchings
2 siblings, 0 replies; 12+ messages in thread
From: Ben Hutchings @ 2013-01-28 18:28 UTC (permalink / raw)
To: Pravin B Shelar; +Cc: netdev, jesse, eric.dumazet
On Thu, 2013-01-24 at 14:16 -0800, Pravin B Shelar wrote:
> Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
> ---
> Fixed according to comments from Jesse and Eric.
> - Factored a MAC layer handler out of skb_gso_segment().
> - Eliminated copy operation from gre_gso_segment().
> - Refresh header pointer after pskb_may_pull().
[...]
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -307,6 +307,8 @@ enum {
> SKB_GSO_TCPV6 = 1 << 4,
>
> SKB_GSO_FCOE = 1 << 5,
> +
> + SKB_GSO_GRE = 1 << 6,
> };
>
> #if BITS_PER_LONG > 32
[...]
I think each new GSO flag must have a corresponding net device feature
flag. In that case you'll need to replace NETIF_F_GSO_RESERVED1 in
include/linux/netdev_features.h and add a name for the feature in
net/core/ethtool.c.
Ben.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net: fix possible wrong checksum generation
2013-01-26 6:34 ` [PATCH net-next] net: fix possible wrong checksum generation Eric Dumazet
2013-01-28 5:28 ` David Miller
@ 2013-01-28 18:35 ` Ben Hutchings
2013-01-29 19:30 ` Pravin Shelar
1 sibling, 1 reply; 12+ messages in thread
From: Ben Hutchings @ 2013-01-28 18:35 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Pravin Shelar, David Miller, netdev, jesse
On Fri, 2013-01-25 at 22:34 -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Pravin Shelar mentioned that GSO could potentially generate
> wrong TX checksum if skb has fragments that are overwritten
> by the user between the checksum computation and transmit.
>
> He suggested to linearize skbs but this extra copy can be
> avoided for normal tcp skbs cooked by tcp_sendmsg().
>
> This patch introduces a new SKB_GSO_SHARED_FRAG flag, set
> in skb_shinfo(skb)->gso_type if at least one frag can be
> modified by the user.
[...]
Again, this needs a net device feature, doesn't it? net_gso_ok() should
allow hardware checksum/segmentation offload of such packets, and it
won't if there is this new GSO flag with no corresponding net device
feature.
Ben.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net: fix possible wrong checksum generation
2013-01-28 18:35 ` Ben Hutchings
@ 2013-01-29 19:30 ` Pravin Shelar
0 siblings, 0 replies; 12+ messages in thread
From: Pravin Shelar @ 2013-01-29 19:30 UTC (permalink / raw)
To: Eric Dumazet, Ben Hutchings; +Cc: David Miller, netdev, jesse
On Mon, Jan 28, 2013 at 10:35 AM, Ben Hutchings
<bhutchings@solarflare.com> wrote:
> On Fri, 2013-01-25 at 22:34 -0800, Eric Dumazet wrote:
>> From: Eric Dumazet <edumazet@google.com>
>>
>> Pravin Shelar mentioned that GSO could potentially generate
>> wrong TX checksum if skb has fragments that are overwritten
>> by the user between the checksum computation and transmit.
>>
>> He suggested to linearize skbs but this extra copy can be
>> avoided for normal tcp skbs cooked by tcp_sendmsg().
>>
>> This patch introduces a new SKB_GSO_SHARED_FRAG flag, set
>> in skb_shinfo(skb)->gso_type if at least one frag can be
>> modified by the user.
> [...]
>
> Again, this needs a net device feature, doesn't it? net_gso_ok() should
> allow hardware checksum/segmentation offload of such packets, and it
> won't if there is this new GSO flag with no corresponding net device
> feature.
>
I am not sure if this flag need to be a GSO type. There is possibility
of having such a page fragment in non GSO packets.
we could have this flag in skb_shinfo(skb)->tx_flags.
Thanks,
Pravin.
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2013-01-29 19:30 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-01-24 22:16 [PATCH 2/2] v2 GRE: Add segmentation offload for GRE Pravin B Shelar
2013-01-24 23:29 ` Eric Dumazet
2013-01-25 0:14 ` Pravin Shelar
2013-01-25 1:34 ` Eric Dumazet
2013-01-25 3:38 ` Pravin Shelar
2013-01-25 19:52 ` Eric Dumazet
2013-01-26 6:34 ` [PATCH net-next] net: fix possible wrong checksum generation Eric Dumazet
2013-01-28 5:28 ` David Miller
2013-01-28 18:35 ` Ben Hutchings
2013-01-29 19:30 ` Pravin Shelar
2013-01-25 1:14 ` [PATCH 2/2] v2 GRE: Add segmentation offload for GRE Michał Mirosław
2013-01-28 18:28 ` Ben Hutchings
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).