* [PATCH 1/3 net-next] net: Export skb_zerocopy() to zerocopy from one skb to another
2013-11-11 15:47 [PATCH 0/3 v5] Open vSwitch zerocopy upcall Thomas Graf
@ 2013-11-11 15:47 ` Thomas Graf
2013-11-11 15:47 ` [PATCH 2/3 net-next] openvswitch: Allow user space to announce ability to accept unaligned Netlink messages Thomas Graf
2013-11-11 15:47 ` [PATCH 3/3 net-next] openvswitch: Use skb_zerocopy() for upcall Thomas Graf
2 siblings, 0 replies; 4+ messages in thread
From: Thomas Graf @ 2013-11-11 15:47 UTC (permalink / raw)
To: jesse, davem; +Cc: dev, netdev, eric.dumazet, dborkman, bhutchings
Make the skb zerocopy logic written for nfnetlink queue available for
use by other modules.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
include/linux/skbuff.h | 3 ++
net/core/skbuff.c | 85 ++++++++++++++++++++++++++++++++++++
net/netfilter/nfnetlink_queue_core.c | 59 ++-----------------------
3 files changed, 92 insertions(+), 55 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 036ec7d..9e6a293 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2372,6 +2372,9 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int len,
unsigned int flags);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
+unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
+void skb_zerocopy(struct sk_buff *to, const struct sk_buff *from,
+ int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8c5197f..c5cefbe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2125,6 +2125,91 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+ /**
+ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
+ * @from: source buffer
+ *
+ * Calculates the amount of linear headroom needed in the 'to' skb passed
+ * into skb_zerocopy().
+ */
+unsigned int
+skb_zerocopy_headlen(const struct sk_buff *from)
+{
+ unsigned int hlen = 0;
+
+ if (!from->head_frag ||
+ skb_headlen(from) < L1_CACHE_BYTES ||
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ hlen = skb_headlen(from);
+
+ if (skb_has_frag_list(from))
+ hlen = from->len;
+
+ return hlen;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
+
+/**
+ * skb_zerocopy - Zero copy skb to skb
+ * @to: destination buffer
+ * @source: source buffer
+ * @len: number of bytes to copy from source buffer
+ * @hlen: size of linear headroom in destination buffer
+ *
+ * Copies up to `len` bytes from `from` to `to` by creating references
+ * to the frags in the source buffer.
+ *
+ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
+ * headroom in the `to` buffer.
+ */
+void
+skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ struct page *page;
+ unsigned int offset;
+
+ BUG_ON(!from->head_frag && !hlen);
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to)) {
+ skb_copy_bits(from, 0, skb_put(to, len), len);
+ return;
+ }
+
+ if (hlen) {
+ skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_page_desc(to, 0, page, offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ to->truesize += len + plen;
+ to->len += len + plen;
+ to->data_len += len + plen;
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+ len -= skb_shinfo(to)->frags[j].size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy);
+
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
__wsum csum;
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 21258cf..615ee12 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -235,51 +235,6 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
spin_unlock_bh(&queue->lock);
}
-static void
-nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
-{
- int i, j = 0;
- int plen = 0; /* length of skb->head fragment */
- struct page *page;
- unsigned int offset;
-
- /* dont bother with small payloads */
- if (len <= skb_tailroom(to)) {
- skb_copy_bits(from, 0, skb_put(to, len), len);
- return;
- }
-
- if (hlen) {
- skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
- len -= hlen;
- } else {
- plen = min_t(int, skb_headlen(from), len);
- if (plen) {
- page = virt_to_head_page(from->head);
- offset = from->data - (unsigned char *)page_address(page);
- __skb_fill_page_desc(to, 0, page, offset, plen);
- get_page(page);
- j = 1;
- len -= plen;
- }
- }
-
- to->truesize += len + plen;
- to->len += len + plen;
- to->data_len += len + plen;
-
- for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
- if (!len)
- break;
- skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
- skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
- len -= skb_shinfo(to)->frags[j].size;
- skb_frag_ref(to, j);
- j++;
- }
- skb_shinfo(to)->nr_frags = j;
-}
-
static int
nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
bool csum_verify)
@@ -304,7 +259,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
{
size_t size;
size_t data_len = 0, cap_len = 0;
- int hlen = 0;
+ unsigned int hlen = 0;
struct sk_buff *skb;
struct nlattr *nla;
struct nfqnl_msg_packet_hdr *pmsg;
@@ -356,14 +311,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (data_len > entskb->len)
data_len = entskb->len;
- if (!entskb->head_frag ||
- skb_headlen(entskb) < L1_CACHE_BYTES ||
- skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS)
- hlen = skb_headlen(entskb);
-
- if (skb_has_frag_list(entskb))
- hlen = entskb->len;
- hlen = min_t(int, data_len, hlen);
+ hlen = skb_zerocopy_headlen(entskb);
+ hlen = min_t(unsigned int, hlen, data_len);
size += sizeof(struct nlattr) + hlen;
cap_len = entskb->len;
break;
@@ -504,7 +453,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nla->nla_type = NFQA_PAYLOAD;
nla->nla_len = nla_attr_size(data_len);
- nfqnl_zcopy(skb, entskb, data_len, hlen);
+ skb_zerocopy(skb, entskb, data_len, hlen);
}
nlh->nlmsg_len = skb->len;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/3 net-next] openvswitch: Allow user space to announce ability to accept unaligned Netlink messages
2013-11-11 15:47 [PATCH 0/3 v5] Open vSwitch zerocopy upcall Thomas Graf
2013-11-11 15:47 ` [PATCH 1/3 net-next] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
@ 2013-11-11 15:47 ` Thomas Graf
2013-11-11 15:47 ` [PATCH 3/3 net-next] openvswitch: Use skb_zerocopy() for upcall Thomas Graf
2 siblings, 0 replies; 4+ messages in thread
From: Thomas Graf @ 2013-11-11 15:47 UTC (permalink / raw)
To: jesse, davem; +Cc: dev, netdev, eric.dumazet, dborkman, bhutchings
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
include/uapi/linux/openvswitch.h | 4 ++++
net/openvswitch/datapath.c | 4 ++++
net/openvswitch/datapath.h | 2 ++
3 files changed, 10 insertions(+)
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d120f9f..07ef2c3 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -75,6 +75,7 @@ enum ovs_datapath_attr {
OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */
OVS_DP_ATTR_STATS, /* struct ovs_dp_stats */
OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */
+ OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */
__OVS_DP_ATTR_MAX
};
@@ -106,6 +107,9 @@ struct ovs_vport_stats {
__u64 tx_dropped; /* no space available in linux */
};
+/* Allow last Netlink attribute to be unaligned */
+#define OVS_DP_F_UNALIGNED (1 << 0)
+
/* Fixed logical ports. */
#define OVSP_LOCAL ((__u32)0)
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 1408adc..0a50574 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1061,6 +1061,7 @@ static struct genl_ops dp_flow_genl_ops[] = {
static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+ [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
};
static struct genl_family dp_datapath_genl_family = {
@@ -1217,6 +1218,9 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
parms.port_no = OVSP_LOCAL;
parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
+ if (a[OVS_DP_ATTR_USER_FEATURES])
+ dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
+
vport = new_vport(&parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index d3d14a58..ecdf44c 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -88,6 +88,8 @@ struct datapath {
/* Network namespace ref. */
struct net *net;
#endif
+
+ u32 user_features;
};
/**
--
1.8.3.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 3/3 net-next] openvswitch: Use skb_zerocopy() for upcall
2013-11-11 15:47 [PATCH 0/3 v5] Open vSwitch zerocopy upcall Thomas Graf
2013-11-11 15:47 ` [PATCH 1/3 net-next] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
2013-11-11 15:47 ` [PATCH 2/3 net-next] openvswitch: Allow user space to announce ability to accept unaligned Netlink messages Thomas Graf
@ 2013-11-11 15:47 ` Thomas Graf
2 siblings, 0 replies; 4+ messages in thread
From: Thomas Graf @ 2013-11-11 15:47 UTC (permalink / raw)
To: jesse, davem; +Cc: dev, netdev, eric.dumazet, dborkman, bhutchings
Use of skb_zerocopy() avoids the expensive call to memcpy() when
copying the packet data into the Netlink skb. Completes checksum
through skb_checksum_help() if needed.
Cost of memcpy is significantly reduced from:
+ 7.48% vhost-8471 [k] memcpy
+ 5.57% ovs-vswitchd [k] memcpy
+ 2.81% vhost-8471 [k] csum_partial_copy_generic
to:
+ 5.72% ovs-vswitchd [k] memcpy
+ 3.32% vhost-5153 [k] memcpy
+ 0.68% vhost-5153 [k] skb_zerocopy
(megaflows disabled)
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
net/openvswitch/datapath.c | 54 +++++++++++++++++++++++++++++++---------------
1 file changed, 37 insertions(+), 17 deletions(-)
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0a50574..8dc2496 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -108,10 +108,10 @@ int lockdep_ovsl_is_held(void)
#endif
static struct vport *new_vport(const struct vport_parms *);
-static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *,
- const struct dp_upcall_info *);
-static int queue_userspace_packet(struct net *, int dp_ifindex,
- struct sk_buff *,
+static int queue_gso_packets(struct datapath *, struct net *, int dp_ifindex,
+ struct sk_buff *, const struct dp_upcall_info *);
+static int queue_userspace_packet(struct datapath *, struct net *,
+ int dp_ifindex, struct sk_buff *,
const struct dp_upcall_info *);
/* Must be called with rcu_read_lock or ovs_mutex. */
@@ -292,9 +292,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
}
if (!skb_is_gso(skb))
- err = queue_userspace_packet(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
+ err = queue_userspace_packet(dp, ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
else
- err = queue_gso_packets(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
+ err = queue_gso_packets(dp, ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
if (err)
goto err;
@@ -310,7 +310,7 @@ err:
return err;
}
-static int queue_gso_packets(struct net *net, int dp_ifindex,
+static int queue_gso_packets(struct datapath *dp, struct net *net, int dp_ifindex,
struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
@@ -327,7 +327,7 @@ static int queue_gso_packets(struct net *net, int dp_ifindex,
/* Queue all of the segments. */
skb = segs;
do {
- err = queue_userspace_packet(net, dp_ifindex, skb, upcall_info);
+ err = queue_userspace_packet(dp, net, dp_ifindex, skb, upcall_info);
if (err)
break;
@@ -381,10 +381,11 @@ static size_t key_attr_size(void)
}
static size_t upcall_msg_size(const struct sk_buff *skb,
- const struct nlattr *userdata)
+ const struct nlattr *userdata,
+ unsigned int hdrlen)
{
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
- + nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */
+ + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
+ nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */
/* OVS_PACKET_ATTR_USERDATA */
@@ -394,14 +395,15 @@ static size_t upcall_msg_size(const struct sk_buff *skb,
return size;
}
-static int queue_userspace_packet(struct net *net, int dp_ifindex,
- struct sk_buff *skb,
+static int queue_userspace_packet(struct datapath *dp, struct net *net,
+ int dp_ifindex, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb; /* to be queued to userspace */
struct nlattr *nla;
+ unsigned int hlen;
int err;
if (vlan_tx_tag_present(skb)) {
@@ -422,7 +424,21 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
goto out;
}
- user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC);
+ /* Complete checksum if needed */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto out;
+
+ /* Older versions of OVS user space enforce alignment of the last
+ * Netlink attribute to NLA_ALIGNTO which would require extensive
+ * padding logic. Only perform zerocopy if padding is not required.
+ */
+ if (dp->user_features & OVS_DP_F_UNALIGNED)
+ hlen = skb_zerocopy_headlen(skb);
+ else
+ hlen = skb->len;
+
+ user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata, hlen), GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
goto out;
@@ -441,13 +457,17 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
- nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
+ /* Only reserve room for attribute header, packet data is added
+ * in skb_zerocopy() */
+ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0)))
+ goto out;
+ nla->nla_len = nla_attr_size(skb->len);
+
+ skb_zerocopy(user_skb, skb, skb->len, hlen);
- skb_copy_and_csum_dev(skb, nla_data(nla));
+ ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
- genlmsg_end(user_skb, upcall);
err = genlmsg_unicast(net, user_skb, upcall_info->portid);
-
out:
kfree_skb(nskb);
return err;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 4+ messages in thread