* [PATCH 0/3 v5] Open vSwitch zerocopy upcall
@ 2013-11-11 15:47 Thomas Graf
2013-11-11 15:47 ` [PATCH 1/3 net-next] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Thomas Graf @ 2013-11-11 15:47 UTC (permalink / raw)
To: jesse, davem; +Cc: dev, netdev, eric.dumazet, dborkman, bhutchings
Respin of the zerocopy patches for the openvswitch upcall.
V5: - Removed padding requirement in user space
- Added OVS_DP_F_UNALIGNED flag let user space signal ability to
receive unaligned Netlink messages, fall back to linear copy
otherwise.
V4: - Daniel Borkmann pointed out that the style in skbuff.h has changed
in net-next, adapted to no longer using extern in headers.
V3: - Removed unneeded alignment of nlmsg_len after padding
V2: - Added skb_zerocopy_headlen() to calculate headroom of destination
buffer. This also takes care of the from->head_frag issue.
- Attribute alignment for frag_list case
- API documentation
- performance data for CHECKSUM_PARTIAL tx case
Thomas Graf (3):
net: Export skb_zerocopy() to zerocopy from one skb to another
openvswitch: Allow user space to announce ability to accept unaligned
Netlink messages
openvswitch: Use skb_zerocopy() for upcall
include/linux/skbuff.h | 3 ++
include/uapi/linux/openvswitch.h | 4 ++
net/core/skbuff.c | 85 ++++++++++++++++++++++++++++++++++++
net/netfilter/nfnetlink_queue_core.c | 59 ++-----------------------
net/openvswitch/datapath.c | 58 ++++++++++++++++--------
net/openvswitch/datapath.h | 2 +
6 files changed, 139 insertions(+), 72 deletions(-)
--
1.8.3.1
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/3 net-next] net: Export skb_zerocopy() to zerocopy from one skb to another
2013-11-11 15:47 [PATCH 0/3 v5] Open vSwitch zerocopy upcall Thomas Graf
@ 2013-11-11 15:47 ` Thomas Graf
2013-11-11 15:47 ` [PATCH 2/3 net-next] openvswitch: Allow user space to announce ability to accept unaligned Netlink messages Thomas Graf
2013-11-11 15:47 ` [PATCH 3/3 net-next] openvswitch: Use skb_zerocopy() for upcall Thomas Graf
2 siblings, 0 replies; 4+ messages in thread
From: Thomas Graf @ 2013-11-11 15:47 UTC (permalink / raw)
To: jesse, davem; +Cc: dev, netdev, eric.dumazet, dborkman, bhutchings
Make the skb zerocopy logic written for nfnetlink queue available for
use by other modules.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
include/linux/skbuff.h | 3 ++
net/core/skbuff.c | 85 ++++++++++++++++++++++++++++++++++++
net/netfilter/nfnetlink_queue_core.c | 59 ++-----------------------
3 files changed, 92 insertions(+), 55 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 036ec7d..9e6a293 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2372,6 +2372,9 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int len,
unsigned int flags);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
+unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
+void skb_zerocopy(struct sk_buff *to, const struct sk_buff *from,
+ int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8c5197f..c5cefbe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2125,6 +2125,91 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+ /**
+ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
+ * @from: source buffer
+ *
+ * Calculates the amount of linear headroom needed in the 'to' skb passed
+ * into skb_zerocopy().
+ */
+unsigned int
+skb_zerocopy_headlen(const struct sk_buff *from)
+{
+ unsigned int hlen = 0;
+
+ if (!from->head_frag ||
+ skb_headlen(from) < L1_CACHE_BYTES ||
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ hlen = skb_headlen(from);
+
+ if (skb_has_frag_list(from))
+ hlen = from->len;
+
+ return hlen;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
+
+/**
+ * skb_zerocopy - Zero copy skb to skb
+ * @to: destination buffer
+ * @source: source buffer
+ * @len: number of bytes to copy from source buffer
+ * @hlen: size of linear headroom in destination buffer
+ *
+ * Copies up to `len` bytes from `from` to `to` by creating references
+ * to the frags in the source buffer.
+ *
+ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
+ * headroom in the `to` buffer.
+ */
+void
+skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ struct page *page;
+ unsigned int offset;
+
+ BUG_ON(!from->head_frag && !hlen);
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to)) {
+ skb_copy_bits(from, 0, skb_put(to, len), len);
+ return;
+ }
+
+ if (hlen) {
+ skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_page_desc(to, 0, page, offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ to->truesize += len + plen;
+ to->len += len + plen;
+ to->data_len += len + plen;
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+ len -= skb_shinfo(to)->frags[j].size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy);
+
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
__wsum csum;
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 21258cf..615ee12 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -235,51 +235,6 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
spin_unlock_bh(&queue->lock);
}
-static void
-nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
-{
- int i, j = 0;
- int plen = 0; /* length of skb->head fragment */
- struct page *page;
- unsigned int offset;
-
- /* dont bother with small payloads */
- if (len <= skb_tailroom(to)) {
- skb_copy_bits(from, 0, skb_put(to, len), len);
- return;
- }
-
- if (hlen) {
- skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
- len -= hlen;
- } else {
- plen = min_t(int, skb_headlen(from), len);
- if (plen) {
- page = virt_to_head_page(from->head);
- offset = from->data - (unsigned char *)page_address(page);
- __skb_fill_page_desc(to, 0, page, offset, plen);
- get_page(page);
- j = 1;
- len -= plen;
- }
- }
-
- to->truesize += len + plen;
- to->len += len + plen;
- to->data_len += len + plen;
-
- for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
- if (!len)
- break;
- skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
- skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
- len -= skb_shinfo(to)->frags[j].size;
- skb_frag_ref(to, j);
- j++;
- }
- skb_shinfo(to)->nr_frags = j;
-}
-
static int
nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
bool csum_verify)
@@ -304,7 +259,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
{
size_t size;
size_t data_len = 0, cap_len = 0;
- int hlen = 0;
+ unsigned int hlen = 0;
struct sk_buff *skb;
struct nlattr *nla;
struct nfqnl_msg_packet_hdr *pmsg;
@@ -356,14 +311,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (data_len > entskb->len)
data_len = entskb->len;
- if (!entskb->head_frag ||
- skb_headlen(entskb) < L1_CACHE_BYTES ||
- skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS)
- hlen = skb_headlen(entskb);
-
- if (skb_has_frag_list(entskb))
- hlen = entskb->len;
- hlen = min_t(int, data_len, hlen);
+ hlen = skb_zerocopy_headlen(entskb);
+ hlen = min_t(unsigned int, hlen, data_len);
size += sizeof(struct nlattr) + hlen;
cap_len = entskb->len;
break;
@@ -504,7 +453,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nla->nla_type = NFQA_PAYLOAD;
nla->nla_len = nla_attr_size(data_len);
- nfqnl_zcopy(skb, entskb, data_len, hlen);
+ skb_zerocopy(skb, entskb, data_len, hlen);
}
nlh->nlmsg_len = skb->len;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/3 net-next] openvswitch: Allow user space to announce ability to accept unaligned Netlink messages
2013-11-11 15:47 [PATCH 0/3 v5] Open vSwitch zerocopy upcall Thomas Graf
2013-11-11 15:47 ` [PATCH 1/3 net-next] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
@ 2013-11-11 15:47 ` Thomas Graf
2013-11-11 15:47 ` [PATCH 3/3 net-next] openvswitch: Use skb_zerocopy() for upcall Thomas Graf
2 siblings, 0 replies; 4+ messages in thread
From: Thomas Graf @ 2013-11-11 15:47 UTC (permalink / raw)
To: jesse, davem; +Cc: dev, netdev, eric.dumazet, dborkman, bhutchings
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
include/uapi/linux/openvswitch.h | 4 ++++
net/openvswitch/datapath.c | 4 ++++
net/openvswitch/datapath.h | 2 ++
3 files changed, 10 insertions(+)
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d120f9f..07ef2c3 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -75,6 +75,7 @@ enum ovs_datapath_attr {
OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */
OVS_DP_ATTR_STATS, /* struct ovs_dp_stats */
OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */
+ OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */
__OVS_DP_ATTR_MAX
};
@@ -106,6 +107,9 @@ struct ovs_vport_stats {
__u64 tx_dropped; /* no space available in linux */
};
+/* Allow last Netlink attribute to be unaligned */
+#define OVS_DP_F_UNALIGNED (1 << 0)
+
/* Fixed logical ports. */
#define OVSP_LOCAL ((__u32)0)
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 1408adc..0a50574 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1061,6 +1061,7 @@ static struct genl_ops dp_flow_genl_ops[] = {
static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+ [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
};
static struct genl_family dp_datapath_genl_family = {
@@ -1217,6 +1218,9 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
parms.port_no = OVSP_LOCAL;
parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
+ if (a[OVS_DP_ATTR_USER_FEATURES])
+ dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
+
vport = new_vport(&parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index d3d14a58..ecdf44c 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -88,6 +88,8 @@ struct datapath {
/* Network namespace ref. */
struct net *net;
#endif
+
+ u32 user_features;
};
/**
--
1.8.3.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 3/3 net-next] openvswitch: Use skb_zerocopy() for upcall
2013-11-11 15:47 [PATCH 0/3 v5] Open vSwitch zerocopy upcall Thomas Graf
2013-11-11 15:47 ` [PATCH 1/3 net-next] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
2013-11-11 15:47 ` [PATCH 2/3 net-next] openvswitch: Allow user space to announce ability to accept unaligned Netlink messages Thomas Graf
@ 2013-11-11 15:47 ` Thomas Graf
2 siblings, 0 replies; 4+ messages in thread
From: Thomas Graf @ 2013-11-11 15:47 UTC (permalink / raw)
To: jesse, davem; +Cc: dev, netdev, eric.dumazet, dborkman, bhutchings
Use of skb_zerocopy() avoids the expensive call to memcpy() when
copying the packet data into the Netlink skb. Completes checksum
through skb_checksum_help() if needed.
Cost of memcpy is significantly reduced from:
+ 7.48% vhost-8471 [k] memcpy
+ 5.57% ovs-vswitchd [k] memcpy
+ 2.81% vhost-8471 [k] csum_partial_copy_generic
to:
+ 5.72% ovs-vswitchd [k] memcpy
+ 3.32% vhost-5153 [k] memcpy
+ 0.68% vhost-5153 [k] skb_zerocopy
(megaflows disabled)
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
net/openvswitch/datapath.c | 54 +++++++++++++++++++++++++++++++---------------
1 file changed, 37 insertions(+), 17 deletions(-)
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0a50574..8dc2496 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -108,10 +108,10 @@ int lockdep_ovsl_is_held(void)
#endif
static struct vport *new_vport(const struct vport_parms *);
-static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *,
- const struct dp_upcall_info *);
-static int queue_userspace_packet(struct net *, int dp_ifindex,
- struct sk_buff *,
+static int queue_gso_packets(struct datapath *, struct net *, int dp_ifindex,
+ struct sk_buff *, const struct dp_upcall_info *);
+static int queue_userspace_packet(struct datapath *, struct net *,
+ int dp_ifindex, struct sk_buff *,
const struct dp_upcall_info *);
/* Must be called with rcu_read_lock or ovs_mutex. */
@@ -292,9 +292,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
}
if (!skb_is_gso(skb))
- err = queue_userspace_packet(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
+ err = queue_userspace_packet(dp, ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
else
- err = queue_gso_packets(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
+ err = queue_gso_packets(dp, ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
if (err)
goto err;
@@ -310,7 +310,7 @@ err:
return err;
}
-static int queue_gso_packets(struct net *net, int dp_ifindex,
+static int queue_gso_packets(struct datapath *dp, struct net *net, int dp_ifindex,
struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
@@ -327,7 +327,7 @@ static int queue_gso_packets(struct net *net, int dp_ifindex,
/* Queue all of the segments. */
skb = segs;
do {
- err = queue_userspace_packet(net, dp_ifindex, skb, upcall_info);
+ err = queue_userspace_packet(dp, net, dp_ifindex, skb, upcall_info);
if (err)
break;
@@ -381,10 +381,11 @@ static size_t key_attr_size(void)
}
static size_t upcall_msg_size(const struct sk_buff *skb,
- const struct nlattr *userdata)
+ const struct nlattr *userdata,
+ unsigned int hdrlen)
{
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
- + nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */
+ + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
+ nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */
/* OVS_PACKET_ATTR_USERDATA */
@@ -394,14 +395,15 @@ static size_t upcall_msg_size(const struct sk_buff *skb,
return size;
}
-static int queue_userspace_packet(struct net *net, int dp_ifindex,
- struct sk_buff *skb,
+static int queue_userspace_packet(struct datapath *dp, struct net *net,
+ int dp_ifindex, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info)
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb; /* to be queued to userspace */
struct nlattr *nla;
+ unsigned int hlen;
int err;
if (vlan_tx_tag_present(skb)) {
@@ -422,7 +424,21 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
goto out;
}
- user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC);
+ /* Complete checksum if needed */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto out;
+
+ /* Older versions of OVS user space enforce alignment of the last
+ * Netlink attribute to NLA_ALIGNTO which would require extensive
+ * padding logic. Only perform zerocopy if padding is not required.
+ */
+ if (dp->user_features & OVS_DP_F_UNALIGNED)
+ hlen = skb_zerocopy_headlen(skb);
+ else
+ hlen = skb->len;
+
+ user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata, hlen), GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
goto out;
@@ -441,13 +457,17 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
- nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
+ /* Only reserve room for attribute header, packet data is added
+ * in skb_zerocopy() */
+ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0)))
+ goto out;
+ nla->nla_len = nla_attr_size(skb->len);
+
+ skb_zerocopy(user_skb, skb, skb->len, hlen);
- skb_copy_and_csum_dev(skb, nla_data(nla));
+ ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
- genlmsg_end(user_skb, upcall);
err = genlmsg_unicast(net, user_skb, upcall_info->portid);
-
out:
kfree_skb(nskb);
return err;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2013-11-11 15:48 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-11-11 15:47 [PATCH 0/3 v5] Open vSwitch zerocopy upcall Thomas Graf
2013-11-11 15:47 ` [PATCH 1/3 net-next] net: Export skb_zerocopy() to zerocopy from one skb to another Thomas Graf
2013-11-11 15:47 ` [PATCH 2/3 net-next] openvswitch: Allow user space to announce ability to accept unaligned Netlink messages Thomas Graf
2013-11-11 15:47 ` [PATCH 3/3 net-next] openvswitch: Use skb_zerocopy() for upcall Thomas Graf
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.