* [PATCH net-next v3 2/4] geneve: Merge ipv4 and ipv6 geneve_build_skb()
From: Pravin B Shelar @ 2016-11-21 19:02 UTC (permalink / raw)
To: netdev; +Cc: Pravin B Shelar
In-Reply-To: <1479754981-17600-1-git-send-email-pshelar@ovn.org>
There are minimal difference in building Geneve header
between ipv4 and ipv6 geneve tunnels. Following patch
refactors code to unify it.
Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
---
drivers/net/geneve.c | 100 ++++++++++++++-------------------------------------
1 file changed, 26 insertions(+), 74 deletions(-)
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 658531d..2cd5c41 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -630,67 +630,34 @@ static int geneve_stop(struct net_device *dev)
}
static void geneve_build_header(struct genevehdr *geneveh,
- __be16 tun_flags, u8 vni[3],
- u8 options_len, u8 *options)
+ const struct ip_tunnel_info *info)
{
geneveh->ver = GENEVE_VER;
- geneveh->opt_len = options_len / 4;
- geneveh->oam = !!(tun_flags & TUNNEL_OAM);
- geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+ geneveh->opt_len = info->options_len / 4;
+ geneveh->oam = !!(info->key.tun_flags & TUNNEL_OAM);
+ geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
geneveh->rsvd1 = 0;
- memcpy(geneveh->vni, vni, 3);
+ tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
geneveh->proto_type = htons(ETH_P_TEB);
geneveh->rsvd2 = 0;
- memcpy(geneveh->options, options, options_len);
+ ip_tunnel_info_opts_get(geneveh->options, info);
}
-static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
- __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
- bool xnet)
-{
- bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
- struct genevehdr *gnvh;
- int min_headroom;
- int err;
-
- skb_scrub_packet(skb, xnet);
-
- min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
- + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr);
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err))
- goto free_rt;
-
- err = udp_tunnel_handle_offloads(skb, udp_sum);
- if (err)
- goto free_rt;
-
- gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
- geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
- return 0;
-
-free_rt:
- ip_rt_put(rt);
- return err;
-}
-
-#if IS_ENABLED(CONFIG_IPV6)
-static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
- __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
- bool xnet)
+static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
+ const struct ip_tunnel_info *info,
+ bool xnet, int ip_hdr_len)
{
- bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
+ bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
struct genevehdr *gnvh;
int min_headroom;
int err;
+ skb_reset_mac_header(skb);
skb_scrub_packet(skb, xnet);
- min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
- + GENEVE_BASE_HLEN + opt_len + sizeof(struct ipv6hdr);
+ min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len +
+ GENEVE_BASE_HLEN + info->options_len + ip_hdr_len;
err = skb_cow_head(skb, min_headroom);
if (unlikely(err))
goto free_dst;
@@ -699,9 +666,9 @@ static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
if (err)
goto free_dst;
- gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
- geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
-
+ gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) +
+ info->options_len);
+ geneve_build_header(gnvh, info);
skb_set_inner_protocol(skb, htons(ETH_P_TEB));
return 0;
@@ -709,12 +676,11 @@ static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
dst_release(dst);
return err;
}
-#endif
static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
struct net_device *dev,
struct flowi4 *fl4,
- struct ip_tunnel_info *info)
+ const struct ip_tunnel_info *info)
{
bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
struct geneve_dev *geneve = netdev_priv(dev);
@@ -738,7 +704,7 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
}
fl4->flowi4_tos = RT_TOS(tos);
- dst_cache = &info->dst_cache;
+ dst_cache = (struct dst_cache *)&info->dst_cache;
if (use_cache) {
rt = dst_cache_get_ip4(dst_cache, &fl4->saddr);
if (rt)
@@ -763,7 +729,7 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
struct net_device *dev,
struct flowi6 *fl6,
- struct ip_tunnel_info *info)
+ const struct ip_tunnel_info *info)
{
bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
struct geneve_dev *geneve = netdev_priv(dev);
@@ -789,7 +755,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
info->key.label);
- dst_cache = &info->dst_cache;
+ dst_cache = (struct dst_cache *)&info->dst_cache;
if (use_cache) {
dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
if (dst)
@@ -812,7 +778,8 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
#endif
static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
- struct geneve_dev *geneve, struct ip_tunnel_info *info)
+ struct geneve_dev *geneve,
+ const struct ip_tunnel_info *info)
{
bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
@@ -820,11 +787,9 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
struct rtable *rt;
int err = -EINVAL;
struct flowi4 fl4;
- u8 *opts = NULL;
__u8 tos, ttl;
__be16 sport;
__be16 df;
- u8 vni[3];
if (!gs4)
return err;
@@ -843,13 +808,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
}
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- tunnel_id_to_vni(key->tun_id, vni);
- if (info->options_len)
- opts = ip_tunnel_info_opts(info);
-
- skb_reset_mac_header(skb);
- err = geneve_build_skb(rt, skb, key->tun_flags, vni,
- info->options_len, opts, xnet);
+ err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
if (unlikely(err))
return err;
@@ -862,7 +821,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
#if IS_ENABLED(CONFIG_IPV6)
static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
- struct geneve_dev *geneve, struct ip_tunnel_info *info)
+ struct geneve_dev *geneve,
+ const struct ip_tunnel_info *info)
{
bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
@@ -870,10 +830,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
struct dst_entry *dst = NULL;
int err = -EINVAL;
struct flowi6 fl6;
- u8 *opts = NULL;
__u8 prio, ttl;
__be16 sport;
- u8 vni[3];
if (!gs6)
return err;
@@ -891,13 +849,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
ip_hdr(skb), skb);
ttl = key->ttl ? : ip6_dst_hoplimit(dst);
}
- tunnel_id_to_vni(key->tun_id, vni);
- if (info->options_len)
- opts = ip_tunnel_info_opts(info);
-
- skb_reset_mac_header(skb);
- err = geneve6_build_skb(dst, skb, key->tun_flags, vni,
- info->options_len, opts, xnet);
+ err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct iphdr));
if (unlikely(err))
return err;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v3 1/4] geneve: Unify LWT and netdev handling.
From: Pravin B Shelar @ 2016-11-21 19:02 UTC (permalink / raw)
To: netdev; +Cc: Pravin B Shelar
In-Reply-To: <1479754981-17600-1-git-send-email-pshelar@ovn.org>
Current geneve implementation has two separate cases to handle.
1. netdev xmit
2. LWT xmit.
In case of netdev, geneve configuration is stored in various
struct geneve_dev members. For example geneve_addr, ttl, tos,
label, flags, dst_cache, etc. For LWT ip_tunnel_info is passed
to the device in ip_tunnel_info.
Following patch uses ip_tunnel_info struct to store almost all
of configuration of a geneve netdevice. This allows us to unify
most of geneve driver code around ip_tunnel_info struct.
This dramatically simplify geneve code, since it does not
need to handle two different configuration cases. Removes
duplicate code, single code path can handle either type
of geneve devices.
Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
---
drivers/net/geneve.c | 612 ++++++++++++++++++++++-----------------------------
1 file changed, 263 insertions(+), 349 deletions(-)
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 90dc6b1..658531d 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -45,41 +45,22 @@ struct geneve_net {
static unsigned int geneve_net_id;
-union geneve_addr {
- struct sockaddr_in sin;
- struct sockaddr_in6 sin6;
- struct sockaddr sa;
-};
-
-static union geneve_addr geneve_remote_unspec = { .sa.sa_family = AF_UNSPEC, };
-
/* Pseudo network device */
struct geneve_dev {
struct hlist_node hlist; /* vni hash table */
struct net *net; /* netns for packet i/o */
struct net_device *dev; /* netdev for geneve tunnel */
+ struct ip_tunnel_info info;
struct geneve_sock __rcu *sock4; /* IPv4 socket used for geneve tunnel */
#if IS_ENABLED(CONFIG_IPV6)
struct geneve_sock __rcu *sock6; /* IPv6 socket used for geneve tunnel */
#endif
- u8 vni[3]; /* virtual network ID for tunnel */
- u8 ttl; /* TTL override */
- u8 tos; /* TOS override */
- union geneve_addr remote; /* IP address for link partner */
struct list_head next; /* geneve's per namespace list */
- __be32 label; /* IPv6 flowlabel override */
- __be16 dst_port;
- bool collect_md;
struct gro_cells gro_cells;
- u32 flags;
- struct dst_cache dst_cache;
+ bool collect_md;
+ bool use_udp6_rx_checksums;
};
-/* Geneve device flags */
-#define GENEVE_F_UDP_ZERO_CSUM_TX BIT(0)
-#define GENEVE_F_UDP_ZERO_CSUM6_TX BIT(1)
-#define GENEVE_F_UDP_ZERO_CSUM6_RX BIT(2)
-
struct geneve_sock {
bool collect_md;
struct list_head list;
@@ -87,7 +68,6 @@ struct geneve_sock {
struct rcu_head rcu;
int refcnt;
struct hlist_head vni_list[VNI_HASH_SIZE];
- u32 flags;
};
static inline __u32 geneve_net_vni_hash(u8 vni[3])
@@ -109,6 +89,20 @@ static __be64 vni_to_tunnel_id(const __u8 *vni)
#endif
}
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+ vni[0] = (__force __u8)(tun_id >> 16);
+ vni[1] = (__force __u8)(tun_id >> 8);
+ vni[2] = (__force __u8)tun_id;
+#else
+ vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+ vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+ vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
{
return gs->sock->sk->sk_family;
@@ -117,6 +111,7 @@ static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
__be32 addr, u8 vni[])
{
+ __be64 id = vni_to_tunnel_id(vni);
struct hlist_head *vni_list_head;
struct geneve_dev *geneve;
__u32 hash;
@@ -125,8 +120,8 @@ static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
hash = geneve_net_vni_hash(vni);
vni_list_head = &gs->vni_list[hash];
hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
- if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
- addr == geneve->remote.sin.sin_addr.s_addr)
+ if (!memcmp(&id, &geneve->info.key.tun_id, sizeof(id)) &&
+ addr == geneve->info.key.u.ipv4.dst)
return geneve;
}
return NULL;
@@ -136,6 +131,7 @@ static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
struct in6_addr addr6, u8 vni[])
{
+ __be64 id = vni_to_tunnel_id(vni);
struct hlist_head *vni_list_head;
struct geneve_dev *geneve;
__u32 hash;
@@ -144,8 +140,8 @@ static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
hash = geneve_net_vni_hash(vni);
vni_list_head = &gs->vni_list[hash];
hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
- if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
- ipv6_addr_equal(&addr6, &geneve->remote.sin6.sin6_addr))
+ if (!memcmp(&id, &geneve->info.key.tun_id, sizeof(id)) &&
+ ipv6_addr_equal(&addr6, &geneve->info.key.u.ipv6.dst))
return geneve;
}
return NULL;
@@ -160,15 +156,12 @@ static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
struct sk_buff *skb)
{
- u8 *vni;
- __be32 addr;
static u8 zero_vni[3];
-#if IS_ENABLED(CONFIG_IPV6)
- static struct in6_addr zero_addr6;
-#endif
+ u8 *vni;
if (geneve_get_sk_family(gs) == AF_INET) {
struct iphdr *iph;
+ __be32 addr;
iph = ip_hdr(skb); /* outer IP header... */
@@ -183,6 +176,7 @@ static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
return geneve_lookup(gs, addr, vni);
#if IS_ENABLED(CONFIG_IPV6)
} else if (geneve_get_sk_family(gs) == AF_INET6) {
+ static struct in6_addr zero_addr6;
struct ipv6hdr *ip6h;
struct in6_addr addr6;
@@ -305,13 +299,12 @@ static int geneve_init(struct net_device *dev)
return err;
}
- err = dst_cache_init(&geneve->dst_cache, GFP_KERNEL);
+ err = dst_cache_init(&geneve->info.dst_cache, GFP_KERNEL);
if (err) {
free_percpu(dev->tstats);
gro_cells_destroy(&geneve->gro_cells);
return err;
}
-
return 0;
}
@@ -319,7 +312,7 @@ static void geneve_uninit(struct net_device *dev)
{
struct geneve_dev *geneve = netdev_priv(dev);
- dst_cache_destroy(&geneve->dst_cache);
+ dst_cache_destroy(&geneve->info.dst_cache);
gro_cells_destroy(&geneve->gro_cells);
free_percpu(dev->tstats);
}
@@ -368,7 +361,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
}
static struct socket *geneve_create_sock(struct net *net, bool ipv6,
- __be16 port, u32 flags)
+ __be16 port, bool ipv6_rx_csum)
{
struct socket *sock;
struct udp_port_cfg udp_conf;
@@ -379,8 +372,7 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6,
if (ipv6) {
udp_conf.family = AF_INET6;
udp_conf.ipv6_v6only = 1;
- udp_conf.use_udp6_rx_checksums =
- !(flags & GENEVE_F_UDP_ZERO_CSUM6_RX);
+ udp_conf.use_udp6_rx_checksums = ipv6_rx_csum;
} else {
udp_conf.family = AF_INET;
udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
@@ -491,7 +483,7 @@ static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
/* Create new listen socket if needed */
static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
- bool ipv6, u32 flags)
+ bool ipv6, bool ipv6_rx_csum)
{
struct geneve_net *gn = net_generic(net, geneve_net_id);
struct geneve_sock *gs;
@@ -503,7 +495,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
if (!gs)
return ERR_PTR(-ENOMEM);
- sock = geneve_create_sock(net, ipv6, port, flags);
+ sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum);
if (IS_ERR(sock)) {
kfree(gs);
return ERR_CAST(sock);
@@ -579,21 +571,22 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
struct net *net = geneve->net;
struct geneve_net *gn = net_generic(net, geneve_net_id);
struct geneve_sock *gs;
+ __u8 vni[3];
__u32 hash;
- gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->dst_port);
+ gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->info.key.tp_dst);
if (gs) {
gs->refcnt++;
goto out;
}
- gs = geneve_socket_create(net, geneve->dst_port, ipv6, geneve->flags);
+ gs = geneve_socket_create(net, geneve->info.key.tp_dst, ipv6,
+ geneve->use_udp6_rx_checksums);
if (IS_ERR(gs))
return PTR_ERR(gs);
out:
gs->collect_md = geneve->collect_md;
- gs->flags = geneve->flags;
#if IS_ENABLED(CONFIG_IPV6)
if (ipv6)
rcu_assign_pointer(geneve->sock6, gs);
@@ -601,7 +594,8 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
#endif
rcu_assign_pointer(geneve->sock4, gs);
- hash = geneve_net_vni_hash(geneve->vni);
+ tunnel_id_to_vni(geneve->info.key.tun_id, vni);
+ hash = geneve_net_vni_hash(vni);
hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]);
return 0;
}
@@ -609,7 +603,7 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
static int geneve_open(struct net_device *dev)
{
struct geneve_dev *geneve = netdev_priv(dev);
- bool ipv6 = geneve->remote.sa.sa_family == AF_INET6;
+ bool ipv6 = !!(geneve->info.mode & IP_TUNNEL_INFO_IPV6);
bool metadata = geneve->collect_md;
int ret = 0;
@@ -653,12 +647,12 @@ static void geneve_build_header(struct genevehdr *geneveh,
static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
__be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
- u32 flags, bool xnet)
+ bool xnet)
{
+ bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
struct genevehdr *gnvh;
int min_headroom;
int err;
- bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM_TX);
skb_scrub_packet(skb, xnet);
@@ -686,12 +680,12 @@ static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
#if IS_ENABLED(CONFIG_IPV6)
static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
__be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
- u32 flags, bool xnet)
+ bool xnet)
{
+ bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
struct genevehdr *gnvh;
int min_headroom;
int err;
- bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM6_TX);
skb_scrub_packet(skb, xnet);
@@ -734,32 +728,22 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
memset(fl4, 0, sizeof(*fl4));
fl4->flowi4_mark = skb->mark;
fl4->flowi4_proto = IPPROTO_UDP;
+ fl4->daddr = info->key.u.ipv4.dst;
+ fl4->saddr = info->key.u.ipv4.src;
- if (info) {
- fl4->daddr = info->key.u.ipv4.dst;
- fl4->saddr = info->key.u.ipv4.src;
- fl4->flowi4_tos = RT_TOS(info->key.tos);
- dst_cache = &info->dst_cache;
- } else {
- tos = geneve->tos;
- if (tos == 1) {
- const struct iphdr *iip = ip_hdr(skb);
-
- tos = ip_tunnel_get_dsfield(iip, skb);
- use_cache = false;
- }
-
- fl4->flowi4_tos = RT_TOS(tos);
- fl4->daddr = geneve->remote.sin.sin_addr.s_addr;
- dst_cache = &geneve->dst_cache;
+ tos = info->key.tos;
+ if ((tos == 1) && !geneve->collect_md) {
+ tos = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
+ use_cache = false;
}
+ fl4->flowi4_tos = RT_TOS(tos);
+ dst_cache = &info->dst_cache;
if (use_cache) {
rt = dst_cache_get_ip4(dst_cache, &fl4->saddr);
if (rt)
return rt;
}
-
rt = ip_route_output_key(geneve->net, fl4);
if (IS_ERR(rt)) {
netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
@@ -795,34 +779,22 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
memset(fl6, 0, sizeof(*fl6));
fl6->flowi6_mark = skb->mark;
fl6->flowi6_proto = IPPROTO_UDP;
-
- if (info) {
- fl6->daddr = info->key.u.ipv6.dst;
- fl6->saddr = info->key.u.ipv6.src;
- fl6->flowlabel = ip6_make_flowinfo(RT_TOS(info->key.tos),
- info->key.label);
- dst_cache = &info->dst_cache;
- } else {
- prio = geneve->tos;
- if (prio == 1) {
- const struct iphdr *iip = ip_hdr(skb);
-
- prio = ip_tunnel_get_dsfield(iip, skb);
- use_cache = false;
- }
-
- fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
- geneve->label);
- fl6->daddr = geneve->remote.sin6.sin6_addr;
- dst_cache = &geneve->dst_cache;
+ fl6->daddr = info->key.u.ipv6.dst;
+ fl6->saddr = info->key.u.ipv6.src;
+ prio = info->key.tos;
+ if ((prio == 1) && !geneve->collect_md) {
+ prio = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
+ use_cache = false;
}
+ fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
+ info->key.label);
+ dst_cache = &info->dst_cache;
if (use_cache) {
dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
if (dst)
return dst;
}
-
if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) {
netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr);
return ERR_PTR(-ENETUNREACH);
@@ -839,195 +811,130 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
}
#endif
-/* Convert 64 bit tunnel ID to 24 bit VNI. */
-static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
-{
-#ifdef __BIG_ENDIAN
- vni[0] = (__force __u8)(tun_id >> 16);
- vni[1] = (__force __u8)(tun_id >> 8);
- vni[2] = (__force __u8)tun_id;
-#else
- vni[0] = (__force __u8)((__force u64)tun_id >> 40);
- vni[1] = (__force __u8)((__force u64)tun_id >> 48);
- vni[2] = (__force __u8)((__force u64)tun_id >> 56);
-#endif
-}
-
-static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
- struct ip_tunnel_info *info)
+static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
+ struct geneve_dev *geneve, struct ip_tunnel_info *info)
{
- struct geneve_dev *geneve = netdev_priv(dev);
- struct geneve_sock *gs4;
- struct rtable *rt = NULL;
- const struct iphdr *iip; /* interior IP header */
+ bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
+ struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
+ const struct ip_tunnel_key *key = &info->key;
+ struct rtable *rt;
int err = -EINVAL;
struct flowi4 fl4;
+ u8 *opts = NULL;
__u8 tos, ttl;
__be16 sport;
__be16 df;
- bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
- u32 flags = geneve->flags;
+ u8 vni[3];
- gs4 = rcu_dereference(geneve->sock4);
if (!gs4)
- goto tx_error;
-
- if (geneve->collect_md) {
- if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
- netdev_dbg(dev, "no tunnel metadata\n");
- goto tx_error;
- }
- if (info && ip_tunnel_info_af(info) != AF_INET)
- goto tx_error;
- }
+ return err;
rt = geneve_get_v4_rt(skb, dev, &fl4, info);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto tx_error;
- }
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
- skb_reset_mac_header(skb);
-
- iip = ip_hdr(skb);
-
- if (info) {
- const struct ip_tunnel_key *key = &info->key;
- u8 *opts = NULL;
- u8 vni[3];
-
- tunnel_id_to_vni(key->tun_id, vni);
- if (info->options_len)
- opts = ip_tunnel_info_opts(info);
-
- if (key->tun_flags & TUNNEL_CSUM)
- flags &= ~GENEVE_F_UDP_ZERO_CSUM_TX;
- else
- flags |= GENEVE_F_UDP_ZERO_CSUM_TX;
-
- err = geneve_build_skb(rt, skb, key->tun_flags, vni,
- info->options_len, opts, flags, xnet);
- if (unlikely(err))
- goto tx_error;
-
- tos = ip_tunnel_ecn_encap(key->tos, iip, skb);
+ if (geneve->collect_md) {
+ tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
ttl = key->ttl;
- df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
} else {
- err = geneve_build_skb(rt, skb, 0, geneve->vni,
- 0, NULL, flags, xnet);
- if (unlikely(err))
- goto tx_error;
-
- tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
- ttl = geneve->ttl;
- if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
- ttl = 1;
- ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
- df = 0;
+ tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
+ ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst);
}
- udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr,
- tos, ttl, df, sport, geneve->dst_port,
- !net_eq(geneve->net, dev_net(geneve->dev)),
- !!(flags & GENEVE_F_UDP_ZERO_CSUM_TX));
+ df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- return NETDEV_TX_OK;
-
-tx_error:
- dev_kfree_skb(skb);
+ tunnel_id_to_vni(key->tun_id, vni);
+ if (info->options_len)
+ opts = ip_tunnel_info_opts(info);
- if (err == -ELOOP)
- dev->stats.collisions++;
- else if (err == -ENETUNREACH)
- dev->stats.tx_carrier_errors++;
+ skb_reset_mac_header(skb);
+ err = geneve_build_skb(rt, skb, key->tun_flags, vni,
+ info->options_len, opts, xnet);
+ if (unlikely(err))
+ return err;
- dev->stats.tx_errors++;
- return NETDEV_TX_OK;
+ udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr,
+ tos, ttl, df, sport, geneve->info.key.tp_dst,
+ !net_eq(geneve->net, dev_net(geneve->dev)),
+ !(info->key.tun_flags & TUNNEL_CSUM));
+ return 0;
}
#if IS_ENABLED(CONFIG_IPV6)
-static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
- struct ip_tunnel_info *info)
+static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
+ struct geneve_dev *geneve, struct ip_tunnel_info *info)
{
- struct geneve_dev *geneve = netdev_priv(dev);
+ bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
+ struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
+ const struct ip_tunnel_key *key = &info->key;
struct dst_entry *dst = NULL;
- const struct iphdr *iip; /* interior IP header */
- struct geneve_sock *gs6;
int err = -EINVAL;
struct flowi6 fl6;
+ u8 *opts = NULL;
__u8 prio, ttl;
__be16 sport;
- __be32 label;
- bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
- u32 flags = geneve->flags;
+ u8 vni[3];
- gs6 = rcu_dereference(geneve->sock6);
if (!gs6)
- goto tx_error;
-
- if (geneve->collect_md) {
- if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
- netdev_dbg(dev, "no tunnel metadata\n");
- goto tx_error;
- }
- }
+ return err;
dst = geneve_get_v6_dst(skb, dev, &fl6, info);
- if (IS_ERR(dst)) {
- err = PTR_ERR(dst);
- goto tx_error;
- }
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
- skb_reset_mac_header(skb);
-
- iip = ip_hdr(skb);
+ if (geneve->collect_md) {
+ prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
+ ttl = key->ttl;
+ } else {
+ prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
+ ip_hdr(skb), skb);
+ ttl = key->ttl ? : ip6_dst_hoplimit(dst);
+ }
+ tunnel_id_to_vni(key->tun_id, vni);
+ if (info->options_len)
+ opts = ip_tunnel_info_opts(info);
- if (info) {
- const struct ip_tunnel_key *key = &info->key;
- u8 *opts = NULL;
- u8 vni[3];
+ skb_reset_mac_header(skb);
+ err = geneve6_build_skb(dst, skb, key->tun_flags, vni,
+ info->options_len, opts, xnet);
+ if (unlikely(err))
+ return err;
- tunnel_id_to_vni(key->tun_id, vni);
- if (info->options_len)
- opts = ip_tunnel_info_opts(info);
+ udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
+ &fl6.saddr, &fl6.daddr, prio, ttl,
+ info->key.label, sport, geneve->info.key.tp_dst,
+ !(info->key.tun_flags & TUNNEL_CSUM));
+ return 0;
+}
+#endif
- if (key->tun_flags & TUNNEL_CSUM)
- flags &= ~GENEVE_F_UDP_ZERO_CSUM6_TX;
- else
- flags |= GENEVE_F_UDP_ZERO_CSUM6_TX;
+static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct geneve_dev *geneve = netdev_priv(dev);
+ struct ip_tunnel_info *info = NULL;
+ int err;
- err = geneve6_build_skb(dst, skb, key->tun_flags, vni,
- info->options_len, opts,
- flags, xnet);
- if (unlikely(err))
+ if (geneve->collect_md) {
+ info = skb_tunnel_info(skb);
+ if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
+ err = -EINVAL;
+ netdev_dbg(dev, "no tunnel metadata\n");
goto tx_error;
-
- prio = ip_tunnel_ecn_encap(key->tos, iip, skb);
- ttl = key->ttl;
- label = info->key.label;
+ }
} else {
- err = geneve6_build_skb(dst, skb, 0, geneve->vni,
- 0, NULL, flags, xnet);
- if (unlikely(err))
- goto tx_error;
-
- prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
- iip, skb);
- ttl = geneve->ttl;
- if (!ttl && ipv6_addr_is_multicast(&fl6.daddr))
- ttl = 1;
- ttl = ttl ? : ip6_dst_hoplimit(dst);
- label = geneve->label;
+ info = &geneve->info;
}
- udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
- &fl6.saddr, &fl6.daddr, prio, ttl, label,
- sport, geneve->dst_port,
- !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX));
- return NETDEV_TX_OK;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (info->mode & IP_TUNNEL_INFO_IPV6)
+ err = geneve6_xmit_skb(skb, dev, geneve, info);
+ else
+#endif
+ err = geneve_xmit_skb(skb, dev, geneve, info);
+ if (likely(!err))
+ return NETDEV_TX_OK;
tx_error:
dev_kfree_skb(skb);
@@ -1039,23 +946,6 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
dev->stats.tx_errors++;
return NETDEV_TX_OK;
}
-#endif
-
-static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct geneve_dev *geneve = netdev_priv(dev);
- struct ip_tunnel_info *info = NULL;
-
- if (geneve->collect_md)
- info = skb_tunnel_info(skb);
-
-#if IS_ENABLED(CONFIG_IPV6)
- if ((info && ip_tunnel_info_af(info) == AF_INET6) ||
- (!info && geneve->remote.sa.sa_family == AF_INET6))
- return geneve6_xmit_skb(skb, dev, info);
-#endif
- return geneve_xmit_skb(skb, dev, info);
-}
static int geneve_change_mtu(struct net_device *dev, int new_mtu)
{
@@ -1073,14 +963,11 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
struct ip_tunnel_info *info = skb_tunnel_info(skb);
struct geneve_dev *geneve = netdev_priv(dev);
- struct rtable *rt;
- struct flowi4 fl4;
-#if IS_ENABLED(CONFIG_IPV6)
- struct dst_entry *dst;
- struct flowi6 fl6;
-#endif
if (ip_tunnel_info_af(info) == AF_INET) {
+ struct rtable *rt;
+ struct flowi4 fl4;
+
rt = geneve_get_v4_rt(skb, dev, &fl4, info);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -1089,6 +976,9 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
info->key.u.ipv4.src = fl4.saddr;
#if IS_ENABLED(CONFIG_IPV6)
} else if (ip_tunnel_info_af(info) == AF_INET6) {
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
dst = geneve_get_v6_dst(skb, dev, &fl6, info);
if (IS_ERR(dst))
return PTR_ERR(dst);
@@ -1102,7 +992,7 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
info->key.tp_src = udp_flow_src_port(geneve->net, skb,
1, USHRT_MAX, true);
- info->key.tp_dst = geneve->dst_port;
+ info->key.tp_dst = geneve->info.key.tp_dst;
return 0;
}
@@ -1224,78 +1114,69 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
}
static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
- __be16 dst_port,
- union geneve_addr *remote,
- u8 vni[],
+ const struct ip_tunnel_info *info,
bool *tun_on_same_port,
bool *tun_collect_md)
{
- struct geneve_dev *geneve, *t;
+ struct geneve_dev *geneve, *t = NULL;
*tun_on_same_port = false;
*tun_collect_md = false;
- t = NULL;
list_for_each_entry(geneve, &gn->geneve_list, next) {
- if (geneve->dst_port == dst_port) {
+ if (info->key.tp_dst == geneve->info.key.tp_dst) {
*tun_collect_md = geneve->collect_md;
*tun_on_same_port = true;
}
- if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
- !memcmp(remote, &geneve->remote, sizeof(geneve->remote)) &&
- dst_port == geneve->dst_port)
+ if (info->key.tun_id == geneve->info.key.tun_id &&
+ info->key.tp_dst == geneve->info.key.tp_dst &&
+ !memcmp(&info->key.u, &geneve->info.key.u, sizeof(info->key.u)))
t = geneve;
}
return t;
}
+static bool is_all_zero(const u8 *fp, size_t size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ if (fp[i])
+ return false;
+ return true;
+}
+
+static bool is_tnl_info_zero(const struct ip_tunnel_info *info)
+{
+ if (info->key.tun_id || info->key.tun_flags || info->key.tos ||
+ info->key.ttl || info->key.label || info->key.tp_src ||
+ !is_all_zero((const u8 *)&info->key.u, sizeof(info->key.u)))
+ return false;
+ else
+ return true;
+}
+
static int geneve_configure(struct net *net, struct net_device *dev,
- union geneve_addr *remote,
- __u32 vni, __u8 ttl, __u8 tos, __be32 label,
- __be16 dst_port, bool metadata, u32 flags)
+ const struct ip_tunnel_info *info,
+ bool metadata, bool ipv6_rx_csum)
{
struct geneve_net *gn = net_generic(net, geneve_net_id);
struct geneve_dev *t, *geneve = netdev_priv(dev);
bool tun_collect_md, tun_on_same_port;
int err, encap_len;
- if (!remote)
- return -EINVAL;
- if (metadata &&
- (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl || label))
+ if (metadata && !is_tnl_info_zero(info))
return -EINVAL;
geneve->net = net;
geneve->dev = dev;
- geneve->vni[0] = (vni & 0x00ff0000) >> 16;
- geneve->vni[1] = (vni & 0x0000ff00) >> 8;
- geneve->vni[2] = vni & 0x000000ff;
-
- if ((remote->sa.sa_family == AF_INET &&
- IN_MULTICAST(ntohl(remote->sin.sin_addr.s_addr))) ||
- (remote->sa.sa_family == AF_INET6 &&
- ipv6_addr_is_multicast(&remote->sin6.sin6_addr)))
- return -EINVAL;
- if (label && remote->sa.sa_family != AF_INET6)
- return -EINVAL;
-
- geneve->remote = *remote;
-
- geneve->ttl = ttl;
- geneve->tos = tos;
- geneve->label = label;
- geneve->dst_port = dst_port;
- geneve->collect_md = metadata;
- geneve->flags = flags;
-
- t = geneve_find_dev(gn, dst_port, remote, geneve->vni,
- &tun_on_same_port, &tun_collect_md);
+ t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md);
if (t)
return -EBUSY;
/* make enough headroom for basic scenario */
encap_len = GENEVE_BASE_HLEN + ETH_HLEN;
- if (remote->sa.sa_family == AF_INET) {
+ if (ip_tunnel_info_af(info) == AF_INET) {
encap_len += sizeof(struct iphdr);
dev->max_mtu -= sizeof(struct iphdr);
} else {
@@ -1312,7 +1193,10 @@ static int geneve_configure(struct net *net, struct net_device *dev,
return -EPERM;
}
- dst_cache_reset(&geneve->dst_cache);
+ dst_cache_reset(&geneve->info.dst_cache);
+ geneve->info = *info;
+ geneve->collect_md = metadata;
+ geneve->use_udp6_rx_checksums = ipv6_rx_csum;
err = register_netdevice(dev);
if (err)
@@ -1322,74 +1206,99 @@ static int geneve_configure(struct net *net, struct net_device *dev,
return 0;
}
+static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
+{
+ memset(info, 0, sizeof(*info));
+ info->key.tp_dst = htons(dst_port);
+}
+
static int geneve_newlink(struct net *net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
- __be16 dst_port = htons(GENEVE_UDP_PORT);
- __u8 ttl = 0, tos = 0;
+ bool use_udp6_rx_checksums = false;
+ struct ip_tunnel_info info;
bool metadata = false;
- union geneve_addr remote = geneve_remote_unspec;
- __be32 label = 0;
- __u32 vni = 0;
- u32 flags = 0;
+
+ init_tnl_info(&info, GENEVE_UDP_PORT);
if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6])
return -EINVAL;
if (data[IFLA_GENEVE_REMOTE]) {
- remote.sa.sa_family = AF_INET;
- remote.sin.sin_addr.s_addr =
+ info.key.u.ipv4.dst =
nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+
+ if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
+ netdev_dbg(dev, "multicast remote is unsupported\n");
+ return -EINVAL;
+ }
}
if (data[IFLA_GENEVE_REMOTE6]) {
- if (!IS_ENABLED(CONFIG_IPV6))
- return -EPFNOSUPPORT;
-
- remote.sa.sa_family = AF_INET6;
- remote.sin6.sin6_addr =
+ #if IS_ENABLED(CONFIG_IPV6)
+ info.mode = IP_TUNNEL_INFO_IPV6;
+ info.key.u.ipv6.dst =
nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);
- if (ipv6_addr_type(&remote.sin6.sin6_addr) &
+ if (ipv6_addr_type(&info.key.u.ipv6.dst) &
IPV6_ADDR_LINKLOCAL) {
netdev_dbg(dev, "link-local remote is unsupported\n");
return -EINVAL;
}
+ if (ipv6_addr_is_multicast(&info.key.u.ipv6.dst)) {
+ netdev_dbg(dev, "multicast remote is unsupported\n");
+ return -EINVAL;
+ }
+ info.key.tun_flags |= TUNNEL_CSUM;
+ use_udp6_rx_checksums = true;
+#else
+ return -EPFNOSUPPORT;
+#endif
}
- if (data[IFLA_GENEVE_ID])
+ if (data[IFLA_GENEVE_ID]) {
+ __u32 vni;
+ __u8 tvni[3];
+
vni = nla_get_u32(data[IFLA_GENEVE_ID]);
+ tvni[0] = (vni & 0x00ff0000) >> 16;
+ tvni[1] = (vni & 0x0000ff00) >> 8;
+ tvni[2] = vni & 0x000000ff;
+ info.key.tun_id = vni_to_tunnel_id(tvni);
+ }
if (data[IFLA_GENEVE_TTL])
- ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
+ info.key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
if (data[IFLA_GENEVE_TOS])
- tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
+ info.key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
- if (data[IFLA_GENEVE_LABEL])
- label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
- IPV6_FLOWLABEL_MASK;
+ if (data[IFLA_GENEVE_LABEL]) {
+ info.key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
+ IPV6_FLOWLABEL_MASK;
+ if (info.key.label && (!(info.mode & IP_TUNNEL_INFO_IPV6)))
+ return -EINVAL;
+ }
if (data[IFLA_GENEVE_PORT])
- dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]);
+ info.key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]);
if (data[IFLA_GENEVE_COLLECT_METADATA])
metadata = true;
if (data[IFLA_GENEVE_UDP_CSUM] &&
!nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
- flags |= GENEVE_F_UDP_ZERO_CSUM_TX;
+ info.key.tun_flags |= TUNNEL_CSUM;
if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] &&
nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]))
- flags |= GENEVE_F_UDP_ZERO_CSUM6_TX;
+ info.key.tun_flags &= ~TUNNEL_CSUM;
if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] &&
nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
- flags |= GENEVE_F_UDP_ZERO_CSUM6_RX;
+ use_udp6_rx_checksums = false;
- return geneve_configure(net, dev, &remote, vni, ttl, tos, label,
- dst_port, metadata, flags);
+ return geneve_configure(net, dev, &info, metadata, use_udp6_rx_checksums);
}
static void geneve_dellink(struct net_device *dev, struct list_head *head)
@@ -1418,45 +1327,52 @@ static size_t geneve_get_size(const struct net_device *dev)
static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct geneve_dev *geneve = netdev_priv(dev);
+ struct ip_tunnel_info *info = &geneve->info;
+ __u8 tmp_vni[3];
__u32 vni;
- vni = (geneve->vni[0] << 16) | (geneve->vni[1] << 8) | geneve->vni[2];
+ tunnel_id_to_vni(info->key.tun_id, tmp_vni);
+ vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2];
if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
goto nla_put_failure;
- if (geneve->remote.sa.sa_family == AF_INET) {
+ if (ip_tunnel_info_af(info) == AF_INET) {
if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
- geneve->remote.sin.sin_addr.s_addr))
+ info->key.u.ipv4.dst))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
+ !!(info->key.tun_flags & TUNNEL_CSUM)))
goto nla_put_failure;
+
#if IS_ENABLED(CONFIG_IPV6)
} else {
if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6,
- &geneve->remote.sin6.sin6_addr))
+ &info->key.u.ipv6.dst))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
+ !(info->key.tun_flags & TUNNEL_CSUM)))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
+ !geneve->use_udp6_rx_checksums))
goto nla_put_failure;
#endif
}
- if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) ||
- nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos) ||
- nla_put_be32(skb, IFLA_GENEVE_LABEL, geneve->label))
+ if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) ||
+ nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) ||
+ nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
goto nla_put_failure;
- if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port))
+ if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
goto nla_put_failure;
if (geneve->collect_md) {
if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
goto nla_put_failure;
}
-
- if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
- !(geneve->flags & GENEVE_F_UDP_ZERO_CSUM_TX)) ||
- nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
- !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_TX)) ||
- nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
- !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_RX)))
- goto nla_put_failure;
-
return 0;
nla_put_failure:
@@ -1480,6 +1396,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
u8 name_assign_type, u16 dst_port)
{
struct nlattr *tb[IFLA_MAX + 1];
+ struct ip_tunnel_info info;
struct net_device *dev;
LIST_HEAD(list_kill);
int err;
@@ -1490,9 +1407,8 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
if (IS_ERR(dev))
return dev;
- err = geneve_configure(net, dev, &geneve_remote_unspec,
- 0, 0, 0, 0, htons(dst_port), true,
- GENEVE_F_UDP_ZERO_CSUM6_RX);
+ init_tnl_info(&info, dst_port);
+ err = geneve_configure(net, dev, &info, true, true);
if (err) {
free_netdev(dev);
return ERR_PTR(err);
@@ -1510,8 +1426,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
goto err;
return dev;
-
- err:
+err:
geneve_dellink(dev, &list_kill);
unregister_netdevice_many(&list_kill);
return ERR_PTR(err);
@@ -1594,7 +1509,6 @@ static int __init geneve_init_module(void)
goto out3;
return 0;
-
out3:
unregister_netdevice_notifier(&geneve_notifier_block);
out2:
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v3 0/4] geneve: Use LWT more effectively.
From: Pravin B Shelar @ 2016-11-21 19:02 UTC (permalink / raw)
To: netdev; +Cc: Pravin B Shelar
Following patch series make use of geneve LWT code path for
geneve netdev type of device.
This allows us to simplify geneve module without changing any
functionality.
v2-v3:
Rebase against latest net-next.
v1-v2:
Fix warning reported by kbuild test robot.
Pravin B Shelar (4):
geneve: Unify LWT and netdev handling.
geneve: Merge ipv4 and ipv6 geneve_build_skb()
geneve: Remove redundant socket checks.
geneve: Optimize geneve device lookup.
drivers/net/geneve.c | 679 +++++++++++++++++++++------------------------------
1 file changed, 274 insertions(+), 405 deletions(-)
--
1.8.3.1
^ permalink raw reply
* [GIT] Networking
From: David Miller @ 2016-11-21 18:34 UTC (permalink / raw)
To: torvalds; +Cc: akpm, netdev, linux-kernel
1) Clear congestion control state when changing algorithms on an
existing socket, from Florian Westphal.
2) Fix register bit values in altr_tse_pcs portion of stmmac driver,
from Jia Jie Ho.
3) Fix PTP handling in stammc driver for GMAC4, from Giuseppe
CAVALLARO.
4) Fix udplite multicast delivery handling, it ignores the udp_table
parameter passed into the lookups, from Pablo Neira Ayuso.
5) Synchronize the space estimated by rtnl_vfinfo_size and the space
actually used by rtnl_fill_vfinfo. From Sabrina Dubroca.
6) Fix memory leak in fib_info when splitting nodes, from Alexander
Duyck.
7) If a driver does a napi_hash_del() explicitily and not via
netif_napi_del(), it must perform RCU synchronization as needed.
Fix this in virtio-net and bnxt drivers, from Eric Dumazet.
8) Likewise, it is not necessary to invoke napi_hash_del() is we are
also doing neif_napi_del() in the same code path. Remove such
calls from be2net and cxgb4 drivers, also from Eric Dumazet.
9) Don't allocate an ID in peernet2id_alloc() if the netns is dead,
from WANG Cong.
10) Fix OF node and device struct leaks in of_mdio, from Johan Hovold.
11) We cannot cache routes in ip6_tunnel when using inherited traffic
classes, from Paolo Abeni.
12) Fix several crashes and leaks in cpsw driver, from Johan Hovold.
13) Splice operations cannot use freezable blocking calls in AF_UNIX,
from WANG Cong.
14) Link dump filtering by master device and kind support added an
error in loop index updates during the dump if we actually do
filter, fix from Zhang Shengju.
Please pull, thanks a lot!
The following changes since commit e76d21c40bd6c67fd4e2c1540d77e113df962b4d:
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2016-11-14 14:15:53 -0800)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git
for you to fetch changes up to 7082c5c3f2407c52022507ffaf644dbbab97a883:
tcp: zero ca_priv area when switching cc algorithms (2016-11-21 13:13:56 -0500)
----------------------------------------------------------------
Alex (1):
net/phy/vitesse: Configure RGMII skew on VSC8601, if needed
Alexander Duyck (2):
ipv4: Restore fib_trie_flush_external function and fix call ordering
ipv4: Fix memory leak in exception case for splitting tries
Alexander Kochetkov (2):
net: arc_emac: annonce IFF_MULTICAST support
net: arc_emac: don't pass multicast packets to kernel in non-multicast mode
Alexey Khoroshilov (1):
net: macb: add check for dma mapping error in start_xmit()
Benjamin Beichler (1):
mac80211_hwsim: fix beacon delta calculation
David S. Miller (7):
Merge branch 'stmmac-ptp'
Merge branch 'fib-tables-fixes'
Merge branch 'thunderx-fixes'
Merge branch 'phy-dev-leaks'
Merge branch 'cpsw-fixes'
Merge tag 'mac80211-for-davem-2016-11-18' of git://git.kernel.org/.../jberg/mac80211
Merge tag 'batadv-net-for-davem-20161119' of git://git.open-mesh.org/linux-merge
Eric Dumazet (5):
gro_cells: mark napi struct as not busy poll candidates
virtio-net: add a missing synchronize_net()
be2net: do not call napi_hash_del()
cxgb4: do not call napi_hash_del()
bnxt: add a missing rcu synchronization
Felix Fietkau (4):
Revert "mac80211: allow using AP_LINK_PS with mac80211-generated TIM IE"
mac80211: update A-MPDU flag on tx dequeue
mac80211: remove bogus skb vif assignment
mac80211: fix A-MSDU aggregation with fast-xmit + txq
Filip Matusiak (1):
mac80211: Ignore VHT IE from peer with wrong rx_mcs_map
Florian Fainelli (1):
net: dsa: b53: Fix VLAN usage and how we treat CPU port
Florian Westphal (1):
tcp: zero ca_priv area when switching cc algorithms
Gao Feng (1):
net: l2tp: Treat NET_XMIT_CN as success in l2tp_eth_dev_xmit
Giuseppe CAVALLARO (3):
stmmac: update the PTP header file
stmmac: fix PTP support for GMAC4
stmmac: fix PTP type ethtool stats
Guillaume Nault (1):
l2tp: fix racy SOCK_ZAPPED flag check in l2tp_ip{,6}_bind()
Hangbin Liu (1):
igmp: do not remove igmp souce list info when set link down
Jeremy Linton (1):
net: sky2: Fix shutdown crash
Jia Jie Ho (1):
net: ethernet: Fix SGMII unable to switch speed and autonego failure
Johan Hovold (10):
of_mdio: fix node leak in of_phy_register_fixed_link error path
of_mdio: fix device reference leak in of_phy_find_device
net: phy: fixed_phy: fix of_node leak in fixed_phy_unregister
net: ethernet: ti: cpsw: fix bad register access in probe error path
net: ethernet: ti: cpsw: fix mdio device reference leak
net: ethernet: ti: cpsw: fix deferred probe
net: ethernet: ti: cpsw: fix of_node and phydev leaks
net: ethernet: ti: cpsw: fix secondary-emac probe error path
net: ethernet: ti: cpsw: add missing sanity check
net: ethernet: ti: cpsw: fix fixed-link phy probe deferral
Johannes Berg (1):
cfg80211: limit scan results cache size
Jon Paul Maloy (1):
tipc: eliminate obsolete socket locking policy description
Josef Bacik (1):
bpf: fix range arithmetic for bpf map access
Pablo Neira (1):
udp: restore UDPlite many-cast delivery
Paolo Abeni (1):
ip6_tunnel: disable caching when the traffic class is inherited
Pedersen, Thomas (1):
cfg80211: add bitrate for 20MHz MCS 9
Peter Robinson (1):
ethernet: stmmac: make DWMAC_STM32 depend on it's associated SoC
Radha Mohan Chintakuntla (1):
net: thunderx: Introduce BGX_ID_MASK macro to extract bgx_id
Roman Mashak (1):
net sched filters: pass netlink message flags in event notification
Sabrina Dubroca (3):
rtnetlink: fix rtnl_vfinfo_size
rtnetlink: fix rtnl message size computation for XDP
rtnetlink: fix FDB size computation
Stefan Hajnoczi (1):
netns: fix get_net_ns_by_fd(int pid) typo
Sunil Goutham (4):
net: thunderx: Program LMAC credits based on MTU
net: thunderx: Fix configuration of L3/L4 length checking
net: thunderx: Fix VF driver's interface statistics
net: thunderx: Fix memory leak and other issues upon interface toggle
Sven Eckelmann (2):
batman-adv: Revert "fix splat on disabling an interface"
batman-adv: Detect missing primaryif during tp_send as error
WANG Cong (2):
net: check dead netns for peernet2id_alloc()
af_unix: conditionally use freezable blocking calls in read
Zhang Shengju (1):
rtnl: fix the loop index update error in rtnl_dump_ifinfo()
drivers/net/dsa/b53/b53_common.c | 16 ++++----------
drivers/net/ethernet/arc/emac_main.c | 7 +++---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++++
drivers/net/ethernet/cadence/macb.c | 6 ++++++
drivers/net/ethernet/cavium/thunder/nic.h | 64 ++++++++++++++++++++++++++++++------------------------
drivers/net/ethernet/cavium/thunder/nic_main.c | 37 ++++++++++++++++++++++----------
drivers/net/ethernet/cavium/thunder/nic_reg.h | 1 +
drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------
drivers/net/ethernet/cavium/thunder/nicvf_main.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------
drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------
drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 24 ++-------------------
drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 4 ++--
drivers/net/ethernet/cavium/thunder/thunder_bgx.h | 2 ++
drivers/net/ethernet/chelsio/cxgb4/sge.c | 1 -
drivers/net/ethernet/emulex/benet/be_main.c | 1 -
drivers/net/ethernet/marvell/sky2.c | 13 +++++++++++
drivers/net/ethernet/stmicro/stmmac/Kconfig | 2 +-
drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c | 4 ++--
drivers/net/ethernet/stmicro/stmmac/common.h | 24 ++++++++++++---------
drivers/net/ethernet/stmicro/stmmac/descs.h | 20 ++++++++++-------
drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.h | 4 ++++
drivers/net/ethernet/stmicro/stmmac/enh_desc.c | 28 +++++++++++++++---------
drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 +
drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 19 ++++++++++-------
drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 43 +++++++++++++++++++++++++++++--------
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 97 ++++++++++++++++++++++++++++++++++++++++++----------------------------------------
drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 9 ++++----
drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h | 72 +++++++++++++++++++++++++++++++------------------------------
drivers/net/ethernet/ti/cpsw.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
drivers/net/phy/fixed_phy.c | 2 +-
drivers/net/phy/vitesse.c | 34 ++++++++++++++++++++++++++++-
drivers/net/virtio_net.c | 5 +++++
drivers/net/wireless/mac80211_hwsim.c | 2 +-
drivers/of/of_mdio.c | 6 +++++-
include/linux/bpf_verifier.h | 5 +++--
include/net/gro_cells.h | 3 +++
include/net/ip_fib.h | 1 +
include/net/net_namespace.h | 2 +-
kernel/bpf/verifier.c | 70 ++++++++++++++++++++++++++++++++++++++++--------------------
net/batman-adv/hard-interface.c | 1 +
net/batman-adv/tp_meter.c | 1 +
net/core/net_namespace.c | 2 ++
net/core/rtnetlink.c | 22 ++++++++++++-------
net/ipv4/fib_frontend.c | 20 ++++++++++++-----
net/ipv4/fib_trie.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
net/ipv4/igmp.c | 50 +++++++++++++++++++++++++++++++------------
net/ipv4/tcp_cong.c | 4 +++-
net/ipv4/udp.c | 6 +++---
net/ipv6/ip6_tunnel.c | 13 +++++++++--
net/ipv6/udp.c | 6 +++---
net/l2tp/l2tp_eth.c | 2 +-
net/l2tp/l2tp_ip.c | 5 +++--
net/l2tp/l2tp_ip6.c | 5 +++--
net/mac80211/sta_info.c | 2 +-
net/mac80211/tx.c | 14 ++++++++----
net/mac80211/vht.c | 16 ++++++++++++++
net/sched/cls_api.c | 5 +++--
net/tipc/socket.c | 48 +----------------------------------------
net/unix/af_unix.c | 17 +++++++++------
net/wireless/core.h | 1 +
net/wireless/scan.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
net/wireless/util.c | 3 ++-
63 files changed, 1020 insertions(+), 560 deletions(-)
^ permalink raw reply
* Re: [PATCH] VSOCK: add loopback to virtio_transport
From: David Miller @ 2016-11-21 18:22 UTC (permalink / raw)
To: jhansen; +Cc: stefanha, netdev, cavery, imbrenda
In-Reply-To: <BY2PR0501MB20563DC7163282F633F7A30EDAB50@BY2PR0501MB2056.namprd05.prod.outlook.com>
From: "Jorgen S. Hansen" <jhansen@vmware.com>
Date: Mon, 21 Nov 2016 12:40:33 +0000
> That should make it on par with the VMCI transport.
Please do not top-post.
^ permalink raw reply
* Re: [PATCH v2 next 0/2] tcp: make undo_cwnd mandatory for congestion modules
From: David Miller @ 2016-11-21 18:20 UTC (permalink / raw)
To: fw; +Cc: netdev
In-Reply-To: <1479734318-30607-1-git-send-email-fw@strlen.de>
From: Florian Westphal <fw@strlen.de>
Date: Mon, 21 Nov 2016 14:18:36 +0100
> highspeed, illinois, scalable, veno and yeah congestion control algorithms
> don't provide a 'cwnd_undo' function. This makes the stack default to a
> 'reno undo' which doubles cwnd. However, the ssthresh implementation of
> these algorithms do not halve the slowstart threshold. This causes similar
> issue as the one fixed for dctcp in ce6dd23329b1e ("dctcp: avoid bogus
> doubling of cwnd after loss").
>
> In light of this it seems better to remove the fallback and make undo_cwnd
> mandatory.
>
> First patch fixes those spots where reno undo seems incorrect by providing
> .cwnd_undo functions, second patch removes the fallback.
Series applied, thanks for following up on this.
^ permalink raw reply
* Re: [PATCH net-next 0/2] bridge: add support for IGMPv3 and MLDv2 querier
From: David Miller @ 2016-11-21 18:17 UTC (permalink / raw)
To: nikolay; +Cc: netdev, roopa, sashok, stephen, liuhangbin
In-Reply-To: <1479729805-23108-1-git-send-email-nikolay@cumulusnetworks.com>
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 21 Nov 2016 13:03:23 +0100
> This patch-set adds support for IGMPv3 and MLDv2 querier in the bridge.
> Two new options which can be toggled via netlink and sysfs are added that
> control the version per-bridge:
> multicast_igmp_version - default 2, can be set to 3
> multicast_mld_version - default 1, can be set to 2 (this option is
> disabled if CONFIG_IPV6=n)
>
> Note that the names do not include "querier", I think that these options
> can be re-used later as more IGMPv3 support is added to the bridge so we
> can avoid adding more options to switch between v2 and v3 behaviour.
>
> The set uses the already existing br_ip{4,6}_multicast_alloc_query
> functions and adds the appropriate header based on the chosen version.
>
> For the initial support I have removed the compatibility implementation
> (RFC3376 sec 7.3.1, 7.3.2; RFC3810 sec 8.3.1, 8.3.2), because there are
> some details that we need to sort out.
Series applied, thanks.
^ permalink raw reply
* Re: [PATCH net] tcp: zero ca_priv area when switching cc algorithms
From: David Miller @ 2016-11-21 18:14 UTC (permalink / raw)
To: fw; +Cc: netdev
In-Reply-To: <1479719317-22437-1-git-send-email-fw@strlen.de>
From: Florian Westphal <fw@strlen.de>
Date: Mon, 21 Nov 2016 10:08:37 +0100
> We need to zero out the private data area when application switches
> connection to different algorithm (TCP_CONGESTION setsockopt).
>
> When congestion ops get assigned at connect time everything is already
> zeroed because sk_alloc uses GFP_ZERO flag. But in the setsockopt case
> this contains whatever previous cc placed there.
>
> Signed-off-by: Florian Westphal <fw@strlen.de>
Good catch, applied, thanks Florian.
^ permalink raw reply
* Re: [PATCH] VSOCK: add loopback to virtio_transport
From: Jorgen S. Hansen @ 2016-11-21 12:40 UTC (permalink / raw)
To: Stefan Hajnoczi, netdev@vger.kernel.org
Cc: cavery@redhat.com, Claudio Imbrenda, David S . Miller
In-Reply-To: <1479397763-22319-1-git-send-email-stefanha@redhat.com>
Hi Stefan,
That should make it on par with the VMCI transport.
Thanks,
Jørgen
________________________________________
From: Stefan Hajnoczi <stefanha@redhat.com>
Sent: Thursday, November 17, 2016 4:49 PM
To: netdev@vger.kernel.org
Cc: cavery@redhat.com; Claudio Imbrenda; Jorgen S. Hansen; David S . Miller; Stefan Hajnoczi
Subject: [PATCH] VSOCK: add loopback to virtio_transport
The VMware VMCI transport supports loopback inside virtual machines.
This patch implements loopback for virtio-vsock.
Flow control is handled by the virtio-vsock protocol as usual. The
sending process stops transmitting on a connection when the peer's
receive buffer space is exhausted.
Cathy Avery <cavery@redhat.com> noticed this difference between VMCI and
virtio-vsock when a test case using loopback failed. Although loopback
isn't the main point of AF_VSOCK, it is useful for testing and
virtio-vsock must match VMCI semantics so that userspace programs run
regardless of the underlying transport.
My understanding is that loopback is not supported on the host side with
VMCI. Follow that by implementing it only in the guest driver, not the
vhost host driver.
Cc: Jorgen Hansen <jhansen@vmware.com>
Reported-by: Cathy Avery <cavery@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
net/vmw_vsock/virtio_transport.c | 57 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 57 insertions(+)
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 936d7ee..f2c4071 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -44,6 +44,10 @@ struct virtio_vsock {
spinlock_t send_pkt_list_lock;
struct list_head send_pkt_list;
+ struct work_struct loopback_work;
+ spinlock_t loopback_list_lock;
+ struct list_head loopback_list;
+
atomic_t queued_replies;
/* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
@@ -74,6 +78,42 @@ static u32 virtio_transport_get_local_cid(void)
return vsock->guest_cid;
}
+static void virtio_transport_loopback_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, loopback_work);
+ LIST_HEAD(pkts);
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_splice_init(&vsock->loopback_list, &pkts);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ mutex_lock(&vsock->rx_lock);
+ while (!list_empty(&pkts)) {
+ struct virtio_vsock_pkt *pkt;
+
+ pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+
+ virtio_transport_recv_pkt(pkt);
+ }
+ mutex_unlock(&vsock->rx_lock);
+}
+
+static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
+ struct virtio_vsock_pkt *pkt)
+{
+ int len = pkt->len;
+
+ spin_lock_bh(&vsock->loopback_list_lock);
+ list_add_tail(&pkt->list, &vsock->loopback_list);
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
+ queue_work(virtio_vsock_workqueue, &vsock->loopback_work);
+
+ return len;
+}
+
static void
virtio_transport_send_pkt_work(struct work_struct *work)
{
@@ -159,6 +199,10 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
return -ENODEV;
}
+ if (le32_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) {
+ return virtio_transport_send_pkt_loopback(vsock, pkt);
+ }
+
if (pkt->reply)
atomic_inc(&vsock->queued_replies);
@@ -510,10 +554,13 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
mutex_init(&vsock->event_lock);
spin_lock_init(&vsock->send_pkt_list_lock);
INIT_LIST_HEAD(&vsock->send_pkt_list);
+ spin_lock_init(&vsock->loopback_list_lock);
+ INIT_LIST_HEAD(&vsock->loopback_list);
INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
INIT_WORK(&vsock->event_work, virtio_transport_event_work);
INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+ INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work);
mutex_lock(&vsock->rx_lock);
virtio_vsock_rx_fill(vsock);
@@ -539,6 +586,7 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
struct virtio_vsock *vsock = vdev->priv;
struct virtio_vsock_pkt *pkt;
+ flush_work(&vsock->loopback_work);
flush_work(&vsock->rx_work);
flush_work(&vsock->tx_work);
flush_work(&vsock->event_work);
@@ -565,6 +613,15 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
}
spin_unlock_bh(&vsock->send_pkt_list_lock);
+ spin_lock_bh(&vsock->loopback_list_lock);
+ while (!list_empty(&vsock->loopback_list)) {
+ pkt = list_first_entry(&vsock->loopback_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->loopback_list_lock);
+
mutex_lock(&the_virtio_vsock_mutex);
the_virtio_vsock = NULL;
vsock_core_exit();
^ permalink raw reply related
* Re: [PATCH net 1/1] net: l2tp: Treat NET_XMIT_CN as success in l2tp_eth_dev_xmit
From: David Miller @ 2016-11-21 18:11 UTC (permalink / raw)
To: fgao; +Cc: edumazet, javier, netdev, gfree.wind
In-Reply-To: <1479689781-2125-1-git-send-email-fgao@ikuai8.com>
From: fgao@ikuai8.com
Date: Mon, 21 Nov 2016 08:56:21 +0800
> From: Gao Feng <gfree.wind@gmail.com>
>
> The tc could return NET_XMIT_CN as one congestion notification, but
> it does not mean the packe is lost. Other modules like ipvlan,
> macvlan, and others treat NET_XMIT_CN as success too.
> So l2tp_eth_dev_xmit should add the NET_XMIT_CN check.
>
> Signed-off-by: Gao Feng <gfree.wind@gmail.com>
Applied.
^ permalink raw reply
* Re: Netperf UDP issue with connected sockets
From: Eric Dumazet @ 2016-11-21 18:10 UTC (permalink / raw)
To: Jesper Dangaard Brouer; +Cc: Rick Jones, netdev, Saeed Mahameed, Tariq Toukan
In-Reply-To: <20161121170351.50a09ee1@redhat.com>
On Mon, 2016-11-21 at 17:03 +0100, Jesper Dangaard Brouer wrote:
> On Thu, 17 Nov 2016 10:51:23 -0800
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> > On Thu, 2016-11-17 at 19:30 +0100, Jesper Dangaard Brouer wrote:
> >
> > > The point is I can see a socket Send-Q forming, thus we do know the
> > > application have something to send. Thus, and possibility for
> > > non-opportunistic bulking. Allowing/implementing bulk enqueue from
> > > socket layer into qdisc layer, should be fairly simple (and rest of
> > > xmit_more is already in place).
> >
> >
> > As I said, you are fooled by TX completions.
>
> Obviously TX completions play a role yes, and I bet I can adjust the
> TX completion to cause xmit_more to happen, at the expense of
> introducing added latency.
>
> The point is the "bloated" spinlock in __dev_queue_xmit is still caused
> by the MMIO tailptr/doorbell. The added cost occurs when enqueueing
> packets, and result in the inability to get enough packets into the
> qdisc for xmit_more going (on my system). I argue that a bulk enqueue
> API would allow us to get past the hurtle of transitioning into
> xmit_more mode more easily.
>
This is very nice, but we already have bulk enqueue, it is called
xmit_more.
Kernel does not know your application is sending a packet after the one
you send.
xmit_more is not often used applications/stacks send many small packets.
qdisc is empty (one enqueued packet is immediately dequeued so
skb->xmit_more is 0), and even bypassed (TCQ_F_CAN_BYPASS)
Not sure it this has been tried before, but the doorbell avoidance could
be done by the driver itself, because it knows a TX completion will come
shortly (well... if softirqs are not delayed too much !)
Doorbell would be forced only if :
( "skb->xmit_more is not set" AND "TX engine is not 'started yet'" )
OR
( too many [1] packets were put in TX ring buffer, no point deferring
more)
Start the pump, but once it is started, let the doorbells being done by
TX completion.
ndo_start_xmit and TX completion handler would have to maintain a shared
state describing if packets were ready but doorbell deferred.
Note that TX completion means "if at least one packet was drained",
otherwise busy polling, constantly calling napi->poll() would force a
doorbell too soon for devices sharing a NAPI for both RX and TX.
But then, maybe busy poll would like to force a doorbell...
I could try these ideas on mlx4 shortly.
[1] limit could be derived from active "ethtool -c" params, eg tx-frames
^ permalink raw reply
* Re: [PATCH for-next 03/11] IB/hns: Optimize the logic of allocating memory using APIs
From: Leon Romanovsky @ 2016-11-21 17:14 UTC (permalink / raw)
To: Salil Mehta
Cc: dledford@redhat.com, Huwei (Xavier), oulijun,
mehta.salil.lnk@gmail.com, linux-rdma@vger.kernel.org,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Linuxarm,
Zhangping (ZP)
In-Reply-To: <F4CC6FACFEB3C54C9141D49AD221F7F91A7AD7DF@lhreml503-mbx>
[-- Attachment #1: Type: text/plain, Size: 4775 bytes --]
On Mon, Nov 21, 2016 at 04:12:38PM +0000, Salil Mehta wrote:
> > -----Original Message-----
> > From: Leon Romanovsky [mailto:leon@kernel.org]
> > Sent: Wednesday, November 16, 2016 8:36 AM
> > To: Salil Mehta
> > Cc: dledford@redhat.com; Huwei (Xavier); oulijun;
> > mehta.salil.lnk@gmail.com; linux-rdma@vger.kernel.org;
> > netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm;
> > Zhangping (ZP)
> > Subject: Re: [PATCH for-next 03/11] IB/hns: Optimize the logic of
> > allocating memory using APIs
> >
> > On Tue, Nov 15, 2016 at 03:52:46PM +0000, Salil Mehta wrote:
> > > > -----Original Message-----
> > > > From: Leon Romanovsky [mailto:leon@kernel.org]
> > > > Sent: Wednesday, November 09, 2016 7:22 AM
> > > > To: Salil Mehta
> > > > Cc: dledford@redhat.com; Huwei (Xavier); oulijun;
> > > > mehta.salil.lnk@gmail.com; linux-rdma@vger.kernel.org;
> > > > netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm;
> > > > Zhangping (ZP)
> > > > Subject: Re: [PATCH for-next 03/11] IB/hns: Optimize the logic of
> > > > allocating memory using APIs
> > > >
> > > > On Fri, Nov 04, 2016 at 04:36:25PM +0000, Salil Mehta wrote:
> > > > > From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>
> > > > >
> > > > > This patch modified the logic of allocating memory using APIs in
> > > > > hns RoCE driver. We used kcalloc instead of kmalloc_array and
> > > > > bitmap_zero. And When kcalloc failed, call vzalloc to alloc
> > > > > memory.
> > > > >
> > > > > Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
> > > > > Signed-off-by: Ping Zhang <zhangping5@huawei.com>
> > > > > Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
> > > > > ---
> > > > > drivers/infiniband/hw/hns/hns_roce_mr.c | 15 ++++++++-------
> > > > > 1 file changed, 8 insertions(+), 7 deletions(-)
> > > > >
> > > > > diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c
> > > > b/drivers/infiniband/hw/hns/hns_roce_mr.c
> > > > > index fb87883..d3dfb5f 100644
> > > > > --- a/drivers/infiniband/hw/hns/hns_roce_mr.c
> > > > > +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
> > > > > @@ -137,11 +137,12 @@ static int hns_roce_buddy_init(struct
> > > > hns_roce_buddy *buddy, int max_order)
> > > > >
> > > > > for (i = 0; i <= buddy->max_order; ++i) {
> > > > > s = BITS_TO_LONGS(1 << (buddy->max_order - i));
> > > > > - buddy->bits[i] = kmalloc_array(s, sizeof(long),
> > > > GFP_KERNEL);
> > > > > - if (!buddy->bits[i])
> > > > > - goto err_out_free;
> > > > > -
> > > > > - bitmap_zero(buddy->bits[i], 1 << (buddy->max_order -
> > i));
> > > > > + buddy->bits[i] = kcalloc(s, sizeof(long),
> > GFP_KERNEL);
> > > > > + if (!buddy->bits[i]) {
> > > > > + buddy->bits[i] = vzalloc(s * sizeof(long));
> > > >
> > > > I wonder, why don't you use directly vzalloc instead of kcalloc
> > > > fallback?
> > > As we know we will have physical contiguous pages if the kcalloc
> > > call succeeds. This will give us a chance to have better performance
> > > over the allocations which are just virtually contiguous through the
> > > function vzalloc(). Therefore, later has only been used as a fallback
> > > when our memory request cannot be entertained through kcalloc.
> > >
> > > Are you suggesting that there will not be much performance penalty
> > > if we use just vzalloc ?
> >
> > Not exactly,
> > I asked it, because we have similar code in our drivers and this
> > construction looks strange to me.
> >
> > 1. If performance is critical, we will use kmalloc.
> > 2. If performance is not critical, we will use vmalloc.
> >
> > But in this case, such construction shows me that we can live with
> > vmalloc performance and kmalloc allocation are not really needed.
> >
> > In your specific case, I'm not sure that kcalloc will ever fail.
> Performance is definitely critical here. Though, I agree this is bit
> unusual way of memory allocation. In actual, we were encountering
> memory alloc failures using kmalloc (if you see allocation amount
> is on the higher side and is exponential) so we ended up using
> vmalloc as fall back - It is very naïve allocation scheme.
I understand it, we did the same, see our mlx5_vzalloc call.
BTW, we used __GFP_NOWARN flag, which you should consider to use
in your case too.
>
> Maybe we need to rethink this allocation scheme part? Also, I can pull
> back this particular patch for now or just live with vzalloc() till
> we figure out proper solution to this?
It is up to you, I don't think that you should drop it, AFAIK, there is
no other proper solution.
>
> >
> > Thanks
> >
> >
> > >
> > > >
> > > > > + if (!buddy->bits[i])
> > > > > + goto err_out_free;
> > > > > + }
> > > > > }
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]
^ permalink raw reply
* Re: [PATCH net] ipv6 addrconf: Implemented enhanced DAD (RFC7527)
From: Erik Nordmark @ 2016-11-21 17:10 UTC (permalink / raw)
To: Hannes Frederic Sowa, netdev
In-Reply-To: <b9854c18-f71c-2e71-d352-531bca72eeb4@stressinduktion.org>
On 11/16/16 10:49 PM, Hannes Frederic Sowa wrote:
> I thought about even removing the sysctl altogether and enable enhanced
> DAD by default. ;)
>
> I am in favor of enabling it by default.
>
> But given that there could be broken implementations out there, we
> should give users a choice and provide.
OK, I'll make it the default and send out a new version of the patch. I
was told I should base the patch on net-next instead of linux-stable so
I'll move it there.
>
> Could you always generate a nonce in the interface structure? You could
> check the sysctl in the send and receive path to attach and check the
> nonce. This has the advantage that you don't need to delete the
> interface and recreate it to enable/disable enhanced dad on an interface
> (also you can get away with the loop around get_random_bytes to make
> sure its value is not zero as we don't depend on a non-zero nonce
> variable to signal enaling of the feature, see below).
The nonce is per interface address and not per interface. Furthermore,
the RFC says that on a retry of DAD the nodes will end up using a
different nonce implying that even for the same interface address it
should pick a different nonce for each DAD attempt.
Note that since there is no automatic retry of DAD (per RFC4862) and
each try would check the current sysctl setting so I don't think
pre-generating the nonce would change the behavior.
>> Is that because get_random_bytes() will not fill in anything if there is
>> insufficient entropy available?
> No, just because 0 is a possible return value from the random number
> generator. ;)
Ah - makes sense.
Thanks again for the review,
Erik
>>>> inc = ipv6_addr_is_multicast(daddr);
>>>>
>>>> @@ -797,6 +811,16 @@ static void ndisc_recv_ns(struct sk_buff
>>>> have_ifp:
>>>> if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
>>>> if (dad) {
>>>> + if (nonce != 0 && ifp->dad_nonce == nonce) {
>>>> + /* Matching nonce if looped back */
>>>> + if (net_ratelimit())
>>>> + ND_PRINTK(2, notice,
>>>> + "%s: IPv6 DAD loopback for address %pI6c
>>>> nonce %llu ignored\n",
>>>> + ifp->idev->dev->name,
>>>> + &ifp->addr,
>>>> + nonce);
>>> If we print the nonce for debugging reasons, we should keep it in
>>> correct endianess on the wire vs. in the debug output.
>> How about printing it as colon-separated hex bytes since that is more
>> clear than decimal?
>> Would follow the network byte order in the packet.
> I would be totally fine with it. It will be probably easier to switch to
> a char[6] array for the nonce then.
^ permalink raw reply
* Re: [PATCH net-next 1/1] driver: macvlan: Remove duplicated IFF_UP condition check in macvlan_forward_source
From: David Miller @ 2016-11-21 16:59 UTC (permalink / raw)
To: fgao; +Cc: kaber, netdev, gfree.wind
In-Reply-To: <1479687998-456-1-git-send-email-fgao@ikuai8.com>
From: fgao@ikuai8.com
Date: Mon, 21 Nov 2016 08:26:38 +0800
> From: Gao Feng <gfree.wind@gmail.com>
>
> The function macvlan_forward_source_one has already checked the flag
> IFF_UP, so needn't check it outside in macvlan_forward_source too.
>
> Signed-off-by: Gao Feng <gfree.wind@gmail.com>
> ---
> v2: Remove the IFF_UP check in macvlan_forward_source instead of macvlan_forward_source_one
> v1: Initial patch
Applied.
^ permalink raw reply
* [PATCH] net: phy: micrel: fix KSZ8041FTL supported value
From: Kirill Esipov @ 2016-11-21 16:53 UTC (permalink / raw)
To: netdev; +Cc: linux-kernel, Kirill Esipov
Fix setting of SUPPORTED_FIBRE bit as it was not present in features
of KSZ8041.
Signed-off-by: Kirill Esipov <yesipov@gmail.com>
---
drivers/net/phy/micrel.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 081df68..ea92d52 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -318,12 +318,12 @@ static int ksz8041_config_init(struct phy_device *phydev)
/* Limit supported and advertised modes in fiber mode */
if (of_property_read_bool(of_node, "micrel,fiber-mode")) {
phydev->dev_flags |= MICREL_PHY_FXEN;
- phydev->supported &= SUPPORTED_FIBRE |
- SUPPORTED_100baseT_Full |
+ phydev->supported &= SUPPORTED_100baseT_Full |
SUPPORTED_100baseT_Half;
- phydev->advertising &= ADVERTISED_FIBRE |
- ADVERTISED_100baseT_Full |
+ phydev->supported |= SUPPORTED_FIBRE;
+ phydev->advertising &= ADVERTISED_100baseT_Full |
ADVERTISED_100baseT_Half;
+ phydev->advertising |= ADVERTISED_FIBRE;
phydev->autoneg = AUTONEG_DISABLE;
}
--
2.7.4
^ permalink raw reply related
* Re: [RFC PATCH net v2 2/3] dt: bindings: add ethernet phy eee-disable-advert option documentation
From: Andrew Lunn @ 2016-11-21 16:47 UTC (permalink / raw)
To: Jerome Brunet
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA,
Florian Fainelli, Alexandre TORGUE, Neil Armstrong,
Martin Blumenstingl, Kevin Hilman,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andre Roth,
linux-amlogic-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Carlo Caione,
Giuseppe Cavallaro,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <1479744993.17538.85.camel-rdvid1DuHRBWk0Htik3J/w@public.gmane.org>
> What I did not realize when doing this patch for the realtek driver is
> that there is already 6 valid modes defined in the kernel
>
> #define MDIO_EEE_100TX MDIO_AN_EEE_ADV_100TX /*
> 100TX EEE cap */
> #define MDIO_EEE_1000T MDIO_AN_EEE_ADV_1000T /*
> 1000T EEE cap */
> #define MDIO_EEE_10GT 0x0008 /* 10GT EEE cap */
> #define MDIO_EEE_1000KX 0x0010 /* 1000KX EEE cap
> */
> #define MDIO_EEE_10GKX4 0x0020 /* 10G KX4 EEE cap
> */
> #define MDIO_EEE_10GKR 0x0040 /* 10G KR EEE cap
> */
>
> I took care of only 2 in the case of realtek.c since it only support
> MDIO_EEE_100TX and MDIO_EEE_1000T.
>
> Defining a property for each is certainly doable but it does not look
> very nice either. If it extends in the future, it will get even more
> messier, especially if you want to disable everything.
Yes, agreed.
> What do you think about keeping a single mask value but use the define
> above in the DT ? It would be more readable than hex and easy to
> extend, don't you think ?
>
> These defines are already part of the uapi so I guess we can use those
> in the DT bindings ?
I don't think they are accessible from the dtc include path. You will
need to make a copy, in include/dt-bindings/net/phy.h
But yes, using these defines is a good idea.
Andrew
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH net-next v2 0/4] geneve: Use LWT more effectively.
From: David Miller @ 2016-11-21 16:28 UTC (permalink / raw)
To: pshelar; +Cc: netdev
In-Reply-To: <1479521411-53012-1-git-send-email-pshelar@ovn.org>
From: Pravin B Shelar <pshelar@ovn.org>
Date: Fri, 18 Nov 2016 18:10:07 -0800
> Following patch series make use of geneve LWT code path for
> geneve netdev type of device.
> This allows us to simplify geneve module.
>
> v1-v2:
> Fix warning reported by kbuild test robot.
This doesn't apply cleanly to net-next, please respin.
Thanks.
^ permalink raw reply
* Re: [PATCH net-next] udp: avoid one cache line miss in recvmsg()
From: David Miller @ 2016-11-21 16:27 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
In-Reply-To: <1479518283.8455.312.camel@edumazet-glaptop3.roam.corp.google.com>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 18 Nov 2016 17:18:03 -0800
> From: Eric Dumazet <edumazet@google.com>
>
> UDP_SKB_CB(skb)->partial_cov is located at offset 66 in skb,
> requesting a cold cache line being read in cpu cache.
>
> We can avoid this cache line miss for UDP sockets,
> as partial_cov has a meaning only for UDPLite.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied.
^ permalink raw reply
* Re: [PATCH v2] ethernet: stmmac: make DWMAC_STM32 depend on it's associated SoC
From: David Miller @ 2016-11-21 16:34 UTC (permalink / raw)
To: pbrobinson; +Cc: peppe.cavallaro, alexandre.torgue, mcoquelin.stm32, netdev
In-Reply-To: <20161120172238.7919-1-pbrobinson@gmail.com>
From: Peter Robinson <pbrobinson@gmail.com>
Date: Sun, 20 Nov 2016 17:22:38 +0000
> There's not much point, except compile test, enabling the stmmac
> platform drivers unless the STM32 SoC is enabled. It's not
> useful without it.
>
> Signed-off-by: Peter Robinson <pbrobinson@gmail.com>
Applied.
^ permalink raw reply
* Re: [PATCH v2 net-next] mlx4: avoid unnecessary dirtying of critical fields
From: David Miller @ 2016-11-21 16:33 UTC (permalink / raw)
To: eric.dumazet; +Cc: ttoukan.linux, netdev, tariqt
In-Reply-To: <1479662676.8455.364.camel@edumazet-glaptop3.roam.corp.google.com>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 20 Nov 2016 09:24:36 -0800
> From: Eric Dumazet <edumazet@google.com>
>
> While stressing a 40Gbit mlx4 NIC with busy polling, I found false
> sharing in mlx4 driver that can be easily avoided.
>
> This patch brings an additional 7 % performance improvement in UDP_RR
> workload.
>
> 1) If we received no frame during one mlx4_en_process_rx_cq()
> invocation, no need to call mlx4_cq_set_ci() and/or dirty ring->cons
>
> 2) Do not refill rx buffers if we have plenty of them.
> This avoids false sharing and allows some bulk/batch optimizations.
> Page allocator and its locks will thank us.
>
> Finally, mlx4_en_poll_rx_cq() should not return 0 if it determined
> cpu handling NIC IRQ should be changed. We should return budget-1
> instead, to not fool net_rx_action() and its netdev_budget.
>
>
> v2: keep AVG_PERF_COUNTER(... polled) even if polled is 0
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next] bnx2: use READ_ONCE() instead of barrier()
From: David Miller @ 2016-11-21 16:32 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev, rasesh.mody, harish.patil
In-Reply-To: <1479596231.8455.354.camel@edumazet-glaptop3.roam.corp.google.com>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 19 Nov 2016 14:57:11 -0800
> From: Eric Dumazet <edumazet@google.com>
>
> barrier() is a big hammer compared to READ_ONCE(),
> and requires comments explaining what is protected.
>
> READ_ONCE() is more precise and compiler should generate
> better overall code.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next v3 0/4] Couple of BPF refcount fixes for mlx5
From: David Miller @ 2016-11-21 16:26 UTC (permalink / raw)
To: daniel; +Cc: alexei.starovoitov, bblanco, zhiyisun, ranas, saeedm, netdev
In-Reply-To: <cover.1479514784.git.daniel@iogearbox.net>
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 19 Nov 2016 01:44:59 +0100
> Various mlx5 bugs on eBPF refcount handling found during review.
> Last patch in series adds a __must_check to BPF helpers to make
> sure we won't run into it again w/o compiler complaining first.
Series applied, thanks Daniel.
^ permalink raw reply
* Re: [PATCH net 07/18] net/ena: refactor ena_get_stats64 to be atomic context safe
From: kbuild test robot @ 2016-11-21 16:23 UTC (permalink / raw)
To: Netanel Belgazal
Cc: kbuild-all, linux-kernel, davem, netdev, Netanel Belgazal, dwmw,
zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1479631547-29354-8-git-send-email-netanel@annapurnalabs.com>
[-- Attachment #1: Type: text/plain, Size: 3508 bytes --]
Hi Netanel,
[auto build test WARNING on net/master]
url: https://github.com/0day-ci/linux/commits/Netanel-Belgazal/Update-ENA-driver-to-version-1-1-2/20161120-165649
config: i386-randconfig-h1-11212236 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
In file included from include/linux/mmzone.h:15:0,
from include/linux/gfp.h:5,
from include/linux/cpu_rmap.h:14,
from drivers/net/ethernet/amazon/ena/ena_netdev.c:36:
drivers/net/ethernet/amazon/ena/ena_netdev.c: In function 'ena_get_stats64':
>> include/linux/seqlock.h:204:19: warning: 'rx_ring' may be used uninitialized in this function [-Wmaybe-uninitialized]
return unlikely(s->sequence != start);
^~
drivers/net/ethernet/amazon/ena/ena_netdev.c:2188:19: note: 'rx_ring' was declared here
struct ena_ring *rx_ring, *tx_ring;
^~~~~~~
vim +/rx_ring +204 include/linux/seqlock.h
4f988f15 Linus Torvalds 2012-05-04 188 /**
3c22cd57 Nick Piggin 2011-01-07 189 * __read_seqcount_retry - end a seq-read critical section (without barrier)
3c22cd57 Nick Piggin 2011-01-07 190 * @s: pointer to seqcount_t
3c22cd57 Nick Piggin 2011-01-07 191 * @start: count, from read_seqcount_begin
3c22cd57 Nick Piggin 2011-01-07 192 * Returns: 1 if retry is required, else 0
3c22cd57 Nick Piggin 2011-01-07 193 *
3c22cd57 Nick Piggin 2011-01-07 194 * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
3c22cd57 Nick Piggin 2011-01-07 195 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
3c22cd57 Nick Piggin 2011-01-07 196 * provided before actually loading any of the variables that are to be
3c22cd57 Nick Piggin 2011-01-07 197 * protected in this critical section.
3c22cd57 Nick Piggin 2011-01-07 198 *
3c22cd57 Nick Piggin 2011-01-07 199 * Use carefully, only in critical code, and comment how the barrier is
3c22cd57 Nick Piggin 2011-01-07 200 * provided.
3c22cd57 Nick Piggin 2011-01-07 201 */
3c22cd57 Nick Piggin 2011-01-07 202 static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
3c22cd57 Nick Piggin 2011-01-07 203 {
3c22cd57 Nick Piggin 2011-01-07 @204 return unlikely(s->sequence != start);
3c22cd57 Nick Piggin 2011-01-07 205 }
3c22cd57 Nick Piggin 2011-01-07 206
3c22cd57 Nick Piggin 2011-01-07 207 /**
3c22cd57 Nick Piggin 2011-01-07 208 * read_seqcount_retry - end a seq-read critical section
3c22cd57 Nick Piggin 2011-01-07 209 * @s: pointer to seqcount_t
3c22cd57 Nick Piggin 2011-01-07 210 * @start: count, from read_seqcount_begin
3c22cd57 Nick Piggin 2011-01-07 211 * Returns: 1 if retry is required, else 0
3c22cd57 Nick Piggin 2011-01-07 212 *
:::::: The code at line 204 was first introduced by commit
:::::: 3c22cd5709e8143444a6d08682a87f4c57902df3 kernel: optimise seqlock
:::::: TO: Nick Piggin <npiggin@kernel.dk>
:::::: CC: Nick Piggin <npiggin@kernel.dk>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31320 bytes --]
^ permalink raw reply
* Re: [mm PATCH v3 21/23] mm: Add support for releasing multiple instances of a page
From: Alexander Duyck @ 2016-11-21 16:21 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Duyck, linux-mm, Netdev, linux-kernel@vger.kernel.org
In-Reply-To: <20161118152716.3f7acf6e25f142846909b2f6@linux-foundation.org>
On Fri, Nov 18, 2016 at 3:27 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Thu, 10 Nov 2016 06:36:06 -0500 Alexander Duyck <alexander.h.duyck@intel.com> wrote:
>
>> This patch adds a function that allows us to batch free a page that has
>> multiple references outstanding. Specifically this function can be used to
>> drop a page being used in the page frag alloc cache. With this drivers can
>> make use of functionality similar to the page frag alloc cache without
>> having to do any workarounds for the fact that there is no function that
>> frees multiple references.
>>
>> ...
>>
>> --- a/include/linux/gfp.h
>> +++ b/include/linux/gfp.h
>> @@ -506,6 +506,8 @@ extern void free_hot_cold_page(struct page *page, bool cold);
>> extern void free_hot_cold_page_list(struct list_head *list, bool cold);
>>
>> struct page_frag_cache;
>> +extern void __page_frag_drain(struct page *page, unsigned int order,
>> + unsigned int count);
>> extern void *__alloc_page_frag(struct page_frag_cache *nc,
>> unsigned int fragsz, gfp_t gfp_mask);
>> extern void __free_page_frag(void *addr);
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 0fbfead..54fea40 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -3912,6 +3912,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
>> return page;
>> }
>>
>> +void __page_frag_drain(struct page *page, unsigned int order,
>> + unsigned int count)
>> +{
>> + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
>> +
>> + if (page_ref_sub_and_test(page, count)) {
>> + if (order == 0)
>> + free_hot_cold_page(page, false);
>> + else
>> + __free_pages_ok(page, order);
>> + }
>> +}
>> +EXPORT_SYMBOL(__page_frag_drain);
>
> It's an exported-to-modules library function. It should be documented,
> please? The page-frag API is only partially documented, but that's no
> excuse.
Okay. I assume you want the documentation as a follow-up patch since
I received a notice that the patch was added to -mm?
> And perhaps documentation will help explain the naming choice. Why
> "drain"? I'd have expected "put"?
The idea was that this is supposed to be a counterpart to
__page_frag_refill. Basically it is a function we can use if we need
to tear down the page frag cache and free the backing page. If you
want I could update the names for these functions to make that
clarification that this is meant to drain a frag cache versus just
freeing a page frag. I had originally thought about coming up with an
mput or something like that since we are dropping multiple references,
but then I figured since we already had __page_frag_refill I would go
for __page_frag_drain.
> And why the leading underscores. The page-frag API is pretty weird :(
>
> And inconsistent. __alloc_page_frag -> page_frag_alloc,
> __free_page_frag -> page_frag_free(), etc. I must have been asleep
> when I let that lot through.
The leading underscores are inherited. Most of it has to do with the
fact that this is a backing API for the netdev sk_buff allocator.
When this stuff existed in net it was already named this way and I
just moved it over. I'm not sure if you approved it or not as I don't
see an Ack-by or Signed-off-by from you on the patch. The timing of
it was such that I think Linus approved it and it was then pulled in
through Dave's tree.
If you would like I could look at doing a couple of renaming patches
so that we make the API a bit more consistent. I could move the
__alloc and __free to what you have suggested, and then take a look at
trying to rename the refill/drain to be a bit more consistent in terms
of what they are supposed to work on and how they are supposed to be
used.
- Alex
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Re: [RFC PATCH net v2 2/3] dt: bindings: add ethernet phy eee-disable-advert option documentation
From: Jerome Brunet @ 2016-11-21 16:16 UTC (permalink / raw)
To: Andrew Lunn
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA,
Florian Fainelli, Alexandre TORGUE, Neil Armstrong,
Martin Blumenstingl, Kevin Hilman,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andre Roth,
linux-amlogic-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Carlo Caione,
Giuseppe Cavallaro,
linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <20161121160149.GF1922-g2DYL2Zd6BY@public.gmane.org>
On Mon, 2016-11-21 at 17:01 +0100, Andrew Lunn wrote:
> On Mon, Nov 21, 2016 at 04:35:23PM +0100, Jerome Brunet wrote:
> >
> > Signed-off-by: Jerome Brunet <jbrunet-rdvid1DuHRBWk0Htik3J/w@public.gmane.org>
> > ---
> > Documentation/devicetree/bindings/net/phy.txt | 5 +++++
> > 1 file changed, 5 insertions(+)
> >
> > diff --git a/Documentation/devicetree/bindings/net/phy.txt
> > b/Documentation/devicetree/bindings/net/phy.txt
> > index bc1c3c8bf8fa..7f066b7c1e2c 100644
> > --- a/Documentation/devicetree/bindings/net/phy.txt
> > +++ b/Documentation/devicetree/bindings/net/phy.txt
> > @@ -35,6 +35,11 @@ Optional Properties:
> > - broken-turn-around: If set, indicates the PHY device does not
> > correctly
> > release the turn around line low at the end of a MDIO
> > transaction.
> >
> > +- eee-advert-disable: Bits to clear in the MDIO_AN_EEE_ADV
> > register to
> > + disable EEE modes. Example
> > + * 0x4: disable EEE for 1000T,
> > + * 0x6: disable EEE for 100TX and 1000T
> > +
>
> Hi Jerome
>
> I like the direction this patchset is taking. But hex values are
> pretty unfriendly.
Agreed
> Please add a set of boolean properties, and do the
> mapping to hex in the C code.
>
> That would also make extending this API easier. e.g. say you have a
> 10Gbps PHY with EEE, and you need to disable it. This hex value
> quickly gets ugly, eee-advert-disable-10000 is nice and simple.
What I did not realize when doing this patch for the realtek driver is
that there is already 6 valid modes defined in the kernel
#define MDIO_EEE_100TX MDIO_AN_EEE_ADV_100TX /*
100TX EEE cap */
#define MDIO_EEE_1000T MDIO_AN_EEE_ADV_1000T /*
1000T EEE cap */
#define MDIO_EEE_10GT 0x0008 /* 10GT EEE cap */
#define MDIO_EEE_1000KX 0x0010 /* 1000KX EEE cap
*/
#define MDIO_EEE_10GKX4 0x0020 /* 10G KX4 EEE cap
*/
#define MDIO_EEE_10GKR 0x0040 /* 10G KR EEE cap
*/
I took care of only 2 in the case of realtek.c since it only support
MDIO_EEE_100TX and MDIO_EEE_1000T.
Defining a property for each is certainly doable but it does not look
very nice either. If it extends in the future, it will get even more
messier, especially if you want to disable everything.
What do you think about keeping a single mask value but use the define
above in the DT ? It would be more readable than hex and easy to
extend, don't you think ?
These defines are already part of the uapi so I guess we can use those
in the DT bindings ?
>
> Andrew
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox