Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-next v3 4/4] vxlan: implement GPE
From: Tom Herbert @ 2016-04-05 13:50 UTC (permalink / raw)
  To: Jiri Benc; +Cc: Linux Kernel Network Developers, Jesse Gross
In-Reply-To: <2f2080cbe948be319e5725b2aa484e8996ed36ae.1459860270.git.jbenc@redhat.com>

On Tue, Apr 5, 2016 at 9:47 AM, Jiri Benc <jbenc@redhat.com> wrote:
> Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
> possible to support static configuration, too, if there is demand for it).
>
> The GPE header parsing has to be moved before iptunnel_pull_header, as we
> need to know the protocol.
>
> v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
>     (now called "raw mode") is added by this patch. This mode does not allow
>     Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
>     specify the encapsulation, IP header is encapsulated instead. The patch
>     does support Ethernet to be encapsulated, though, using ETH_P_TEB in
>     skb->protocol. This will be utilized by other COLLECT_METADATA users
>     (openvswitch in particular).
>
>     If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
>     ip route, it's easy to add a new flag switching the interface to
>     "Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
>     leave this out, it seems we don't need it.
>
>     Disallowed more flag combinations, especially RCO with GPE.
>     Added comment explaining that GBP and GPE cannot be set together.
>
I requested input from VXLAN protocol experts on whether RCO is
architecturally correct in VXLAN and whether we are using the right
fields both on the mailing list and in WG meeting yesterday @IETF.
Have not gotten any response, so I am going to assume all this is
reasonable. I will add explicit support to VXLAN-RCO draft for
VXLAN-GPE. The configuration option for RX RCO can be removed and RCO
can be supported for VXLAN/VXLAN-GPE in same way. Presumably, GBP
might make same assumptions but GBP format as defined for VXLAN isn't
compatible with VXLAN-GPE.

Tom

> Signed-off-by: Jiri Benc <jbenc@redhat.com>
> ---
>  drivers/net/vxlan.c          | 170 ++++++++++++++++++++++++++++++++++++++-----
>  include/net/vxlan.h          |  68 +++++++++++++++++
>  include/uapi/linux/if_link.h |   1 +
>  3 files changed, 222 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index d62eebaa9720..51cccddfe403 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -1192,6 +1192,45 @@ out:
>         unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
>  }
>
> +static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
> +                               __be32 *protocol,
> +                               struct sk_buff *skb, u32 vxflags)
> +{
> +       struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;
> +
> +       /* Need to have Next Protocol set for interfaces in GPE mode. */
> +       if (!gpe->np_applied)
> +               return false;
> +       /* "The initial version is 0. If a receiver does not support the
> +        * version indicated it MUST drop the packet.
> +        */
> +       if (gpe->version != 0)
> +               return false;
> +       /* "When the O bit is set to 1, the packet is an OAM packet and OAM
> +        * processing MUST occur." However, we don't implement OAM
> +        * processing, thus drop the packet.
> +        */
> +       if (gpe->oam_flag)
> +               return false;
> +
> +       switch (gpe->next_protocol) {
> +       case VXLAN_GPE_NP_IPV4:
> +               *protocol = htons(ETH_P_IP);
> +               break;
> +       case VXLAN_GPE_NP_IPV6:
> +               *protocol = htons(ETH_P_IPV6);
> +               break;
> +       case VXLAN_GPE_NP_ETHERNET:
> +               *protocol = htons(ETH_P_TEB);
> +               break;
> +       default:
> +               return false;
> +       }
> +
> +       unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
> +       return true;
> +}
> +
>  static bool vxlan_set_mac(struct vxlan_dev *vxlan,
>                           struct vxlan_sock *vs,
>                           struct sk_buff *skb)
> @@ -1257,9 +1296,11 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
>         struct vxlanhdr unparsed;
>         struct vxlan_metadata _md;
>         struct vxlan_metadata *md = &_md;
> +       __be32 protocol = htons(ETH_P_TEB);
> +       bool raw_proto = false;
>         void *oiph;
>
> -       /* Need Vxlan and inner Ethernet header to be present */
> +       /* Need UDP and VXLAN header to be present */
>         if (!pskb_may_pull(skb, VXLAN_HLEN))
>                 return 1;
>
> @@ -1283,9 +1324,18 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
>         if (!vxlan)
>                 goto drop;
>
> -       if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB),
> -                                !net_eq(vxlan->net, dev_net(vxlan->dev))))
> -               goto drop;
> +       /* For backwards compatibility, only allow reserved fields to be
> +        * used by VXLAN extensions if explicitly requested.
> +        */
> +       if (vs->flags & VXLAN_F_GPE) {
> +               if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
> +                       goto drop;
> +               raw_proto = true;
> +       }
> +
> +       if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
> +                                  !net_eq(vxlan->net, dev_net(vxlan->dev))))
> +                       goto drop;
>
>         if (vxlan_collect_metadata(vs)) {
>                 __be32 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
> @@ -1304,14 +1354,14 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
>                 memset(md, 0, sizeof(*md));
>         }
>
> -       /* For backwards compatibility, only allow reserved fields to be
> -        * used by VXLAN extensions if explicitly requested.
> -        */
>         if (vs->flags & VXLAN_F_REMCSUM_RX)
>                 if (!vxlan_remcsum(&unparsed, skb, vs->flags))
>                         goto drop;
>         if (vs->flags & VXLAN_F_GBP)
>                 vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
> +       /* Note that GBP and GPE can never be active together. This is
> +        * ensured in vxlan_dev_configure.
> +        */
>
>         if (unparsed.vx_flags || unparsed.vx_vni) {
>                 /* If there are any unprocessed flags remaining treat
> @@ -1325,8 +1375,13 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
>                 goto drop;
>         }
>
> -       if (!vxlan_set_mac(vxlan, vs, skb))
> -               goto drop;
> +       if (!raw_proto) {
> +               if (!vxlan_set_mac(vxlan, vs, skb))
> +                       goto drop;
> +       } else {
> +               skb->dev = vxlan->dev;
> +               skb->pkt_type = PACKET_HOST;
> +       }
>
>         oiph = skb_network_header(skb);
>         skb_reset_network_header(skb);
> @@ -1685,6 +1740,27 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
>         gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
>  }
>
> +static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
> +                              __be16 protocol)
> +{
> +       struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
> +
> +       gpe->np_applied = 1;
> +
> +       switch (protocol) {
> +       case htons(ETH_P_IP):
> +               gpe->next_protocol = VXLAN_GPE_NP_IPV4;
> +               return 0;
> +       case htons(ETH_P_IPV6):
> +               gpe->next_protocol = VXLAN_GPE_NP_IPV6;
> +               return 0;
> +       case htons(ETH_P_TEB):
> +               gpe->next_protocol = VXLAN_GPE_NP_ETHERNET;
> +               return 0;
> +       }
> +       return -EPFNOSUPPORT;
> +}
> +
>  static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
>                            int iphdr_len, __be32 vni,
>                            struct vxlan_metadata *md, u32 vxflags,
> @@ -1694,6 +1770,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
>         int min_headroom;
>         int err;
>         int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
> +       __be16 inner_protocol = htons(ETH_P_TEB);
>
>         if ((vxflags & VXLAN_F_REMCSUM_TX) &&
>             skb->ip_summed == CHECKSUM_PARTIAL) {
> @@ -1712,10 +1789,8 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
>
>         /* Need space for new headers (invalidates iph ptr) */
>         err = skb_cow_head(skb, min_headroom);
> -       if (unlikely(err)) {
> -               kfree_skb(skb);
> -               return err;
> -       }
> +       if (unlikely(err))
> +               goto out_free;
>
>         skb = vlan_hwaccel_push_inside(skb);
>         if (WARN_ON(!skb))
> @@ -1744,9 +1819,19 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
>
>         if (vxflags & VXLAN_F_GBP)
>                 vxlan_build_gbp_hdr(vxh, vxflags, md);
> +       if (vxflags & VXLAN_F_GPE) {
> +               err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
> +               if (err < 0)
> +                       goto out_free;
> +               inner_protocol = skb->protocol;
> +       }
>
> -       skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +       skb_set_inner_protocol(skb, inner_protocol);
>         return 0;
> +
> +out_free:
> +       kfree_skb(skb);
> +       return err;
>  }
>
>  static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
> @@ -2421,6 +2506,17 @@ static const struct net_device_ops vxlan_netdev_ether_ops = {
>         .ndo_fill_metadata_dst  = vxlan_fill_metadata_dst,
>  };
>
> +static const struct net_device_ops vxlan_netdev_raw_ops = {
> +       .ndo_init               = vxlan_init,
> +       .ndo_uninit             = vxlan_uninit,
> +       .ndo_open               = vxlan_open,
> +       .ndo_stop               = vxlan_stop,
> +       .ndo_start_xmit         = vxlan_xmit,
> +       .ndo_get_stats64        = ip_tunnel_get_stats64,
> +       .ndo_change_mtu         = vxlan_change_mtu,
> +       .ndo_fill_metadata_dst  = vxlan_fill_metadata_dst,
> +};
> +
>  /* Info for udev, that this is a virtual tunnel endpoint */
>  static struct device_type vxlan_type = {
>         .name = "vxlan",
> @@ -2500,6 +2596,17 @@ static void vxlan_ether_setup(struct net_device *dev)
>         dev->netdev_ops = &vxlan_netdev_ether_ops;
>  }
>
> +static void vxlan_raw_setup(struct net_device *dev)
> +{
> +       dev->type = ARPHRD_NONE;
> +       dev->hard_header_len = 0;
> +       dev->addr_len = 0;
> +       dev->mtu = ETH_DATA_LEN;
> +       dev->tx_queue_len = 1000;
> +       dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
> +       dev->netdev_ops = &vxlan_netdev_raw_ops;
> +}
> +
>  static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
>         [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
>         [IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
> @@ -2526,6 +2633,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
>         [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
>         [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
>         [IFLA_VXLAN_GBP]        = { .type = NLA_FLAG, },
> +       [IFLA_VXLAN_GPE]        = { .type = NLA_FLAG, },
>         [IFLA_VXLAN_REMCSUM_NOPARTIAL]  = { .type = NLA_FLAG },
>  };
>
> @@ -2726,7 +2834,20 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
>         __be16 default_port = vxlan->cfg.dst_port;
>         struct net_device *lowerdev = NULL;
>
> -       vxlan_ether_setup(dev);
> +       if (conf->flags & VXLAN_F_GPE) {
> +               if (conf->flags & ~VXLAN_F_ALLOWED_GPE)
> +                       return -EINVAL;
> +               /* For now, allow GPE only together with COLLECT_METADATA.
> +                * This can be relaxed later; in such case, the other side
> +                * of the PtP link will have to be provided.
> +                */
> +               if (!(conf->flags & VXLAN_F_COLLECT_METADATA))
> +                       return -EINVAL;
> +
> +               vxlan_raw_setup(dev);
> +       } else {
> +               vxlan_ether_setup(dev);
> +       }
>
>         vxlan->net = src_net;
>
> @@ -2789,8 +2910,12 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
>         dev->needed_headroom = needed_headroom;
>
>         memcpy(&vxlan->cfg, conf, sizeof(*conf));
> -       if (!vxlan->cfg.dst_port)
> -               vxlan->cfg.dst_port = default_port;
> +       if (!vxlan->cfg.dst_port) {
> +               if (conf->flags & VXLAN_F_GPE)
> +                       vxlan->cfg.dst_port = 4790; /* IANA assigned VXLAN-GPE port */
> +               else
> +                       vxlan->cfg.dst_port = default_port;
> +       }
>         vxlan->flags |= conf->flags;
>
>         if (!vxlan->cfg.age_interval)
> @@ -2961,6 +3086,9 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
>         if (data[IFLA_VXLAN_GBP])
>                 conf.flags |= VXLAN_F_GBP;
>
> +       if (data[IFLA_VXLAN_GPE])
> +               conf.flags |= VXLAN_F_GPE;
> +
>         if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
>                 conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
>
> @@ -2977,6 +3105,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
>         case -EEXIST:
>                 pr_info("duplicate VNI %u\n", be32_to_cpu(conf.vni));
>                 break;
> +
> +       case -EINVAL:
> +               pr_info("unsupported combination of extensions\n");
> +               break;
>         }
>
>         return err;
> @@ -3104,6 +3236,10 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
>             nla_put_flag(skb, IFLA_VXLAN_GBP))
>                 goto nla_put_failure;
>
> +       if (vxlan->flags & VXLAN_F_GPE &&
> +           nla_put_flag(skb, IFLA_VXLAN_GPE))
> +               goto nla_put_failure;
> +
>         if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL &&
>             nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
>                 goto nla_put_failure;
> diff --git a/include/net/vxlan.h b/include/net/vxlan.h
> index 73ed2e951c02..dcc6f4057115 100644
> --- a/include/net/vxlan.h
> +++ b/include/net/vxlan.h
> @@ -119,6 +119,64 @@ struct vxlanhdr_gbp {
>  #define VXLAN_GBP_POLICY_APPLIED       (BIT(3) << 16)
>  #define VXLAN_GBP_ID_MASK              (0xFFFF)
>
> +/*
> + * VXLAN Generic Protocol Extension (VXLAN_F_GPE):
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |R|R|Ver|I|P|R|O|       Reserved                |Next Protocol  |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |                VXLAN Network Identifier (VNI) |   Reserved    |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *
> + * Ver = Version. Indicates VXLAN GPE protocol version.
> + *
> + * P = Next Protocol Bit. The P bit is set to indicate that the
> + *     Next Protocol field is present.
> + *
> + * O = OAM Flag Bit. The O bit is set to indicate that the packet
> + *     is an OAM packet.
> + *
> + * Next Protocol = This 8 bit field indicates the protocol header
> + * immediately following the VXLAN GPE header.
> + *
> + * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01
> + */
> +
> +struct vxlanhdr_gpe {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +       u8      oam_flag:1,
> +               reserved_flags1:1,
> +               np_applied:1,
> +               instance_applied:1,
> +               version:2,
> +reserved_flags2:2;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +       u8      reserved_flags2:2,
> +               version:2,
> +               instance_applied:1,
> +               np_applied:1,
> +               reserved_flags1:1,
> +               oam_flag:1;
> +#endif
> +       u8      reserved_flags3;
> +       u8      reserved_flags4;
> +       u8      next_protocol;
> +       __be32  vx_vni;
> +};
> +
> +/* VXLAN-GPE header flags. */
> +#define VXLAN_HF_VER   cpu_to_be32(BIT(29) | BIT(28))
> +#define VXLAN_HF_NP    cpu_to_be32(BIT(26))
> +#define VXLAN_HF_OAM   cpu_to_be32(BIT(24))
> +
> +#define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \
> +                            cpu_to_be32(0xff))
> +
> +/* VXLAN-GPE header Next Protocol. */
> +#define VXLAN_GPE_NP_IPV4      0x01
> +#define VXLAN_GPE_NP_IPV6      0x02
> +#define VXLAN_GPE_NP_ETHERNET  0x03
> +#define VXLAN_GPE_NP_NSH       0x04
> +
>  struct vxlan_metadata {
>         u32             gbp;
>  };
> @@ -206,16 +264,26 @@ struct vxlan_dev {
>  #define VXLAN_F_GBP                    0x800
>  #define VXLAN_F_REMCSUM_NOPARTIAL      0x1000
>  #define VXLAN_F_COLLECT_METADATA       0x2000
> +#define VXLAN_F_GPE                    0x4000
>
>  /* Flags that are used in the receive path. These flags must match in
>   * order for a socket to be shareable
>   */
>  #define VXLAN_F_RCV_FLAGS              (VXLAN_F_GBP |                  \
> +                                        VXLAN_F_GPE |                  \
>                                          VXLAN_F_UDP_ZERO_CSUM6_RX |    \
>                                          VXLAN_F_REMCSUM_RX |           \
>                                          VXLAN_F_REMCSUM_NOPARTIAL |    \
>                                          VXLAN_F_COLLECT_METADATA)
>
> +/* Flags that can be set together with VXLAN_F_GPE. */
> +#define VXLAN_F_ALLOWED_GPE            (VXLAN_F_GPE |                  \
> +                                        VXLAN_F_IPV6 |                 \
> +                                        VXLAN_F_UDP_ZERO_CSUM_TX |     \
> +                                        VXLAN_F_UDP_ZERO_CSUM6_TX |    \
> +                                        VXLAN_F_UDP_ZERO_CSUM6_RX |    \
> +                                        VXLAN_F_COLLECT_METADATA)
> +
>  struct net_device *vxlan_dev_create(struct net *net, const char *name,
>                                     u8 name_assign_type, struct vxlan_config *conf);
>
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index c488066fb53a..9427f17d06d6 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -488,6 +488,7 @@ enum {
>         IFLA_VXLAN_REMCSUM_NOPARTIAL,
>         IFLA_VXLAN_COLLECT_METADATA,
>         IFLA_VXLAN_LABEL,
> +       IFLA_VXLAN_GPE,
>         __IFLA_VXLAN_MAX
>  };
>  #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
> --
> 1.8.3.1
>

^ permalink raw reply

* Re: [PATCH net-next v3 4/4] vxlan: implement GPE
From: Jiri Benc @ 2016-04-05 13:57 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Linux Kernel Network Developers, Jesse Gross
In-Reply-To: <CALx6S35gFDe+hiOtQqDUuzahkeh7vB3du3TN=noYefuDNGmCJw@mail.gmail.com>

On Tue, 5 Apr 2016 10:50:57 -0300, Tom Herbert wrote:
> I requested input from VXLAN protocol experts on whether RCO is
> architecturally correct in VXLAN and whether we are using the right
> fields both on the mailing list and in WG meeting yesterday @IETF.
> Have not gotten any response, so I am going to assume all this is
> reasonable. I will add explicit support to VXLAN-RCO draft for
> VXLAN-GPE. The configuration option for RX RCO can be removed and RCO
> can be supported for VXLAN/VXLAN-GPE in same way. Presumably, GBP
> might make same assumptions but GBP format as defined for VXLAN isn't
> compatible with VXLAN-GPE.

Cool, thanks! We'll loosen the restriction after it's added to the RFC,
it should be a simple one line patch.

 Jiri

^ permalink raw reply

* Hiking Enthusiasts List
From: Connie Garrett @ 2016-04-05 13:45 UTC (permalink / raw)
  To: netdev


Hi,

Hope this email finds you well!

Would you be interested in acquiring an email list of "Hiking Enthusiasts List" from USA?

We also have data for  Fishing Enthusiasts, Sports Enthusiasts, Outdoor Enthusiasts, Spa and Resort visitors, Camping Enthusiasts, Food Enthusiasts, Scuba Divers List, Cruise Travelers, Boat Owners, Foodies, Apparel Buyers, travellers, Luxury Brand Buyers, Gift buyers and many more.

Each record in the list contains Contact Name (First, Middle and Last Name), Mailing Address, List type and Opt-in email address.

All the contacts are opt-in verified, 100% permission based and can be used for unlimited multi-channel marketing.

Please let me know your thoughts towards procuring the Hiking Enthusiasts List.

Best Regards,
Connie Garrett
Research Analyst



We respect your privacy, if you do not wish to receive any further emails from our end, please reply with a subject “Leave Out”.


---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus

^ permalink raw reply

* Re: [PATCH net-next 2/8] perf, bpf: allow bpf programs attach to tracepoints
From: Peter Zijlstra @ 2016-04-05 14:18 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, David S . Miller, Ingo Molnar, Daniel Borkmann,
	Arnaldo Carvalho de Melo, Wang Nan, Josef Bacik, Brendan Gregg,
	netdev, linux-kernel, kernel-team
In-Reply-To: <1459831974-2891931-3-git-send-email-ast@fb.com>

On Mon, Apr 04, 2016 at 09:52:48PM -0700, Alexei Starovoitov wrote:
> introduce BPF_PROG_TYPE_TRACEPOINT program type and allow it to be
> attached to tracepoints.

More specifically the perf tracepoint handler, not tracepoints directly.

> The tracepoint will copy the arguments in the per-cpu buffer and pass
> it to the bpf program as its first argument.

> The layout of the fields can be discovered by doing
> 'cat /sys/kernel/debug/tracing/events/sched/sched_switch/format'
> prior to the compilation of the program with exception that first 8 bytes
> are reserved and not accessible to the program. This area is used to store
> the pointer to 'struct pt_regs' which some of the bpf helpers will use:
> +---------+
> | 8 bytes | hidden 'struct pt_regs *' (inaccessible to bpf program)
> +---------+
> | N bytes | static tracepoint fields defined in tracepoint/format (bpf readonly)
> +---------+
> | dynamic | __dynamic_array bytes of tracepoint (inaccessible to bpf yet)
> +---------+
> 
> Not that all of the fields are already dumped to user space via perf ring buffer
> and some application access it directly without consulting tracepoint/format.

We call those apps broken..

> Same rule applies here: static tracepoint fields should only be accessed
> in a format defined in tracepoint/format. The order of fields and
> field sizes are not an ABI.


> @@ -56,8 +57,9 @@ perf_trace_##call(void *__data, proto)					\
>  			     sizeof(u64));				\
>  	__entry_size -= sizeof(u32);					\
>  									\
> -	entry = perf_trace_buf_prepare(__entry_size,			\
> -			event_call->event.type, &__regs, &rctx);	\
> +	event_type = prog ? TRACE_EVENT_TYPE_MAX : event_call->event.type; \
> +	entry = perf_trace_buf_prepare(__entry_size, event_type,	\
> +				       &__regs, &rctx);			\
>  	if (!entry)							\
>  		return;							\
>  									\
> @@ -67,6 +69,14 @@ perf_trace_##call(void *__data, proto)					\
>  									\
>  	{ assign; }							\
>  									\
> +	if (prog) {							\
> +		*(struct pt_regs **)entry = __regs;			\
> +		if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
> +			perf_swevent_put_recursion_context(rctx);	\
> +			return;						\

So if the prog 'fails' you consume the entry,

> +		}							\
> +		memset(&entry->ent, 0, sizeof(entry->ent));		\

But if not, you destroy it and then feed it to perf?

> +	}								\
>  	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
>  		__count, __regs, head, __task);				\
>  }


> diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
> index 7a68afca8249..7ada829029d3 100644
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -284,6 +284,9 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
>  		*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
>  	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
>  
> +	if (type == TRACE_EVENT_TYPE_MAX)
> +		return raw_data;
> +
>  	/* zero the dead bytes from align to not leak stack to user */
>  	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));

What's this hunk do? Why can you skip this stuff for BPF attached
events?

^ permalink raw reply

* [PATCH V2 0/2] net: socket: return a proper error code when source address becomes nonlocal
From: Liping Zhang @ 2016-04-05 14:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Liping Zhang

From: Liping Zhang <liping.zhang@spreadtrum.com>

This patch version 2 spilt the original patch into 2 patches, because it fix
two separate problems actually.

Liping Zhang (2):
  net: socket: return EADDRNOTAVAIL when source address becomes
    nonlocal
  net: socket: return EADDRNOTAVAIL when IPV6_PKTINFO's ipi6_addr is
    not available

 net/ipv4/route.c    |    6 ++++--
 net/ipv6/datagram.c |    2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

-- 
1.7.9.5

^ permalink raw reply

* [PATCH V2 1/2] net: socket: return EADDRNOTAVAIL when source address becomes nonlocal
From: Liping Zhang @ 2016-04-05 14:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Liping Zhang
In-Reply-To: <1459865986-18271-1-git-send-email-zlpnobody@163.com>

From: Liping Zhang <liping.zhang@spreadtrum.com>

A socket can use bind(directly) or connect(indirectly) to bind to a local
ip address, and later if the network becomes down, that cause the source
address becomes nonlocal, then send() call will fail and return EINVAL.
But this error code is confusing, acctually we did not pass any invalid
arguments. Furthermore, send() maybe return ok at first, it now returns
fail just because of a temporary network problem, i.e. when the network
recovery, send() call will become ok. Return EADDRNOTAVAIL instead of
EINVAL in such situation is better.

Take the ping utility for example, we can use -I option to specify the
source address (e.g ping -I 10.19.145.26 117.135.169.41), when network
becomes down, error message will be printed out as follows:
    64 bytes from  (117.135.169.41): icmp_seq=9 ttl=54 time=46.7 ms
    ping: sendmsg: Invalid argument
    ping: sendmsg: Invalid argument

Apply this patch, error message will become:
    64 bytes from  (117.135.169.41): icmp_seq=5 ttl=54 time=47.5 ms
    ping: sendmsg: Cannot assign requested address
    ping: sendmsg: Cannot assign requested address

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
---
 net/ipv4/route.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c6229..857f7b3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2149,11 +2149,13 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 
 	rcu_read_lock();
 	if (fl4->saddr) {
-		rth = ERR_PTR(-EINVAL);
+		rth = ERR_PTR(-EADDRNOTAVAIL);
 		if (ipv4_is_multicast(fl4->saddr) ||
 		    ipv4_is_lbcast(fl4->saddr) ||
-		    ipv4_is_zeronet(fl4->saddr))
+		    ipv4_is_zeronet(fl4->saddr)) {
+			rth = ERR_PTR(-EINVAL);
 			goto out;
+		}
 
 		/* I removed check for oif == dev_out->oif here.
 		   It was wrong for two reasons:
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH V2 2/2] net: socket: return EADDRNOTAVAIL when IPV6_PKTINFO's ipi6_addr is not available
From: Liping Zhang @ 2016-04-05 14:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Liping Zhang
In-Reply-To: <1459865986-18271-1-git-send-email-zlpnobody@163.com>

From: Liping Zhang <liping.zhang@spreadtrum.com>

We can use IPV6_PKTINFO to specify the ipv6 source address when call
sendmsg() to send packet, but if the address is not available, call will
fail and EINVAL is returned. This error code is not very appropriate,
it failed maybe just because of a temporary network problem, i.e. when
the network recovery, sendmsg() call will become ok. Also RFC3542,
section 6.6 describe an example in error scenario that returns
EADDRNOTAVAIL: "ipi6_ifindex specifies an interface but the address
ipi6_addr is not available for use on that interface.". So return
EADDRNOTAVAIL instead of EINVAL here.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
---
 net/ipv6/datagram.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index a73d701..20a968b 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -753,7 +753,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 						   strict ? dev : NULL, 0) &&
 				    !ipv6_chk_acast_addr_src(net, dev,
 							     &src_info->ipi6_addr))
-					err = -EINVAL;
+					err = -EADDRNOTAVAIL;
 				else
 					fl6->saddr = src_info->ipi6_addr;
 			}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH] ipsec: Fix error handling in the function xfrm6_get_addr
From: Bastien Philbert @ 2016-04-05 14:23 UTC (permalink / raw)
  To: herbert; +Cc: davem, kuznet, jmorris, yoshfuji, kaber, netdev, linux-kernel

This fixes the error handling in the function xfrm_get_addr by
checking if the call to ipv6_dev_get_saddr has failed and if
so return -EADDRNOTAVAIL to signal to the caller of xfrm6_get_addr
that the call has failed the error should be handled by the caller
while also freeing the dst_entry structure pointer in use by this

Signed-off-by: Bastien Philbert <bastienphilbert@gmail.com>
---
 net/ipv6/xfrm6_policy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index c074771..759473e 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -64,7 +64,10 @@ static int xfrm6_get_saddr(struct net *net, int oif,
 		return -EHOSTUNREACH;
 
 	dev = ip6_dst_idev(dst)->dev;
-	ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6);
+	if (ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6)) {
+		dst_release(dst);
+		return -EADDRNOTAVAIL;
+	}
 	dst_release(dst);
 	return 0;
 }
-- 
2.5.0

^ permalink raw reply related

* [PATCH net-next] net: intel: remove dead links
From: Jiri Benc @ 2016-04-05 14:25 UTC (permalink / raw)
  To: netdev
  Cc: Jeff Kirsher, Jesse Brandeburg, Shannon Nelson, Carolyn Wyborny,
	Don Skidmore, Bruce Allan, John Ronciak, Mitch Williams,
	intel-wired-lan

The Kconfig for Intel NICs references two different URLs for the "Adapter
& Driver ID Guide". Neither of those two links works. The current URL seems
to be
http://www.intel.com/content/www/us/en/support/network-and-i-o/ethernet-products/000005584.html
but given it's apparently constantly changing, there's no point in having it
in the help text.

Just keep a generic pointer to http://support.intel.com. Hopefully, this one
will have a longer live. It still works, at least.

Futhermore, remove a link to "the latest Intel PRO/100 network driver for
Linux", this has no place in the mainline kernel and the latest Linux driver
it offers is from 2006, anyway.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
---
 drivers/net/ethernet/intel/Kconfig | 80 +++++++-------------------------------
 1 file changed, 14 insertions(+), 66 deletions(-)

diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index 3772f3ac956e..714bd1014ddb 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -25,16 +25,13 @@ config E100
 	  on the adapter. Look for a label that has a barcode and a number
 	  in the format 123456-001 (six digits hyphen three digits).
 
-	  Use the above information and the Adapter & Driver ID Guide at:
+	  Use the above information and the Adapter & Driver ID Guide that
+	  can be located at:
 
-	  <http://support.intel.com/support/network/adapter/pro100/21397.htm>
+	  <http://support.intel.com>
 
 	  to identify the adapter.
 
-	  For the latest Intel PRO/100 network driver for Linux, see:
-
-	  <http://www.intel.com/p/en_US/support/highlights/network/pro100plus>
-
 	  More specific information on configuring the driver is in
 	  <file:Documentation/networking/e100.txt>.
 
@@ -47,12 +44,7 @@ config E1000
 	---help---
 	  This driver supports Intel(R) PRO/1000 gigabit ethernet family of
 	  adapters.  For more information on how to identify your adapter, go
-	  to the Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/adapter/pro100/21397.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  to the Adapter & Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
@@ -71,12 +63,8 @@ config E1000E
 	  This driver supports the PCI-Express Intel(R) PRO/1000 gigabit
 	  ethernet family of adapters. For PCI or PCI-X e1000 adapters,
 	  use the regular e1000 driver For more information on how to
-	  identify your adapter, go to the Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/adapter/pro100/21397.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  identify your adapter, go to the Adapter & Driver ID Guide that
+	  can be located at:
 
 	  <http://support.intel.com>
 
@@ -101,12 +89,7 @@ config IGB
 	---help---
 	  This driver supports Intel(R) 82575/82576 gigabit ethernet family of
 	  adapters.  For more information on how to identify your adapter, go
-	  to the Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/adapter/pro100/21397.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  to the Adapter & Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
@@ -142,12 +125,7 @@ config IGBVF
 	---help---
 	  This driver supports Intel(R) 82576 virtual functions.  For more
 	  information on how to identify your adapter, go to the Adapter &
-	  Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/adapter/pro100/21397.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
@@ -164,12 +142,7 @@ config IXGB
 	  This driver supports Intel(R) PRO/10GbE family of adapters for
 	  PCI-X type cards. For PCI-E type cards, use the "ixgbe" driver
 	  instead. For more information on how to identify your adapter, go
-	  to the Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/adapter/pro100/21397.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  to the Adapter & Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
@@ -187,12 +160,7 @@ config IXGBE
 	---help---
 	  This driver supports Intel(R) 10GbE PCI Express family of
 	  adapters.  For more information on how to identify your adapter, go
-	  to the Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/adapter/pro100/21397.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  to the Adapter & Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
@@ -243,12 +211,7 @@ config IXGBEVF
 	---help---
 	  This driver supports Intel(R) PCI Express virtual functions for the
 	  Intel(R) ixgbe driver.  For more information on how to identify your
-	  adapter, go to the Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/sb/CS-008441.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  adapter, go to the Adapter & Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
@@ -266,12 +229,7 @@ config I40E
 	---help---
 	  This driver supports Intel(R) Ethernet Controller XL710 Family of
 	  devices.  For more information on how to identify your adapter, go
-	  to the Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/adapter/pro100/21397.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  to the Adapter & Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
@@ -326,12 +284,7 @@ config I40EVF
 	---help---
 	  This driver supports Intel(R) XL710 and X710 virtual functions.
 	  For more information on how to identify your adapter, go to the
-	  Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/sb/CS-008441.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  Adapter & Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
@@ -347,12 +300,7 @@ config FM10K
 	---help---
 	  This driver supports Intel(R) FM10000 Ethernet Switch Host
 	  Interface.  For more information on how to identify your adapter,
-	  go to the Adapter & Driver ID Guide at:
-
-	  <http://support.intel.com/support/network/sb/CS-008441.htm>
-
-	  For general information and support, go to the Intel support
-	  website at:
+	  go to the Adapter & Driver ID Guide that can be located at:
 
 	  <http://support.intel.com>
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net] MAINTAINERS: intel-wired-lan list is moderated
From: Jiri Benc @ 2016-04-05 14:39 UTC (permalink / raw)
  To: netdev
  Cc: Jeff Kirsher, Jesse Brandeburg, Shannon Nelson, Carolyn Wyborny,
	Don Skidmore, Bruce Allan, John Ronciak, Mitch Williams

I got the following message:

> Your mail to 'Intel-wired-lan' with the subject
>
>     [PATCH net-next] net: intel: remove dead links
>
> Is being held until the list moderator can review it for approval.
>
> The reason it is being held:
>
>     Post by non-member to a members-only list

Mark the list as moderated.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7ba7bc485d74..1d8ce9b9e4f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5750,7 +5750,7 @@ R:	Don Skidmore <donald.c.skidmore@intel.com>
 R:	Bruce Allan <bruce.w.allan@intel.com>
 R:	John Ronciak <john.ronciak@intel.com>
 R:	Mitch Williams <mitch.a.williams@intel.com>
-L:	intel-wired-lan@lists.osuosl.org
+L:	intel-wired-lan@lists.osuosl.org (moderated for non-subscribers)
 W:	http://www.intel.com/support/feedback.htm
 W:	http://e1000.sourceforge.net/
 Q:	http://patchwork.ozlabs.org/project/intel-wired-lan/list/
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next] net: intel: remove dead links
From: Jiri Benc @ 2016-04-05 14:44 UTC (permalink / raw)
  To: netdev
  Cc: Jeff Kirsher, Jesse Brandeburg, Shannon Nelson, Carolyn Wyborny,
	Don Skidmore, Bruce Allan, John Ronciak, Mitch Williams
In-Reply-To: <3c9f8ea384ba033f83ec360eef17732ff0a5a8dd.1459866280.git.jbenc@redhat.com>

[removing intel-wired-lan@lists.osuosl.org as I have no interest in the
annoying "awaits moderator approval" emails]

On Tue,  5 Apr 2016 16:25:07 +0200, Jiri Benc wrote:
> The Kconfig for Intel NICs references two different URLs for the "Adapter
> & Driver ID Guide". Neither of those two links works. The current URL seems
> to be
> http://www.intel.com/content/www/us/en/support/network-and-i-o/ethernet-products/000005584.html
> but given it's apparently constantly changing, there's no point in having it
> in the help text.
> 
> Just keep a generic pointer to http://support.intel.com. Hopefully, this one
> will have a longer live. It still works, at least.
> 
> Futhermore, remove a link to "the latest Intel PRO/100 network driver for
> Linux", this has no place in the mainline kernel and the latest Linux driver
> it offers is from 2006, anyway.

The patch also applies to net. It's probably better to apply it there
than to net-next.

 Jiri

^ permalink raw reply

* Re: [PATCH net-next 1/6] net: skbuff: don't use union for napi_id and sender_cpu
From: Michael S. Tsirkin @ 2016-04-05 15:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, davem, netdev, linux-kernel
In-Reply-To: <1459515859.6473.277.camel@edumazet-glaptop3.roam.corp.google.com>

On Fri, Apr 01, 2016 at 06:04:19AM -0700, Eric Dumazet wrote:
> On Fri, 2016-04-01 at 12:49 +0800, Jason Wang wrote:
> > 
> > On 04/01/2016 10:55 AM, Eric Dumazet wrote:
> > > On Fri, 2016-04-01 at 10:13 +0800, Jason Wang wrote:
> > >
> > >
> > >> The problem is we want to support busy polling for tun. This needs
> > >> napi_id to be passed to tun socket by sk_mark_napi_id() during
> > >> tun_net_xmit(). But before reaching this, XPS will set sender_cpu will
> > >> make us can't see correct napi_id.
> > >>
> > > Looks like napi_id should have precedence then ?
> > 
> > But then when busy polling is enabled, we may still hit the issue before
> > commit 2bd82484bb4c5db1d5dc983ac7c409b2782e0154? So looks like sometimes
> > (e.g for tun), we need both two fields.
> 
> You did not clearly show me the path you take where both fields would be
> needed. If you expect me to do that, it wont happen.
> 
> > 
> > >
> > > Only forwarding should allow the field to be cleared to allow XPS to do
> > > its job.
> > >
> > > Maybe skb_sender_cpu_clear() was removed too early (commit
> > > 64d4e3431e686dc37ce388ba531c4c4e866fb141)
> > 
> > Not sure I get you, but this will clear napi_id too.
> 
> Only when allowed. In your case it would not be called.
> 
> Some people do not use tun, and want to forward or cook millions of
> packets per second. sk_buff size is critical. 
> 
> If busy polling gives you 5 % of performance improvement, but cost
> everyone else a performance decrease, this is a serious problem.
> 
> XPS is a sender problem, NAPI is a receiver problem. Fields should be
> shared.

Right. The issue IIUC is the weird way tun behaves: it's a netdev
so linux is a sender, but it has a socket in it and then linux
is the receiver too. I guess we need to find a way to special-case
tun somehow?

-- 
MST

^ permalink raw reply

* Re: [net PATCH v2 2/2] ipv4/GRO: Make GRO conform to RFC 6864
From: Edward Cree @ 2016-04-05 15:07 UTC (permalink / raw)
  To: Herbert Xu, Alexander Duyck
  Cc: Alexander Duyck, Tom Herbert, Jesse Gross, Eric Dumazet, Netdev,
	David Miller
In-Reply-To: <20160405043209.GA9822@gondor.apana.org.au>

On 05/04/16 05:32, Herbert Xu wrote:
> On Mon, Apr 04, 2016 at 09:26:55PM -0700, Alexander Duyck wrote:
>> The question I would have is what are you really losing with increment
>> from 0 versus fixed 0?  From what I see it is essentially just garbage
>> in/garbage out.
> GRO is meant to be lossless, that is, you should not be able to
> detect its presence from the outside.  If you lose information then
> you're breaking this rule and people will soon start asking for it
> to be disabled in various situations.
>
> I'm not against doing this per se but it should not be part of the
> default configuration.
I'm certainly in favour of this being configurable - indeed IMHO it should
also be possible to configure GRO with the 'looser' semantics of LRO, so
that people who want that can get it without all the horrible "don't confuse
Slow Start" hacks, and so that LRO can go away (AIUI the only reasons it
exists are (a) improved performance from the 'loose' semantics and (b) old
kernels without GRO.  We may not be able to kill (b) but we can certainly
address (a)).

But I don't agree that the default has to be totally lossless; anyone who is
caring about the ID fields in atomic datagrams is breaking the RFCs, and can
be assumed to Know What They're Doing sufficiently to configure this.

On the gripping hand, I feel like GRO+TSO is the wrong model for speeding up
forwarding/routing workloads.  Instead we should be looking into having lists
of SKBs traverse the stack together, splitting the list whenever e.g. the
destination changes.  That seems like it ought to be much more efficient than
rewriting headers twice, once to coalesce a superframe and once to segment it
again - and it also means this worry about GRO being lossless can go away.
But until someone tries implementing skb batches, we won't know for sure if
it works (and I don't have time right now ;)

-Ed

^ permalink raw reply

* [PATCH net-next 0/3] sock: lockdep tightening
From: Hannes Frederic Sowa @ 2016-04-05 15:10 UTC (permalink / raw)
  To: netdev; +Cc: daniel

First patch is from Eric Dumazet and improves lockdep accuracy for
socket locks. After that, second patch introduces lockdep_sock_is_held
and uses it. Final patch reverts and reworks the lockdep fix from Daniel
in the filter code, as we now have tighter lockdep support.

Hannes Frederic Sowa (3):
  sock: fix lockdep annotation in release_sock
  net: introduce lockdep_is_held and update various places to use it
  tun: use socket locks for sk_{attach,detatch}_filter

 drivers/net/tun.c        | 14 +++++++++-----
 include/linux/filter.h   |  4 ----
 include/net/sock.h       | 19 ++++++++++++++++---
 net/core/filter.c        | 35 +++++++++++++----------------------
 net/core/sock.c          |  5 -----
 net/dccp/ipv4.c          |  2 +-
 net/dccp/ipv6.c          |  2 +-
 net/ipv4/af_inet.c       |  2 +-
 net/ipv4/cipso_ipv4.c    |  3 ++-
 net/ipv4/ip_sockglue.c   |  4 ++--
 net/ipv4/tcp_ipv4.c      |  8 +++-----
 net/ipv6/ipv6_sockglue.c |  6 ++++--
 net/ipv6/tcp_ipv6.c      |  2 +-
 net/socket.c             |  2 +-
 14 files changed, 54 insertions(+), 54 deletions(-)

-- 
2.5.5

^ permalink raw reply

* [PATCH net-next 1/3] sock: fix lockdep annotation in release_sock
From: Hannes Frederic Sowa @ 2016-04-05 15:10 UTC (permalink / raw)
  To: netdev; +Cc: daniel, Eric Dumazet
In-Reply-To: <1459869016-13896-1-git-send-email-hannes@stressinduktion.org>

During release_sock we use callbacks to finish the processing
of outstanding skbs on the socket. We actually are still locked,
sk_locked.owned == 1, but we already told lockdep that the mutex
is released. This could lead to false positives in lockdep for
lockdep_sock_is_held (we don't hold the slock spinlock during processing
the outstanding skbs).

I took over this patch from Eric Dumazet and tested it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 include/net/sock.h | 7 ++++++-
 net/core/sock.c    | 5 -----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 310c4367ea83f6..4654ad45e84e7f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1329,7 +1329,12 @@ static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 
 static inline void sock_release_ownership(struct sock *sk)
 {
-	sk->sk_lock.owned = 0;
+	if (sk->sk_lock.owned) {
+		sk->sk_lock.owned = 0;
+
+		/* The sk_lock has mutex_unlock() semantics: */
+		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+	}
 }
 
 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index 2f517ea5678612..f5923e9c92bb0d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2467,11 +2467,6 @@ EXPORT_SYMBOL(lock_sock_nested);
 
 void release_sock(struct sock *sk)
 {
-	/*
-	 * The sk_lock has mutex_unlock() semantics:
-	 */
-	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-
 	spin_lock_bh(&sk->sk_lock.slock);
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
-- 
2.5.5

^ permalink raw reply related

* [PATCH net-next 2/3] net: introduce lockdep_is_held and update various places to use it
From: Hannes Frederic Sowa @ 2016-04-05 15:10 UTC (permalink / raw)
  To: netdev; +Cc: daniel
In-Reply-To: <1459869016-13896-1-git-send-email-hannes@stressinduktion.org>

The socket is either locked if we hold the slock spin_lock for
lock_sock_fast and unlock_sock_fast or we own the lock (sk_lock.owned
!= 0). Check for this and at the same time improve that the current
thread/cpu is really holding the lock.

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 include/net/sock.h       | 12 ++++++++++--
 net/dccp/ipv4.c          |  2 +-
 net/dccp/ipv6.c          |  2 +-
 net/ipv4/af_inet.c       |  2 +-
 net/ipv4/cipso_ipv4.c    |  3 ++-
 net/ipv4/ip_sockglue.c   |  4 ++--
 net/ipv4/tcp_ipv4.c      |  8 +++-----
 net/ipv6/ipv6_sockglue.c |  6 ++++--
 net/ipv6/tcp_ipv6.c      |  2 +-
 net/socket.c             |  2 +-
 10 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 4654ad45e84e7f..57c894ef37bac8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1356,6 +1356,14 @@ do {									\
 	lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);	\
 } while (0)
 
+static bool lockdep_sock_is_held(const struct sock *csk)
+{
+	struct sock *sk = (struct sock *)csk;
+
+	return lockdep_is_held(&sk->sk_lock) ||
+	       lockdep_is_held(&sk->sk_lock.slock);
+}
+
 void lock_sock_nested(struct sock *sk, int subclass);
 
 static inline void lock_sock(struct sock *sk)
@@ -1594,8 +1602,8 @@ static inline void sk_rethink_txhash(struct sock *sk)
 static inline struct dst_entry *
 __sk_dst_get(struct sock *sk)
 {
-	return rcu_dereference_check(sk->sk_dst_cache, sock_owned_by_user(sk) ||
-						       lockdep_is_held(&sk->sk_lock.slock));
+	return rcu_dereference_check(sk->sk_dst_cache,
+				     lockdep_sock_is_held(sk));
 }
 
 static inline struct dst_entry *
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6438c5a7efc411..f6d183f8f33222 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -62,7 +62,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	nexthop = daddr = usin->sin_addr.s_addr;
 
 	inet_opt = rcu_dereference_protected(inet->inet_opt,
-					     sock_owned_by_user(sk));
+					     lockdep_sock_is_held(sk));
 	if (inet_opt != NULL && inet_opt->opt.srr) {
 		if (daddr == 0)
 			return -EINVAL;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 71bf1deba4c5ed..8ceb3cebcad4b6 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -868,7 +868,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.fl6_sport = inet->inet_sport;
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+	opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
 
 	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9e481992dbaef2..7e37ebb5af396e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1106,7 +1106,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	struct ip_options_rcu *inet_opt;
 
 	inet_opt = rcu_dereference_protected(inet->inet_opt,
-					     sock_owned_by_user(sk));
+					     lockdep_sock_is_held(sk));
 	if (inet_opt && inet_opt->opt.srr)
 		daddr = inet_opt->opt.faddr;
 
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index bdb2a07ec363b7..40d6b87713a132 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1933,7 +1933,8 @@ int cipso_v4_sock_setattr(struct sock *sk,
 
 	sk_inet = inet_sk(sk);
 
-	old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+	old = rcu_dereference_protected(sk_inet->inet_opt,
+					lockdep_sock_is_held(sk));
 	if (sk_inet->is_icsk) {
 		sk_conn = inet_csk(sk);
 		if (old)
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 1b7c0776c805b5..89b5f3bd669436 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -642,7 +642,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		if (err)
 			break;
 		old = rcu_dereference_protected(inet->inet_opt,
-						sock_owned_by_user(sk));
+						lockdep_sock_is_held(sk));
 		if (inet->is_icsk) {
 			struct inet_connection_sock *icsk = inet_csk(sk);
 #if IS_ENABLED(CONFIG_IPV6)
@@ -1302,7 +1302,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		struct ip_options_rcu *inet_opt;
 
 		inet_opt = rcu_dereference_protected(inet->inet_opt,
-						     sock_owned_by_user(sk));
+						     lockdep_sock_is_held(sk));
 		opt->optlen = 0;
 		if (inet_opt)
 			memcpy(optbuf, &inet_opt->opt,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 456ff3d6a13223..f4f2a0a3849d3d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -157,7 +157,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	nexthop = daddr = usin->sin_addr.s_addr;
 	inet_opt = rcu_dereference_protected(inet->inet_opt,
-					     sock_owned_by_user(sk));
+					     lockdep_sock_is_held(sk));
 	if (inet_opt && inet_opt->opt.srr) {
 		if (!daddr)
 			return -EINVAL;
@@ -882,8 +882,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 
 	/* caller either holds rcu_read_lock() or socket lock */
 	md5sig = rcu_dereference_check(tp->md5sig_info,
-				       sock_owned_by_user(sk) ||
-				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
+				       lockdep_sock_is_held(sk));
 	if (!md5sig)
 		return NULL;
 #if IS_ENABLED(CONFIG_IPV6)
@@ -928,8 +927,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 	}
 
 	md5sig = rcu_dereference_protected(tp->md5sig_info,
-					   sock_owned_by_user(sk) ||
-					   lockdep_is_held(&sk->sk_lock.slock));
+					   lockdep_sock_is_held(sk));
 	if (!md5sig) {
 		md5sig = kmalloc(sizeof(*md5sig), gfp);
 		if (!md5sig)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a5557d22f89ea9..4ff4b29894ebfe 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -407,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
 			break;
 
-		opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+		opt = rcu_dereference_protected(np->opt,
+						lockdep_sock_is_held(sk));
 		opt = ipv6_renew_options(sk, opt, optname,
 					 (struct ipv6_opt_hdr __user *)optval,
 					 optlen);
@@ -1124,7 +1125,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		struct ipv6_txoptions *opt;
 
 		lock_sock(sk);
-		opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+		opt = rcu_dereference_protected(np->opt,
+						lockdep_sock_is_held(sk));
 		len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
 		release_sock(sk);
 		/* check if ipv6_getsockopt_sticky() returns err code */
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7cde1b6fdda3fc..0e621bc1ae11c8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -234,7 +234,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.fl6_dport = usin->sin6_port;
 	fl6.fl6_sport = inet->inet_sport;
 
-	opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+	opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
 
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
diff --git a/net/socket.c b/net/socket.c
index 979d3146b081d7..afa3c347071735 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1046,7 +1046,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
 		return -EINVAL;
 
 	lock_sock(sk);
-	wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
+	wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk));
 	fasync_helper(fd, filp, on, &wq->fasync_list);
 
 	if (!wq->fasync_list)
-- 
2.5.5

^ permalink raw reply related

* [PATCH net-next 3/3] tun: use socket locks for sk_{attach,detatch}_filter
From: Hannes Frederic Sowa @ 2016-04-05 15:10 UTC (permalink / raw)
  To: netdev; +Cc: daniel
In-Reply-To: <1459869016-13896-1-git-send-email-hannes@stressinduktion.org>

This reverts commit 5a5abb1fa3b05dd ("tun, bpf: fix suspicious RCU usage
in tun_{attach, detach}_filter") and replaces it to use lock_sock around
sk_{attach,detach}_filter. The checks inside filter.c are updated with
lockdep_sock_is_held to check for proper socket locks.

It keeps the code cleaner by ensuring that only one lock governs the
socket filter instead of two independent locks.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 drivers/net/tun.c      | 14 +++++++++-----
 include/linux/filter.h |  4 ----
 net/core/filter.c      | 35 +++++++++++++----------------------
 3 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9abc36bf77eae3..64bc143eddd983 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -622,8 +622,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 
 	/* Re-attach the filter to persist device */
 	if (!skip_filter && (tun->filter_attached == true)) {
-		err = __sk_attach_filter(&tun->fprog, tfile->socket.sk,
-					 lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 		if (!err)
 			goto out;
 	}
@@ -1824,7 +1825,9 @@ static void tun_detach_filter(struct tun_struct *tun, int n)
 
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
-		__sk_detach_filter(tfile->socket.sk, lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		sk_detach_filter(tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 	}
 
 	tun->filter_attached = false;
@@ -1837,8 +1840,9 @@ static int tun_attach_filter(struct tun_struct *tun)
 
 	for (i = 0; i < tun->numqueues; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
-		ret = __sk_attach_filter(&tun->fprog, tfile->socket.sk,
-					 lockdep_rtnl_is_held());
+		lock_sock(tfile->socket.sk);
+		ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+		release_sock(tfile->socket.sk);
 		if (ret) {
 			tun_detach_filter(tun, i);
 			return ret;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a51a5361695fbf..43aa1f8855c7ff 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -465,14 +465,10 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
 void bpf_prog_destroy(struct bpf_prog *fp);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
-		       bool locked);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
-int __sk_detach_filter(struct sock *sk, bool locked);
-
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index ca7f832b29802d..e8486ba601eae7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1149,8 +1149,7 @@ void bpf_prog_destroy(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_destroy);
 
-static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
-			    bool locked)
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
 {
 	struct sk_filter *fp, *old_fp;
 
@@ -1166,8 +1165,10 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
 		return -ENOMEM;
 	}
 
-	old_fp = rcu_dereference_protected(sk->sk_filter, locked);
+	old_fp = rcu_dereference_protected(sk->sk_filter,
+					   lockdep_sock_is_held(sk));
 	rcu_assign_pointer(sk->sk_filter, fp);
+
 	if (old_fp)
 		sk_filter_uncharge(sk, old_fp);
 
@@ -1246,8 +1247,7 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
  * occurs or there is insufficient memory for the filter a negative
  * errno code is returned. On success the return is zero.
  */
-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
-		       bool locked)
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
 	struct bpf_prog *prog = __get_filter(fprog, sk);
 	int err;
@@ -1255,7 +1255,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	err = __sk_attach_prog(prog, sk, locked);
+	err = __sk_attach_prog(prog, sk);
 	if (err < 0) {
 		__bpf_prog_release(prog);
 		return err;
@@ -1263,12 +1263,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__sk_attach_filter);
-
-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
-{
-	return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk));
-}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
 
 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
@@ -1314,7 +1309,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk));
+	err = __sk_attach_prog(prog, sk);
 	if (err < 0) {
 		bpf_prog_put(prog);
 		return err;
@@ -2255,7 +2250,7 @@ static int __init register_sk_filter_ops(void)
 }
 late_initcall(register_sk_filter_ops);
 
-int __sk_detach_filter(struct sock *sk, bool locked)
+int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
 	struct sk_filter *filter;
@@ -2263,7 +2258,8 @@ int __sk_detach_filter(struct sock *sk, bool locked)
 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
 		return -EPERM;
 
-	filter = rcu_dereference_protected(sk->sk_filter, locked);
+	filter = rcu_dereference_protected(sk->sk_filter,
+					   lockdep_sock_is_held(sk));
 	if (filter) {
 		RCU_INIT_POINTER(sk->sk_filter, NULL);
 		sk_filter_uncharge(sk, filter);
@@ -2272,12 +2268,7 @@ int __sk_detach_filter(struct sock *sk, bool locked)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__sk_detach_filter);
-
-int sk_detach_filter(struct sock *sk)
-{
-	return __sk_detach_filter(sk, sock_owned_by_user(sk));
-}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
 
 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 		  unsigned int len)
@@ -2288,7 +2279,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 
 	lock_sock(sk);
 	filter = rcu_dereference_protected(sk->sk_filter,
-					   sock_owned_by_user(sk));
+					   lockdep_sock_is_held(sk));
 	if (!filter)
 		goto out;
 
-- 
2.5.5

^ permalink raw reply related

* [PATCH net-next] net: dsa: document missing functions
From: Vivien Didelot @ 2016-04-05 15:22 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn,
	Florian Fainelli, Vivien Didelot

Add description for the missing port_vlan_prepare, port_fdb_prepare,
port_fdb_dump functions in the DSA documentation.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 Documentation/networking/dsa/dsa.txt | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 3b196c3..8ba3369 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -542,6 +542,12 @@ Bridge layer
 Bridge VLAN filtering
 ---------------------
 
+- port_vlan_prepare: bridge layer function invoked when the bridge prepares the
+  configuration of a VLAN on the given port. If the operation is not
+  programmable, this function should return -EOPNOTSUPP to inform the bridge
+  code to fallback to a software implementation. No hardware programmation
+  must be done in this function. See port_vlan_add for this and details.
+
 - port_vlan_add: bridge layer function invoked when a VLAN is configured
   (tagged or untagged) for the given switch port
 
@@ -552,6 +558,12 @@ Bridge VLAN filtering
   function that the driver has to call for each VLAN the given port is a member
   of. A switchdev object is used to carry the VID and bridge flags.
 
+- port_fdb_prepare: bridge layer function invoked when the bridge prepares the
+  installation of a Forwarding Database entry. If the operation is not
+  programmable, this function should return -EOPNOTSUPP to inform the bridge
+  code to fallback to a software implementation. No hardware programmation
+  must be done in this function. See port_fdb_add for this and details.
+
 - port_fdb_add: bridge layer function invoked when the bridge wants to install a
   Forwarding Database entry, the switch hardware should be programmed with the
   specified address in the specified VLAN Id in the forwarding database
@@ -565,6 +577,10 @@ of DSA, would be the its port-based VLAN, used by the associated bridge device.
   the specified MAC address from the specified VLAN ID if it was mapped into
   this port forwarding database
 
+- port_fdb_dump: bridge layer function invoked with a switchdev callback
+  function that the driver has to call for each MAC address known to be behind
+  the given port. A switchdev object is used to carry the VID and FDB info.
+
 TODO
 ====
 
-- 
2.8.0

^ permalink raw reply related

* [PATCH net-next 0/8] udp: GRO in UDP sockets
From: Tom Herbert @ 2016-04-05 15:22 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team

This patch set adds GRO functions (gro_receive and gro_complete) to UDP
sockets and removes udp_offload infrastructure.

Add GRO functions (gro_receive and gro_complete) to UDP sockets. In
udp_gro_receive and udp_gro_complete a socket lookup is done instead of
looking up the port number in udp_offloads.  If a socket is found and
there are GRO functions for it then those are called. This feature
allows binding GRO functions to more than just a port number.
Eventually, we will be able to use this technique to allow application
defined GRO for an application protocol by attaching BPF porgrams to UDP
sockets for doing GRO.

In order to implement these functions, we added exported
udp6_lib_lookup_skb and udp4_lib_lookup_skb functions in ipv4/udp.c and
ipv6/udp.c. Also, inet_iif and references to skb_dst() were changed to
check that dst is set in skbuf before derefencing. In the GRO path there
is now a UDP socket lookup performed before dst is set, to the get the
device in that case we simply use skb->dev.

Tested:

Ran various combinations of VXLAN and GUE TCP_STREAM and TCP_RR tests.
Did not see any material regression.

Tom Herbert (8):
  net: Checks skb_dst to be NULL in inet_iif
  udp: Add udp6_lib_lookup_skb and udp4_lib_lookup_skb
  udp: Add GRO functions to UDP socket
  udp: Add socket based GRO and config
  vxlan: change vxlan to use UDP socket GRO
  fou: change to use UDP socket GRO
  geneve: change to use UDP socket GRO
  udp: Remove udp_offloads

 drivers/net/geneve.c      |  28 ++++--------
 drivers/net/vxlan.c       |  30 ++++--------
 include/linux/netdevice.h |  17 -------
 include/linux/udp.h       |   8 ++++
 include/net/protocol.h    |   3 --
 include/net/route.h       |   7 +--
 include/net/udp.h         |  11 ++++-
 include/net/udp_tunnel.h  |   7 +++
 include/net/vxlan.h       |   1 -
 net/ipv4/fou.c            |  48 +++++++-------------
 net/ipv4/udp.c            |  13 ++++++
 net/ipv4/udp_offload.c    | 113 ++++++++--------------------------------------
 net/ipv4/udp_tunnel.c     |   2 +
 net/ipv6/Makefile         |   5 +-
 net/ipv6/af_inet6.c       |   8 ++++
 net/ipv6/ip6_offload.c    |   2 -
 net/ipv6/ip6_offload.h    |   3 +-
 net/ipv6/udp.c            |  13 ++++++
 net/ipv6/udp_offload.c    |  11 +++--
 19 files changed, 129 insertions(+), 201 deletions(-)

-- 
2.8.0.rc2

^ permalink raw reply

* [PATCH net-next 1/8] net: Checks skb_dst to be NULL in inet_iif
From: Tom Herbert @ 2016-04-05 15:22 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team
In-Reply-To: <1459869776-2090500-1-git-send-email-tom@herbertland.com>

In inet_iif check if skb_rtable is NULL for the skb and return
skb->skb_iif if it is.

This change allows inet_iif to be called before the dst
information has been set in the skb (e.g. when doing socket based
UDP GRO).

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 include/net/route.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 9b0a523..f4b11ee 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -322,10 +322,11 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
 
 static inline int inet_iif(const struct sk_buff *skb)
 {
-	int iif = skb_rtable(skb)->rt_iif;
+	struct rtable *rt = skb_rtable(skb);
+
+	if (rt && rt->rt_iif)
+		return rt->rt_iif;
 
-	if (iif)
-		return iif;
 	return skb->skb_iif;
 }
 
-- 
2.8.0.rc2

^ permalink raw reply related

* [PATCH net-next 4/8] udp: Add socket based GRO and config
From: Tom Herbert @ 2016-04-05 15:22 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team
In-Reply-To: <1459869776-2090500-1-git-send-email-tom@herbertland.com>

Add gro_receive and  gro_complete to struct udp_tunnel_sock_cfg.

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 include/net/udp_tunnel.h | 7 +++++++
 net/ipv4/udp_tunnel.c    | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index b831140..2dcf1de 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -64,6 +64,11 @@ static inline int udp_sock_create(struct net *net,
 
 typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
 typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
+typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk,
+						     struct sk_buff **head,
+						     struct sk_buff *skb);
+typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
+					 int nhoff);
 
 struct udp_tunnel_sock_cfg {
 	void *sk_user_data;     /* user data used by encap_rcv call back */
@@ -71,6 +76,8 @@ struct udp_tunnel_sock_cfg {
 	__u8  encap_type;
 	udp_tunnel_encap_rcv_t encap_rcv;
 	udp_tunnel_encap_destroy_t encap_destroy;
+	udp_tunnel_gro_receive_t gro_receive;
+	udp_tunnel_gro_complete_t gro_complete;
 };
 
 /* Setup the given (UDP) sock to receive UDP encapsulated packets */
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 96599d1..47f12c7 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -69,6 +69,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 	udp_sk(sk)->encap_type = cfg->encap_type;
 	udp_sk(sk)->encap_rcv = cfg->encap_rcv;
 	udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+	udp_sk(sk)->gro_receive = cfg->gro_receive;
+	udp_sk(sk)->gro_complete = cfg->gro_complete;
 
 	udp_tunnel_encap_enable(sock);
 }
-- 
2.8.0.rc2

^ permalink raw reply related

* [PATCH net-next 3/8] udp: Add GRO functions to UDP socket
From: Tom Herbert @ 2016-04-05 15:22 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team
In-Reply-To: <1459869776-2090500-1-git-send-email-tom@herbertland.com>

This patch adds GRO functions (gro_receive and gro_complete) to UDP
sockets. udp_gro_receive is changed to perform socket lookup on a
packet. If a socket is found the related GRO functions are called.

This features obsoletes using UDP offload infrastructure for GRO
(udp_offload). This has the advantage of not being limited to provide
offload on a per port basis, GRO is now applied to whatever individual
UDP sockets are bound to.  This also allows the possbility of
"application defined GRO"-- that is we can attach something like
a BPF program to a UDP socket to perfrom GRO on an application
layer protocol.

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 include/linux/udp.h    |  8 ++++++++
 include/net/udp.h      |  7 +++++--
 net/ipv4/udp_offload.c | 52 +++++++++++++++++++-------------------------------
 net/ipv6/Makefile      |  5 +++--
 net/ipv6/af_inet6.c    |  8 ++++++++
 net/ipv6/ip6_offload.c |  2 --
 net/ipv6/ip6_offload.h |  3 ++-
 net/ipv6/udp_offload.c | 11 ++++++++---
 8 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 3234275..d1fd8cd 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -71,6 +71,14 @@ struct udp_sock {
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
+
+	/* GRO functions for UDP socket */
+	struct sk_buff **	(*gro_receive)(struct sock *sk,
+					       struct sk_buff **head,
+					       struct sk_buff *skb);
+	int			(*gro_complete)(struct sock *sk,
+						struct sk_buff *skb,
+						int nhoff);
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
diff --git a/include/net/udp.h b/include/net/udp.h
index e2e3803..e245a1d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -158,9 +158,12 @@ static inline __sum16 udp_v4_check(int len, __be32 saddr,
 void udp_set_csum(bool nocheck, struct sk_buff *skb,
 		  __be32 saddr, __be32 daddr, int len);
 
+typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
+				     __be16 dport);
+
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
-				 struct udphdr *uh);
-int udp_gro_complete(struct sk_buff *skb, int nhoff);
+				 struct udphdr *uh, udp_lookup_t lookup);
+int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0ed2daf..65c3fd3 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -179,6 +179,7 @@ out_unlock:
 
 	return segs;
 }
+EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
@@ -304,13 +305,13 @@ unlock:
 EXPORT_SYMBOL(udp_del_offload);
 
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
-				 struct udphdr *uh)
+				 struct udphdr *uh, udp_lookup_t lookup)
 {
-	struct udp_offload_priv *uo_priv;
 	struct sk_buff *p, **pp = NULL;
 	struct udphdr *uh2;
 	unsigned int off = skb_gro_offset(skb);
 	int flush = 1;
+	struct sock *sk;
 
 	if (NAPI_GRO_CB(skb)->encap_mark ||
 	    (skb->ip_summed != CHECKSUM_PARTIAL &&
@@ -322,13 +323,11 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 	NAPI_GRO_CB(skb)->encap_mark = 1;
 
 	rcu_read_lock();
-	uo_priv = rcu_dereference(udp_offload_base);
-	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
-		if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
-		    uo_priv->offload->port == uh->dest &&
-		    uo_priv->offload->callbacks.gro_receive)
-			goto unflush;
-	}
+	sk = (*lookup)(skb, uh->source, uh->dest);
+
+	if (sk && udp_sk(sk)->gro_receive)
+		goto unflush;
+
 	goto out_unlock;
 
 unflush:
@@ -352,9 +351,7 @@ unflush:
 
 	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
 	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
-	NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-	pp = uo_priv->offload->callbacks.gro_receive(head, skb,
-						     uo_priv->offload);
+	pp = udp_sk(sk)->gro_receive(sk, head, skb);
 
 out_unlock:
 	rcu_read_unlock();
@@ -362,6 +359,7 @@ out:
 	NAPI_GRO_CB(skb)->flush |= flush;
 	return pp;
 }
+EXPORT_SYMBOL(udp_gro_receive);
 
 static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 					 struct sk_buff *skb)
@@ -383,39 +381,28 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
 					     inet_gro_compute_pseudo);
 skip:
 	NAPI_GRO_CB(skb)->is_ipv6 = 0;
-	return udp_gro_receive(head, skb, uh);
+	return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
 
 flush:
 	NAPI_GRO_CB(skb)->flush = 1;
 	return NULL;
 }
 
-int udp_gro_complete(struct sk_buff *skb, int nhoff)
+int udp_gro_complete(struct sk_buff *skb, int nhoff,
+		     udp_lookup_t lookup)
 {
-	struct udp_offload_priv *uo_priv;
 	__be16 newlen = htons(skb->len - nhoff);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 	int err = -ENOSYS;
+	struct sock *sk;
 
 	uh->len = newlen;
 
 	rcu_read_lock();
-
-	uo_priv = rcu_dereference(udp_offload_base);
-	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
-		if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
-		    uo_priv->offload->port == uh->dest &&
-		    uo_priv->offload->callbacks.gro_complete)
-			break;
-	}
-
-	if (uo_priv) {
-		NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-		err = uo_priv->offload->callbacks.gro_complete(skb,
-				nhoff + sizeof(struct udphdr),
-				uo_priv->offload);
-	}
-
+	sk = (*lookup)(skb, uh->source, uh->dest);
+	if (sk && udp_sk(sk)->gro_complete)
+		err = udp_sk(sk)->gro_complete(sk, skb,
+				nhoff + sizeof(struct udphdr));
 	rcu_read_unlock();
 
 	if (skb->remcsum_offload)
@@ -426,6 +413,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
 
 	return err;
 }
+EXPORT_SYMBOL(udp_gro_complete);
 
 static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 {
@@ -440,7 +428,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
 	}
 
-	return udp_gro_complete(skb, nhoff);
+	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
 
 static const struct net_offload udpv4_offload = {
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2fbd90b..5e9d6bf 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,9 +8,10 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		addrlabel.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
-		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o
+		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
+		udp_offload.o
 
-ipv6-offload :=	ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o
+ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
 ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
 ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b11c37c..9a2fec5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -64,6 +64,8 @@
 #include <asm/uaccess.h>
 #include <linux/mroute6.h>
 
+#include "ip6_offload.h"
+
 MODULE_AUTHOR("Cast of dozens");
 MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
 MODULE_LICENSE("GPL");
@@ -958,6 +960,10 @@ static int __init inet6_init(void)
 	if (err)
 		goto udplitev6_fail;
 
+	err = udpv6_offload_init();
+	if (err)
+		goto udpv6_offload_fail;
+
 	err = tcpv6_init();
 	if (err)
 		goto tcpv6_fail;
@@ -987,6 +993,8 @@ pingv6_fail:
 ipv6_packet_fail:
 	tcpv6_exit();
 tcpv6_fail:
+	udpv6_offload_exit();
+udpv6_offload_fail:
 	udplitev6_exit();
 udplitev6_fail:
 	udpv6_exit();
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 82e9f30..204af22 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -325,8 +325,6 @@ static int __init ipv6_offload_init(void)
 
 	if (tcpv6_offload_init() < 0)
 		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
-	if (udp_offload_init() < 0)
-		pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
 	if (ipv6_exthdrs_offload_init() < 0)
 		pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
 
diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h
index 2e155c6..96b40e4 100644
--- a/net/ipv6/ip6_offload.h
+++ b/net/ipv6/ip6_offload.h
@@ -12,7 +12,8 @@
 #define __ip6_offload_h
 
 int ipv6_exthdrs_offload_init(void);
-int udp_offload_init(void);
+int udpv6_offload_init(void);
+int udpv6_offload_exit(void);
 int tcpv6_offload_init(void);
 
 #endif
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2b0fbe6..5429f6b 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -153,7 +153,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
 
 skip:
 	NAPI_GRO_CB(skb)->is_ipv6 = 1;
-	return udp_gro_receive(head, skb, uh);
+	return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb);
 
 flush:
 	NAPI_GRO_CB(skb)->flush = 1;
@@ -173,7 +173,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
 	}
 
-	return udp_gro_complete(skb, nhoff);
+	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
 
 static const struct net_offload udpv6_offload = {
@@ -184,7 +184,12 @@ static const struct net_offload udpv6_offload = {
 	},
 };
 
-int __init udp_offload_init(void)
+int udpv6_offload_init(void)
 {
 	return inet6_add_offload(&udpv6_offload, IPPROTO_UDP);
 }
+
+int udpv6_offload_exit(void)
+{
+	return inet6_del_offload(&udpv6_offload, IPPROTO_UDP);
+}
-- 
2.8.0.rc2

^ permalink raw reply related

* [PATCH net-next 5/8] vxlan: change vxlan to use UDP socket GRO
From: Tom Herbert @ 2016-04-05 15:22 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team
In-Reply-To: <1459869776-2090500-1-git-send-email-tom@herbertland.com>

Adapt vxlan_gro_receive, vxlan_gro_complete to take a socket argument.
Set these functions in tunnel_config.  Don't set udp_offloads any more.

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 drivers/net/vxlan.c | 30 ++++++++----------------------
 include/net/vxlan.h |  1 -
 2 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1c0fa36..29e7269 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -551,16 +551,15 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 	return vh;
 }
 
-static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
-					  struct sk_buff *skb,
-					  struct udp_offload *uoff)
+static struct sk_buff **vxlan_gro_receive(struct sock *sk,
+					  struct sk_buff **head,
+					  struct sk_buff *skb)
 {
 	struct sk_buff *p, **pp = NULL;
 	struct vxlanhdr *vh, *vh2;
 	unsigned int hlen, off_vx;
 	int flush = 1;
-	struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock,
-					     udp_offloads);
+	struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
 	__be32 flags;
 	struct gro_remcsum grc;
 
@@ -613,8 +612,7 @@ out:
 	return pp;
 }
 
-static int vxlan_gro_complete(struct sk_buff *skb, int nhoff,
-			      struct udp_offload *uoff)
+static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
 {
 	udp_tunnel_gro_complete(skb, nhoff);
 
@@ -629,13 +627,6 @@ static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
 	struct net *net = sock_net(sk);
 	sa_family_t sa_family = vxlan_get_sk_family(vs);
 	__be16 port = inet_sk(sk)->inet_sport;
-	int err;
-
-	if (sa_family == AF_INET) {
-		err = udp_add_offload(net, &vs->udp_offloads);
-		if (err)
-			pr_warn("vxlan: udp_add_offload failed with status %d\n", err);
-	}
 
 	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
@@ -662,9 +653,6 @@ static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
 							    port);
 	}
 	rcu_read_unlock();
-
-	if (sa_family == AF_INET)
-		udp_del_offload(&vs->udp_offloads);
 }
 
 /* Add new entry to forwarding table -- assumes lock held */
@@ -2640,21 +2628,19 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 	atomic_set(&vs->refcnt, 1);
 	vs->flags = (flags & VXLAN_F_RCV_FLAGS);
 
-	/* Initialize the vxlan udp offloads structure */
-	vs->udp_offloads.port = port;
-	vs->udp_offloads.callbacks.gro_receive  = vxlan_gro_receive;
-	vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete;
-
 	spin_lock(&vn->sock_lock);
 	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
 	vxlan_notify_add_rx_port(vs);
 	spin_unlock(&vn->sock_lock);
 
 	/* Mark socket as an encapsulation socket. */
+	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
 	tunnel_cfg.sk_user_data = vs;
 	tunnel_cfg.encap_type = 1;
 	tunnel_cfg.encap_rcv = vxlan_rcv;
 	tunnel_cfg.encap_destroy = NULL;
+	tunnel_cfg.gro_receive = vxlan_gro_receive;
+	tunnel_cfg.gro_complete = vxlan_gro_complete;
 
 	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 73ed2e9..2ad2136 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -131,7 +131,6 @@ struct vxlan_sock {
 	struct rcu_head	  rcu;
 	struct hlist_head vni_list[VNI_HASH_SIZE];
 	atomic_t	  refcnt;
-	struct udp_offload udp_offloads;
 	u32		  flags;
 };
 
-- 
2.8.0.rc2

^ permalink raw reply related

* [PATCH net-next 2/8] udp: Add udp6_lib_lookup_skb and udp4_lib_lookup_skb
From: Tom Herbert @ 2016-04-05 15:22 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team
In-Reply-To: <1459869776-2090500-1-git-send-email-tom@herbertland.com>

Add externally visible functions to lookup a UDP socket by skb. This
will be used for GRO in UDP sockets. These functions also check
if skb->dst is set, and if it is not skb->dev is used to get dev_net.
This allows calling lookup functions before dst has been set on the
skbuff.

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 include/net/udp.h |  4 ++++
 net/ipv4/udp.c    | 13 +++++++++++++
 net/ipv6/udp.c    | 13 +++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/include/net/udp.h b/include/net/udp.h
index d870ec1..e2e3803 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -260,6 +260,8 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 			       __be32 daddr, __be16 dport, int dif,
 			       struct udp_table *tbl, struct sk_buff *skb);
+struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
+				 __be16 sport, __be16 dport);
 struct sock *udp6_lib_lookup(struct net *net,
 			     const struct in6_addr *saddr, __be16 sport,
 			     const struct in6_addr *daddr, __be16 dport,
@@ -269,6 +271,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
 			       const struct in6_addr *daddr, __be16 dport,
 			       int dif, struct udp_table *tbl,
 			       struct sk_buff *skb);
+struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
+				 __be16 sport, __be16 dport);
 
 /*
  * 	SNMP statistics for UDP and UDP-Lite
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 355bdb2..b39c566 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -604,6 +604,19 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 				 udptable, skb);
 }
 
+struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
+				 __be16 sport, __be16 dport)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct net_device *dev =
+	    skb_dst(skb) ? skb_dst(skb)->dev : skb->dev;
+
+	return __udp4_lib_lookup(dev_net(dev), iph->saddr, sport,
+				 iph->daddr, dport, inet_iif(skb),
+				 &udp_table, skb);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
+
 /* Must be called under rcu_read_lock().
  * Does increment socket refcount.
  */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 78a7dfd..33a58eb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -326,6 +326,19 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 				 udptable, skb);
 }
 
+struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
+				 __be16 sport, __be16 dport)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct net_device *dev =
+	    skb_dst(skb) ? skb_dst(skb)->dev : skb->dev;
+
+	return __udp6_lib_lookup(dev_net(dev), &iph->saddr, sport,
+				 &iph->daddr, dport, inet6_iif(skb),
+				 &udp_table, skb);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
+
 /* Must be called under rcu_read_lock().
  * Does increment socket refcount.
  */
-- 
2.8.0.rc2

^ permalink raw reply related

* [PATCH net-next 6/8] fou: change to use UDP socket GRO
From: Tom Herbert @ 2016-04-05 15:22 UTC (permalink / raw)
  To: davem, netdev; +Cc: kernel-team
In-Reply-To: <1459869776-2090500-1-git-send-email-tom@herbertland.com>

Adapt gue_gro_receive, gue_gro_complete to take a socket argument.
Don't set udp_offloads any more.

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 net/ipv4/fou.c | 48 +++++++++++++++++-------------------------------
 1 file changed, 17 insertions(+), 31 deletions(-)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 5a94aea..5738b97 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -22,7 +22,6 @@ struct fou {
 	u8 flags;
 	__be16 port;
 	u16 type;
-	struct udp_offload udp_offloads;
 	struct list_head list;
 	struct rcu_head rcu;
 };
@@ -186,13 +185,13 @@ drop:
 	return 0;
 }
 
-static struct sk_buff **fou_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb,
-					struct udp_offload *uoff)
+static struct sk_buff **fou_gro_receive(struct sock *sk,
+					struct sk_buff **head,
+					struct sk_buff *skb)
 {
 	const struct net_offload *ops;
 	struct sk_buff **pp = NULL;
-	u8 proto = NAPI_GRO_CB(skb)->proto;
+	u8 proto = fou_from_sock(sk)->protocol;
 	const struct net_offload **offloads;
 
 	/* We can clear the encap_mark for FOU as we are essentially doing
@@ -217,11 +216,11 @@ out_unlock:
 	return pp;
 }
 
-static int fou_gro_complete(struct sk_buff *skb, int nhoff,
-			    struct udp_offload *uoff)
+static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
+			    int nhoff)
 {
 	const struct net_offload *ops;
-	u8 proto = NAPI_GRO_CB(skb)->proto;
+	u8 proto = fou_from_sock(sk)->protocol;
 	int err = -ENOSYS;
 	const struct net_offload **offloads;
 
@@ -264,9 +263,9 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	return guehdr;
 }
 
-static struct sk_buff **gue_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb,
-					struct udp_offload *uoff)
+static struct sk_buff **gue_gro_receive(struct sock *sk,
+					struct sk_buff **head,
+					struct sk_buff *skb)
 {
 	const struct net_offload **offloads;
 	const struct net_offload *ops;
@@ -277,7 +276,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 	void *data;
 	u16 doffset = 0;
 	int flush = 1;
-	struct fou *fou = container_of(uoff, struct fou, udp_offloads);
+	struct fou *fou = fou_from_sock(sk);
 	struct gro_remcsum grc;
 
 	skb_gro_remcsum_init(&grc);
@@ -386,8 +385,7 @@ out:
 	return pp;
 }
 
-static int gue_gro_complete(struct sk_buff *skb, int nhoff,
-			    struct udp_offload *uoff)
+static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
 {
 	const struct net_offload **offloads;
 	struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
@@ -435,10 +433,7 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou)
 static void fou_release(struct fou *fou)
 {
 	struct socket *sock = fou->sock;
-	struct sock *sk = sock->sk;
 
-	if (sk->sk_family == AF_INET)
-		udp_del_offload(&fou->udp_offloads);
 	list_del(&fou->list);
 	udp_tunnel_sock_release(sock);
 
@@ -448,11 +443,9 @@ static void fou_release(struct fou *fou)
 static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
 {
 	udp_sk(sk)->encap_rcv = fou_udp_recv;
-	fou->protocol = cfg->protocol;
-	fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
-	fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
-	fou->udp_offloads.port = cfg->udp_config.local_udp_port;
-	fou->udp_offloads.ipproto = cfg->protocol;
+	udp_sk(sk)->gro_receive = fou_gro_receive;
+	udp_sk(sk)->gro_complete = fou_gro_complete;
+	fou_from_sock(sk)->protocol = cfg->protocol;
 
 	return 0;
 }
@@ -460,9 +453,8 @@ static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
 static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
 {
 	udp_sk(sk)->encap_rcv = gue_udp_recv;
-	fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
-	fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
-	fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+	udp_sk(sk)->gro_receive = gue_gro_receive;
+	udp_sk(sk)->gro_complete = gue_gro_complete;
 
 	return 0;
 }
@@ -521,12 +513,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 
 	sk->sk_allocation = GFP_ATOMIC;
 
-	if (cfg->udp_config.family == AF_INET) {
-		err = udp_add_offload(net, &fou->udp_offloads);
-		if (err)
-			goto error;
-	}
-
 	err = fou_add_to_port_list(net, fou);
 	if (err)
 		goto error;
-- 
2.8.0.rc2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox