From mboxrd@z Thu Jan 1 00:00:00 1970 From: Yongseok Koh Subject: Re: [PATCH v2 6/7] net/mlx5: e-switch VXLAN encapsulation rules management Date: Thu, 25 Oct 2018 00:33:14 +0000 Message-ID: <20181025003305.GB26874@mtidpdk.mti.labs.mlnx> References: <1538461807-37507-1-git-send-email-viacheslavo@mellanox.com> <1539612815-47199-1-git-send-email-viacheslavo@mellanox.com> <1539612815-47199-7-git-send-email-viacheslavo@mellanox.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: quoted-printable Cc: Shahaf Shuler , "dev@dpdk.org" To: Slava Ovsiienko Return-path: Received: from EUR01-DB5-obe.outbound.protection.outlook.com (mail-db5eur01on0075.outbound.protection.outlook.com [104.47.2.75]) by dpdk.org (Postfix) with ESMTP id 3AD9644BE for ; Thu, 25 Oct 2018 02:33:16 +0200 (CEST) In-Reply-To: <1539612815-47199-7-git-send-email-viacheslavo@mellanox.com> Content-Language: en-US Content-ID: <96A32FC103CD96498220FA3E08CB98CD@eurprd05.prod.outlook.com> List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" On Mon, Oct 15, 2018 at 02:13:34PM +0000, Viacheslav Ovsiienko wrote: > VXLAN encap rules are applied to the VF ingress traffic and have the > VTEP as actual redirection destinations instead of outer PF. > The encapsulation rule should provide: > - redirection action VF->PF > - VF port ID > - some inner network parameters (MACs/IP) > - the tunnel outer source IP (v4/v6) > - the tunnel outer destination IP (v4/v6). Current > - VNI - Virtual Network Identifier >=20 > There is no direct way found to provide kernel with all required > encapsulatioh header parameters. The encapsulation VTEP is created > attached to the outer interface and assumed as default path for > egress encapsulated traffic. The outer tunnel IP address are > assigned to interface using Netlink, the implicit route is > created like this: >=20 > ip addr add peer dev scope link >=20 > Peer address provides implicit route, and scode link reduces > the risk of conflicts. At initialization time all local scope > link addresses are flushed from device (see next part of patchset). >=20 > The destination MAC address is provided via permenent neigh rule: >=20 > ip neigh add dev lladdr to nud permanent >=20 > At initialization time all neigh rules of this type are flushed > from device (see the next part of patchset). >=20 > Suggested-by: Adrien Mazarguil > Signed-off-by: Viacheslav Ovsiienko > --- > drivers/net/mlx5/mlx5_flow_tcf.c | 394 +++++++++++++++++++++++++++++++++= +++++- > 1 file changed, 389 insertions(+), 5 deletions(-) >=20 > diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flo= w_tcf.c > index efa9c3b..a1d7733 100644 > --- a/drivers/net/mlx5/mlx5_flow_tcf.c > +++ b/drivers/net/mlx5/mlx5_flow_tcf.c > @@ -3443,6 +3443,376 @@ struct pedit_parser { > return -err; > } > =20 > +/** > + * Emit Netlink message to add/remove local address to the outer device. > + * The address being added is visible within the link only (scope link). > + * > + * Note that an implicit route is maintained by the kernel due to the > + * presence of a peer address (IFA_ADDRESS). > + * > + * These rules are used for encapsultion only and allow to assign > + * the outer tunnel source IP address. > + * > + * @param[in] tcf > + * Libmnl socket context object. > + * @param[in] encap > + * Encapsulation properties (source address and its peer). > + * @param[in] ifindex > + * Network interface to apply rule. > + * @param[in] enable > + * Toggle between add and remove. > + * @param[out] error > + * Perform verbose error reporting if not NULL. > + * > + * @return > + * 0 on success, a negative errno value otherwise and rte_errno is set= . > + */ > +static int > +flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf, > + const struct mlx5_flow_tcf_vxlan_encap *encap, > + unsigned int ifindex, > + bool enable, > + struct rte_flow_error *error) > +{ > + struct nlmsghdr *nlh; > + struct ifaddrmsg *ifa; > + alignas(struct nlmsghdr) > + uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)]; > + > + nlh =3D mnl_nlmsg_put_header(buf); > + nlh->nlmsg_type =3D enable ? RTM_NEWADDR : RTM_DELADDR; > + nlh->nlmsg_flags =3D > + NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0); > + nlh->nlmsg_seq =3D 0; > + ifa =3D mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa)); > + ifa->ifa_flags =3D IFA_F_PERMANENT; > + ifa->ifa_scope =3D RT_SCOPE_LINK; > + ifa->ifa_index =3D ifindex; > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) { > + ifa->ifa_family =3D AF_INET; > + ifa->ifa_prefixlen =3D 32; > + mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src); > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) > + mnl_attr_put_u32(nlh, IFA_ADDRESS, > + encap->ipv4.dst); > + } else { > + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC); > + ifa->ifa_family =3D AF_INET6; > + ifa->ifa_prefixlen =3D 128; > + mnl_attr_put(nlh, IFA_LOCAL, > + sizeof(encap->ipv6.src), > + &encap->ipv6.src); > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST) > + mnl_attr_put(nlh, IFA_ADDRESS, > + sizeof(encap->ipv6.dst), > + &encap->ipv6.dst); > + } > + if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL)) > + return 0; > + return rte_flow_error_set > + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, > + "netlink: cannot complete IFA request (ip addr add)"); > +} > + > +/** > + * Emit Netlink message to add/remove neighbor. > + * > + * @param[in] tcf > + * Libmnl socket context object. > + * @param[in] encap > + * Encapsulation properties (destination address). > + * @param[in] ifindex > + * Network interface. > + * @param[in] enable > + * Toggle between add and remove. > + * @param[out] error > + * Perform verbose error reporting if not NULL. > + * > + * @return > + * 0 on success, a negative errno value otherwise and rte_errno is set= . > + */ > +static int > +flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf, > + const struct mlx5_flow_tcf_vxlan_encap *encap, > + unsigned int ifindex, > + bool enable, > + struct rte_flow_error *error) > +{ > + struct nlmsghdr *nlh; > + struct ndmsg *ndm; > + alignas(struct nlmsghdr) > + uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)]; > + > + nlh =3D mnl_nlmsg_put_header(buf); > + nlh->nlmsg_type =3D enable ? RTM_NEWNEIGH : RTM_DELNEIGH; > + nlh->nlmsg_flags =3D > + NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0); > + nlh->nlmsg_seq =3D 0; > + ndm =3D mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm)); > + ndm->ndm_ifindex =3D ifindex; > + ndm->ndm_state =3D NUD_PERMANENT; > + ndm->ndm_flags =3D 0; > + ndm->ndm_type =3D 0; > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) { > + ndm->ndm_family =3D AF_INET; > + mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst); > + } else { > + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST); > + ndm->ndm_family =3D AF_INET6; > + mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst), > + &encap->ipv6.dst); > + } > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_SRC && enable) > + DRV_LOG(WARNING, > + "Outer ethernet source address cannot be " > + "forced for VXLAN encapsulation"); > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_DST) > + mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst), > + &encap->eth.dst); > + if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL)) > + return 0; > + return rte_flow_error_set > + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, > + "netlink: cannot complete ND request (ip neigh)"); > +} > + > +/** > + * Manage the local IP addresses and their peers IP addresses on the > + * outer interface for encapsulation purposes. The kernel searches the > + * appropriate device for tunnel egress traffic using the outer source > + * IP, this IP should be assigned to the outer network device, otherwise > + * kernel rejects the rule. > + * > + * Adds or removes the addresses using the Netlink command like this: > + * ip addr add peer scope link dev > + * > + * The addresses are local to the netdev ("scope link"), this reduces > + * the risk of conflicts. Note that an implicit route is maintained by > + * the kernel due to the presence of a peer address (IFA_ADDRESS). > + * > + * @param[in] tcf > + * Libmnl socket context object. > + * @param[in] vtep > + * VTEP object, contains rule database and ifouter index. > + * @param[in] dev_flow > + * Flow object, contains the tunnel parameters (for encap only). > + * @param[in] enable > + * Toggle between add and remove. > + * @param[out] error > + * Perform verbose error reporting if not NULL. > + * > + * @return > + * 0 on success, a negative errno value otherwise and rte_errno is set= . > + */ > +static int > +flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf, > + struct mlx5_flow_tcf_vtep *vtep, > + struct mlx5_flow *dev_flow, > + bool enable, > + struct rte_flow_error *error) > +{ > + const struct mlx5_flow_tcf_vxlan_encap *encap =3D > + dev_flow->tcf.vxlan_encap; > + struct tcf_local_rule *rule; > + bool found =3D false; > + int ret; > + > + assert(encap); > + assert(encap->hdr.type =3D=3D MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP); > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) { > + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST); > + LIST_FOREACH(rule, &vtep->local, next) { > + if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC && > + encap->ipv4.src =3D=3D rule->ipv4.src && > + encap->ipv4.dst =3D=3D rule->ipv4.dst) { > + found =3D true; > + break; > + } > + } > + } else { > + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC); > + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST); > + LIST_FOREACH(rule, &vtep->local, next) { > + if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC && > + !memcmp(&encap->ipv6.src, &rule->ipv6.src, > + sizeof(encap->ipv6.src)) && > + !memcmp(&encap->ipv6.dst, &rule->ipv6.dst, > + sizeof(encap->ipv6.dst))) { > + found =3D true; > + break; > + } > + } > + } > + if (found) { > + if (enable) { > + rule->refcnt++; > + return 0; > + } > + if (!rule->refcnt || !--rule->refcnt) { Same suggestion for this as that of vtep - refcnt handling and adding get() func. > + LIST_REMOVE(rule, next); > + return flow_tcf_rule_local(tcf, encap, > + vtep->ifouter, false, error); > + } > + return 0; > + } > + if (!enable) { > + DRV_LOG(WARNING, "Disabling not existing local rule"); > + rte_flow_error_set > + (error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, > + NULL, "Disabling not existing local rule"); > + return -ENOENT; > + } > + rule =3D rte_zmalloc(__func__, sizeof(struct tcf_local_rule), > + alignof(struct tcf_local_rule)); > + if (!rule) { > + rte_flow_error_set > + (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, > + NULL, "unable to allocate memory for local rule"); > + return -rte_errno; > + } > + *rule =3D (struct tcf_local_rule){.refcnt =3D 0, > + .mask =3D 0, > + }; Is it effective? The allocated memory is already zeroed out. > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) { > + rule->mask =3D MLX5_FLOW_TCF_ENCAP_IPV4_SRC > + | MLX5_FLOW_TCF_ENCAP_IPV4_DST; > + rule->ipv4.src =3D encap->ipv4.src; > + rule->ipv4.dst =3D encap->ipv4.dst; > + } else { > + rule->mask =3D MLX5_FLOW_TCF_ENCAP_IPV6_SRC > + | MLX5_FLOW_TCF_ENCAP_IPV6_DST; > + memcpy(&rule->ipv6.src, &encap->ipv6.src, > + sizeof(rule->ipv6.src)); > + memcpy(&rule->ipv6.dst, &encap->ipv6.dst, > + sizeof(rule->ipv6.dst)); > + } > + ret =3D flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error); > + if (ret) { > + rte_free(rule); > + return ret; > + } > + rule->refcnt++; > + LIST_INSERT_HEAD(&vtep->local, rule, next); > + return 0; > +} > + > +/** > + * Manage the destination MAC/IP addresses neigh database, kernel uses > + * this one to determine the destination MAC address within encapsulatio= n > + * header. Adds or removes the entries using the Netlink command like th= is: > + * ip neigh add dev lladdr to nud permane= nt > + * > + * @param[in] tcf > + * Libmnl socket context object. > + * @param[in] vtep > + * VTEP object, contains rule database and ifouter index. > + * @param[in] dev_flow > + * Flow object, contains the tunnel parameters (for encap only). > + * @param[in] enable > + * Toggle between add and remove. > + * @param[out] error > + * Perform verbose error reporting if not NULL. > + * > + * @return > + * 0 on success, a negative errno value otherwise and rte_errno is set= . > + */ > +static int > +flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf, > + struct mlx5_flow_tcf_vtep *vtep, > + struct mlx5_flow *dev_flow, > + bool enable, > + struct rte_flow_error *error) > +{ > + const struct mlx5_flow_tcf_vxlan_encap *encap =3D > + dev_flow->tcf.vxlan_encap; > + struct tcf_neigh_rule *rule; > + bool found =3D false; > + int ret; > + > + assert(encap); > + assert(encap->hdr.type =3D=3D MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP); > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) { > + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC); > + LIST_FOREACH(rule, &vtep->neigh, next) { > + if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST && > + encap->ipv4.dst =3D=3D rule->ipv4.dst) { > + found =3D true; > + break; > + } > + } > + } else { > + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC); > + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST); > + LIST_FOREACH(rule, &vtep->neigh, next) { > + if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST && > + !memcmp(&encap->ipv6.dst, &rule->ipv6.dst, > + sizeof(encap->ipv6.dst))) { > + found =3D true; > + break; > + } > + } > + } > + if (found) { > + if (memcmp(&encap->eth.dst, &rule->eth, > + sizeof(encap->eth.dst))) { > + DRV_LOG(WARNING, "Destination MAC differs" > + " in neigh rule"); > + rte_flow_error_set(error, EEXIST, > + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, > + NULL, "Different MAC address" > + " neigh rule for the same" > + " destination IP"); > + return -EEXIST; > + } > + if (enable) { > + rule->refcnt++; > + return 0; > + } > + if (!rule->refcnt || !--rule->refcnt) { Same suggestion for this as that of vtep - refcnt handling by adding create()/get()/release() func. > + LIST_REMOVE(rule, next); > + return flow_tcf_rule_neigh(tcf, encap, > + vtep->ifouter, > + false, error); > + } > + return 0; > + } > + if (!enable) { > + DRV_LOG(WARNING, "Disabling not existing neigh rule"); > + rte_flow_error_set > + (error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, > + NULL, "unable to allocate memory for neigh rule"); > + return -ENOENT; > + } > + rule =3D rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule), > + alignof(struct tcf_neigh_rule)); > + if (!rule) { > + rte_flow_error_set > + (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, > + NULL, "unadble to allocate memory for neigh rule"); > + return -rte_errno; > + } > + *rule =3D (struct tcf_neigh_rule){.refcnt =3D 0, > + .mask =3D 0, > + }; Is it effective? The allocated memory is already zeroed out. > + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) { > + rule->mask =3D MLX5_FLOW_TCF_ENCAP_IPV4_DST; > + rule->ipv4.dst =3D encap->ipv4.dst; > + } else { > + rule->mask =3D MLX5_FLOW_TCF_ENCAP_IPV6_DST; > + memcpy(&rule->ipv6.dst, &encap->ipv6.dst, > + sizeof(rule->ipv6.dst)); > + } > + memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth)); > + ret =3D flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error); > + if (ret) { > + rte_free(rule); > + return ret; > + } > + rule->refcnt++; > + LIST_INSERT_HEAD(&vtep->neigh, rule, next); > + return 0; > +} > + > /* VTEP device list is shared between PMD port instances. */ > static LIST_HEAD(, mlx5_flow_tcf_vtep) > vtep_list_vxlan =3D LIST_HEAD_INITIALIZER(); > @@ -3715,6 +4085,7 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep) > { > static uint16_t encap_port =3D MLX5_VXLAN_PORT_RANGE_MIN - 1; > struct mlx5_flow_tcf_vtep *vtep, *vlst; > + int ret; > =20 > assert(ifouter); > /* Look whether the attached VTEP for encap is created. */ > @@ -3766,6 +4137,21 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep) > } > if (!vtep) > return 0; > + /* Create local ipaddr with peer to specify the outer IPs. */ > + ret =3D flow_tcf_encap_local(tcf, vtep, dev_flow, true, error); > + if (ret) { > + if (!vtep->refcnt) > + flow_tcf_delete_iface(tcf, vtep); There's no possibility of decreasing vtep->refcnt in flow_tcf_encap_local()= , then why do you expect it to be zero here? If it is already zero at this po= int, it should've been deleted when it became zero. > + return 0; > + } > + /* Create neigh rule to specify outer destination MAC. */ > + ret =3D flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error); > + if (ret) { > + flow_tcf_encap_local(tcf, vtep, dev_flow, false, error); > + if (!vtep->refcnt) > + flow_tcf_delete_iface(tcf, vtep); Same here. Thanks, Yongseok > + return 0; > + } > vtep->refcnt++; > assert(vtep->ifindex); > return vtep->ifindex; > @@ -3848,11 +4234,9 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep) > case MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP: > break; > case MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP: > -/* > - * TODO: Remove the encap ancillary rules first. > - * flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL); > - * flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL); > - */ > + /* Remove the encap ancillary rules first. */ > + flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL); > + flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL); > break; > default: > assert(false); >=20