From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jianfeng Tan Subject: Re: [PATCH] net/packet: support vhost mrg_rxbuf Date: Mon, 29 Oct 2018 12:19:11 +0800 Message-ID: <7e124a5b-0e95-a3a5-4d19-4b356b7b4942@linux.alibaba.com> References: <20181027120445.21552-1-jianfeng.tan@linux.alibaba.com> <11ee5374-2df6-1d73-2d99-932b6117ccea@redhat.com> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 8bit Cc: davem@davemloft.net, mst@redhat.com To: Jason Wang , netdev@vger.kernel.org Return-path: Received: from out30-132.freemail.mail.aliyun.com ([115.124.30.132]:60450 "EHLO out30-132.freemail.mail.aliyun.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1729256AbeJ2NGe (ORCPT ); Mon, 29 Oct 2018 09:06:34 -0400 In-Reply-To: <11ee5374-2df6-1d73-2d99-932b6117ccea@redhat.com> Content-Language: en-US Sender: netdev-owner@vger.kernel.org List-ID: On 10/29/2018 10:54 AM, Jason Wang wrote: > > On 2018/10/27 下午8:04, Jianfeng Tan wrote: >> Previouly, virtio net header size is hardcoded to be 10, which makes >> the feature mrg_rxbuf not available. >> >> We redefine PACKET_VNET_HDR ioctl which treats user input as boolean, >> but now as int, 0, 10, 12, or everything else be treated as 10. >> >> There will be one case which is treated differently: if user input is >> 12, previously, the header size will be 10; but now it's 12. >> >> Signed-off-by: Jianfeng Tan > > > This should go for net-next which is closed. You may consider to > re-submit when it was open. Thank you for the reminder. We'll re-evaluate the necessity of this patch. > > >> --- >>   net/packet/af_packet.c | 97 ++++++++++++++++++++++++++---------------- >>   net/packet/diag.c      |  2 +- >>   net/packet/internal.h  |  2 +- >>   3 files changed, 63 insertions(+), 38 deletions(-) >> >> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c >> index ec3095f13aae..1bd7f4cdcc80 100644 >> --- a/net/packet/af_packet.c >> +++ b/net/packet/af_packet.c >> @@ -1999,18 +1999,24 @@ static unsigned int run_filter(struct sk_buff >> *skb, >>   } >>     static int packet_rcv_vnet(struct msghdr *msg, const struct >> sk_buff *skb, >> -               size_t *len) >> +               size_t *len, int vnet_hdr_len) >>   { >> +    int res; >>       struct virtio_net_hdr vnet_hdr; >>   -    if (*len < sizeof(vnet_hdr)) >> +    if (*len < vnet_hdr_len) >>           return -EINVAL; >> -    *len -= sizeof(vnet_hdr); >> +    *len -= vnet_hdr_len; >>         if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) >>           return -EINVAL; >>   -    return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); >> +    res = memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); >> +    if (res == 0) >> +        iov_iter_advance(&msg->msg_iter, >> +                 vnet_hdr_len - sizeof(vnet_hdr)); >> + >> +    return res; >>   } >>     /* >> @@ -2206,11 +2212,13 @@ static int tpacket_rcv(struct sk_buff *skb, >> struct net_device *dev, >>                     po->tp_reserve; >>       } else { >>           unsigned int maclen = skb_network_offset(skb); >> +        int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); >> + >>           netoff = TPACKET_ALIGN(po->tp_hdrlen + >>                          (maclen < 16 ? 16 : maclen)) + >>                          po->tp_reserve; >> -        if (po->has_vnet_hdr) { >> -            netoff += sizeof(struct virtio_net_hdr); >> +        if (vnet_hdr_sz) { >> +            netoff += vnet_hdr_sz; >>               do_vnet = true; >>           } >>           macoff = netoff - maclen; >> @@ -2429,19 +2437,6 @@ static int __packet_snd_vnet_parse(struct >> virtio_net_hdr *vnet_hdr, size_t len) >>       return 0; >>   } >>   -static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, >> -                 struct virtio_net_hdr *vnet_hdr) >> -{ >> -    if (*len < sizeof(*vnet_hdr)) >> -        return -EINVAL; >> -    *len -= sizeof(*vnet_hdr); >> - >> -    if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), >> &msg->msg_iter)) >> -        return -EFAULT; >> - >> -    return __packet_snd_vnet_parse(vnet_hdr, *len); >> -} >> - >>   static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff >> *skb, >>           void *frame, struct net_device *dev, void *data, int tp_len, >>           __be16 proto, unsigned char *addr, int hlen, int copylen, >> @@ -2609,6 +2604,7 @@ static int tpacket_snd(struct packet_sock *po, >> struct msghdr *msg) >>       int len_sum = 0; >>       int status = TP_STATUS_AVAILABLE; >>       int hlen, tlen, copylen = 0; >> +    int vnet_hdr_sz; >>         mutex_lock(&po->pg_vec_lock); >>   @@ -2648,7 +2644,8 @@ static int tpacket_snd(struct packet_sock >> *po, struct msghdr *msg) >>       size_max = po->tx_ring.frame_size >>           - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); >>   -    if ((size_max > dev->mtu + reserve + VLAN_HLEN) && >> !po->has_vnet_hdr) >> +    vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); >> +    if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) >>           size_max = dev->mtu + reserve + VLAN_HLEN; >>         do { >> @@ -2668,10 +2665,10 @@ static int tpacket_snd(struct packet_sock >> *po, struct msghdr *msg) >>           status = TP_STATUS_SEND_REQUEST; >>           hlen = LL_RESERVED_SPACE(dev); >>           tlen = dev->needed_tailroom; >> -        if (po->has_vnet_hdr) { >> +        if (vnet_hdr_sz) { >>               vnet_hdr = data; >> -            data += sizeof(*vnet_hdr); >> -            tp_len -= sizeof(*vnet_hdr); >> +            data += vnet_hdr_sz; >> +            tp_len -= vnet_hdr_sz; >>               if (tp_len < 0 || >>                   __packet_snd_vnet_parse(vnet_hdr, tp_len)) { >>                   tp_len = -EINVAL; >> @@ -2696,7 +2693,7 @@ static int tpacket_snd(struct packet_sock *po, >> struct msghdr *msg) >>                         addr, hlen, copylen, &sockc); >>           if (likely(tp_len >= 0) && >>               tp_len > dev->mtu + reserve && >> -            !po->has_vnet_hdr && >> +            !vnet_hdr_sz && >>               !packet_extra_vlan_len_allowed(dev, skb)) >>               tp_len = -EMSGSIZE; >>   @@ -2715,7 +2712,7 @@ static int tpacket_snd(struct packet_sock >> *po, struct msghdr *msg) >>               } >>           } >>   -        if (po->has_vnet_hdr) { >> +        if (vnet_hdr_sz) { >>               if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { >>                   tp_len = -EINVAL; >>                   goto tpacket_error; >> @@ -2802,9 +2799,9 @@ static int packet_snd(struct socket *sock, >> struct msghdr *msg, size_t len) >>       int err, reserve = 0; >>       struct sockcm_cookie sockc; >>       struct virtio_net_hdr vnet_hdr = { 0 }; >> +    int vnet_hdr_sz; >>       int offset = 0; >>       struct packet_sock *po = pkt_sk(sk); >> -    bool has_vnet_hdr = false; >>       int hlen, tlen, linear; >>       int extra_len = 0; >>   @@ -2844,11 +2841,29 @@ static int packet_snd(struct socket *sock, >> struct msghdr *msg, size_t len) >>         if (sock->type == SOCK_RAW) >>           reserve = dev->hard_header_len; >> -    if (po->has_vnet_hdr) { >> -        err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); >> -        if (err) >> + >> +    vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); >> +    if (vnet_hdr_sz) { >> +        if (len < vnet_hdr_sz) { >> +            err = -EINVAL; >>               goto out_unlock; >> -        has_vnet_hdr = true; >> +        } >> +        len -= vnet_hdr_sz; >> + >> +        if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), >> +                     &msg->msg_iter)) { >> +            err = -EFAULT; >> +            goto out_unlock; >> +        } >> + >> +        if (__packet_snd_vnet_parse(&vnet_hdr, len)) { >> +            err = -EINVAL; >> +            goto out_unlock; >> +        } > > > Any reason to open code packet_snd_vnet_parse() here? No particular reason. Will try to add an parameter, and keep the vnet related code inside that function if there will be resubmit. > > >> + >> +        /* TODO: check hdr_len with len? */ >> + >> +        iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - >> sizeof(vnet_hdr)); >>       } >>         if (unlikely(sock_flag(sk, SOCK_NOFCS))) { >> @@ -2912,7 +2927,7 @@ static int packet_snd(struct socket *sock, >> struct msghdr *msg, size_t len) >>       skb->mark = sockc.mark; >>       skb->tstamp = sockc.transmit_time; >>   -    if (has_vnet_hdr) { >> +    if (vnet_hdr_sz) { >>           err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); >>           if (err) >>               goto out_free; >> @@ -3307,11 +3322,11 @@ static int packet_recvmsg(struct socket >> *sock, struct msghdr *msg, size_t len, >>       if (pkt_sk(sk)->pressure) >>           packet_rcv_has_room(pkt_sk(sk), NULL); >>   -    if (pkt_sk(sk)->has_vnet_hdr) { >> -        err = packet_rcv_vnet(msg, skb, &len); >> +    vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); >> +    if (vnet_hdr_len) { >> +        err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); >>           if (err) >>               goto out_free; >> -        vnet_hdr_len = sizeof(struct virtio_net_hdr); >>       } >>         /* You lose any data beyond the buffer you gave. If it worries >> @@ -3772,7 +3787,17 @@ packet_setsockopt(struct socket *sock, int >> level, int optname, char __user *optv >>           if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { >>               ret = -EBUSY; >>           } else { >> -            po->has_vnet_hdr = !!val; >> +            /* Previouly we treat user input as boolean (!!val), >> +             * now we treat it as int. After the below correction, >> +             * the only violation case is 12, which results in >> +             * vnet header size of 12 instead of 10. >> +             */ >> +            if (val && >> +                val != sizeof(struct virtio_net_hdr) && >> +                val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) >> +                val = sizeof(struct virtio_net_hdr); >> + >> +            po->vnet_hdr_sz = val; >>               ret = 0; >>           } >>           release_sock(sk); >> @@ -3903,7 +3928,7 @@ static int packet_getsockopt(struct socket >> *sock, int level, int optname, >>           val = po->origdev; >>           break; >>       case PACKET_VNET_HDR: >> -        val = po->has_vnet_hdr; >> +        val = po->vnet_hdr_sz; > > > So the change here is noticeable by userspace. Maybe we need a new opt > for this? Nice catch, users may assume that only 0 or 1 is returned. Thanks, Jianfeng > > Thanks > > >>           break; >>       case PACKET_VERSION: >>           val = po->tp_version; >> diff --git a/net/packet/diag.c b/net/packet/diag.c >> index 7ef1c881ae74..950015b6704f 100644 >> --- a/net/packet/diag.c >> +++ b/net/packet/diag.c >> @@ -26,7 +26,7 @@ static int pdiag_put_info(const struct packet_sock >> *po, struct sk_buff *nlskb) >>           pinfo.pdi_flags |= PDI_AUXDATA; >>       if (po->origdev) >>           pinfo.pdi_flags |= PDI_ORIGDEV; >> -    if (po->has_vnet_hdr) >> +    if (po->vnet_hdr_sz) >>           pinfo.pdi_flags |= PDI_VNETHDR; >>       if (po->tp_loss) >>           pinfo.pdi_flags |= PDI_LOSS; >> diff --git a/net/packet/internal.h b/net/packet/internal.h >> index 3bb7c5fb3bff..11bc75950f28 100644 >> --- a/net/packet/internal.h >> +++ b/net/packet/internal.h >> @@ -115,9 +115,9 @@ struct packet_sock { >>       unsigned int        running;    /* bind_lock must be held */ >>       unsigned int        auxdata:1,    /* writer must hold sock lock */ >>                   origdev:1, >> -                has_vnet_hdr:1, >>                   tp_loss:1, >>                   tp_tx_has_off:1; >> +    int            vnet_hdr_sz; >>       int            pressure; >>       int            ifindex;    /* bound device        */ >>       __be16            num;